aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_conv
diff options
context:
space:
mode:
authorMichael Tyler <michael.tyler@arm.com>2023-04-12 17:43:17 +0100
committermichael.tyler <michael.tyler@arm.com>2023-06-05 15:57:58 +0000
commit74921eee924625426429044decefe3673561b174 (patch)
tree654da1a95e3d42d6af8ad1ff27bb40d77b1fd8c5 /src/core/NEON/kernels/arm_conv
parentdf5d9878008be9b60586df97ebfff197abb5195e (diff)
downloadComputeLibrary-74921eee924625426429044decefe3673561b174.tar.gz
Update CPU kernel implementations and guard directives
Resolves COMPMID-6023 Change-Id: I868975d14c4f98af6716726feda22405a6a4c891 Signed-off-by: Michael Tyler <michael.tyler@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9686 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/arm_conv')
-rw-r--r--src/core/NEON/kernels/arm_conv/addressing.cpp5
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp7
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp10
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp90
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp178
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp178
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_8b_mla.cpp40
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp5
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp5
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp10
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp273
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp350
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp10
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp567
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp1290
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp10
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp969
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp1848
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp10
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp293
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp462
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp10
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp661
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp808
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp356
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp1510
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp10
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp279
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp354
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp10
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp573
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp1072
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp10
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp972
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp1590
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp10
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp299
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp466
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp10
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp661
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp810
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp216
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp542
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp1281
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp1196
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp12
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp2738
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp12
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp1850
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp12
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp1798
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp12
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp3194
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp7
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp316
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp378
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp11
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp532
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp2120
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp11
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp2496
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp2738
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp12
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp1850
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp12
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp1798
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp12
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp3194
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp7
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp316
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp7
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp378
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp7
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp532
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp2120
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp1856
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp13
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp1786
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp13
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp3632
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp13
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp1850
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp12
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp1798
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp12
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp3194
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp7
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp316
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp2120
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp8
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp192
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp252
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp8
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp466
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp648
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp8
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp710
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp1000
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp8
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp184
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp286
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za/generic.cpp394
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za/generic.cpp560
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za/generic.cpp1132
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za/generic.cpp1208
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za/generic.cpp582
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za/generic.cpp698
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za/generic.cpp1356
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za/generic.cpp1288
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za/generic.cpp582
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za/generic.cpp830
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za/generic.cpp1570
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za/generic.cpp1434
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za/generic.cpp582
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za/generic.cpp830
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za/generic.cpp1570
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za/generic.cpp1434
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za/generic.cpp582
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za/generic.cpp830
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za/generic.cpp1570
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za/generic.cpp1434
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp252
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp318
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp514
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp734
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp884
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp1188
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp270
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp330
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp642
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp838
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp252
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp318
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp514
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp734
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp884
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp1188
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp270
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp330
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp642
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp838
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp28
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp304
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp554
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp8
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp744
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp880
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp15
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp590
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp15
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp590
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp15
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp1028
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp338
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp426
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp756
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp880
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp15
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp590
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp15
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp590
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp15
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp1028
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp338
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp424
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp15
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp590
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp15
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp590
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp15
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp1028
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/depthfirst_driver.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp62
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp275
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp53
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp274
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp67
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp239
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp56
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp239
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp283
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp56
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp359
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp283
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp488
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp283
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp56
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp359
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp303
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp515
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp16
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp38
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst/generic.cpp155
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp54
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst/generic.cpp153
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp38
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst/generic.cpp153
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp54
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst/generic.cpp151
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst/generic.cpp157
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp54
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst/generic.cpp151
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst/generic.cpp161
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst/generic.cpp221
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst/generic.cpp165
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp54
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst/generic.cpp151
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst/generic.cpp177
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst/generic.cpp231
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp62
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp148
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp84
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp148
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp62
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp148
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp84
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp148
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp126
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp84
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp148
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst/generic.cpp136
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst/generic.cpp208
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst/generic.cpp146
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp84
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp148
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp156
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp382
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_cache_oblivious.hpp312
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic_quantized.hpp256
316 files changed, 61766 insertions, 62533 deletions
diff --git a/src/core/NEON/kernels/arm_conv/addressing.cpp b/src/core/NEON/kernels/arm_conv/addressing.cpp
index d01627bc5a..2460398880 100644
--- a/src/core/NEON/kernels/arm_conv/addressing.cpp
+++ b/src/core/NEON/kernels/arm_conv/addressing.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,7 +23,8 @@
*/
#include "addressing.hpp"
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
+#include <algorithm>
#include <cstring>
namespace arm_conv {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
index c305835107..b6f45c6825 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
@@ -24,8 +24,8 @@
#pragma once
-#include "src/core/NEON/kernels/assembly/depthwise.hpp"
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "depthwise.hpp"
+#include "utils.hpp"
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp
index c2b861000c..2950d5e957 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp
@@ -10,8 +10,8 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -22,9 +22,10 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
#include "depthwise_common.hpp"
+#include "utils.hpp"
+
using arm_gemm::iceildiv;
namespace arm_conv {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp
index cef568fadd..3d305b6d18 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp
@@ -27,10 +27,6 @@
#include "depthwise_depthfirst.hpp"
#include "interleaves/generic_quantized_dot_product.hpp"
-#ifdef CYCLE_PROFILING
-#include "profiler.hpp"
-#endif
-
#include <limits>
namespace arm_conv {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp
index 350e93b874..134dbd1b4c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -77,6 +77,18 @@ namespace
);
}
+ template <class Strategy>
+ unsigned int planar_cycle_estimate(const DepthwiseArgs &args, const Nothing &)
+ {
+ // First-pass: compute the number of output pixels which will be computed.
+ return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
+ args.output_cols *
+ arm_gemm::iceildiv(
+ (long unsigned) args.input_channels * args.channel_multiplier,
+ arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
+ );
+ }
+
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
unsigned int not_preferred(const DepthwiseArgs &, const Nothing &) __attribute__ ((unused));
unsigned int not_preferred(const DepthwiseArgs &, const Nothing &)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp
index 09ee983907..382ccd3c62 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -103,6 +103,18 @@ namespace
);
}
+ template <class Strategy>
+ unsigned int fast_mode_cycle_estimate(const DepthwiseArgs &args, const Nothing &)
+ {
+ // First-pass: compute the number of output pixels which will be computed.
+ return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
+ arm_gemm::roundup(args.output_cols, Strategy::output_cols) *
+ arm_gemm::iceildiv(
+ (long unsigned) args.input_channels * args.channel_multiplier,
+ arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
+ ) * 2 / 3;
+ }
+
#if defined(__aarch64__)
unsigned int not_preferred(const DepthwiseArgs &, const Nothing &)
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp
index 1ba7694f1e..15064aeedc 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
#pragma once
#include "arm_gemm.hpp"
-#include "src/core/NEON/kernels/assembly/depthwise.hpp"
+#include "depthwise.hpp"
namespace arm_conv
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp
index 2b2e6f3555..567eab13f3 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#pragma once
+
#include "depthfirst_driver.hpp"
#include "interleaves/generic.hpp"
@@ -52,7 +54,7 @@ struct PlanarKernelType;
template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
struct PlanarKernelType<TInput, TWeight, TOutput, TAccum, Nothing>
{
- using Type = std::function<void(
+ typedef void (*Type)(
const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
unsigned int pad_top, unsigned int valid_input_rows,
unsigned int pad_left, unsigned int valid_input_cols,
@@ -60,7 +62,7 @@ struct PlanarKernelType<TInput, TWeight, TOutput, TAccum, Nothing>
TOutput **, const size_t *, const size_t *, unsigned int output_cols,
unsigned int start_channels, unsigned int valid_channels,
TAccum act_min, TAccum act_max
- )>;
+ );
template <typename WorkspaceType>
static inline void execute(
@@ -89,7 +91,7 @@ struct PlanarKernelType<TInput, TWeight, TOutput, TAccum, Nothing>
template <typename TInput, typename TWeight, typename TOutput>
struct PlanarKernelType<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
{
- using Type = std::function<void(
+ typedef void (*Type)(
const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
unsigned int pad_top, unsigned int valid_input_rows,
unsigned int pad_left, unsigned int valid_input_cols,
@@ -97,7 +99,7 @@ struct PlanarKernelType<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize3
TOutput **, const size_t *, const size_t *, unsigned int output_cols,
unsigned int start_channel, unsigned int valid_channels,
const arm_gemm::Requantize32 &
- )>;
+ );
template <typename WorkspaceType>
static inline void execute(
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.hpp
index 99b91fb833..39f60c362b 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#pragma once
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "interleaves/generic.hpp"
#include "depthfirst_driver.hpp"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp
deleted file mode 100644
index d59d6b7e35..0000000000
--- a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "8b_mla.hpp"
-
-size_t generic_get_packed_size(
- const VLType vec_type,
- const unsigned int acc_depth,
- const unsigned int kernel_rows,
- const unsigned int kernel_cols,
- const unsigned int n_input_channels
-)
-{
- const auto per_iter = acc_depth * arm_gemm::utils::get_vector_length<int32_t>(vec_type);
- return arm_gemm::roundup((long unsigned int) n_input_channels, per_iter) * kernel_rows * kernel_cols * sizeof(int8_t);
-}
-
-void generic_pack(
- const VLType vec_type,
- const unsigned int acc_depth,
- const unsigned int kernel_rows,
- const unsigned int kernel_cols,
- const unsigned int n_channels,
- void *_outptr,
- const void *_weights,
- size_t ld_weight_col,
- size_t ld_weight_row
-)
-{
- int8_t *outptr = reinterpret_cast<int8_t *>(_outptr);
- const int8_t *weights = reinterpret_cast<const int8_t *>(_weights);
-
- // Get the strides
- ld_weight_col = (ld_weight_col == 0) ? n_channels * sizeof(int8_t) : ld_weight_col;
- ld_weight_row = (ld_weight_row == 0) ? kernel_cols * ld_weight_col : ld_weight_row;
-
- // Pack into per-iter chunks.
- const auto per_iter = acc_depth * arm_gemm::utils::get_vector_length<int32_t>(vec_type);
- for (unsigned int c = 0; c < n_channels; c += per_iter)
- {
- auto weight_row = weights + c;
- const auto to_copy = std::min<unsigned int>(per_iter, n_channels - c);
-
- for (unsigned int i = 0; i < kernel_rows; i++)
- {
- auto weight_col = weight_row;
-
- for (unsigned int j = 0; j < kernel_cols; j++)
- {
- memcpy(outptr, weight_col, to_copy);
- outptr += per_iter;
- weight_col += ld_weight_col;
- }
-
- weight_row += ld_weight_row;
- }
- }
-}
-
-namespace arm_conv {
-namespace depthwise {
-
-ADD_IMPLEMENTATION(a64, s8q, int8_t, None, 2, 3, 3)
-ADD_IMPLEMENTATION(a64, s8q, int8_t, None, 2, 5, 5)
-ADD_IMPLEMENTATION(a64, u8q, uint8_t, None, 2, 3, 3)
-ADD_IMPLEMENTATION(a64, u8q, uint8_t, None, 2, 5, 5)
-
-} // namespace depthwise
-} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.hpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.hpp
deleted file mode 100644
index 3176d1dedd..0000000000
--- a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.hpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_gemm.hpp"
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
-#include "src/core/NEON/kernels/assembly/depthwise.hpp"
-#include <cstdint>
-#include <cstring>
-
-using namespace arm_gemm;
-
-size_t generic_get_packed_size(
- const VLType vec_type,
- const unsigned int acc_depth,
- const unsigned int kernel_rows,
- const unsigned int kernel_cols,
- const unsigned int n_input_channels
-);
-
-void generic_pack(
- const VLType vec_type,
- const unsigned int acc_depth,
- const unsigned int kernel_rows,
- const unsigned int kernel_cols,
- const unsigned int n_channels,
- void *_outptr,
- const void *_weights,
- size_t ld_weight_col,
- size_t ld_weight_row
-);
-
-#define ADD_IMPLEMENTATION(ARCH, TYPENAME, TYPE, VEC_TYPE, ACC_DEPTH, KERN_ROWS, KERN_COLS) \
-struct interleave_ ## ARCH ## _ ## TYPENAME ## _ ## KERN_ROWS ## x ## KERN_COLS ## _mla \
-{ \
- static size_t get_packed_size(const DepthwiseArgs &args); \
- static void pack_parameters( \
- unsigned int n_channels, void *outptr, \
- const TYPE *weights, size_t ld_weight_col, size_t ld_weight_row \
- ); \
-}; \
-\
-size_t interleave_ ## ARCH ## _ ## TYPENAME ## _ ## KERN_ROWS ## x ## KERN_COLS ## _mla::get_packed_size(const DepthwiseArgs &args) \
-{ \
- return generic_get_packed_size(VLType::VEC_TYPE, ACC_DEPTH, KERN_ROWS, KERN_COLS, args.input_channels); \
-} \
-\
-void interleave_ ## ARCH ## _ ## TYPENAME ## _ ## KERN_ROWS ## x ## KERN_COLS ## _mla::pack_parameters(unsigned int n_channels, void *outptr, \
- const TYPE *weights, size_t ld_weight_col, size_t ld_weight_row) \
-{ \
- generic_pack(VLType::VEC_TYPE, ACC_DEPTH, KERN_ROWS, KERN_COLS, n_channels, outptr, weights, ld_weight_col, ld_weight_row); \
-}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp
index adda78f164..5e4bf99120 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp
@@ -25,8 +25,8 @@
#if defined(__aarch64__)
#include "arm_gemm.hpp"
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
-#include "src/core/NEON/kernels/assembly/depthwise.hpp"
+#include "utils.hpp"
+#include "depthwise.hpp"
#include <cstdint>
namespace arm_conv {
@@ -54,162 +54,162 @@ void interleave_a64_s8q_3x3_dot::pack_parameters(unsigned int n_channels, void *
"cmp %x[ld_weight_col], XZR\n"
"csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
"movi v16.4s, #0x9\n"
- "movi v0.16b, #0x0\n"
+ "movi v31.16b, #0x0\n"
"mov x21, #0x3\n"
"mul x21, %x[ld_weight_col], x21\n"
"add x20, %x[qp], %[offsetof_input_offset]\n"
- "ld1r { v31.4s }, [x20]\n"
- "add x20, %x[qp], %[offsetof_weights_offset]\n"
"ld1r { v30.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_weights_offset]\n"
+ "ld1r { v29.4s }, [x20]\n"
"cmp %x[ld_weight_row], XZR\n"
- "mul v30.4s, v30.4s, v31.4s\n"
+ "mul v29.4s, v29.4s, v30.4s\n"
"csel %x[ld_weight_row], %x[ld_weight_row], x21, NE\n"
"lsr x21, %x[n_channels], #0x2\n"
- "movi v29.16b, #0x1\n"
- "mul v30.4s, v30.4s, v16.4s\n"
+ "movi v28.16b, #0x1\n"
+ "mul v29.4s, v29.4s, v16.4s\n"
"add x25, %x[weights], %x[ld_weight_row]\n"
"add x20, %x[qp], %[offsetof_per_layer_mul]\n"
- "ld1r { v28.4s }, [x20]\n"
- "add x20, %x[qp], %[offsetof_per_layer_right_shift]\n"
"ld1r { v27.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_per_layer_right_shift]\n"
+ "ld1r { v26.4s }, [x20]\n"
"add x24, x25, %x[ld_weight_row]\n"
"add x23, %x[ld_weight_col], %x[ld_weight_col]\n"
"mov x22, #0x0\n"
"cbz x21, 4f\n"
"1:" // Loop
- "movi v26.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
"cbz %x[bias], 2f\n"
- "ldr q26, [%x[bias], x22]\n"
+ "ldr q25, [%x[bias], x22]\n"
"2:" // Loop: Skip bias load
- "ldr s25, [%x[weights], #0x0]\n"
- "ldr s22, [%x[weights], %x[ld_weight_col]]\n"
- "zip1 v22.16b, v22.16b, v0.16b\n"
- "movi v24.4s, #0x0\n"
- "ldr s20, [%x[weights], x23]\n"
- "ldr s23, [x25, #0x0]\n"
- "zip1 v20.16b, v25.16b, v20.16b\n"
- "zip1 v22.16b, v20.16b, v22.16b\n"
- "ldr s21, [x25, %x[ld_weight_col]]\n"
- "ldr s18, [x25, x23]\n"
- "zip1 v20.16b, v23.16b, v18.16b\n"
- "zip1 v18.16b, v21.16b, v0.16b\n"
+ "ldr s19, [%x[weights], #0x0]\n"
+ "ldr s16, [%x[weights], %x[ld_weight_col]]\n"
+ "zip1 v17.16b, v16.16b, v31.16b\n"
+ "movi v21.4s, #0x0\n"
+ "ldr s16, [%x[weights], x23]\n"
+ "ldr s18, [x25, #0x0]\n"
+ "zip1 v16.16b, v19.16b, v16.16b\n"
+ "zip1 v20.16b, v16.16b, v17.16b\n"
+ "ldr s17, [x25, %x[ld_weight_col]]\n"
+ "ldr s16, [x25, x23]\n"
+ "zip1 v18.16b, v18.16b, v16.16b\n"
+ "zip1 v16.16b, v17.16b, v31.16b\n"
"ldr s17, [x24, #0x0]\n"
"ldr s19, [x24, %x[ld_weight_col]]\n"
- ".inst 0x4e9697b8 // sdot v24.4s, v29.16b, v22.16b\n"
- "zip1 v18.16b, v20.16b, v18.16b\n"
+ ".inst 0x4e949795 // sdot v21.4s, v28.16b, v20.16b\n"
+ "zip1 v18.16b, v18.16b, v16.16b\n"
"ldr s16, [x24, x23]\n"
"zip1 v17.16b, v17.16b, v16.16b\n"
- "zip1 v16.16b, v19.16b, v0.16b\n"
- ".inst 0x4e9297b8 // sdot v24.4s, v29.16b, v18.16b\n"
+ "zip1 v16.16b, v19.16b, v31.16b\n"
+ ".inst 0x4e929795 // sdot v21.4s, v28.16b, v18.16b\n"
"zip1 v16.16b, v17.16b, v16.16b\n"
- ".inst 0x4e9097b8 // sdot v24.4s, v29.16b, v16.16b\n"
+ ".inst 0x4e909795 // sdot v21.4s, v28.16b, v16.16b\n"
"add %x[weights], %x[weights], #0x4\n"
"add x25, x25, #0x4\n"
- "mls v26.4s, v24.4s, v31.4s\n"
+ "mls v25.4s, v21.4s, v30.4s\n"
"add x24, x24, #0x4\n"
- "add v26.4s, v26.4s, v30.4s\n"
- "str q26, [%x[outptr], #0x0]\n"
- "str q22, [%x[outptr], #0x10]\n"
+ "add v25.4s, v25.4s, v29.4s\n"
+ "str q25, [%x[outptr], #0x0]\n"
+ "str q20, [%x[outptr], #0x10]\n"
"str q18, [%x[outptr], #0x20]\n"
"str q16, [%x[outptr], #0x30]\n"
"add %x[outptr], %x[outptr], #0x40\n"
"cbz %x[rq_mul_perchannel], 3f\n"
- "ldr q28, [%x[rq_mul_perchannel], x22]\n"
- "ldr q27, [%x[rq_shift_perchannel], x22]\n"
+ "ldr q27, [%x[rq_mul_perchannel], x22]\n"
+ "ldr q26, [%x[rq_shift_perchannel], x22]\n"
"3:" // Loop: Quantisation parameters: Store
"subs x21, x21, #0x1\n"
- "str q28, [%x[outptr], #0x0]\n"
+ "str q27, [%x[outptr], #0x0]\n"
"add x22, x22, #0x10\n"
- "str q27, [%x[outptr], #0x10]\n"
+ "str q26, [%x[outptr], #0x10]\n"
"add %x[outptr], %x[outptr], #0x20\n"
"bgt 1b\n"
"tst %x[n_channels], #0x3\n"
"beq 13f\n"
"4:" // Oddments
- "movi v26.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
"cbz %x[bias], 7f\n"
"add %x[bias], %x[bias], x22\n"
"tbz %x[n_channels], #1, 5f\n"
- "ld1 { v26.d }[0], [%x[bias]], #0x8\n"
+ "ld1 { v25.d }[0], [%x[bias]], #0x8\n"
"tbz %x[n_channels], #0, 6f\n"
- "ld1 { v26.s }[2], [%x[bias]], #0x4\n"
+ "ld1 { v25.s }[2], [%x[bias]], #0x4\n"
"b 6f\n"
"5:" // Oddments: Load bias: Bit 1: Unset
- "ld1 { v26.s }[0], [%x[bias]], #0x4\n"
+ "ld1 { v25.s }[0], [%x[bias]], #0x4\n"
"6:" // Oddments: Load bias: Bit 1: End
"7:" // Oddments: Skip bias load
"tbz %x[n_channels], #1, 8f\n"
- "ld1 { v25.h }[0], [%x[weights]]\n"
- "ld1 { v23.h }[0], [x25]\n"
+ "ld1 { v17.h }[0], [%x[weights]]\n"
+ "ld1 { v24.h }[0], [x25]\n"
"add x21, %x[weights], %x[ld_weight_col]\n"
"add x20, %x[weights], x23\n"
- "ld1 { v22.h }[0], [x21]\n"
- "ld1 { v20.h }[0], [x20]\n"
+ "ld1 { v20.h }[0], [x21]\n"
+ "ld1 { v16.h }[0], [x20]\n"
"add x21, x25, %x[ld_weight_col]\n"
"add x20, x25, x23\n"
- "ld1 { v21.h }[0], [x21]\n"
+ "ld1 { v19.h }[0], [x21]\n"
"ld1 { v18.h }[0], [x20]\n"
"add x21, x24, %x[ld_weight_col]\n"
"add x20, x24, x23\n"
- "ld1 { v17.h }[0], [x24]\n"
- "ld1 { v19.h }[0], [x21]\n"
+ "ld1 { v23.h }[0], [x24]\n"
+ "ld1 { v22.h }[0], [x21]\n"
"add %x[weights], %x[weights], #0x2\n"
"add x25, x25, #0x2\n"
- "ld1 { v16.h }[0], [x20]\n"
+ "ld1 { v21.h }[0], [x20]\n"
"add x24, x24, #0x2\n"
"tbz %x[n_channels], #0, 9f\n"
- "ld1 { v25.b }[2], [%x[weights]]\n"
- "ld1 { v23.b }[2], [x25]\n"
+ "ld1 { v17.b }[2], [%x[weights]]\n"
+ "ld1 { v24.b }[2], [x25]\n"
"add x21, %x[weights], %x[ld_weight_col]\n"
"add x20, %x[weights], x23\n"
- "ld1 { v22.b }[2], [x21]\n"
- "ld1 { v20.b }[2], [x20]\n"
+ "ld1 { v20.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"add x21, x25, %x[ld_weight_col]\n"
"add x20, x25, x23\n"
- "ld1 { v21.b }[2], [x21]\n"
+ "ld1 { v19.b }[2], [x21]\n"
"ld1 { v18.b }[2], [x20]\n"
"add x21, x24, %x[ld_weight_col]\n"
"add x20, x24, x23\n"
- "ld1 { v17.b }[2], [x24]\n"
- "ld1 { v19.b }[2], [x21]\n"
+ "ld1 { v23.b }[2], [x24]\n"
+ "ld1 { v22.b }[2], [x21]\n"
"add %x[weights], %x[weights], #0x1\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 9f\n"
"8:" // Oddments: Load weights: Bit 1: Unset
- "ld1 { v25.b }[0], [%x[weights]]\n"
- "ld1 { v23.b }[0], [x25]\n"
+ "ld1 { v17.b }[0], [%x[weights]]\n"
+ "ld1 { v24.b }[0], [x25]\n"
"add x21, %x[weights], %x[ld_weight_col]\n"
"add x20, %x[weights], x23\n"
- "ld1 { v22.b }[0], [x21]\n"
- "ld1 { v20.b }[0], [x20]\n"
+ "ld1 { v20.b }[0], [x21]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"add x21, x25, %x[ld_weight_col]\n"
"add x20, x25, x23\n"
- "ld1 { v21.b }[0], [x21]\n"
+ "ld1 { v19.b }[0], [x21]\n"
"ld1 { v18.b }[0], [x20]\n"
"add x21, x24, %x[ld_weight_col]\n"
"add x20, x24, x23\n"
- "ld1 { v17.b }[0], [x24]\n"
- "ld1 { v19.b }[0], [x21]\n"
+ "ld1 { v23.b }[0], [x24]\n"
+ "ld1 { v22.b }[0], [x21]\n"
"add %x[weights], %x[weights], #0x1\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "ld1 { v21.b }[0], [x20]\n"
"9:" // Oddments: Load weights: Bit 1: End
- "zip1 v20.16b, v25.16b, v20.16b\n"
- "zip1 v22.16b, v22.16b, v0.16b\n"
- "zip1 v22.16b, v20.16b, v22.16b\n"
- "zip1 v20.16b, v23.16b, v18.16b\n"
- "zip1 v18.16b, v21.16b, v0.16b\n"
- "movi v24.4s, #0x0\n"
- ".inst 0x4e9697b8 // sdot v24.4s, v29.16b, v22.16b\n"
- "zip1 v18.16b, v20.16b, v18.16b\n"
"zip1 v17.16b, v17.16b, v16.16b\n"
- ".inst 0x4e9297b8 // sdot v24.4s, v29.16b, v18.16b\n"
- "zip1 v16.16b, v19.16b, v0.16b\n"
+ "zip1 v16.16b, v20.16b, v31.16b\n"
+ "zip1 v20.16b, v17.16b, v16.16b\n"
+ "zip1 v17.16b, v24.16b, v18.16b\n"
+ "zip1 v16.16b, v19.16b, v31.16b\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x4e949793 // sdot v19.4s, v28.16b, v20.16b\n"
+ "zip1 v18.16b, v17.16b, v16.16b\n"
+ "zip1 v17.16b, v23.16b, v21.16b\n"
+ ".inst 0x4e929793 // sdot v19.4s, v28.16b, v18.16b\n"
+ "zip1 v16.16b, v22.16b, v31.16b\n"
"zip1 v16.16b, v17.16b, v16.16b\n"
- ".inst 0x4e9097b8 // sdot v24.4s, v29.16b, v16.16b\n"
- "mls v26.4s, v24.4s, v31.4s\n"
- "add v26.4s, v26.4s, v30.4s\n"
- "str q26, [%x[outptr], #0x0]\n"
- "str q22, [%x[outptr], #0x10]\n"
+ ".inst 0x4e909793 // sdot v19.4s, v28.16b, v16.16b\n"
+ "mls v25.4s, v19.4s, v30.4s\n"
+ "add v25.4s, v25.4s, v29.4s\n"
+ "str q25, [%x[outptr], #0x0]\n"
+ "str q20, [%x[outptr], #0x10]\n"
"str q18, [%x[outptr], #0x20]\n"
"str q16, [%x[outptr], #0x30]\n"
"add %x[outptr], %x[outptr], #0x40\n"
@@ -217,24 +217,24 @@ void interleave_a64_s8q_3x3_dot::pack_parameters(unsigned int n_channels, void *
"add x21, %x[rq_mul_perchannel], x22\n"
"add x20, %x[rq_shift_perchannel], x22\n"
"tbz %x[n_channels], #1, 10f\n"
- "ld1 { v28.d }[0], [x21], #0x8\n"
- "ld1 { v27.d }[0], [x20], #0x8\n"
+ "ld1 { v27.d }[0], [x21], #0x8\n"
+ "ld1 { v26.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v28.s }[2], [x21], #0x4\n"
- "ld1 { v27.s }[2], [x20], #0x4\n"
+ "ld1 { v27.s }[2], [x21], #0x4\n"
+ "ld1 { v26.s }[2], [x20], #0x4\n"
"b 11f\n"
"10:" // Oddments: Quantisation parameters: Load quant params: Bit 1: Unset
- "ld1 { v28.s }[0], [x21], #0x4\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
+ "ld1 { v27.s }[0], [x21], #0x4\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
"11:" // Oddments: Quantisation parameters: Load quant params: Bit 1: End
"12:" // Oddments: Quantisation parameters: Store
- "str q28, [%x[outptr], #0x0]\n"
- "str q27, [%x[outptr], #0x10]\n"
+ "str q27, [%x[outptr], #0x0]\n"
+ "str q26, [%x[outptr], #0x10]\n"
"add %x[outptr], %x[outptr], #0x20\n"
"13:" // End
: [bias] "+&r" (bias), [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights)
: [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts)
- : "cc", "memory", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp
index b89886ae0c..314f09a0c5 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp
@@ -25,8 +25,8 @@
#if defined(__aarch64__)
#include "arm_gemm.hpp"
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
-#include "src/core/NEON/kernels/assembly/depthwise.hpp"
+#include "utils.hpp"
+#include "depthwise.hpp"
#include <cstdint>
namespace arm_conv {
@@ -54,162 +54,162 @@ void interleave_a64_u8q_3x3_dot::pack_parameters(unsigned int n_channels, void *
"cmp %x[ld_weight_col], XZR\n"
"csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
"movi v16.4s, #0x9\n"
- "movi v0.16b, #0x0\n"
+ "movi v31.16b, #0x0\n"
"mov x21, #0x3\n"
"mul x21, %x[ld_weight_col], x21\n"
"add x20, %x[qp], %[offsetof_input_offset]\n"
- "ld1r { v31.4s }, [x20]\n"
- "add x20, %x[qp], %[offsetof_weights_offset]\n"
"ld1r { v30.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_weights_offset]\n"
+ "ld1r { v29.4s }, [x20]\n"
"cmp %x[ld_weight_row], XZR\n"
- "mul v30.4s, v30.4s, v31.4s\n"
+ "mul v29.4s, v29.4s, v30.4s\n"
"csel %x[ld_weight_row], %x[ld_weight_row], x21, NE\n"
"lsr x21, %x[n_channels], #0x2\n"
- "movi v29.16b, #0x1\n"
- "mul v30.4s, v30.4s, v16.4s\n"
+ "movi v28.16b, #0x1\n"
+ "mul v29.4s, v29.4s, v16.4s\n"
"add x25, %x[weights], %x[ld_weight_row]\n"
"add x20, %x[qp], %[offsetof_per_layer_mul]\n"
- "ld1r { v28.4s }, [x20]\n"
- "add x20, %x[qp], %[offsetof_per_layer_right_shift]\n"
"ld1r { v27.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_per_layer_right_shift]\n"
+ "ld1r { v26.4s }, [x20]\n"
"add x24, x25, %x[ld_weight_row]\n"
"add x23, %x[ld_weight_col], %x[ld_weight_col]\n"
"mov x22, #0x0\n"
"cbz x21, 4f\n"
"1:" // Loop
- "movi v26.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
"cbz %x[bias], 2f\n"
- "ldr q26, [%x[bias], x22]\n"
+ "ldr q25, [%x[bias], x22]\n"
"2:" // Loop: Skip bias load
- "ldr s25, [%x[weights], #0x0]\n"
- "ldr s22, [%x[weights], %x[ld_weight_col]]\n"
- "zip1 v22.16b, v22.16b, v0.16b\n"
- "movi v24.4s, #0x0\n"
- "ldr s20, [%x[weights], x23]\n"
- "ldr s23, [x25, #0x0]\n"
- "zip1 v20.16b, v25.16b, v20.16b\n"
- "zip1 v22.16b, v20.16b, v22.16b\n"
- "ldr s21, [x25, %x[ld_weight_col]]\n"
- "ldr s18, [x25, x23]\n"
- "zip1 v20.16b, v23.16b, v18.16b\n"
- "zip1 v18.16b, v21.16b, v0.16b\n"
+ "ldr s19, [%x[weights], #0x0]\n"
+ "ldr s16, [%x[weights], %x[ld_weight_col]]\n"
+ "zip1 v17.16b, v16.16b, v31.16b\n"
+ "movi v21.4s, #0x0\n"
+ "ldr s16, [%x[weights], x23]\n"
+ "ldr s18, [x25, #0x0]\n"
+ "zip1 v16.16b, v19.16b, v16.16b\n"
+ "zip1 v20.16b, v16.16b, v17.16b\n"
+ "ldr s17, [x25, %x[ld_weight_col]]\n"
+ "ldr s16, [x25, x23]\n"
+ "zip1 v18.16b, v18.16b, v16.16b\n"
+ "zip1 v16.16b, v17.16b, v31.16b\n"
"ldr s17, [x24, #0x0]\n"
"ldr s19, [x24, %x[ld_weight_col]]\n"
- ".inst 0x6e9697b8 // udot v24.4s, v29.16b, v22.16b\n"
- "zip1 v18.16b, v20.16b, v18.16b\n"
+ ".inst 0x6e949795 // udot v21.4s, v28.16b, v20.16b\n"
+ "zip1 v18.16b, v18.16b, v16.16b\n"
"ldr s16, [x24, x23]\n"
"zip1 v17.16b, v17.16b, v16.16b\n"
- "zip1 v16.16b, v19.16b, v0.16b\n"
- ".inst 0x6e9297b8 // udot v24.4s, v29.16b, v18.16b\n"
+ "zip1 v16.16b, v19.16b, v31.16b\n"
+ ".inst 0x6e929795 // udot v21.4s, v28.16b, v18.16b\n"
"zip1 v16.16b, v17.16b, v16.16b\n"
- ".inst 0x6e9097b8 // udot v24.4s, v29.16b, v16.16b\n"
+ ".inst 0x6e909795 // udot v21.4s, v28.16b, v16.16b\n"
"add %x[weights], %x[weights], #0x4\n"
"add x25, x25, #0x4\n"
- "mls v26.4s, v24.4s, v31.4s\n"
+ "mls v25.4s, v21.4s, v30.4s\n"
"add x24, x24, #0x4\n"
- "add v26.4s, v26.4s, v30.4s\n"
- "str q26, [%x[outptr], #0x0]\n"
- "str q22, [%x[outptr], #0x10]\n"
+ "add v25.4s, v25.4s, v29.4s\n"
+ "str q25, [%x[outptr], #0x0]\n"
+ "str q20, [%x[outptr], #0x10]\n"
"str q18, [%x[outptr], #0x20]\n"
"str q16, [%x[outptr], #0x30]\n"
"add %x[outptr], %x[outptr], #0x40\n"
"cbz %x[rq_mul_perchannel], 3f\n"
- "ldr q28, [%x[rq_mul_perchannel], x22]\n"
- "ldr q27, [%x[rq_shift_perchannel], x22]\n"
+ "ldr q27, [%x[rq_mul_perchannel], x22]\n"
+ "ldr q26, [%x[rq_shift_perchannel], x22]\n"
"3:" // Loop: Quantisation parameters: Store
"subs x21, x21, #0x1\n"
- "str q28, [%x[outptr], #0x0]\n"
+ "str q27, [%x[outptr], #0x0]\n"
"add x22, x22, #0x10\n"
- "str q27, [%x[outptr], #0x10]\n"
+ "str q26, [%x[outptr], #0x10]\n"
"add %x[outptr], %x[outptr], #0x20\n"
"bgt 1b\n"
"tst %x[n_channels], #0x3\n"
"beq 13f\n"
"4:" // Oddments
- "movi v26.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
"cbz %x[bias], 7f\n"
"add %x[bias], %x[bias], x22\n"
"tbz %x[n_channels], #1, 5f\n"
- "ld1 { v26.d }[0], [%x[bias]], #0x8\n"
+ "ld1 { v25.d }[0], [%x[bias]], #0x8\n"
"tbz %x[n_channels], #0, 6f\n"
- "ld1 { v26.s }[2], [%x[bias]], #0x4\n"
+ "ld1 { v25.s }[2], [%x[bias]], #0x4\n"
"b 6f\n"
"5:" // Oddments: Load bias: Bit 1: Unset
- "ld1 { v26.s }[0], [%x[bias]], #0x4\n"
+ "ld1 { v25.s }[0], [%x[bias]], #0x4\n"
"6:" // Oddments: Load bias: Bit 1: End
"7:" // Oddments: Skip bias load
"tbz %x[n_channels], #1, 8f\n"
- "ld1 { v25.h }[0], [%x[weights]]\n"
- "ld1 { v23.h }[0], [x25]\n"
+ "ld1 { v17.h }[0], [%x[weights]]\n"
+ "ld1 { v24.h }[0], [x25]\n"
"add x21, %x[weights], %x[ld_weight_col]\n"
"add x20, %x[weights], x23\n"
- "ld1 { v22.h }[0], [x21]\n"
- "ld1 { v20.h }[0], [x20]\n"
+ "ld1 { v20.h }[0], [x21]\n"
+ "ld1 { v16.h }[0], [x20]\n"
"add x21, x25, %x[ld_weight_col]\n"
"add x20, x25, x23\n"
- "ld1 { v21.h }[0], [x21]\n"
+ "ld1 { v19.h }[0], [x21]\n"
"ld1 { v18.h }[0], [x20]\n"
"add x21, x24, %x[ld_weight_col]\n"
"add x20, x24, x23\n"
- "ld1 { v17.h }[0], [x24]\n"
- "ld1 { v19.h }[0], [x21]\n"
+ "ld1 { v23.h }[0], [x24]\n"
+ "ld1 { v22.h }[0], [x21]\n"
"add %x[weights], %x[weights], #0x2\n"
"add x25, x25, #0x2\n"
- "ld1 { v16.h }[0], [x20]\n"
+ "ld1 { v21.h }[0], [x20]\n"
"add x24, x24, #0x2\n"
"tbz %x[n_channels], #0, 9f\n"
- "ld1 { v25.b }[2], [%x[weights]]\n"
- "ld1 { v23.b }[2], [x25]\n"
+ "ld1 { v17.b }[2], [%x[weights]]\n"
+ "ld1 { v24.b }[2], [x25]\n"
"add x21, %x[weights], %x[ld_weight_col]\n"
"add x20, %x[weights], x23\n"
- "ld1 { v22.b }[2], [x21]\n"
- "ld1 { v20.b }[2], [x20]\n"
+ "ld1 { v20.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"add x21, x25, %x[ld_weight_col]\n"
"add x20, x25, x23\n"
- "ld1 { v21.b }[2], [x21]\n"
+ "ld1 { v19.b }[2], [x21]\n"
"ld1 { v18.b }[2], [x20]\n"
"add x21, x24, %x[ld_weight_col]\n"
"add x20, x24, x23\n"
- "ld1 { v17.b }[2], [x24]\n"
- "ld1 { v19.b }[2], [x21]\n"
+ "ld1 { v23.b }[2], [x24]\n"
+ "ld1 { v22.b }[2], [x21]\n"
"add %x[weights], %x[weights], #0x1\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 9f\n"
"8:" // Oddments: Load weights: Bit 1: Unset
- "ld1 { v25.b }[0], [%x[weights]]\n"
- "ld1 { v23.b }[0], [x25]\n"
+ "ld1 { v17.b }[0], [%x[weights]]\n"
+ "ld1 { v24.b }[0], [x25]\n"
"add x21, %x[weights], %x[ld_weight_col]\n"
"add x20, %x[weights], x23\n"
- "ld1 { v22.b }[0], [x21]\n"
- "ld1 { v20.b }[0], [x20]\n"
+ "ld1 { v20.b }[0], [x21]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"add x21, x25, %x[ld_weight_col]\n"
"add x20, x25, x23\n"
- "ld1 { v21.b }[0], [x21]\n"
+ "ld1 { v19.b }[0], [x21]\n"
"ld1 { v18.b }[0], [x20]\n"
"add x21, x24, %x[ld_weight_col]\n"
"add x20, x24, x23\n"
- "ld1 { v17.b }[0], [x24]\n"
- "ld1 { v19.b }[0], [x21]\n"
+ "ld1 { v23.b }[0], [x24]\n"
+ "ld1 { v22.b }[0], [x21]\n"
"add %x[weights], %x[weights], #0x1\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "ld1 { v21.b }[0], [x20]\n"
"9:" // Oddments: Load weights: Bit 1: End
- "zip1 v20.16b, v25.16b, v20.16b\n"
- "zip1 v22.16b, v22.16b, v0.16b\n"
- "zip1 v22.16b, v20.16b, v22.16b\n"
- "zip1 v20.16b, v23.16b, v18.16b\n"
- "zip1 v18.16b, v21.16b, v0.16b\n"
- "movi v24.4s, #0x0\n"
- ".inst 0x6e9697b8 // udot v24.4s, v29.16b, v22.16b\n"
- "zip1 v18.16b, v20.16b, v18.16b\n"
"zip1 v17.16b, v17.16b, v16.16b\n"
- ".inst 0x6e9297b8 // udot v24.4s, v29.16b, v18.16b\n"
- "zip1 v16.16b, v19.16b, v0.16b\n"
+ "zip1 v16.16b, v20.16b, v31.16b\n"
+ "zip1 v20.16b, v17.16b, v16.16b\n"
+ "zip1 v17.16b, v24.16b, v18.16b\n"
+ "zip1 v16.16b, v19.16b, v31.16b\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x6e949793 // udot v19.4s, v28.16b, v20.16b\n"
+ "zip1 v18.16b, v17.16b, v16.16b\n"
+ "zip1 v17.16b, v23.16b, v21.16b\n"
+ ".inst 0x6e929793 // udot v19.4s, v28.16b, v18.16b\n"
+ "zip1 v16.16b, v22.16b, v31.16b\n"
"zip1 v16.16b, v17.16b, v16.16b\n"
- ".inst 0x6e9097b8 // udot v24.4s, v29.16b, v16.16b\n"
- "mls v26.4s, v24.4s, v31.4s\n"
- "add v26.4s, v26.4s, v30.4s\n"
- "str q26, [%x[outptr], #0x0]\n"
- "str q22, [%x[outptr], #0x10]\n"
+ ".inst 0x6e909793 // udot v19.4s, v28.16b, v16.16b\n"
+ "mls v25.4s, v19.4s, v30.4s\n"
+ "add v25.4s, v25.4s, v29.4s\n"
+ "str q25, [%x[outptr], #0x0]\n"
+ "str q20, [%x[outptr], #0x10]\n"
"str q18, [%x[outptr], #0x20]\n"
"str q16, [%x[outptr], #0x30]\n"
"add %x[outptr], %x[outptr], #0x40\n"
@@ -217,24 +217,24 @@ void interleave_a64_u8q_3x3_dot::pack_parameters(unsigned int n_channels, void *
"add x21, %x[rq_mul_perchannel], x22\n"
"add x20, %x[rq_shift_perchannel], x22\n"
"tbz %x[n_channels], #1, 10f\n"
- "ld1 { v28.d }[0], [x21], #0x8\n"
- "ld1 { v27.d }[0], [x20], #0x8\n"
+ "ld1 { v27.d }[0], [x21], #0x8\n"
+ "ld1 { v26.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v28.s }[2], [x21], #0x4\n"
- "ld1 { v27.s }[2], [x20], #0x4\n"
+ "ld1 { v27.s }[2], [x21], #0x4\n"
+ "ld1 { v26.s }[2], [x20], #0x4\n"
"b 11f\n"
"10:" // Oddments: Quantisation parameters: Load quant params: Bit 1: Unset
- "ld1 { v28.s }[0], [x21], #0x4\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
+ "ld1 { v27.s }[0], [x21], #0x4\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
"11:" // Oddments: Quantisation parameters: Load quant params: Bit 1: End
"12:" // Oddments: Quantisation parameters: Store
- "str q28, [%x[outptr], #0x0]\n"
- "str q27, [%x[outptr], #0x10]\n"
+ "str q27, [%x[outptr], #0x0]\n"
+ "str q26, [%x[outptr], #0x10]\n"
"add %x[outptr], %x[outptr], #0x20\n"
"13:" // End
: [bias] "+&r" (bias), [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights)
: [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts)
- : "cc", "memory", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.hpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.hpp
index 5b5ae17806..756c50b98c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#pragma once
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "depthwise.hpp"
#include <functional>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_8b_mla.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_8b_mla.cpp
deleted file mode 100644
index de74ca5f43..0000000000
--- a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_8b_mla.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "8b_mla.hpp"
-
-namespace arm_conv {
-namespace depthwise {
-
-#if defined(__ARM_FEATURE_SVE)
-
-ADD_IMPLEMENTATION(sve, s8q, int8_t, SVE, 2, 3, 3)
-ADD_IMPLEMENTATION(sve, s8q, int8_t, SVE, 2, 5, 5)
-ADD_IMPLEMENTATION(sve, u8q, uint8_t, SVE, 2, 3, 3)
-ADD_IMPLEMENTATION(sve, u8q, uint8_t, SVE, 2, 5, 5)
-
-#endif // defined(__ARM_FEATURE_SVE)
-
-} // namespace depthwise
-} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp
index 0cf8044733..3a4999296a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp
@@ -25,8 +25,8 @@
#if defined(ARM_COMPUTE_ENABLE_SVE)
#include "arm_gemm.hpp"
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
-#include "src/core/NEON/kernels/assembly/depthwise.hpp"
+#include "utils.hpp"
+#include "depthwise.hpp"
#include <cstdint>
namespace arm_conv {
@@ -76,7 +76,6 @@ void interleave_sve_s8q_3x3_dot::pack_parameters(unsigned int n_channels, void *
"cbz %x[bias], 1f\n"
"ptrue p8.s\n"
"1:" // No bias
-
"2:" // Loop
"cntp x20, p2, p1.s\n"
"whilelt p0.b, XZR, x20\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp
index e5bc8198f8..7c5d3c4904 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp
@@ -25,8 +25,8 @@
#if defined(ARM_COMPUTE_ENABLE_SVE)
#include "arm_gemm.hpp"
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
-#include "src/core/NEON/kernels/assembly/depthwise.hpp"
+#include "utils.hpp"
+#include "depthwise.hpp"
#include <cstdint>
namespace arm_conv {
@@ -76,7 +76,6 @@ void interleave_sve_u8q_3x3_dot::pack_parameters(unsigned int n_channels, void *
"cbz %x[bias], 1f\n"
"ptrue p8.s\n"
"1:" // No bias
-
"2:" // Loop
"cntp x20, p2, p1.s\n"
"whilelt p0.b, XZR, x20\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index d2db12535f..6beaba841f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -33,8 +33,8 @@
namespace arm_conv {
namespace depthwise {
-void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
-void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
class a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
{
@@ -57,7 +57,7 @@ class a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 2;
a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(2, 3, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
index f4027df375..d8ca3d7437 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -116,9 +116,9 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"add x28, x9, x25, LSL #1\n"
"add x12, x12, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v18.8h }, [x20]\n"
+ "ld1r { v27.8h }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v17.8h }, [x20]\n"
+ "ld1r { v26.8h }, [x20]\n"
"add x27, x28, x25, LSL #1\n"
"add x26, x11, x15\n"
"add x25, x12, x24, LSL #1\n"
@@ -126,7 +126,7 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"mov x21, #0x0\n"
"sub x20, XZR, x23\n"
"cbz x22, 4f\n"
- "ldr q16, [x10, #0x0]\n"
+ "ldr q25, [x10, #0x0]\n"
"ldr q0, [x10, #0x10]\n"
"cmp x23, x22, LSL #4\n"
"ldr q1, [x10, #0x20]\n"
@@ -145,162 +145,162 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"ldr q13, [x28, x15]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "mov v28.16b, v16.16b\n fmla v28.8h, v4.8h, v9.8h\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v3.8h, v9.8h\n"
+ "mov v24.16b, v25.16b\n fmla v24.8h, v4.8h, v9.8h\n"
+ "mov v23.16b, v25.16b\n fmla v23.8h, v3.8h, v9.8h\n"
"add x23, x23, #0x10\n"
"cmp x23, x22, LSL #4\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
- "ld1 { v9.8h }, [x27]\n"
- "ldr q16, [x10, #0x0]\n"
- "fmla v28.8h, v0.8h, v10.8h\n"
- "ldr q10, [x28, x11]\n"
- "fmla v29.8h, v2.8h, v11.8h\n"
- "ldr q11, [x27, x26]\n"
- "fmla v30.8h, v2.8h, v12.8h\n"
- "fmla v31.8h, v1.8h, v12.8h\n"
+ "mov v22.16b, v25.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+ "mov v21.16b, v25.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "ld1 { v18.8h }, [x27]\n"
+ "ldr q25, [x10, #0x0]\n"
+ "fmla v24.8h, v0.8h, v10.8h\n"
+ "ldr q20, [x28, x11]\n"
+ "fmla v23.8h, v2.8h, v11.8h\n"
+ "ldr q17, [x27, x26]\n"
+ "fmla v22.8h, v2.8h, v12.8h\n"
+ "fmla v21.8h, v1.8h, v12.8h\n"
"add x20, x20, #0x10\n"
"add x21, x21, #0x10\n"
- "fmla v28.8h, v5.8h, v12.8h\n"
- "fmla v29.8h, v4.8h, v12.8h\n"
- "ldr q12, [x13, x15]\n"
- "fmla v30.8h, v6.8h, v9.8h\n"
- "ldr q9, [x13, x11]\n"
- "fmla v31.8h, v3.8h, v13.8h\n"
+ "fmla v24.8h, v5.8h, v12.8h\n"
+ "fmla v23.8h, v4.8h, v12.8h\n"
+ "ldr q16, [x13, x15]\n"
+ "fmla v22.8h, v6.8h, v18.8h\n"
+ "ldr q18, [x13, x11]\n"
+ "fmla v21.8h, v3.8h, v13.8h\n"
"add x13, x13, #0x10\n"
- "fmla v28.8h, v7.8h, v13.8h\n"
- "fmla v29.8h, v6.8h, v13.8h\n"
- "fmla v30.8h, v4.8h, v13.8h\n"
- "fmla v31.8h, v8.8h, v11.8h\n"
- "ld1 { v11.8h }, [x9]\n"
- "fmla v28.8h, v1.8h, v12.8h\n"
- "fmla v29.8h, v0.8h, v12.8h\n"
- "ldr q12, [x9, x26]\n"
+ "fmla v24.8h, v7.8h, v13.8h\n"
+ "fmla v23.8h, v6.8h, v13.8h\n"
+ "fmla v22.8h, v4.8h, v13.8h\n"
+ "fmla v21.8h, v8.8h, v17.8h\n"
+ "ld1 { v17.8h }, [x9]\n"
+ "fmla v24.8h, v1.8h, v16.8h\n"
+ "fmla v23.8h, v0.8h, v16.8h\n"
+ "ldr q16, [x9, x26]\n"
"add x9, x9, #0x10\n"
- "fmla v30.8h, v5.8h, v10.8h\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
+ "fmla v22.8h, v5.8h, v20.8h\n"
+ "fmla v21.8h, v4.8h, v20.8h\n"
"ldr q4, [x10, #0x50]\n"
- "fmla v28.8h, v2.8h, v9.8h\n"
- "fmla v29.8h, v1.8h, v9.8h\n"
- "ld1 { v9.8h }, [x28]\n"
+ "fmla v24.8h, v2.8h, v18.8h\n"
+ "fmla v23.8h, v1.8h, v18.8h\n"
+ "ld1 { v19.8h }, [x28]\n"
"ldr q1, [x10, #0x20]\n"
- "fmla v30.8h, v0.8h, v11.8h\n"
+ "fmla v22.8h, v0.8h, v17.8h\n"
"ldr q0, [x10, #0x10]\n"
- "fmla v31.8h, v2.8h, v12.8h\n"
+ "fmla v21.8h, v2.8h, v16.8h\n"
"ldr q2, [x10, #0x30]\n"
- "fmla v28.8h, v8.8h, v10.8h\n"
- "fmla v29.8h, v7.8h, v10.8h\n"
- "ldr q10, [x28, x26]\n"
+ "fmla v24.8h, v8.8h, v20.8h\n"
+ "fmla v23.8h, v7.8h, v20.8h\n"
+ "ldr q18, [x28, x26]\n"
"add x28, x28, #0x10\n"
"ldr q13, [x28, x15]\n"
- "fmla v30.8h, v3.8h, v9.8h\n"
- "fmla v31.8h, v5.8h, v10.8h\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x27, x15]\n"
+ "fmla v22.8h, v3.8h, v19.8h\n"
+ "fmla v21.8h, v5.8h, v18.8h\n"
+ "fmla v24.8h, v3.8h, v17.8h\n"
+ "ldr q17, [x27, x15]\n"
"ldr q3, [x10, #0x40]\n"
- "fmla v29.8h, v5.8h, v12.8h\n"
- "ldr q12, [x27, x11]\n"
+ "fmla v23.8h, v5.8h, v16.8h\n"
+ "ldr q16, [x27, x11]\n"
"ldr q5, [x10, #0x60]\n"
- "fmla v30.8h, v7.8h, v11.8h\n"
- "fmla v31.8h, v6.8h, v11.8h\n"
+ "fmla v22.8h, v7.8h, v17.8h\n"
+ "fmla v21.8h, v6.8h, v17.8h\n"
"ldr q11, [x13, x26]\n"
- "fmla v28.8h, v6.8h, v9.8h\n"
+ "fmla v24.8h, v6.8h, v19.8h\n"
"ldr q9, [x9, x15]\n"
- "fmla v29.8h, v8.8h, v10.8h\n"
+ "fmla v23.8h, v8.8h, v18.8h\n"
"ld1 { v10.8h }, [x13]\n"
"ldr q6, [x10, #0x70]\n"
- "fmla v30.8h, v8.8h, v12.8h\n"
- "fmla v31.8h, v7.8h, v12.8h\n"
+ "fmla v22.8h, v8.8h, v16.8h\n"
+ "fmla v21.8h, v7.8h, v16.8h\n"
"ldr q12, [x9, x11]\n"
"ldr q7, [x10, #0x80]\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
+ "fmax v24.8h, v24.8h, v27.8h\n"
+ "fmax v23.8h, v23.8h, v27.8h\n"
"ldr q8, [x10, #0x90]\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
+ "fmax v22.8h, v22.8h, v27.8h\n"
+ "fmax v21.8h, v21.8h, v27.8h\n"
"add x27, x27, #0x10\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "st1 { v28.8h }, [x12]\n"
+ "fmin v24.8h, v24.8h, v26.8h\n"
+ "fmin v23.8h, v23.8h, v26.8h\n"
+ "st1 { v24.8h }, [x12]\n"
"add x10, x10, #0xa0\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
- "str q29, [x12, x14]\n"
+ "fmin v22.8h, v22.8h, v26.8h\n"
+ "fmin v21.8h, v21.8h, v26.8h\n"
+ "str q23, [x12, x14]\n"
"add x12, x12, #0x10\n"
- "st1 { v30.8h }, [x25]\n"
- "str q31, [x25, x14]\n"
+ "st1 { v22.8h }, [x25]\n"
+ "str q21, [x25, x14]\n"
"add x25, x25, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "mov v28.16b, v16.16b\n fmla v28.8h, v4.8h, v9.8h\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v3.8h, v9.8h\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
- "ld1 { v9.8h }, [x27]\n"
- "fmla v28.8h, v0.8h, v10.8h\n"
- "ldr q10, [x28, x11]\n"
- "fmla v29.8h, v2.8h, v11.8h\n"
- "ldr q11, [x27, x26]\n"
- "fmla v30.8h, v2.8h, v12.8h\n"
- "fmla v31.8h, v1.8h, v12.8h\n"
- "fmla v28.8h, v5.8h, v12.8h\n"
- "fmla v29.8h, v4.8h, v12.8h\n"
- "ldr q12, [x13, x15]\n"
- "fmla v30.8h, v6.8h, v9.8h\n"
- "ldr q9, [x13, x11]\n"
- "fmla v31.8h, v3.8h, v13.8h\n"
+ "mov v24.16b, v25.16b\n fmla v24.8h, v4.8h, v9.8h\n"
+ "mov v23.16b, v25.16b\n fmla v23.8h, v3.8h, v9.8h\n"
+ "mov v22.16b, v25.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+ "mov v21.16b, v25.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "ld1 { v18.8h }, [x27]\n"
+ "fmla v24.8h, v0.8h, v10.8h\n"
+ "ldr q20, [x28, x11]\n"
+ "fmla v23.8h, v2.8h, v11.8h\n"
+ "ldr q17, [x27, x26]\n"
+ "fmla v22.8h, v2.8h, v12.8h\n"
+ "fmla v21.8h, v1.8h, v12.8h\n"
+ "fmla v24.8h, v5.8h, v12.8h\n"
+ "fmla v23.8h, v4.8h, v12.8h\n"
+ "ldr q16, [x13, x15]\n"
+ "fmla v22.8h, v6.8h, v18.8h\n"
+ "ldr q18, [x13, x11]\n"
+ "fmla v21.8h, v3.8h, v13.8h\n"
"add x13, x13, #0x10\n"
- "fmla v28.8h, v7.8h, v13.8h\n"
- "fmla v29.8h, v6.8h, v13.8h\n"
- "fmla v30.8h, v4.8h, v13.8h\n"
- "fmla v31.8h, v8.8h, v11.8h\n"
- "ld1 { v11.8h }, [x9]\n"
- "fmla v28.8h, v1.8h, v12.8h\n"
- "fmla v29.8h, v0.8h, v12.8h\n"
- "ldr q12, [x9, x26]\n"
+ "fmla v24.8h, v7.8h, v13.8h\n"
+ "fmla v23.8h, v6.8h, v13.8h\n"
+ "fmla v22.8h, v4.8h, v13.8h\n"
+ "fmla v21.8h, v8.8h, v17.8h\n"
+ "ld1 { v17.8h }, [x9]\n"
+ "fmla v24.8h, v1.8h, v16.8h\n"
+ "fmla v23.8h, v0.8h, v16.8h\n"
+ "ldr q16, [x9, x26]\n"
"add x9, x9, #0x10\n"
- "fmla v30.8h, v5.8h, v10.8h\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
- "fmla v28.8h, v2.8h, v9.8h\n"
- "fmla v29.8h, v1.8h, v9.8h\n"
- "ld1 { v9.8h }, [x28]\n"
- "fmla v30.8h, v0.8h, v11.8h\n"
- "fmla v31.8h, v2.8h, v12.8h\n"
- "fmla v28.8h, v8.8h, v10.8h\n"
- "fmla v29.8h, v7.8h, v10.8h\n"
- "ldr q10, [x28, x26]\n"
+ "fmla v22.8h, v5.8h, v20.8h\n"
+ "fmla v21.8h, v4.8h, v20.8h\n"
+ "fmla v24.8h, v2.8h, v18.8h\n"
+ "fmla v23.8h, v1.8h, v18.8h\n"
+ "ld1 { v19.8h }, [x28]\n"
+ "fmla v22.8h, v0.8h, v17.8h\n"
+ "fmla v21.8h, v2.8h, v16.8h\n"
+ "fmla v24.8h, v8.8h, v20.8h\n"
+ "fmla v23.8h, v7.8h, v20.8h\n"
+ "ldr q18, [x28, x26]\n"
"add x28, x28, #0x10\n"
- "fmla v30.8h, v3.8h, v9.8h\n"
- "fmla v31.8h, v5.8h, v10.8h\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x27, x15]\n"
- "fmla v29.8h, v5.8h, v12.8h\n"
- "ldr q12, [x27, x11]\n"
- "fmla v30.8h, v7.8h, v11.8h\n"
- "fmla v31.8h, v6.8h, v11.8h\n"
+ "fmla v22.8h, v3.8h, v19.8h\n"
+ "fmla v21.8h, v5.8h, v18.8h\n"
+ "fmla v24.8h, v3.8h, v17.8h\n"
+ "ldr q17, [x27, x15]\n"
+ "fmla v23.8h, v5.8h, v16.8h\n"
+ "ldr q16, [x27, x11]\n"
+ "fmla v22.8h, v7.8h, v17.8h\n"
+ "fmla v21.8h, v6.8h, v17.8h\n"
"add x27, x27, #0x10\n"
- "fmla v28.8h, v6.8h, v9.8h\n"
- "fmla v29.8h, v8.8h, v10.8h\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmla v30.8h, v8.8h, v12.8h\n"
- "fmla v31.8h, v7.8h, v12.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "st1 { v28.8h }, [x12]\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
- "str q29, [x12, x14]\n"
+ "fmla v24.8h, v6.8h, v19.8h\n"
+ "fmla v23.8h, v8.8h, v18.8h\n"
+ "fmax v24.8h, v24.8h, v27.8h\n"
+ "fmla v22.8h, v8.8h, v16.8h\n"
+ "fmla v21.8h, v7.8h, v16.8h\n"
+ "fmax v23.8h, v23.8h, v27.8h\n"
+ "fmax v22.8h, v22.8h, v27.8h\n"
+ "fmax v21.8h, v21.8h, v27.8h\n"
+ "fmin v24.8h, v24.8h, v26.8h\n"
+ "fmin v23.8h, v23.8h, v26.8h\n"
+ "st1 { v24.8h }, [x12]\n"
+ "fmin v22.8h, v22.8h, v26.8h\n"
+ "fmin v21.8h, v21.8h, v26.8h\n"
+ "str q23, [x12, x14]\n"
"add x12, x12, #0x10\n"
- "st1 { v30.8h }, [x25]\n"
- "str q31, [x25, x14]\n"
+ "st1 { v22.8h }, [x25]\n"
+ "str q21, [x25, x14]\n"
"add x25, x25, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x7\n"
"beq 57f\n"
- "ldr q16, [x10, #0x0]\n"
+ "ldr q25, [x10, #0x0]\n"
"ldr q0, [x10, #0x10]\n"
"add x24, x9, x15\n"
"add x23, x13, XZR\n"
@@ -363,11 +363,11 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"ldr h12, [x21, #0x0]\n"
"ldr h13, [x20, #0x0]\n"
"8:" // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: End
- "mov v28.16b, v16.16b\n fmla v28.8h, v4.8h, v9.8h\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v3.8h, v9.8h\n"
+ "mov v28.16b, v25.16b\n fmla v28.8h, v4.8h, v9.8h\n"
+ "mov v29.16b, v25.16b\n fmla v29.8h, v3.8h, v9.8h\n"
"add x20, x27, XZR\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
+ "mov v30.16b, v25.16b\n fmla v30.8h, v1.8h, v9.8h\n"
+ "mov v31.16b, v25.16b\n fmla v31.8h, v0.8h, v9.8h\n"
"fmla v28.8h, v0.8h, v10.8h\n"
"fmla v29.8h, v2.8h, v11.8h\n"
"fmla v28.8h, v5.8h, v12.8h\n"
@@ -630,14 +630,14 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"52:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: End
"fmla v30.8h, v8.8h, v12.8h\n"
"fmla v31.8h, v7.8h, v12.8h\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
+ "fmax v28.8h, v28.8h, v27.8h\n"
+ "fmax v29.8h, v29.8h, v27.8h\n"
+ "fmax v30.8h, v30.8h, v27.8h\n"
+ "fmax v31.8h, v31.8h, v27.8h\n"
+ "fmin v28.8h, v28.8h, v26.8h\n"
+ "fmin v29.8h, v29.8h, v26.8h\n"
+ "fmin v30.8h, v30.8h, v26.8h\n"
+ "fmin v31.8h, v31.8h, v26.8h\n"
"tbz %x[n_channels], #2, 54f\n"
"mov x21, x12\n"
"mov x20, x25\n"
@@ -699,7 +699,6 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"st1 { v29.h }[0], [x21]\n"
"st1 { v31.h }[0], [x20]\n"
"56:" // Tile loop: Oddments: Store: Bit 2: End
-
"57:" // Tile loop: End
"ldr x22, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
@@ -714,7 +713,7 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index bea4715313..c9a554e9ad 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -83,16 +83,16 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"lsr x15, %x[n_channels], #0x3\n"
"ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v18.8h }, [x20]\n"
+ "ld1r { v27.8h }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v17.8h }, [x20]\n"
+ "ld1r { v26.8h }, [x20]\n"
"add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
"ldp x12, x11, [x21, #0x0]\n"
"ldp x10, x9, [x21, #0x10]\n"
"mov x28, #0x0\n"
"sub x27, XZR, x16\n"
"cbz x15, 3f\n"
- "ldr q16, [x14, #0x0]\n"
+ "ldr q25, [x14, #0x0]\n"
"ldr q0, [x14, #0x10]\n"
"cmp x16, x15, LSL #4\n"
"ldr q1, [x14, #0x20]\n"
@@ -104,197 +104,197 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q7, [x14, #0x80]\n"
"ldr q8, [x14, #0x90]\n"
"add x14, x14, #0xa0\n"
- "ldp x26, x22, [x13, #0x0]\n"
- "ldr q9, [x26, x28]\n"
- "ldr q10, [x22, x28]\n"
- "ldp x25, x24, [x13, #0x10]\n"
- "ldr q11, [x25, x28]\n"
- "ldr q12, [x24, x28]\n"
- "ldr x23, [x13, #0x20]\n"
- "ldr q13, [x23, x28]\n"
- "bge 2f\n"
- "1:" // Channel loop
- "mov v28.16b, v16.16b\n fmla v28.8h, v4.8h, v9.8h\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v3.8h, v9.8h\n"
- "ldr x22, [x13, #0x28]\n"
- "ldr x21, [x13, #0x30]\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
- "ldr q9, [x22, x28]\n"
- "ldr q16, [x14, #0x0]\n"
- "fmla v28.8h, v0.8h, v10.8h\n"
- "fmla v29.8h, v2.8h, v11.8h\n"
+ "ldp x21, x20, [x13, #0x0]\n"
+ "ldr q9, [x21, x28]\n"
+ "ldr q10, [x20, x28]\n"
+ "ldp x21, x20, [x13, #0x10]\n"
"ldr q11, [x21, x28]\n"
- "ldr x20, [x13, #0x38]\n"
- "fmla v30.8h, v2.8h, v12.8h\n"
- "fmla v31.8h, v1.8h, v12.8h\n"
- "ldr x22, [x13, #0x48]\n"
- "ldr q10, [x22, x28]\n"
- "fmla v28.8h, v5.8h, v12.8h\n"
- "fmla v29.8h, v4.8h, v12.8h\n"
"ldr q12, [x20, x28]\n"
- "ldr x26, [x13, #0x40]\n"
- "fmla v30.8h, v6.8h, v9.8h\n"
- "ldr q9, [x26, x28]\n"
- "fmla v31.8h, v3.8h, v13.8h\n"
- "ldr x25, [x13, #0x50]\n"
- "fmla v28.8h, v7.8h, v13.8h\n"
- "fmla v29.8h, v6.8h, v13.8h\n"
- "ldr x24, [x13, #0x58]\n"
- "ldr x23, [x13, #0x60]\n"
- "fmla v30.8h, v4.8h, v13.8h\n"
- "fmla v31.8h, v8.8h, v11.8h\n"
- "ldr q11, [x25, x28]\n"
- "ldr x22, [x13, #0x68]\n"
- "fmla v28.8h, v1.8h, v12.8h\n"
- "fmla v29.8h, v0.8h, v12.8h\n"
- "ldr q12, [x24, x28]\n"
- "ldr x21, [x13, #0x70]\n"
- "fmla v30.8h, v5.8h, v10.8h\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
+ "ldr x20, [x13, #0x20]\n"
+ "ldr q13, [x20, x28]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "mov v24.16b, v25.16b\n fmla v24.8h, v4.8h, v9.8h\n"
+ "mov v23.16b, v25.16b\n fmla v23.8h, v3.8h, v9.8h\n"
+ "ldr x21, [x13, #0x28]\n"
+ "ldr x20, [x13, #0x30]\n"
+ "mov v22.16b, v25.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+ "mov v21.16b, v25.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "ldr q18, [x21, x28]\n"
+ "ldr q25, [x14, #0x0]\n"
+ "fmla v24.8h, v0.8h, v10.8h\n"
+ "fmla v23.8h, v2.8h, v11.8h\n"
+ "ldr q17, [x20, x28]\n"
+ "ldr x21, [x13, #0x38]\n"
+ "fmla v22.8h, v2.8h, v12.8h\n"
+ "fmla v21.8h, v1.8h, v12.8h\n"
+ "ldr x20, [x13, #0x48]\n"
+ "ldr q20, [x20, x28]\n"
+ "fmla v24.8h, v5.8h, v12.8h\n"
+ "fmla v23.8h, v4.8h, v12.8h\n"
+ "ldr q16, [x21, x28]\n"
+ "ldr x20, [x13, #0x40]\n"
+ "fmla v22.8h, v6.8h, v18.8h\n"
+ "ldr q18, [x20, x28]\n"
+ "fmla v21.8h, v3.8h, v13.8h\n"
+ "ldr x20, [x13, #0x50]\n"
+ "fmla v24.8h, v7.8h, v13.8h\n"
+ "fmla v23.8h, v6.8h, v13.8h\n"
+ "ldr x22, [x13, #0x58]\n"
+ "ldr x21, [x13, #0x60]\n"
+ "fmla v22.8h, v4.8h, v13.8h\n"
+ "fmla v21.8h, v8.8h, v17.8h\n"
+ "ldr q17, [x20, x28]\n"
+ "ldr x20, [x13, #0x68]\n"
+ "fmla v24.8h, v1.8h, v16.8h\n"
+ "fmla v23.8h, v0.8h, v16.8h\n"
+ "ldr q16, [x22, x28]\n"
+ "ldr x26, [x13, #0x70]\n"
+ "fmla v22.8h, v5.8h, v20.8h\n"
+ "fmla v21.8h, v4.8h, v20.8h\n"
"ldr q4, [x14, #0x50]\n"
- "ldr x20, [x13, #0x78]\n"
- "fmla v28.8h, v2.8h, v9.8h\n"
- "fmla v29.8h, v1.8h, v9.8h\n"
- "ldr q9, [x23, x28]\n"
+ "ldr x25, [x13, #0x78]\n"
+ "fmla v24.8h, v2.8h, v18.8h\n"
+ "fmla v23.8h, v1.8h, v18.8h\n"
+ "ldr q19, [x21, x28]\n"
"ldr q1, [x14, #0x20]\n"
- "fmla v30.8h, v0.8h, v11.8h\n"
+ "fmla v22.8h, v0.8h, v17.8h\n"
"ldr q0, [x14, #0x10]\n"
- "fmla v31.8h, v2.8h, v12.8h\n"
+ "fmla v21.8h, v2.8h, v16.8h\n"
"ldr q2, [x14, #0x30]\n"
- "fmla v28.8h, v8.8h, v10.8h\n"
- "fmla v29.8h, v7.8h, v10.8h\n"
- "ldr q10, [x22, x28]\n"
- "ldp x26, x22, [x13, #0x0]\n"
- "fmla v30.8h, v3.8h, v9.8h\n"
- "fmla v31.8h, v5.8h, v10.8h\n"
- "ldp x25, x24, [x13, #0x10]\n"
- "ldr x23, [x13, #0x20]\n"
- "ldr q13, [x23, x16]\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x21, x28]\n"
- "fmla v29.8h, v5.8h, v12.8h\n"
- "ldr q12, [x20, x28]\n"
+ "fmla v24.8h, v8.8h, v20.8h\n"
+ "fmla v23.8h, v7.8h, v20.8h\n"
+ "ldr q18, [x20, x28]\n"
+ "ldp x24, x23, [x13, #0x0]\n"
+ "fmla v22.8h, v3.8h, v19.8h\n"
+ "fmla v21.8h, v5.8h, v18.8h\n"
+ "ldp x22, x21, [x13, #0x10]\n"
+ "ldr x20, [x13, #0x20]\n"
+ "ldr q13, [x20, x16]\n"
+ "fmla v24.8h, v3.8h, v17.8h\n"
+ "ldr q17, [x26, x28]\n"
+ "fmla v23.8h, v5.8h, v16.8h\n"
+ "ldr q16, [x25, x28]\n"
"ldr q3, [x14, #0x40]\n"
- "fmla v30.8h, v7.8h, v11.8h\n"
- "fmla v31.8h, v6.8h, v11.8h\n"
- "ldr q11, [x25, x16]\n"
+ "fmla v22.8h, v7.8h, v17.8h\n"
+ "fmla v21.8h, v6.8h, v17.8h\n"
+ "ldr q11, [x22, x16]\n"
"ldr q5, [x14, #0x60]\n"
- "fmla v28.8h, v6.8h, v9.8h\n"
- "fmla v29.8h, v8.8h, v10.8h\n"
- "ldr q9, [x26, x16]\n"
- "ldr q10, [x22, x16]\n"
- "fmla v30.8h, v8.8h, v12.8h\n"
- "fmla v31.8h, v7.8h, v12.8h\n"
- "ldr q12, [x24, x16]\n"
+ "fmla v24.8h, v6.8h, v19.8h\n"
+ "fmla v23.8h, v8.8h, v18.8h\n"
+ "ldr q9, [x24, x16]\n"
+ "ldr q10, [x23, x16]\n"
+ "fmla v22.8h, v8.8h, v16.8h\n"
+ "fmla v21.8h, v7.8h, v16.8h\n"
+ "ldr q12, [x21, x16]\n"
"ldr q6, [x14, #0x70]\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
+ "fmax v24.8h, v24.8h, v27.8h\n"
+ "fmax v23.8h, v23.8h, v27.8h\n"
"ldr q7, [x14, #0x80]\n"
"ldr q8, [x14, #0x90]\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
+ "fmax v22.8h, v22.8h, v27.8h\n"
+ "fmax v21.8h, v21.8h, v27.8h\n"
"add x16, x16, #0x10\n"
"add x27, x27, #0x10\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
+ "fmin v24.8h, v24.8h, v26.8h\n"
+ "fmin v23.8h, v23.8h, v26.8h\n"
"cmp x16, x15, LSL #4\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
+ "fmin v22.8h, v22.8h, v26.8h\n"
+ "fmin v21.8h, v21.8h, v26.8h\n"
"add x28, x28, #0x10\n"
- "str q28, [x12, x27]\n"
+ "str q24, [x12, x27]\n"
"add x14, x14, #0xa0\n"
- "str q29, [x11, x27]\n"
- "str q30, [x10, x27]\n"
- "str q31, [x9, x27]\n"
+ "str q23, [x11, x27]\n"
+ "str q22, [x10, x27]\n"
+ "str q21, [x9, x27]\n"
"blt 1b\n"
"2:" // Channel tail
- "mov v28.16b, v16.16b\n fmla v28.8h, v4.8h, v9.8h\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v3.8h, v9.8h\n"
- "ldr x22, [x13, #0x28]\n"
- "ldr x21, [x13, #0x30]\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
- "ldr q9, [x22, x28]\n"
- "ldr x20, [x13, #0x38]\n"
- "fmla v28.8h, v0.8h, v10.8h\n"
- "fmla v29.8h, v2.8h, v11.8h\n"
- "ldr q11, [x21, x28]\n"
- "ldr x22, [x13, #0x48]\n"
- "ldr q10, [x22, x28]\n"
- "fmla v30.8h, v2.8h, v12.8h\n"
- "fmla v31.8h, v1.8h, v12.8h\n"
- "ldr x26, [x13, #0x40]\n"
- "fmla v28.8h, v5.8h, v12.8h\n"
- "fmla v29.8h, v4.8h, v12.8h\n"
- "ldr q12, [x20, x28]\n"
- "ldr x25, [x13, #0x50]\n"
- "fmla v30.8h, v6.8h, v9.8h\n"
- "ldr q9, [x26, x28]\n"
- "fmla v31.8h, v3.8h, v13.8h\n"
- "ldr x24, [x13, #0x58]\n"
- "fmla v28.8h, v7.8h, v13.8h\n"
- "fmla v29.8h, v6.8h, v13.8h\n"
+ "mov v24.16b, v25.16b\n fmla v24.8h, v4.8h, v9.8h\n"
+ "mov v23.16b, v25.16b\n fmla v23.8h, v3.8h, v9.8h\n"
+ "ldr x21, [x13, #0x28]\n"
+ "ldr x20, [x13, #0x30]\n"
+ "mov v22.16b, v25.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+ "mov v21.16b, v25.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "ldr q18, [x21, x28]\n"
+ "ldr x21, [x13, #0x38]\n"
+ "fmla v24.8h, v0.8h, v10.8h\n"
+ "fmla v23.8h, v2.8h, v11.8h\n"
+ "ldr q17, [x20, x28]\n"
+ "ldr x20, [x13, #0x48]\n"
+ "ldr q20, [x20, x28]\n"
+ "fmla v22.8h, v2.8h, v12.8h\n"
+ "fmla v21.8h, v1.8h, v12.8h\n"
+ "ldr x20, [x13, #0x40]\n"
+ "fmla v24.8h, v5.8h, v12.8h\n"
+ "fmla v23.8h, v4.8h, v12.8h\n"
+ "ldr q16, [x21, x28]\n"
+ "ldr x21, [x13, #0x50]\n"
+ "fmla v22.8h, v6.8h, v18.8h\n"
+ "ldr q18, [x20, x28]\n"
+ "fmla v21.8h, v3.8h, v13.8h\n"
+ "ldr x20, [x13, #0x58]\n"
+ "fmla v24.8h, v7.8h, v13.8h\n"
+ "fmla v23.8h, v6.8h, v13.8h\n"
"ldr x23, [x13, #0x60]\n"
"ldr x22, [x13, #0x68]\n"
- "fmla v30.8h, v4.8h, v13.8h\n"
- "fmla v31.8h, v8.8h, v11.8h\n"
- "ldr q11, [x25, x28]\n"
+ "fmla v22.8h, v4.8h, v13.8h\n"
+ "fmla v21.8h, v8.8h, v17.8h\n"
+ "ldr q17, [x21, x28]\n"
"ldr x21, [x13, #0x70]\n"
- "fmla v28.8h, v1.8h, v12.8h\n"
- "fmla v29.8h, v0.8h, v12.8h\n"
- "ldr q12, [x24, x28]\n"
+ "fmla v24.8h, v1.8h, v16.8h\n"
+ "fmla v23.8h, v0.8h, v16.8h\n"
+ "ldr q16, [x20, x28]\n"
"ldr x20, [x13, #0x78]\n"
- "fmla v30.8h, v5.8h, v10.8h\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
+ "fmla v22.8h, v5.8h, v20.8h\n"
+ "fmla v21.8h, v4.8h, v20.8h\n"
"add x27, x27, #0x10\n"
- "fmla v28.8h, v2.8h, v9.8h\n"
- "fmla v29.8h, v1.8h, v9.8h\n"
- "ldr q9, [x23, x28]\n"
- "fmla v30.8h, v0.8h, v11.8h\n"
- "fmla v31.8h, v2.8h, v12.8h\n"
- "fmla v28.8h, v8.8h, v10.8h\n"
- "fmla v29.8h, v7.8h, v10.8h\n"
- "ldr q10, [x22, x28]\n"
- "fmla v30.8h, v3.8h, v9.8h\n"
- "fmla v31.8h, v5.8h, v10.8h\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x21, x28]\n"
- "fmla v29.8h, v5.8h, v12.8h\n"
- "ldr q12, [x20, x28]\n"
- "fmla v30.8h, v7.8h, v11.8h\n"
- "fmla v31.8h, v6.8h, v11.8h\n"
+ "fmla v24.8h, v2.8h, v18.8h\n"
+ "fmla v23.8h, v1.8h, v18.8h\n"
+ "ldr q19, [x23, x28]\n"
+ "fmla v22.8h, v0.8h, v17.8h\n"
+ "fmla v21.8h, v2.8h, v16.8h\n"
+ "fmla v24.8h, v8.8h, v20.8h\n"
+ "fmla v23.8h, v7.8h, v20.8h\n"
+ "ldr q18, [x22, x28]\n"
+ "fmla v22.8h, v3.8h, v19.8h\n"
+ "fmla v21.8h, v5.8h, v18.8h\n"
+ "fmla v24.8h, v3.8h, v17.8h\n"
+ "ldr q17, [x21, x28]\n"
+ "fmla v23.8h, v5.8h, v16.8h\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v22.8h, v7.8h, v17.8h\n"
+ "fmla v21.8h, v6.8h, v17.8h\n"
"add x28, x28, #0x10\n"
- "fmla v28.8h, v6.8h, v9.8h\n"
- "fmla v29.8h, v8.8h, v10.8h\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmla v30.8h, v8.8h, v12.8h\n"
- "fmla v31.8h, v7.8h, v12.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "str q28, [x12, x27]\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
- "str q29, [x11, x27]\n"
- "str q30, [x10, x27]\n"
- "str q31, [x9, x27]\n"
+ "fmla v24.8h, v6.8h, v19.8h\n"
+ "fmla v23.8h, v8.8h, v18.8h\n"
+ "fmax v24.8h, v24.8h, v27.8h\n"
+ "fmla v22.8h, v8.8h, v16.8h\n"
+ "fmla v21.8h, v7.8h, v16.8h\n"
+ "fmax v23.8h, v23.8h, v27.8h\n"
+ "fmax v22.8h, v22.8h, v27.8h\n"
+ "fmax v21.8h, v21.8h, v27.8h\n"
+ "fmin v24.8h, v24.8h, v26.8h\n"
+ "fmin v23.8h, v23.8h, v26.8h\n"
+ "str q24, [x12, x27]\n"
+ "fmin v22.8h, v22.8h, v26.8h\n"
+ "fmin v21.8h, v21.8h, v26.8h\n"
+ "str q23, [x11, x27]\n"
+ "str q22, [x10, x27]\n"
+ "str q21, [x9, x27]\n"
"3:" // Oddments
"tst %x[n_channels], #0x7\n"
"beq 56f\n"
- "ldr q16, [x14, #0x0]\n"
+ "ldr q25, [x14, #0x0]\n"
"ldr q0, [x14, #0x10]\n"
- "mov x27, x28\n"
- "add x12, x12, x27\n"
+ "mov x20, x28\n"
+ "add x12, x12, x20\n"
"ldr q1, [x14, #0x20]\n"
"ldr q2, [x14, #0x30]\n"
- "add x11, x11, x27\n"
- "add x10, x10, x27\n"
+ "add x11, x11, x20\n"
+ "add x10, x10, x20\n"
"ldr q3, [x14, #0x40]\n"
"ldr q4, [x14, #0x50]\n"
- "add x9, x9, x27\n"
+ "add x9, x9, x20\n"
"ldr q5, [x14, #0x60]\n"
"ldr q6, [x14, #0x70]\n"
"ldr q7, [x14, #0x80]\n"
@@ -357,12 +357,12 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"ld1 { v12.h }[0], [x21], #0x2\n"
"ld1 { v13.h }[0], [x20], #0x2\n"
"7:" // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: End
- "mov v28.16b, v16.16b\n fmla v28.8h, v4.8h, v9.8h\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v3.8h, v9.8h\n"
+ "mov v28.16b, v25.16b\n fmla v28.8h, v4.8h, v9.8h\n"
+ "mov v29.16b, v25.16b\n fmla v29.8h, v3.8h, v9.8h\n"
"ldr x20, [x13, #0x28]\n"
"add x20, x20, x28\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
+ "mov v30.16b, v25.16b\n fmla v30.8h, v1.8h, v9.8h\n"
+ "mov v31.16b, v25.16b\n fmla v31.8h, v0.8h, v9.8h\n"
"fmla v28.8h, v0.8h, v10.8h\n"
"fmla v29.8h, v2.8h, v11.8h\n"
"fmla v28.8h, v5.8h, v12.8h\n"
@@ -635,14 +635,14 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"51:" // Oddments: Load input (3, 2): Bit 2: End
"fmla v30.8h, v8.8h, v12.8h\n"
"fmla v31.8h, v7.8h, v12.8h\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
+ "fmax v28.8h, v28.8h, v27.8h\n"
+ "fmax v29.8h, v29.8h, v27.8h\n"
+ "fmax v30.8h, v30.8h, v27.8h\n"
+ "fmax v31.8h, v31.8h, v27.8h\n"
+ "fmin v28.8h, v28.8h, v26.8h\n"
+ "fmin v29.8h, v29.8h, v26.8h\n"
+ "fmin v30.8h, v30.8h, v26.8h\n"
+ "fmin v31.8h, v31.8h, v26.8h\n"
"tbz %x[n_channels], #2, 53f\n"
"st1 { v28.d }[0], [x12], #0x8\n"
"st1 { v29.d }[0], [x11], #0x8\n"
@@ -687,7 +687,7 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"56:" // End
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
index 75368dfcf9..6bbd3508cb 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -33,8 +33,8 @@
namespace arm_conv {
namespace depthwise {
-void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
-void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
class a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
{
@@ -57,7 +57,7 @@ class a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 3;
a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(3, 3, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
index 2b1dc3646d..4e64a2bf2b 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -120,9 +120,9 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"add x9, x11, x8\n"
"add x28, x15, x22, LSL #1\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v18.8h }, [x20]\n"
+ "ld1r { v15.8h }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v17.8h }, [x20]\n"
+ "ld1r { v14.8h }, [x20]\n"
"add x27, x10, x25, LSL #1\n"
"add x26, x9, x8\n"
"add x25, x28, x22, LSL #1\n"
@@ -130,7 +130,7 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"mov x21, #0x0\n"
"sub x20, XZR, x24\n"
"cbz x23, 4f\n"
- "ldr q16, [x14, #0x0]\n"
+ "ldr q31, [x14, #0x0]\n"
"ldr q0, [x14, #0x10]\n"
"cmp x24, x23, LSL #4\n"
"ldr q1, [x14, #0x20]\n"
@@ -149,304 +149,304 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"ldr q13, [x13, x11]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "mov v24.16b, v16.16b\n fmla v24.8h, v7.8h, v9.8h\n"
- "mov v23.16b, v16.16b\n fmla v23.8h, v8.8h, v9.8h\n"
+ "mov v29.16b, v31.16b\n fmla v29.8h, v7.8h, v9.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v8.8h, v9.8h\n"
"add x24, x24, #0x10\n"
"cmp x24, x23, LSL #4\n"
- "mov v25.16b, v16.16b\n fmla v25.8h, v6.8h, v9.8h\n"
- "fmla v24.8h, v4.8h, v13.8h\n"
+ "mov v27.16b, v31.16b\n fmla v27.8h, v6.8h, v9.8h\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
"add x20, x20, #0x10\n"
"add x21, x21, #0x10\n"
- "mov v26.16b, v16.16b\n fmla v26.8h, v5.8h, v9.8h\n"
- "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
- "mov v28.16b, v16.16b\n fmla v28.8h, v3.8h, v9.8h\n"
- "fmla v23.8h, v0.8h, v10.8h\n"
- "ldr q10, [x12, x9]\n"
- "fmla v25.8h, v2.8h, v11.8h\n"
- "ldr q11, [x12, x8]\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v2.8h, v9.8h\n"
- "fmla v24.8h, v6.8h, v11.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
- "fmla v23.8h, v5.8h, v13.8h\n"
- "fmla v25.8h, v3.8h, v13.8h\n"
+ "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
+ "mov v25.16b, v31.16b\n fmla v25.8h, v4.8h, v9.8h\n"
+ "mov v24.16b, v31.16b\n fmla v24.8h, v3.8h, v9.8h\n"
+ "fmla v28.8h, v0.8h, v10.8h\n"
+ "ldr q23, [x12, x9]\n"
+ "fmla v27.8h, v2.8h, v11.8h\n"
+ "ldr q18, [x12, x8]\n"
+ "mov v22.16b, v31.16b\n fmla v22.8h, v2.8h, v9.8h\n"
+ "fmla v29.8h, v6.8h, v18.8h\n"
+ "mov v21.16b, v31.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "fmla v28.8h, v5.8h, v13.8h\n"
+ "fmla v27.8h, v3.8h, v13.8h\n"
"fmla v26.8h, v2.8h, v13.8h\n"
- "fmla v27.8h, v1.8h, v13.8h\n"
- "fmla v28.8h, v0.8h, v13.8h\n"
- "ldr q13, [x16, x8]\n"
- "fmla v29.8h, v6.8h, v12.8h\n"
- "ldr q12, [x27, x26]\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
- "ldr q16, [x14, #0x0]\n"
+ "fmla v25.8h, v1.8h, v13.8h\n"
"fmla v24.8h, v0.8h, v13.8h\n"
- "fmla v31.8h, v8.8h, v12.8h\n"
- "ldr q12, [x16, x9]\n"
- "fmla v23.8h, v7.8h, v11.8h\n"
- "fmla v30.8h, v0.8h, v11.8h\n"
- "fmla v26.8h, v4.8h, v11.8h\n"
- "fmla v27.8h, v3.8h, v11.8h\n"
- "fmla v29.8h, v1.8h, v11.8h\n"
- "ld1 { v11.8h }, [x13]\n"
- "fmla v24.8h, v2.8h, v12.8h\n"
- "fmla v25.8h, v1.8h, v12.8h\n"
- "ld1 { v12.8h }, [x10]\n"
- "fmla v28.8h, v4.8h, v10.8h\n"
- "fmla v23.8h, v1.8h, v13.8h\n"
- "ldr q13, [x13, x26]\n"
- "fmla v30.8h, v2.8h, v10.8h\n"
- "fmla v31.8h, v1.8h, v10.8h\n"
- "fmla v24.8h, v8.8h, v10.8h\n"
- "fmla v25.8h, v7.8h, v10.8h\n"
- "fmla v27.8h, v5.8h, v10.8h\n"
- "ldr q10, [x10, x11]\n"
- "fmla v26.8h, v0.8h, v11.8h\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
- "fmla v28.8h, v2.8h, v13.8h\n"
- "fmla v30.8h, v4.8h, v10.8h\n"
- "fmla v31.8h, v3.8h, v10.8h\n"
- "fmla v23.8h, v3.8h, v11.8h\n"
- "ldr q11, [x10, x26]\n"
- "fmla v25.8h, v5.8h, v13.8h\n"
- "ldr q13, [x27, x8]\n"
- "fmla v26.8h, v6.8h, v12.8h\n"
- "ldr q12, [x13, x8]\n"
- "fmla v27.8h, v7.8h, v10.8h\n"
- "fmla v29.8h, v5.8h, v10.8h\n"
- "fmla v28.8h, v6.8h, v10.8h\n"
- "fmla v31.8h, v5.8h, v11.8h\n"
- "fmla v30.8h, v6.8h, v13.8h\n"
- "fmla v26.8h, v8.8h, v10.8h\n"
- "fmla v29.8h, v7.8h, v13.8h\n"
- "ldr q13, [x27, x9]\n"
- "fmla v24.8h, v3.8h, v12.8h\n"
- "fmla v27.8h, v0.8h, v12.8h\n"
- "fmla v28.8h, v8.8h, v11.8h\n"
- "ldr q11, [x13, x9]\n"
- "fmla v30.8h, v8.8h, v13.8h\n"
+ "ldr q17, [x16, x8]\n"
+ "fmla v22.8h, v6.8h, v12.8h\n"
+ "ldr q16, [x27, x26]\n"
+ "mov v20.16b, v31.16b\n fmla v20.8h, v1.8h, v9.8h\n"
+ "ldr q31, [x14, #0x0]\n"
+ "fmla v29.8h, v0.8h, v17.8h\n"
+ "fmla v21.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x16, x9]\n"
+ "fmla v28.8h, v7.8h, v18.8h\n"
+ "fmla v20.8h, v0.8h, v18.8h\n"
+ "fmla v26.8h, v4.8h, v18.8h\n"
+ "fmla v25.8h, v3.8h, v18.8h\n"
+ "fmla v22.8h, v1.8h, v18.8h\n"
+ "ld1 { v19.8h }, [x13]\n"
+ "fmla v29.8h, v2.8h, v16.8h\n"
+ "fmla v27.8h, v1.8h, v16.8h\n"
+ "ld1 { v18.8h }, [x10]\n"
+ "fmla v24.8h, v4.8h, v23.8h\n"
+ "fmla v28.8h, v1.8h, v17.8h\n"
+ "ldr q16, [x13, x26]\n"
+ "fmla v20.8h, v2.8h, v23.8h\n"
+ "fmla v21.8h, v1.8h, v23.8h\n"
+ "fmla v29.8h, v8.8h, v23.8h\n"
+ "fmla v27.8h, v7.8h, v23.8h\n"
+ "fmla v25.8h, v5.8h, v23.8h\n"
+ "ldr q17, [x10, x11]\n"
+ "fmla v26.8h, v0.8h, v19.8h\n"
+ "fmla v22.8h, v3.8h, v18.8h\n"
+ "fmla v24.8h, v2.8h, v16.8h\n"
+ "fmla v20.8h, v4.8h, v17.8h\n"
+ "fmla v21.8h, v3.8h, v17.8h\n"
+ "fmla v28.8h, v3.8h, v19.8h\n"
+ "ldr q19, [x10, x26]\n"
+ "fmla v27.8h, v5.8h, v16.8h\n"
+ "ldr q16, [x27, x8]\n"
+ "fmla v26.8h, v6.8h, v18.8h\n"
+ "ldr q18, [x13, x8]\n"
+ "fmla v25.8h, v7.8h, v17.8h\n"
+ "fmla v22.8h, v5.8h, v17.8h\n"
+ "fmla v24.8h, v6.8h, v17.8h\n"
+ "fmla v21.8h, v5.8h, v19.8h\n"
+ "fmla v20.8h, v6.8h, v16.8h\n"
+ "fmla v26.8h, v8.8h, v17.8h\n"
+ "fmla v22.8h, v7.8h, v16.8h\n"
+ "ldr q17, [x27, x9]\n"
+ "fmla v29.8h, v3.8h, v18.8h\n"
+ "fmla v25.8h, v0.8h, v18.8h\n"
+ "fmla v24.8h, v8.8h, v19.8h\n"
+ "ldr q16, [x13, x9]\n"
+ "fmla v20.8h, v8.8h, v17.8h\n"
"add x13, x13, #0x10\n"
- "fmla v31.8h, v7.8h, v13.8h\n"
- "ldr q13, [x10, x9]\n"
- "fmla v23.8h, v4.8h, v12.8h\n"
- "fmla v26.8h, v1.8h, v12.8h\n"
- "ldr q12, [x10, x8]\n"
- "fmla v24.8h, v5.8h, v11.8h\n"
+ "fmla v21.8h, v7.8h, v17.8h\n"
+ "ldr q19, [x10, x9]\n"
+ "fmla v28.8h, v4.8h, v18.8h\n"
+ "fmla v26.8h, v1.8h, v18.8h\n"
+ "ldr q17, [x10, x8]\n"
+ "fmla v29.8h, v5.8h, v16.8h\n"
"add x10, x10, #0x10\n"
- "fmla v25.8h, v4.8h, v11.8h\n"
- "fmla v27.8h, v2.8h, v11.8h\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "ldr q11, [x16, x11]\n"
- "fmla v29.8h, v4.8h, v12.8h\n"
+ "fmla v27.8h, v4.8h, v16.8h\n"
+ "fmla v25.8h, v2.8h, v16.8h\n"
+ "fmla v24.8h, v1.8h, v16.8h\n"
+ "ldr q16, [x16, x11]\n"
+ "fmla v22.8h, v4.8h, v17.8h\n"
"add x16, x16, #0x10\n"
"ld1 { v10.8h }, [x16]\n"
- "fmla v30.8h, v3.8h, v12.8h\n"
- "fmla v31.8h, v4.8h, v13.8h\n"
+ "fmla v20.8h, v3.8h, v17.8h\n"
+ "fmla v21.8h, v4.8h, v19.8h\n"
"ldr q4, [x14, #0x50]\n"
- "fmla v26.8h, v7.8h, v12.8h\n"
- "fmla v27.8h, v6.8h, v12.8h\n"
- "ld1 { v12.8h }, [x12]\n"
- "fmla v23.8h, v2.8h, v11.8h\n"
- "fmla v24.8h, v1.8h, v11.8h\n"
+ "fmla v26.8h, v7.8h, v17.8h\n"
+ "fmla v25.8h, v6.8h, v17.8h\n"
+ "ld1 { v18.8h }, [x12]\n"
+ "fmla v28.8h, v2.8h, v16.8h\n"
+ "fmla v29.8h, v1.8h, v16.8h\n"
"ldr q1, [x14, #0x20]\n"
- "fmax v24.8h, v24.8h, v18.8h\n"
- "fmla v25.8h, v0.8h, v11.8h\n"
- "ldr q11, [x12, x26]\n"
- "fmla v28.8h, v7.8h, v13.8h\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "fmla v27.8h, v0.8h, v16.8h\n"
+ "ldr q17, [x12, x26]\n"
+ "fmla v24.8h, v7.8h, v19.8h\n"
"add x12, x12, #0x10\n"
"ldr q9, [x12, x11]\n"
- "fmla v30.8h, v5.8h, v13.8h\n"
- "fmla v29.8h, v0.8h, v12.8h\n"
+ "fmla v20.8h, v5.8h, v19.8h\n"
+ "fmla v22.8h, v0.8h, v18.8h\n"
"ldr q0, [x14, #0x10]\n"
- "fmla v31.8h, v2.8h, v11.8h\n"
+ "fmla v21.8h, v2.8h, v17.8h\n"
"ldr q2, [x14, #0x30]\n"
- "fmla v27.8h, v8.8h, v13.8h\n"
- "ldr q13, [x27, x11]\n"
- "fmla v23.8h, v6.8h, v12.8h\n"
- "fmla v26.8h, v3.8h, v12.8h\n"
+ "fmla v25.8h, v8.8h, v19.8h\n"
+ "ldr q16, [x27, x11]\n"
+ "fmla v28.8h, v6.8h, v18.8h\n"
+ "fmla v26.8h, v3.8h, v18.8h\n"
"ldr q3, [x14, #0x40]\n"
- "fmax v23.8h, v23.8h, v18.8h\n"
- "fmla v25.8h, v8.8h, v11.8h\n"
- "fmla v28.8h, v5.8h, v11.8h\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "fmla v27.8h, v8.8h, v17.8h\n"
+ "fmla v24.8h, v5.8h, v17.8h\n"
"ldr q11, [x16, x26]\n"
"ldr q5, [x14, #0x60]\n"
- "fmla v29.8h, v8.8h, v13.8h\n"
+ "fmla v22.8h, v8.8h, v16.8h\n"
"ldr q8, [x14, #0x90]\n"
- "fmla v30.8h, v7.8h, v13.8h\n"
+ "fmla v20.8h, v7.8h, v16.8h\n"
"ldr q7, [x14, #0x80]\n"
- "fmla v31.8h, v6.8h, v13.8h\n"
+ "fmla v21.8h, v6.8h, v16.8h\n"
"ldr q13, [x13, x11]\n"
"ldr q6, [x14, #0x70]\n"
- "fmax v25.8h, v25.8h, v18.8h\n"
- "fmax v26.8h, v26.8h, v18.8h\n"
- "fmax v27.8h, v27.8h, v18.8h\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
"add x27, x27, #0x10\n"
"ld1 { v12.8h }, [x27]\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "fmax v22.8h, v22.8h, v15.8h\n"
"add x14, x14, #0xa0\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "fmin v23.8h, v23.8h, v17.8h\n"
- "fmin v24.8h, v24.8h, v17.8h\n"
- "st1 { v23.8h }, [x15]\n"
- "fmin v25.8h, v25.8h, v17.8h\n"
- "fmin v26.8h, v26.8h, v17.8h\n"
- "str q24, [x15, x17]\n"
- "fmin v27.8h, v27.8h, v17.8h\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "str q25, [x15, x22]\n"
+ "fmax v20.8h, v20.8h, v15.8h\n"
+ "fmax v21.8h, v21.8h, v15.8h\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "st1 { v28.8h }, [x15]\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "str q29, [x15, x17]\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "str q27, [x15, x22]\n"
"add x15, x15, #0x10\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmin v22.8h, v22.8h, v14.8h\n"
+ "fmin v20.8h, v20.8h, v14.8h\n"
"st1 { v26.8h }, [x28]\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
- "str q27, [x28, x17]\n"
- "str q28, [x28, x22]\n"
+ "fmin v21.8h, v21.8h, v14.8h\n"
+ "str q25, [x28, x17]\n"
+ "str q24, [x28, x22]\n"
"add x28, x28, #0x10\n"
- "st1 { v29.8h }, [x25]\n"
- "str q30, [x25, x17]\n"
- "str q31, [x25, x22]\n"
+ "st1 { v22.8h }, [x25]\n"
+ "str q20, [x25, x17]\n"
+ "str q21, [x25, x22]\n"
"add x25, x25, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "mov v24.16b, v16.16b\n fmla v24.8h, v7.8h, v9.8h\n"
- "mov v23.16b, v16.16b\n fmla v23.8h, v8.8h, v9.8h\n"
- "mov v25.16b, v16.16b\n fmla v25.8h, v6.8h, v9.8h\n"
- "fmla v24.8h, v4.8h, v13.8h\n"
- "mov v26.16b, v16.16b\n fmla v26.8h, v5.8h, v9.8h\n"
- "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
- "mov v28.16b, v16.16b\n fmla v28.8h, v3.8h, v9.8h\n"
- "fmla v23.8h, v0.8h, v10.8h\n"
- "ldr q10, [x12, x9]\n"
- "fmla v25.8h, v2.8h, v11.8h\n"
- "ldr q11, [x12, x8]\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v2.8h, v9.8h\n"
- "fmla v24.8h, v6.8h, v11.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
- "fmla v23.8h, v5.8h, v13.8h\n"
- "fmla v25.8h, v3.8h, v13.8h\n"
+ "mov v29.16b, v31.16b\n fmla v29.8h, v7.8h, v9.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v8.8h, v9.8h\n"
+ "mov v27.16b, v31.16b\n fmla v27.8h, v6.8h, v9.8h\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
+ "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
+ "mov v25.16b, v31.16b\n fmla v25.8h, v4.8h, v9.8h\n"
+ "mov v24.16b, v31.16b\n fmla v24.8h, v3.8h, v9.8h\n"
+ "fmla v28.8h, v0.8h, v10.8h\n"
+ "ldr q23, [x12, x9]\n"
+ "fmla v27.8h, v2.8h, v11.8h\n"
+ "ldr q18, [x12, x8]\n"
+ "mov v22.16b, v31.16b\n fmla v22.8h, v2.8h, v9.8h\n"
+ "fmla v29.8h, v6.8h, v18.8h\n"
+ "mov v21.16b, v31.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "fmla v28.8h, v5.8h, v13.8h\n"
+ "fmla v27.8h, v3.8h, v13.8h\n"
"fmla v26.8h, v2.8h, v13.8h\n"
- "fmla v27.8h, v1.8h, v13.8h\n"
- "fmla v28.8h, v0.8h, v13.8h\n"
- "ldr q13, [x16, x8]\n"
- "fmla v29.8h, v6.8h, v12.8h\n"
- "ldr q12, [x27, x26]\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
+ "fmla v25.8h, v1.8h, v13.8h\n"
"fmla v24.8h, v0.8h, v13.8h\n"
- "fmla v31.8h, v8.8h, v12.8h\n"
- "ldr q12, [x16, x9]\n"
- "fmla v23.8h, v7.8h, v11.8h\n"
- "fmla v30.8h, v0.8h, v11.8h\n"
- "fmla v26.8h, v4.8h, v11.8h\n"
- "fmla v27.8h, v3.8h, v11.8h\n"
- "fmla v29.8h, v1.8h, v11.8h\n"
- "ld1 { v11.8h }, [x13]\n"
- "fmla v24.8h, v2.8h, v12.8h\n"
- "fmla v25.8h, v1.8h, v12.8h\n"
- "ld1 { v12.8h }, [x10]\n"
- "fmla v28.8h, v4.8h, v10.8h\n"
- "fmla v23.8h, v1.8h, v13.8h\n"
- "ldr q13, [x13, x26]\n"
- "fmla v30.8h, v2.8h, v10.8h\n"
- "fmla v31.8h, v1.8h, v10.8h\n"
- "fmla v24.8h, v8.8h, v10.8h\n"
- "fmla v25.8h, v7.8h, v10.8h\n"
- "fmla v27.8h, v5.8h, v10.8h\n"
- "ldr q10, [x10, x11]\n"
- "fmla v26.8h, v0.8h, v11.8h\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
- "fmla v28.8h, v2.8h, v13.8h\n"
- "fmla v30.8h, v4.8h, v10.8h\n"
- "fmla v31.8h, v3.8h, v10.8h\n"
- "fmla v23.8h, v3.8h, v11.8h\n"
- "ldr q11, [x10, x26]\n"
- "fmla v25.8h, v5.8h, v13.8h\n"
- "ldr q13, [x27, x8]\n"
- "fmla v26.8h, v6.8h, v12.8h\n"
- "ldr q12, [x13, x8]\n"
- "fmla v27.8h, v7.8h, v10.8h\n"
- "fmla v29.8h, v5.8h, v10.8h\n"
- "fmla v28.8h, v6.8h, v10.8h\n"
- "fmla v31.8h, v5.8h, v11.8h\n"
- "fmla v30.8h, v6.8h, v13.8h\n"
- "fmla v26.8h, v8.8h, v10.8h\n"
- "fmla v29.8h, v7.8h, v13.8h\n"
- "ldr q13, [x27, x9]\n"
- "fmla v24.8h, v3.8h, v12.8h\n"
- "fmla v27.8h, v0.8h, v12.8h\n"
- "fmla v28.8h, v8.8h, v11.8h\n"
- "ldr q11, [x13, x9]\n"
- "fmla v30.8h, v8.8h, v13.8h\n"
+ "ldr q17, [x16, x8]\n"
+ "fmla v22.8h, v6.8h, v12.8h\n"
+ "ldr q16, [x27, x26]\n"
+ "mov v20.16b, v31.16b\n fmla v20.8h, v1.8h, v9.8h\n"
+ "fmla v29.8h, v0.8h, v17.8h\n"
+ "fmla v21.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x16, x9]\n"
+ "fmla v28.8h, v7.8h, v18.8h\n"
+ "fmla v20.8h, v0.8h, v18.8h\n"
+ "fmla v26.8h, v4.8h, v18.8h\n"
+ "fmla v25.8h, v3.8h, v18.8h\n"
+ "fmla v22.8h, v1.8h, v18.8h\n"
+ "ld1 { v19.8h }, [x13]\n"
+ "fmla v29.8h, v2.8h, v16.8h\n"
+ "fmla v27.8h, v1.8h, v16.8h\n"
+ "ld1 { v18.8h }, [x10]\n"
+ "fmla v24.8h, v4.8h, v23.8h\n"
+ "fmla v28.8h, v1.8h, v17.8h\n"
+ "ldr q16, [x13, x26]\n"
+ "fmla v20.8h, v2.8h, v23.8h\n"
+ "fmla v21.8h, v1.8h, v23.8h\n"
+ "fmla v29.8h, v8.8h, v23.8h\n"
+ "fmla v27.8h, v7.8h, v23.8h\n"
+ "fmla v25.8h, v5.8h, v23.8h\n"
+ "ldr q17, [x10, x11]\n"
+ "fmla v26.8h, v0.8h, v19.8h\n"
+ "fmla v22.8h, v3.8h, v18.8h\n"
+ "fmla v24.8h, v2.8h, v16.8h\n"
+ "fmla v20.8h, v4.8h, v17.8h\n"
+ "fmla v21.8h, v3.8h, v17.8h\n"
+ "fmla v28.8h, v3.8h, v19.8h\n"
+ "ldr q19, [x10, x26]\n"
+ "fmla v27.8h, v5.8h, v16.8h\n"
+ "ldr q16, [x27, x8]\n"
+ "fmla v26.8h, v6.8h, v18.8h\n"
+ "ldr q18, [x13, x8]\n"
+ "fmla v25.8h, v7.8h, v17.8h\n"
+ "fmla v22.8h, v5.8h, v17.8h\n"
+ "fmla v24.8h, v6.8h, v17.8h\n"
+ "fmla v21.8h, v5.8h, v19.8h\n"
+ "fmla v20.8h, v6.8h, v16.8h\n"
+ "fmla v26.8h, v8.8h, v17.8h\n"
+ "fmla v22.8h, v7.8h, v16.8h\n"
+ "ldr q17, [x27, x9]\n"
+ "fmla v29.8h, v3.8h, v18.8h\n"
+ "fmla v25.8h, v0.8h, v18.8h\n"
+ "fmla v24.8h, v8.8h, v19.8h\n"
+ "ldr q16, [x13, x9]\n"
+ "fmla v20.8h, v8.8h, v17.8h\n"
"add x13, x13, #0x10\n"
- "fmla v31.8h, v7.8h, v13.8h\n"
- "ldr q13, [x10, x9]\n"
- "fmla v23.8h, v4.8h, v12.8h\n"
- "fmla v26.8h, v1.8h, v12.8h\n"
- "ldr q12, [x10, x8]\n"
- "fmla v24.8h, v5.8h, v11.8h\n"
+ "fmla v21.8h, v7.8h, v17.8h\n"
+ "ldr q19, [x10, x9]\n"
+ "fmla v28.8h, v4.8h, v18.8h\n"
+ "fmla v26.8h, v1.8h, v18.8h\n"
+ "ldr q17, [x10, x8]\n"
+ "fmla v29.8h, v5.8h, v16.8h\n"
"add x10, x10, #0x10\n"
- "fmla v25.8h, v4.8h, v11.8h\n"
- "fmla v27.8h, v2.8h, v11.8h\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "ldr q11, [x16, x11]\n"
- "fmla v29.8h, v4.8h, v12.8h\n"
+ "fmla v27.8h, v4.8h, v16.8h\n"
+ "fmla v25.8h, v2.8h, v16.8h\n"
+ "fmla v24.8h, v1.8h, v16.8h\n"
+ "ldr q16, [x16, x11]\n"
+ "fmla v22.8h, v4.8h, v17.8h\n"
"add x16, x16, #0x10\n"
- "fmla v30.8h, v3.8h, v12.8h\n"
- "fmla v31.8h, v4.8h, v13.8h\n"
- "fmla v26.8h, v7.8h, v12.8h\n"
- "fmla v27.8h, v6.8h, v12.8h\n"
- "ld1 { v12.8h }, [x12]\n"
- "fmla v23.8h, v2.8h, v11.8h\n"
- "fmla v24.8h, v1.8h, v11.8h\n"
- "fmax v24.8h, v24.8h, v18.8h\n"
- "fmla v25.8h, v0.8h, v11.8h\n"
- "ldr q11, [x12, x26]\n"
- "fmla v28.8h, v7.8h, v13.8h\n"
- "fmin v24.8h, v24.8h, v17.8h\n"
- "fmla v30.8h, v5.8h, v13.8h\n"
- "fmla v29.8h, v0.8h, v12.8h\n"
+ "fmla v20.8h, v3.8h, v17.8h\n"
+ "fmla v21.8h, v4.8h, v19.8h\n"
+ "fmla v26.8h, v7.8h, v17.8h\n"
+ "fmla v25.8h, v6.8h, v17.8h\n"
+ "ld1 { v18.8h }, [x12]\n"
+ "fmla v28.8h, v2.8h, v16.8h\n"
+ "fmla v29.8h, v1.8h, v16.8h\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "fmla v27.8h, v0.8h, v16.8h\n"
+ "ldr q17, [x12, x26]\n"
+ "fmla v24.8h, v7.8h, v19.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "fmla v20.8h, v5.8h, v19.8h\n"
+ "fmla v22.8h, v0.8h, v18.8h\n"
"add x12, x12, #0x10\n"
- "fmla v31.8h, v2.8h, v11.8h\n"
- "fmla v27.8h, v8.8h, v13.8h\n"
- "ldr q13, [x27, x11]\n"
- "fmax v27.8h, v27.8h, v18.8h\n"
- "fmla v23.8h, v6.8h, v12.8h\n"
- "fmla v26.8h, v3.8h, v12.8h\n"
- "fmax v23.8h, v23.8h, v18.8h\n"
+ "fmla v21.8h, v2.8h, v17.8h\n"
+ "fmla v25.8h, v8.8h, v19.8h\n"
+ "ldr q16, [x27, x11]\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "fmla v28.8h, v6.8h, v18.8h\n"
+ "fmla v26.8h, v3.8h, v18.8h\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
"add x27, x27, #0x10\n"
- "fmla v25.8h, v8.8h, v11.8h\n"
- "fmla v28.8h, v5.8h, v11.8h\n"
- "fmax v25.8h, v25.8h, v18.8h\n"
- "fmla v29.8h, v8.8h, v13.8h\n"
- "fmla v30.8h, v7.8h, v13.8h\n"
- "fmax v26.8h, v26.8h, v18.8h\n"
- "fmla v31.8h, v6.8h, v13.8h\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "fmin v23.8h, v23.8h, v17.8h\n"
- "st1 { v23.8h }, [x15]\n"
- "fmin v25.8h, v25.8h, v17.8h\n"
- "fmin v26.8h, v26.8h, v17.8h\n"
- "str q24, [x15, x17]\n"
- "fmin v27.8h, v27.8h, v17.8h\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "str q25, [x15, x22]\n"
+ "fmla v27.8h, v8.8h, v17.8h\n"
+ "fmla v24.8h, v5.8h, v17.8h\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "fmla v22.8h, v8.8h, v16.8h\n"
+ "fmla v20.8h, v7.8h, v16.8h\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "fmla v21.8h, v6.8h, v16.8h\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "fmax v22.8h, v22.8h, v15.8h\n"
+ "fmax v20.8h, v20.8h, v15.8h\n"
+ "fmax v21.8h, v21.8h, v15.8h\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "st1 { v28.8h }, [x15]\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "str q29, [x15, x17]\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "str q27, [x15, x22]\n"
"add x15, x15, #0x10\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmin v22.8h, v22.8h, v14.8h\n"
+ "fmin v20.8h, v20.8h, v14.8h\n"
"st1 { v26.8h }, [x28]\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
- "str q27, [x28, x17]\n"
- "str q28, [x28, x22]\n"
+ "fmin v21.8h, v21.8h, v14.8h\n"
+ "str q25, [x28, x17]\n"
+ "str q24, [x28, x22]\n"
"add x28, x28, #0x10\n"
- "st1 { v29.8h }, [x25]\n"
- "str q30, [x25, x17]\n"
- "str q31, [x25, x22]\n"
+ "st1 { v22.8h }, [x25]\n"
+ "str q20, [x25, x17]\n"
+ "str q21, [x25, x22]\n"
"add x25, x25, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x7\n"
"beq 93f\n"
- "ldr q16, [x14, #0x0]\n"
+ "ldr q31, [x14, #0x0]\n"
"ldr q0, [x14, #0x10]\n"
"add x24, x12, x11\n"
"add x23, x16, XZR\n"
@@ -509,18 +509,18 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"ldr h12, [x21, #0x0]\n"
"ldr h13, [x20, #0x0]\n"
"8:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: End
- "mov v23.16b, v16.16b\n fmla v23.8h, v8.8h, v9.8h\n"
- "mov v25.16b, v16.16b\n fmla v25.8h, v6.8h, v9.8h\n"
+ "mov v23.16b, v31.16b\n fmla v23.8h, v8.8h, v9.8h\n"
+ "mov v25.16b, v31.16b\n fmla v25.8h, v6.8h, v9.8h\n"
"add x20, x27, x26\n"
- "mov v24.16b, v16.16b\n fmla v24.8h, v7.8h, v9.8h\n"
- "mov v26.16b, v16.16b\n fmla v26.8h, v5.8h, v9.8h\n"
- "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
- "mov v28.16b, v16.16b\n fmla v28.8h, v3.8h, v9.8h\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v2.8h, v9.8h\n"
+ "mov v24.16b, v31.16b\n fmla v24.8h, v7.8h, v9.8h\n"
+ "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
+ "mov v27.16b, v31.16b\n fmla v27.8h, v4.8h, v9.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v3.8h, v9.8h\n"
+ "mov v29.16b, v31.16b\n fmla v29.8h, v2.8h, v9.8h\n"
"fmla v23.8h, v0.8h, v10.8h\n"
"fmla v25.8h, v2.8h, v11.8h\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
+ "mov v30.16b, v31.16b\n fmla v30.8h, v1.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v9.8h\n"
"fmla v29.8h, v6.8h, v12.8h\n"
"fmla v23.8h, v5.8h, v13.8h\n"
"fmla v24.8h, v4.8h, v13.8h\n"
@@ -1009,25 +1009,25 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"88:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: End
"fmla v29.8h, v8.8h, v13.8h\n"
"fmla v30.8h, v7.8h, v13.8h\n"
- "fmax v23.8h, v23.8h, v18.8h\n"
+ "fmax v23.8h, v23.8h, v15.8h\n"
"fmla v31.8h, v6.8h, v13.8h\n"
- "fmax v24.8h, v24.8h, v18.8h\n"
- "fmax v25.8h, v25.8h, v18.8h\n"
- "fmax v26.8h, v26.8h, v18.8h\n"
- "fmax v27.8h, v27.8h, v18.8h\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "fmin v23.8h, v23.8h, v17.8h\n"
- "fmin v24.8h, v24.8h, v17.8h\n"
- "fmin v25.8h, v25.8h, v17.8h\n"
- "fmin v26.8h, v26.8h, v17.8h\n"
- "fmin v27.8h, v27.8h, v17.8h\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "fmax v30.8h, v30.8h, v15.8h\n"
+ "fmax v31.8h, v31.8h, v15.8h\n"
+ "fmin v23.8h, v23.8h, v14.8h\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "fmin v30.8h, v30.8h, v14.8h\n"
+ "fmin v31.8h, v31.8h, v14.8h\n"
"tbz %x[n_channels], #2, 90f\n"
"mov x22, x15\n"
"mov x21, x28\n"
@@ -1134,7 +1134,6 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"st1 { v28.h }[0], [x21]\n"
"st1 { v31.h }[0], [x20]\n"
"92:" // Tile loop: Oddments: Store: Bit 2: End
-
"93:" // Tile loop: End
"ldr x23, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x24, [%x[params_struct], %[offsetof_args_tile_i]]\n"
@@ -1149,7 +1148,7 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
index 878aa29bcf..72e68482c6 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -87,405 +87,405 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "mov x8, #0x10\n" // cntb _, ALL, #1
- "lsr x17, %x[n_channels], #0x3\n"
- "ldr x16, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mov x7, #0x10\n" // cntb _, ALL, #1
+ "lsr x8, %x[n_channels], #0x3\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v18.8h }, [x20]\n"
+ "ld1r { v15.8h }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v17.8h }, [x20]\n"
- "add x14, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "mov x13, #0x0\n"
- "sub x12, XZR, x8\n"
- "cbz x17, 3f\n"
- "ldr q16, [x15, #0x0]\n"
- "ldr q0, [x15, #0x10]\n"
- "cmp x8, x17, LSL #4\n"
- "ldr q1, [x15, #0x20]\n"
- "ldr q2, [x15, #0x30]\n"
- "ldr q3, [x15, #0x40]\n"
- "ldr q4, [x15, #0x50]\n"
- "ldr q5, [x15, #0x60]\n"
- "ldr q6, [x15, #0x70]\n"
- "ldr q7, [x15, #0x80]\n"
- "ldr q8, [x15, #0x90]\n"
- "add x15, x15, #0xa0\n"
- "ldp x11, x10, [x14, #0x0]\n"
- "ldr q9, [x11, x13]\n"
- "ldr q10, [x10, x13]\n"
- "ldp x9, x28, [x14, #0x10]\n"
- "ldr q11, [x9, x13]\n"
- "ldr q12, [x28, x13]\n"
- "ldr x27, [x14, #0x20]\n"
- "ldr q13, [x27, x13]\n"
+ "ld1r { v14.8h }, [x20]\n"
+ "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x14, #0x0\n"
+ "sub x13, XZR, x7\n"
+ "cbz x8, 3f\n"
+ "ldr q31, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "cmp x7, x8, LSL #4\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "ldr q5, [x16, #0x60]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "ldr q7, [x16, #0x80]\n"
+ "ldr q8, [x16, #0x90]\n"
+ "add x16, x16, #0xa0\n"
+ "ldp x21, x20, [x15, #0x0]\n"
+ "ldr q9, [x21, x14]\n"
+ "ldr q10, [x20, x14]\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr q11, [x21, x14]\n"
+ "ldr q12, [x20, x14]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ldr q13, [x20, x14]\n"
"bge 2f\n"
"1:" // Channel loop
- "mov v23.16b, v16.16b\n fmla v23.8h, v8.8h, v9.8h\n"
- "mov v24.16b, v16.16b\n fmla v24.8h, v7.8h, v9.8h\n"
- "ldr x26, [x14, #0x30]\n"
- "ldr x25, [x14, #0x38]\n"
- "mov v25.16b, v16.16b\n fmla v25.8h, v6.8h, v9.8h\n"
- "fmla v23.8h, v0.8h, v10.8h\n"
- "ldr x24, [x14, #0x28]\n"
- "ldr x10, [x14, #0x48]\n"
- "ldr q10, [x10, x13]\n"
- "fmla v24.8h, v4.8h, v13.8h\n"
- "mov v26.16b, v16.16b\n fmla v26.8h, v5.8h, v9.8h\n"
- "ldr x11, [x14, #0x40]\n"
- "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
- "mov v28.16b, v16.16b\n fmla v28.8h, v3.8h, v9.8h\n"
- "ldr x9, [x14, #0x50]\n"
- "ldr x28, [x14, #0x58]\n"
- "fmla v25.8h, v2.8h, v11.8h\n"
- "ldr q11, [x26, x13]\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v2.8h, v9.8h\n"
- "ldr x27, [x14, #0x60]\n"
- "fmla v23.8h, v5.8h, v13.8h\n"
- "fmla v24.8h, v6.8h, v11.8h\n"
- "ldr x26, [x14, #0x70]\n"
- "ldr x10, [x14, #0x88]\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
- "fmla v25.8h, v3.8h, v13.8h\n"
- "ldr x23, [x16, #0x0]\n"
- "add x12, x12, #0x10\n"
+ "mov v29.16b, v31.16b\n fmla v29.8h, v8.8h, v9.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v7.8h, v9.8h\n"
+ "ldr x26, [x15, #0x30]\n"
+ "ldr x23, [x15, #0x38]\n"
+ "mov v27.16b, v31.16b\n fmla v27.8h, v6.8h, v9.8h\n"
+ "fmla v29.8h, v0.8h, v10.8h\n"
+ "ldr x22, [x15, #0x28]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "ldr q19, [x20, x14]\n"
+ "fmla v28.8h, v4.8h, v13.8h\n"
+ "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
+ "ldr x21, [x15, #0x40]\n"
+ "mov v25.16b, v31.16b\n fmla v25.8h, v4.8h, v9.8h\n"
+ "mov v24.16b, v31.16b\n fmla v24.8h, v3.8h, v9.8h\n"
+ "ldr x25, [x15, #0x50]\n"
+ "ldr x24, [x15, #0x58]\n"
+ "fmla v27.8h, v2.8h, v11.8h\n"
+ "ldr q17, [x26, x14]\n"
+ "mov v23.16b, v31.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+ "ldr x20, [x15, #0x60]\n"
+ "fmla v29.8h, v5.8h, v13.8h\n"
+ "fmla v28.8h, v6.8h, v17.8h\n"
+ "ldr x12, [x15, #0x70]\n"
+ "ldr x11, [x15, #0x88]\n"
+ "mov v22.16b, v31.16b\n fmla v22.8h, v0.8h, v9.8h\n"
+ "fmla v27.8h, v3.8h, v13.8h\n"
+ "ldr x10, [x17, #0x0]\n"
+ "add x13, x13, #0x10\n"
"fmla v26.8h, v2.8h, v13.8h\n"
- "fmla v27.8h, v1.8h, v13.8h\n"
- "ldr x22, [x16, #0x8]\n"
- "ldr x21, [x16, #0x10]\n"
- "fmla v28.8h, v0.8h, v13.8h\n"
- "ldr q13, [x25, x13]\n"
- "fmla v29.8h, v6.8h, v12.8h\n"
- "ldr q12, [x24, x13]\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
- "ldr q16, [x15, #0x0]\n"
- "fmla v23.8h, v7.8h, v11.8h\n"
- "ldr x24, [x14, #0x68]\n"
+ "fmla v25.8h, v1.8h, v13.8h\n"
+ "ldr x9, [x17, #0x8]\n"
+ "ldr x28, [x17, #0x10]\n"
"fmla v24.8h, v0.8h, v13.8h\n"
- "fmla v31.8h, v8.8h, v12.8h\n"
- "ldr q12, [x11, x13]\n"
- "ldr x25, [x14, #0x78]\n"
- "fmla v26.8h, v4.8h, v11.8h\n"
- "fmla v27.8h, v3.8h, v11.8h\n"
- "ldr x11, [x14, #0x80]\n"
- "ldr x20, [x16, #0x18]\n"
- "fmla v30.8h, v0.8h, v11.8h\n"
- "fmla v28.8h, v4.8h, v10.8h\n"
- "fmla v29.8h, v1.8h, v11.8h\n"
- "ldr q11, [x9, x13]\n"
- "fmla v23.8h, v1.8h, v13.8h\n"
- "ldr q13, [x28, x13]\n"
- "fmla v24.8h, v2.8h, v12.8h\n"
- "fmla v25.8h, v1.8h, v12.8h\n"
- "ldr q12, [x27, x13]\n"
- "ldr x9, [x14, #0x90]\n"
- "fmla v27.8h, v5.8h, v10.8h\n"
- "fmla v30.8h, v2.8h, v10.8h\n"
- "ldr x27, [x14, #0xa0]\n"
- "ldr x28, [x14, #0x98]\n"
- "fmla v26.8h, v0.8h, v11.8h\n"
- "fmla v28.8h, v2.8h, v13.8h\n"
- "fmla v24.8h, v8.8h, v10.8h\n"
- "fmla v25.8h, v7.8h, v10.8h\n"
- "fmla v31.8h, v1.8h, v10.8h\n"
- "ldr q10, [x24, x13]\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
- "ldr x24, [x14, #0xa8]\n"
- "fmla v26.8h, v6.8h, v12.8h\n"
- "ldr q12, [x11, x13]\n"
- "fmla v27.8h, v7.8h, v10.8h\n"
- "ldr x11, [x14, #0xc0]\n"
- "fmla v28.8h, v6.8h, v10.8h\n"
- "fmla v30.8h, v4.8h, v10.8h\n"
- "fmla v23.8h, v3.8h, v11.8h\n"
- "ldr q11, [x26, x13]\n"
- "fmla v25.8h, v5.8h, v13.8h\n"
- "ldr q13, [x25, x13]\n"
- "fmla v29.8h, v5.8h, v10.8h\n"
- "fmla v31.8h, v3.8h, v10.8h\n"
- "ldr x26, [x14, #0xb0]\n"
- "ldr x25, [x14, #0xb8]\n"
- "fmla v26.8h, v8.8h, v10.8h\n"
- "fmla v28.8h, v8.8h, v11.8h\n"
- "fmla v30.8h, v6.8h, v13.8h\n"
- "fmla v24.8h, v3.8h, v12.8h\n"
- "fmla v27.8h, v0.8h, v12.8h\n"
- "fmla v31.8h, v5.8h, v11.8h\n"
- "ldr q11, [x10, x13]\n"
- "fmla v29.8h, v7.8h, v13.8h\n"
- "ldr q13, [x9, x13]\n"
- "fmla v23.8h, v4.8h, v12.8h\n"
- "fmla v26.8h, v1.8h, v12.8h\n"
- "ldr q12, [x28, x13]\n"
- "fmla v24.8h, v5.8h, v11.8h\n"
- "fmla v25.8h, v4.8h, v11.8h\n"
- "fmla v27.8h, v2.8h, v11.8h\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "ldr q11, [x27, x13]\n"
- "fmla v30.8h, v8.8h, v13.8h\n"
- "ldr x27, [x14, #0x20]\n"
- "fmla v31.8h, v7.8h, v13.8h\n"
- "ldr q13, [x24, x13]\n"
- "fmla v23.8h, v2.8h, v11.8h\n"
- "fmla v26.8h, v7.8h, v12.8h\n"
- "fmla v27.8h, v6.8h, v12.8h\n"
- "fmla v29.8h, v4.8h, v12.8h\n"
- "fmla v30.8h, v3.8h, v12.8h\n"
- "ldr q12, [x26, x13]\n"
- "fmla v31.8h, v4.8h, v13.8h\n"
- "ldr q4, [x15, #0x50]\n"
- "fmla v24.8h, v1.8h, v11.8h\n"
- "ldr q1, [x15, #0x20]\n"
- "fmla v25.8h, v0.8h, v11.8h\n"
- "ldr q11, [x25, x13]\n"
+ "ldr q18, [x23, x14]\n"
"fmla v23.8h, v6.8h, v12.8h\n"
- "fmax v23.8h, v23.8h, v18.8h\n"
- "fmla v28.8h, v7.8h, v13.8h\n"
- "fmla v30.8h, v5.8h, v13.8h\n"
- "fmin v23.8h, v23.8h, v17.8h\n"
- "str q23, [x23, x12]\n"
- "fmla v29.8h, v0.8h, v12.8h\n"
- "ldr q0, [x15, #0x10]\n"
- "fmla v31.8h, v2.8h, v11.8h\n"
- "ldr q2, [x15, #0x30]\n"
- "fmla v27.8h, v8.8h, v13.8h\n"
- "ldr q13, [x11, x13]\n"
- "fmla v26.8h, v3.8h, v12.8h\n"
- "ldr q3, [x15, #0x40]\n"
- "fmla v25.8h, v8.8h, v11.8h\n"
- "fmla v28.8h, v5.8h, v11.8h\n"
- "ldr q5, [x15, #0x60]\n"
- "fmax v24.8h, v24.8h, v18.8h\n"
- "fmla v29.8h, v8.8h, v13.8h\n"
- "ldr q8, [x15, #0x90]\n"
- "fmla v30.8h, v7.8h, v13.8h\n"
- "ldr q7, [x15, #0x80]\n"
- "fmla v31.8h, v6.8h, v13.8h\n"
- "ldr q13, [x27, x8]\n"
- "ldr q6, [x15, #0x70]\n"
- "fmax v25.8h, v25.8h, v18.8h\n"
- "fmax v26.8h, v26.8h, v18.8h\n"
- "fmax v27.8h, v27.8h, v18.8h\n"
- "ldr x23, [x16, #0x20]\n"
- "ldp x11, x10, [x14, #0x0]\n"
- "ldr q9, [x11, x8]\n"
- "ldr q10, [x10, x8]\n"
- "fmin v24.8h, v24.8h, v17.8h\n"
- "fmin v25.8h, v25.8h, v17.8h\n"
- "ldp x9, x28, [x14, #0x10]\n"
- "ldr q11, [x9, x8]\n"
- "fmin v26.8h, v26.8h, v17.8h\n"
- "fmin v27.8h, v27.8h, v17.8h\n"
- "ldr q12, [x28, x8]\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
- "str q24, [x22, x12]\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "str q25, [x21, x12]\n"
- "ldr x22, [x16, #0x28]\n"
- "str q26, [x20, x12]\n"
- "ldr x21, [x16, #0x30]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x8, x8, #0x10\n"
- "str q27, [x23, x12]\n"
- "ldr x23, [x16, #0x40]\n"
- "cmp x8, x17, LSL #4\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "add x13, x13, #0x10\n"
- "str q28, [x22, x12]\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
- "str q29, [x21, x12]\n"
- "add x15, x15, #0xa0\n"
- "str q30, [x20, x12]\n"
- "str q31, [x23, x12]\n"
+ "ldr q16, [x22, x14]\n"
+ "mov v21.16b, v31.16b\n fmla v21.8h, v1.8h, v9.8h\n"
+ "ldr q31, [x16, #0x0]\n"
+ "fmla v29.8h, v7.8h, v17.8h\n"
+ "ldr x23, [x15, #0x68]\n"
+ "fmla v28.8h, v0.8h, v18.8h\n"
+ "fmla v22.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x21, x14]\n"
+ "ldr x22, [x15, #0x78]\n"
+ "fmla v26.8h, v4.8h, v17.8h\n"
+ "fmla v25.8h, v3.8h, v17.8h\n"
+ "ldr x21, [x15, #0x80]\n"
+ "ldr x27, [x17, #0x18]\n"
+ "fmla v21.8h, v0.8h, v17.8h\n"
+ "fmla v24.8h, v4.8h, v19.8h\n"
+ "fmla v23.8h, v1.8h, v17.8h\n"
+ "ldr q17, [x25, x14]\n"
+ "fmla v29.8h, v1.8h, v18.8h\n"
+ "ldr q20, [x24, x14]\n"
+ "fmla v28.8h, v2.8h, v16.8h\n"
+ "fmla v27.8h, v1.8h, v16.8h\n"
+ "ldr q16, [x20, x14]\n"
+ "ldr x26, [x15, #0x90]\n"
+ "fmla v25.8h, v5.8h, v19.8h\n"
+ "fmla v21.8h, v2.8h, v19.8h\n"
+ "ldr x25, [x15, #0xa0]\n"
+ "ldr x20, [x15, #0x98]\n"
+ "fmla v26.8h, v0.8h, v17.8h\n"
+ "fmla v24.8h, v2.8h, v20.8h\n"
+ "fmla v28.8h, v8.8h, v19.8h\n"
+ "fmla v27.8h, v7.8h, v19.8h\n"
+ "fmla v22.8h, v1.8h, v19.8h\n"
+ "ldr q19, [x23, x14]\n"
+ "fmla v23.8h, v3.8h, v16.8h\n"
+ "ldr x24, [x15, #0xa8]\n"
+ "fmla v26.8h, v6.8h, v16.8h\n"
+ "ldr q18, [x21, x14]\n"
+ "fmla v25.8h, v7.8h, v19.8h\n"
+ "ldr x23, [x15, #0xc0]\n"
+ "fmla v24.8h, v6.8h, v19.8h\n"
+ "fmla v21.8h, v4.8h, v19.8h\n"
+ "fmla v29.8h, v3.8h, v17.8h\n"
+ "ldr q17, [x12, x14]\n"
+ "fmla v27.8h, v5.8h, v20.8h\n"
+ "ldr q16, [x22, x14]\n"
+ "fmla v23.8h, v5.8h, v19.8h\n"
+ "fmla v22.8h, v3.8h, v19.8h\n"
+ "ldr x22, [x15, #0xb0]\n"
+ "ldr x21, [x15, #0xb8]\n"
+ "fmla v26.8h, v8.8h, v19.8h\n"
+ "fmla v24.8h, v8.8h, v17.8h\n"
+ "fmla v21.8h, v6.8h, v16.8h\n"
+ "fmla v28.8h, v3.8h, v18.8h\n"
+ "fmla v25.8h, v0.8h, v18.8h\n"
+ "fmla v22.8h, v5.8h, v17.8h\n"
+ "ldr q17, [x11, x14]\n"
+ "fmla v23.8h, v7.8h, v16.8h\n"
+ "ldr q16, [x26, x14]\n"
+ "fmla v29.8h, v4.8h, v18.8h\n"
+ "fmla v26.8h, v1.8h, v18.8h\n"
+ "ldr q18, [x20, x14]\n"
+ "fmla v28.8h, v5.8h, v17.8h\n"
+ "fmla v27.8h, v4.8h, v17.8h\n"
+ "fmla v25.8h, v2.8h, v17.8h\n"
+ "fmla v24.8h, v1.8h, v17.8h\n"
+ "ldr q17, [x25, x14]\n"
+ "fmla v21.8h, v8.8h, v16.8h\n"
+ "ldr x20, [x15, #0x20]\n"
+ "fmla v22.8h, v7.8h, v16.8h\n"
+ "ldr q16, [x24, x14]\n"
+ "fmla v29.8h, v2.8h, v17.8h\n"
+ "fmla v26.8h, v7.8h, v18.8h\n"
+ "fmla v25.8h, v6.8h, v18.8h\n"
+ "fmla v23.8h, v4.8h, v18.8h\n"
+ "fmla v21.8h, v3.8h, v18.8h\n"
+ "ldr q18, [x22, x14]\n"
+ "fmla v22.8h, v4.8h, v16.8h\n"
+ "ldr q4, [x16, #0x50]\n"
+ "fmla v28.8h, v1.8h, v17.8h\n"
+ "ldr q1, [x16, #0x20]\n"
+ "fmla v27.8h, v0.8h, v17.8h\n"
+ "ldr q17, [x21, x14]\n"
+ "fmla v29.8h, v6.8h, v18.8h\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "fmla v24.8h, v7.8h, v16.8h\n"
+ "fmla v21.8h, v5.8h, v16.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "str q29, [x10, x13]\n"
+ "fmla v23.8h, v0.8h, v18.8h\n"
+ "ldr q0, [x16, #0x10]\n"
+ "fmla v22.8h, v2.8h, v17.8h\n"
+ "ldr q2, [x16, #0x30]\n"
+ "fmla v25.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x23, x14]\n"
+ "fmla v26.8h, v3.8h, v18.8h\n"
+ "ldr q3, [x16, #0x40]\n"
+ "fmla v27.8h, v8.8h, v17.8h\n"
+ "fmla v24.8h, v5.8h, v17.8h\n"
+ "ldr q5, [x16, #0x60]\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "fmla v23.8h, v8.8h, v16.8h\n"
+ "ldr q8, [x16, #0x90]\n"
+ "fmla v21.8h, v7.8h, v16.8h\n"
+ "ldr q7, [x16, #0x80]\n"
+ "fmla v22.8h, v6.8h, v16.8h\n"
+ "ldr q13, [x20, x7]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "ldr x24, [x17, #0x20]\n"
+ "ldp x21, x20, [x15, #0x0]\n"
+ "ldr q9, [x21, x7]\n"
+ "ldr q10, [x20, x7]\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr q11, [x21, x7]\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "ldr q12, [x20, x7]\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "fmax v23.8h, v23.8h, v15.8h\n"
+ "str q28, [x9, x13]\n"
+ "fmax v21.8h, v21.8h, v15.8h\n"
+ "fmax v22.8h, v22.8h, v15.8h\n"
+ "str q27, [x28, x13]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "str q26, [x27, x13]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x7, x7, #0x10\n"
+ "str q25, [x24, x13]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "cmp x7, x8, LSL #4\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "fmin v23.8h, v23.8h, v14.8h\n"
+ "fmin v21.8h, v21.8h, v14.8h\n"
+ "add x14, x14, #0x10\n"
+ "str q24, [x23, x13]\n"
+ "fmin v22.8h, v22.8h, v14.8h\n"
+ "str q23, [x22, x13]\n"
+ "add x16, x16, #0xa0\n"
+ "str q21, [x21, x13]\n"
+ "str q22, [x20, x13]\n"
"blt 1b\n"
"2:" // Channel tail
- "mov v23.16b, v16.16b\n fmla v23.8h, v8.8h, v9.8h\n"
- "mov v24.16b, v16.16b\n fmla v24.8h, v7.8h, v9.8h\n"
- "ldr x26, [x14, #0x30]\n"
- "ldr x25, [x14, #0x38]\n"
- "mov v25.16b, v16.16b\n fmla v25.8h, v6.8h, v9.8h\n"
- "fmla v23.8h, v0.8h, v10.8h\n"
- "ldr x24, [x14, #0x28]\n"
- "ldr x10, [x14, #0x48]\n"
- "ldr q10, [x10, x13]\n"
- "fmla v24.8h, v4.8h, v13.8h\n"
- "mov v26.16b, v16.16b\n fmla v26.8h, v5.8h, v9.8h\n"
- "ldr x11, [x14, #0x40]\n"
- "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
- "mov v28.16b, v16.16b\n fmla v28.8h, v3.8h, v9.8h\n"
- "ldr x9, [x14, #0x50]\n"
- "ldr x28, [x14, #0x58]\n"
- "fmla v25.8h, v2.8h, v11.8h\n"
- "ldr q11, [x26, x13]\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v2.8h, v9.8h\n"
- "ldr x27, [x14, #0x60]\n"
- "fmla v23.8h, v5.8h, v13.8h\n"
- "fmla v24.8h, v6.8h, v11.8h\n"
- "ldr x26, [x14, #0x70]\n"
- "ldr x10, [x14, #0x88]\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
- "fmla v25.8h, v3.8h, v13.8h\n"
- "ldr x23, [x16, #0x0]\n"
- "add x12, x12, #0x10\n"
+ "mov v29.16b, v31.16b\n fmla v29.8h, v8.8h, v9.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v7.8h, v9.8h\n"
+ "ldr x23, [x15, #0x30]\n"
+ "ldr x22, [x15, #0x38]\n"
+ "mov v27.16b, v31.16b\n fmla v27.8h, v6.8h, v9.8h\n"
+ "fmla v29.8h, v0.8h, v10.8h\n"
+ "ldr x21, [x15, #0x28]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "ldr q19, [x20, x14]\n"
+ "fmla v28.8h, v4.8h, v13.8h\n"
+ "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "mov v25.16b, v31.16b\n fmla v25.8h, v4.8h, v9.8h\n"
+ "mov v24.16b, v31.16b\n fmla v24.8h, v3.8h, v9.8h\n"
+ "ldr x25, [x15, #0x50]\n"
+ "ldr x24, [x15, #0x58]\n"
+ "fmla v27.8h, v2.8h, v11.8h\n"
+ "ldr q17, [x23, x14]\n"
+ "mov v23.16b, v31.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+ "ldr x23, [x15, #0x60]\n"
+ "fmla v29.8h, v5.8h, v13.8h\n"
+ "fmla v28.8h, v6.8h, v17.8h\n"
+ "ldr x12, [x15, #0x70]\n"
+ "ldr x11, [x15, #0x88]\n"
+ "mov v22.16b, v31.16b\n fmla v22.8h, v0.8h, v9.8h\n"
+ "fmla v27.8h, v3.8h, v13.8h\n"
+ "ldr x10, [x17, #0x0]\n"
+ "add x13, x13, #0x10\n"
"fmla v26.8h, v2.8h, v13.8h\n"
- "fmla v27.8h, v1.8h, v13.8h\n"
- "ldr x22, [x16, #0x8]\n"
- "ldr x21, [x16, #0x10]\n"
- "fmla v28.8h, v0.8h, v13.8h\n"
- "ldr q13, [x25, x13]\n"
- "fmla v29.8h, v6.8h, v12.8h\n"
- "ldr q12, [x24, x13]\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
- "fmla v23.8h, v7.8h, v11.8h\n"
- "ldr x24, [x14, #0x68]\n"
- "ldr x25, [x14, #0x78]\n"
+ "fmla v25.8h, v1.8h, v13.8h\n"
+ "ldr x9, [x17, #0x8]\n"
+ "ldr x28, [x17, #0x10]\n"
"fmla v24.8h, v0.8h, v13.8h\n"
- "fmla v31.8h, v8.8h, v12.8h\n"
- "ldr q12, [x11, x13]\n"
- "ldr x11, [x14, #0x80]\n"
- "fmla v26.8h, v4.8h, v11.8h\n"
- "fmla v27.8h, v3.8h, v11.8h\n"
- "ldr x20, [x16, #0x18]\n"
- "fmla v30.8h, v0.8h, v11.8h\n"
- "fmla v28.8h, v4.8h, v10.8h\n"
- "fmla v29.8h, v1.8h, v11.8h\n"
- "ldr q11, [x9, x13]\n"
- "fmla v23.8h, v1.8h, v13.8h\n"
- "ldr q13, [x28, x13]\n"
- "fmla v24.8h, v2.8h, v12.8h\n"
- "fmla v25.8h, v1.8h, v12.8h\n"
- "ldr q12, [x27, x13]\n"
- "ldr x9, [x14, #0x90]\n"
- "fmla v27.8h, v5.8h, v10.8h\n"
- "fmla v30.8h, v2.8h, v10.8h\n"
- "ldr x27, [x14, #0xa0]\n"
- "ldr x28, [x14, #0x98]\n"
- "fmla v26.8h, v0.8h, v11.8h\n"
- "fmla v28.8h, v2.8h, v13.8h\n"
- "fmla v24.8h, v8.8h, v10.8h\n"
- "fmla v25.8h, v7.8h, v10.8h\n"
- "fmla v31.8h, v1.8h, v10.8h\n"
- "ldr q10, [x24, x13]\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
- "ldr x24, [x14, #0xa8]\n"
- "fmla v26.8h, v6.8h, v12.8h\n"
- "ldr q12, [x11, x13]\n"
- "fmla v27.8h, v7.8h, v10.8h\n"
- "ldr x11, [x14, #0xc0]\n"
- "fmla v28.8h, v6.8h, v10.8h\n"
- "fmla v30.8h, v4.8h, v10.8h\n"
- "fmla v23.8h, v3.8h, v11.8h\n"
- "ldr q11, [x26, x13]\n"
- "fmla v25.8h, v5.8h, v13.8h\n"
- "ldr q13, [x25, x13]\n"
- "fmla v29.8h, v5.8h, v10.8h\n"
- "fmla v31.8h, v3.8h, v10.8h\n"
- "ldr x26, [x14, #0xb0]\n"
- "ldr x25, [x14, #0xb8]\n"
- "fmla v26.8h, v8.8h, v10.8h\n"
- "fmla v28.8h, v8.8h, v11.8h\n"
- "fmla v30.8h, v6.8h, v13.8h\n"
- "fmla v24.8h, v3.8h, v12.8h\n"
- "fmla v27.8h, v0.8h, v12.8h\n"
- "fmla v31.8h, v5.8h, v11.8h\n"
- "ldr q11, [x10, x13]\n"
- "fmla v29.8h, v7.8h, v13.8h\n"
- "ldr q13, [x9, x13]\n"
- "fmla v23.8h, v4.8h, v12.8h\n"
- "fmla v26.8h, v1.8h, v12.8h\n"
- "ldr q12, [x28, x13]\n"
- "fmla v24.8h, v5.8h, v11.8h\n"
- "fmla v25.8h, v4.8h, v11.8h\n"
- "fmla v27.8h, v2.8h, v11.8h\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "ldr q11, [x27, x13]\n"
- "fmla v30.8h, v8.8h, v13.8h\n"
- "fmla v31.8h, v7.8h, v13.8h\n"
- "ldr q13, [x24, x13]\n"
- "fmla v23.8h, v2.8h, v11.8h\n"
- "fmla v26.8h, v7.8h, v12.8h\n"
- "fmla v27.8h, v6.8h, v12.8h\n"
- "fmla v29.8h, v4.8h, v12.8h\n"
- "fmla v30.8h, v3.8h, v12.8h\n"
- "ldr q12, [x26, x13]\n"
- "fmla v31.8h, v4.8h, v13.8h\n"
- "fmla v24.8h, v1.8h, v11.8h\n"
- "fmax v24.8h, v24.8h, v18.8h\n"
- "fmla v25.8h, v0.8h, v11.8h\n"
- "ldr q11, [x25, x13]\n"
+ "ldr q18, [x22, x14]\n"
"fmla v23.8h, v6.8h, v12.8h\n"
- "fmax v23.8h, v23.8h, v18.8h\n"
- "fmla v28.8h, v7.8h, v13.8h\n"
- "fmla v30.8h, v5.8h, v13.8h\n"
- "fmin v23.8h, v23.8h, v17.8h\n"
- "str q23, [x23, x12]\n"
- "fmla v29.8h, v0.8h, v12.8h\n"
- "fmla v31.8h, v2.8h, v11.8h\n"
- "ldr x23, [x16, #0x20]\n"
- "fmin v24.8h, v24.8h, v17.8h\n"
- "fmla v27.8h, v8.8h, v13.8h\n"
- "ldr q13, [x11, x13]\n"
- "fmla v26.8h, v3.8h, v12.8h\n"
- "fmax v26.8h, v26.8h, v18.8h\n"
- "fmla v25.8h, v8.8h, v11.8h\n"
- "fmla v28.8h, v5.8h, v11.8h\n"
- "fmax v25.8h, v25.8h, v18.8h\n"
- "str q24, [x22, x12]\n"
- "fmla v29.8h, v8.8h, v13.8h\n"
- "fmla v30.8h, v7.8h, v13.8h\n"
- "fmax v27.8h, v27.8h, v18.8h\n"
- "ldr x22, [x16, #0x28]\n"
- "fmla v31.8h, v6.8h, v13.8h\n"
- "fmin v25.8h, v25.8h, v17.8h\n"
- "str q25, [x21, x12]\n"
- "ldr x21, [x16, #0x30]\n"
- "fmin v26.8h, v26.8h, v17.8h\n"
- "fmin v27.8h, v27.8h, v17.8h\n"
- "str q26, [x20, x12]\n"
- "ldr x20, [x16, #0x38]\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
- "str q27, [x23, x12]\n"
- "ldr x23, [x16, #0x40]\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "add x13, x13, #0x10\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "str q28, [x22, x12]\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
- "str q29, [x21, x12]\n"
- "str q30, [x20, x12]\n"
- "str q31, [x23, x12]\n"
+ "ldr q16, [x21, x14]\n"
+ "mov v21.16b, v31.16b\n fmla v21.8h, v1.8h, v9.8h\n"
+ "fmla v29.8h, v7.8h, v17.8h\n"
+ "ldr x22, [x15, #0x68]\n"
+ "ldr x21, [x15, #0x78]\n"
+ "fmla v28.8h, v0.8h, v18.8h\n"
+ "fmla v22.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x20, x14]\n"
+ "ldr x20, [x15, #0x80]\n"
+ "fmla v26.8h, v4.8h, v17.8h\n"
+ "fmla v25.8h, v3.8h, v17.8h\n"
+ "ldr x27, [x17, #0x18]\n"
+ "fmla v21.8h, v0.8h, v17.8h\n"
+ "fmla v24.8h, v4.8h, v19.8h\n"
+ "fmla v23.8h, v1.8h, v17.8h\n"
+ "ldr q17, [x25, x14]\n"
+ "fmla v29.8h, v1.8h, v18.8h\n"
+ "ldr q20, [x24, x14]\n"
+ "fmla v28.8h, v2.8h, v16.8h\n"
+ "fmla v27.8h, v1.8h, v16.8h\n"
+ "ldr q16, [x23, x14]\n"
+ "ldr x26, [x15, #0x90]\n"
+ "fmla v25.8h, v5.8h, v19.8h\n"
+ "fmla v21.8h, v2.8h, v19.8h\n"
+ "ldr x25, [x15, #0xa0]\n"
+ "ldr x24, [x15, #0x98]\n"
+ "fmla v26.8h, v0.8h, v17.8h\n"
+ "fmla v24.8h, v2.8h, v20.8h\n"
+ "fmla v28.8h, v8.8h, v19.8h\n"
+ "fmla v27.8h, v7.8h, v19.8h\n"
+ "fmla v22.8h, v1.8h, v19.8h\n"
+ "ldr q19, [x22, x14]\n"
+ "fmla v23.8h, v3.8h, v16.8h\n"
+ "ldr x23, [x15, #0xa8]\n"
+ "fmla v26.8h, v6.8h, v16.8h\n"
+ "ldr q18, [x20, x14]\n"
+ "fmla v25.8h, v7.8h, v19.8h\n"
+ "ldr x22, [x15, #0xc0]\n"
+ "fmla v24.8h, v6.8h, v19.8h\n"
+ "fmla v21.8h, v4.8h, v19.8h\n"
+ "fmla v29.8h, v3.8h, v17.8h\n"
+ "ldr q17, [x12, x14]\n"
+ "fmla v27.8h, v5.8h, v20.8h\n"
+ "ldr q16, [x21, x14]\n"
+ "fmla v23.8h, v5.8h, v19.8h\n"
+ "fmla v22.8h, v3.8h, v19.8h\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla v26.8h, v8.8h, v19.8h\n"
+ "fmla v24.8h, v8.8h, v17.8h\n"
+ "fmla v21.8h, v6.8h, v16.8h\n"
+ "fmla v28.8h, v3.8h, v18.8h\n"
+ "fmla v25.8h, v0.8h, v18.8h\n"
+ "fmla v22.8h, v5.8h, v17.8h\n"
+ "ldr q17, [x11, x14]\n"
+ "fmla v23.8h, v7.8h, v16.8h\n"
+ "ldr q16, [x26, x14]\n"
+ "fmla v29.8h, v4.8h, v18.8h\n"
+ "fmla v26.8h, v1.8h, v18.8h\n"
+ "ldr q18, [x24, x14]\n"
+ "fmla v28.8h, v5.8h, v17.8h\n"
+ "fmla v27.8h, v4.8h, v17.8h\n"
+ "fmla v25.8h, v2.8h, v17.8h\n"
+ "fmla v24.8h, v1.8h, v17.8h\n"
+ "ldr q17, [x25, x14]\n"
+ "fmla v21.8h, v8.8h, v16.8h\n"
+ "fmla v22.8h, v7.8h, v16.8h\n"
+ "ldr q16, [x23, x14]\n"
+ "fmla v29.8h, v2.8h, v17.8h\n"
+ "fmla v26.8h, v7.8h, v18.8h\n"
+ "fmla v25.8h, v6.8h, v18.8h\n"
+ "fmla v23.8h, v4.8h, v18.8h\n"
+ "fmla v21.8h, v3.8h, v18.8h\n"
+ "ldr q18, [x21, x14]\n"
+ "fmla v22.8h, v4.8h, v16.8h\n"
+ "fmla v28.8h, v1.8h, v17.8h\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "fmla v27.8h, v0.8h, v17.8h\n"
+ "ldr q17, [x20, x14]\n"
+ "fmla v29.8h, v6.8h, v18.8h\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "fmla v24.8h, v7.8h, v16.8h\n"
+ "fmla v21.8h, v5.8h, v16.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "str q29, [x10, x13]\n"
+ "fmla v23.8h, v0.8h, v18.8h\n"
+ "fmla v22.8h, v2.8h, v17.8h\n"
+ "ldr x20, [x17, #0x20]\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "fmla v25.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x22, x14]\n"
+ "fmla v26.8h, v3.8h, v18.8h\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "fmla v27.8h, v8.8h, v17.8h\n"
+ "fmla v24.8h, v5.8h, v17.8h\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "str q28, [x9, x13]\n"
+ "fmla v23.8h, v8.8h, v16.8h\n"
+ "fmla v21.8h, v7.8h, v16.8h\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "ldr x23, [x17, #0x28]\n"
+ "fmla v22.8h, v6.8h, v16.8h\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "str q27, [x28, x13]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "str q26, [x27, x13]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "fmax v23.8h, v23.8h, v15.8h\n"
+ "str q25, [x20, x13]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "fmax v21.8h, v21.8h, v15.8h\n"
+ "fmax v22.8h, v22.8h, v15.8h\n"
+ "add x14, x14, #0x10\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "fmin v23.8h, v23.8h, v14.8h\n"
+ "str q24, [x23, x13]\n"
+ "fmin v21.8h, v21.8h, v14.8h\n"
+ "fmin v22.8h, v22.8h, v14.8h\n"
+ "str q23, [x22, x13]\n"
+ "str q21, [x21, x13]\n"
+ "str q22, [x20, x13]\n"
"3:" // Oddments
"tst %x[n_channels], #0x7\n"
"beq 92f\n"
- "ldr q16, [x15, #0x0]\n"
- "ldr q0, [x15, #0x10]\n"
- "mov x12, x13\n"
- "ldr q1, [x15, #0x20]\n"
- "ldr q2, [x15, #0x30]\n"
- "ldr q3, [x15, #0x40]\n"
- "ldr q4, [x15, #0x50]\n"
- "ldr q5, [x15, #0x60]\n"
- "ldr q6, [x15, #0x70]\n"
- "ldr q7, [x15, #0x80]\n"
- "ldr q8, [x15, #0x90]\n"
- "ldr x24, [x14, #0x0]\n"
- "ldr x23, [x14, #0x8]\n"
- "add x24, x24, x13\n"
- "add x23, x23, x13\n"
- "ldr x22, [x14, #0x10]\n"
- "ldr x21, [x14, #0x18]\n"
- "add x22, x22, x13\n"
- "add x21, x21, x13\n"
- "ldr x20, [x14, #0x20]\n"
- "add x20, x20, x13\n"
+ "ldr q31, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "mov x13, x14\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "ldr q5, [x16, #0x60]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "ldr q7, [x16, #0x80]\n"
+ "ldr q8, [x16, #0x90]\n"
+ "ldr x24, [x15, #0x0]\n"
+ "ldr x23, [x15, #0x8]\n"
+ "add x24, x24, x14\n"
+ "add x23, x23, x14\n"
+ "ldr x22, [x15, #0x10]\n"
+ "ldr x21, [x15, #0x18]\n"
+ "add x22, x22, x14\n"
+ "add x21, x21, x14\n"
+ "ldr x20, [x15, #0x20]\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 5f\n"
"ld1 { v9.d }[0], [x24], #0x8\n"
"ld1 { v10.d }[0], [x23], #0x8\n"
@@ -534,19 +534,19 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ld1 { v12.h }[0], [x21], #0x2\n"
"ld1 { v13.h }[0], [x20], #0x2\n"
"7:" // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: End
- "mov v23.16b, v16.16b\n fmla v23.8h, v8.8h, v9.8h\n"
- "mov v25.16b, v16.16b\n fmla v25.8h, v6.8h, v9.8h\n"
- "ldr x20, [x14, #0x28]\n"
- "add x20, x20, x13\n"
- "mov v24.16b, v16.16b\n fmla v24.8h, v7.8h, v9.8h\n"
- "mov v26.16b, v16.16b\n fmla v26.8h, v5.8h, v9.8h\n"
- "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
- "mov v28.16b, v16.16b\n fmla v28.8h, v3.8h, v9.8h\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v2.8h, v9.8h\n"
+ "mov v23.16b, v31.16b\n fmla v23.8h, v8.8h, v9.8h\n"
+ "mov v25.16b, v31.16b\n fmla v25.8h, v6.8h, v9.8h\n"
+ "ldr x20, [x15, #0x28]\n"
+ "add x20, x20, x14\n"
+ "mov v24.16b, v31.16b\n fmla v24.8h, v7.8h, v9.8h\n"
+ "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
+ "mov v27.16b, v31.16b\n fmla v27.8h, v4.8h, v9.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v3.8h, v9.8h\n"
+ "mov v29.16b, v31.16b\n fmla v29.8h, v2.8h, v9.8h\n"
"fmla v23.8h, v0.8h, v10.8h\n"
"fmla v25.8h, v2.8h, v11.8h\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
+ "mov v30.16b, v31.16b\n fmla v30.8h, v1.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v9.8h\n"
"fmla v29.8h, v6.8h, v12.8h\n"
"fmla v23.8h, v5.8h, v13.8h\n"
"fmla v24.8h, v4.8h, v13.8h\n"
@@ -574,9 +574,9 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"10:" // Oddments: Load input (4, 4): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"11:" // Oddments: Load input (4, 4): Bit 2: End
- "ldr x20, [x14, #0x30]\n"
+ "ldr x20, [x15, #0x30]\n"
"fmla v31.8h, v8.8h, v12.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 13f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 12f\n"
@@ -597,10 +597,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"14:" // Oddments: Load input (2, 1): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"15:" // Oddments: Load input (2, 1): Bit 2: End
- "ldr x20, [x14, #0x38]\n"
+ "ldr x20, [x15, #0x38]\n"
"fmla v23.8h, v7.8h, v11.8h\n"
"fmla v24.8h, v6.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v26.8h, v4.8h, v11.8h\n"
"fmla v27.8h, v3.8h, v11.8h\n"
"fmla v29.8h, v1.8h, v11.8h\n"
@@ -625,10 +625,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"18:" // Oddments: Load input (0, 1): Bit 2: Unset: Bit 1: Unset
"ld1 { v13.h }[0], [x20], #0x2\n"
"19:" // Oddments: Load input (0, 1): Bit 2: End
- "ldr x20, [x14, #0x40]\n"
+ "ldr x20, [x15, #0x40]\n"
"fmla v23.8h, v1.8h, v13.8h\n"
"fmla v24.8h, v0.8h, v13.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 21f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 20f\n"
@@ -649,10 +649,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"22:" // Oddments: Load input (0, 3): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"23:" // Oddments: Load input (0, 3): Bit 2: End
- "ldr x20, [x14, #0x48]\n"
+ "ldr x20, [x15, #0x48]\n"
"fmla v24.8h, v2.8h, v12.8h\n"
"fmla v25.8h, v1.8h, v12.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 25f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 24f\n"
@@ -673,10 +673,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"26:" // Oddments: Load input (2, 3): Bit 2: Unset: Bit 1: Unset
"ld1 { v10.h }[0], [x20], #0x2\n"
"27:" // Oddments: Load input (2, 3): Bit 2: End
- "ldr x20, [x14, #0x50]\n"
+ "ldr x20, [x15, #0x50]\n"
"fmla v24.8h, v8.8h, v10.8h\n"
"fmla v25.8h, v7.8h, v10.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v27.8h, v5.8h, v10.8h\n"
"fmla v28.8h, v4.8h, v10.8h\n"
"fmla v30.8h, v2.8h, v10.8h\n"
@@ -701,10 +701,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"30:" // Oddments: Load input (1, 0): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"31:" // Oddments: Load input (1, 0): Bit 2: End
- "ldr x20, [x14, #0x58]\n"
+ "ldr x20, [x15, #0x58]\n"
"fmla v23.8h, v3.8h, v11.8h\n"
"fmla v26.8h, v0.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 33f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 32f\n"
@@ -725,10 +725,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"34:" // Oddments: Load input (1, 4): Bit 2: Unset: Bit 1: Unset
"ld1 { v13.h }[0], [x20], #0x2\n"
"35:" // Oddments: Load input (1, 4): Bit 2: End
- "ldr x20, [x14, #0x60]\n"
+ "ldr x20, [x15, #0x60]\n"
"fmla v25.8h, v5.8h, v13.8h\n"
"fmla v28.8h, v2.8h, v13.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 37f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 36f\n"
@@ -749,10 +749,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"38:" // Oddments: Load input (3, 0): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"39:" // Oddments: Load input (3, 0): Bit 2: End
- "ldr x20, [x14, #0x68]\n"
+ "ldr x20, [x15, #0x68]\n"
"fmla v26.8h, v6.8h, v12.8h\n"
"fmla v29.8h, v3.8h, v12.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 41f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 40f\n"
@@ -773,10 +773,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"42:" // Oddments: Load input (3, 2): Bit 2: Unset: Bit 1: Unset
"ld1 { v10.h }[0], [x20], #0x2\n"
"43:" // Oddments: Load input (3, 2): Bit 2: End
- "ldr x20, [x14, #0x70]\n"
+ "ldr x20, [x15, #0x70]\n"
"fmla v26.8h, v8.8h, v10.8h\n"
"fmla v27.8h, v7.8h, v10.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v28.8h, v6.8h, v10.8h\n"
"fmla v29.8h, v5.8h, v10.8h\n"
"fmla v30.8h, v4.8h, v10.8h\n"
@@ -801,10 +801,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"46:" // Oddments: Load input (3, 4): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"47:" // Oddments: Load input (3, 4): Bit 2: End
- "ldr x20, [x14, #0x78]\n"
+ "ldr x20, [x15, #0x78]\n"
"fmla v28.8h, v8.8h, v11.8h\n"
"fmla v31.8h, v5.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 49f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 48f\n"
@@ -825,10 +825,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"50:" // Oddments: Load input (4, 1): Bit 2: Unset: Bit 1: Unset
"ld1 { v13.h }[0], [x20], #0x2\n"
"51:" // Oddments: Load input (4, 1): Bit 2: End
- "ldr x20, [x14, #0x80]\n"
+ "ldr x20, [x15, #0x80]\n"
"fmla v29.8h, v7.8h, v13.8h\n"
"fmla v30.8h, v6.8h, v13.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 53f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 52f\n"
@@ -849,10 +849,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"54:" // Oddments: Load input (1, 1): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"55:" // Oddments: Load input (1, 1): Bit 2: End
- "ldr x20, [x14, #0x88]\n"
+ "ldr x20, [x15, #0x88]\n"
"fmla v23.8h, v4.8h, v12.8h\n"
"fmla v24.8h, v3.8h, v12.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v26.8h, v1.8h, v12.8h\n"
"fmla v27.8h, v0.8h, v12.8h\n"
"tbz %x[n_channels], #2, 57f\n"
@@ -875,10 +875,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"58:" // Oddments: Load input (1, 3): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"59:" // Oddments: Load input (1, 3): Bit 2: End
- "ldr x20, [x14, #0x90]\n"
+ "ldr x20, [x15, #0x90]\n"
"fmla v24.8h, v5.8h, v11.8h\n"
"fmla v25.8h, v4.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v27.8h, v2.8h, v11.8h\n"
"fmla v28.8h, v1.8h, v11.8h\n"
"tbz %x[n_channels], #2, 61f\n"
@@ -901,10 +901,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"62:" // Oddments: Load input (4, 3): Bit 2: Unset: Bit 1: Unset
"ld1 { v13.h }[0], [x20], #0x2\n"
"63:" // Oddments: Load input (4, 3): Bit 2: End
- "ldr x20, [x14, #0x98]\n"
+ "ldr x20, [x15, #0x98]\n"
"fmla v30.8h, v8.8h, v13.8h\n"
"fmla v31.8h, v7.8h, v13.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 65f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 64f\n"
@@ -925,10 +925,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"66:" // Oddments: Load input (3, 1): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"67:" // Oddments: Load input (3, 1): Bit 2: End
- "ldr x20, [x14, #0xa0]\n"
+ "ldr x20, [x15, #0xa0]\n"
"fmla v26.8h, v7.8h, v12.8h\n"
"fmla v27.8h, v6.8h, v12.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v29.8h, v4.8h, v12.8h\n"
"fmla v30.8h, v3.8h, v12.8h\n"
"tbz %x[n_channels], #2, 69f\n"
@@ -951,10 +951,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"70:" // Oddments: Load input (0, 2): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"71:" // Oddments: Load input (0, 2): Bit 2: End
- "ldr x20, [x14, #0xa8]\n"
+ "ldr x20, [x15, #0xa8]\n"
"fmla v23.8h, v2.8h, v11.8h\n"
"fmla v24.8h, v1.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v25.8h, v0.8h, v11.8h\n"
"tbz %x[n_channels], #2, 73f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
@@ -976,10 +976,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"74:" // Oddments: Load input (3, 3): Bit 2: Unset: Bit 1: Unset
"ld1 { v13.h }[0], [x20], #0x2\n"
"75:" // Oddments: Load input (3, 3): Bit 2: End
- "ldr x20, [x14, #0xb0]\n"
+ "ldr x20, [x15, #0xb0]\n"
"fmla v27.8h, v8.8h, v13.8h\n"
"fmla v28.8h, v7.8h, v13.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v30.8h, v5.8h, v13.8h\n"
"fmla v31.8h, v4.8h, v13.8h\n"
"tbz %x[n_channels], #2, 77f\n"
@@ -1002,10 +1002,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"78:" // Oddments: Load input (2, 0): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"79:" // Oddments: Load input (2, 0): Bit 2: End
- "ldr x20, [x14, #0xb8]\n"
+ "ldr x20, [x15, #0xb8]\n"
"fmla v23.8h, v6.8h, v12.8h\n"
"fmla v26.8h, v3.8h, v12.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v29.8h, v0.8h, v12.8h\n"
"tbz %x[n_channels], #2, 81f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
@@ -1027,10 +1027,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"82:" // Oddments: Load input (2, 4): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"83:" // Oddments: Load input (2, 4): Bit 2: End
- "ldr x20, [x14, #0xc0]\n"
+ "ldr x20, [x15, #0xc0]\n"
"fmla v25.8h, v8.8h, v11.8h\n"
"fmla v28.8h, v5.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v31.8h, v2.8h, v11.8h\n"
"tbz %x[n_channels], #2, 85f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
@@ -1054,236 +1054,234 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"87:" // Oddments: Load input (4, 2): Bit 2: End
"fmla v29.8h, v8.8h, v13.8h\n"
"fmla v30.8h, v7.8h, v13.8h\n"
- "fmax v23.8h, v23.8h, v18.8h\n"
+ "fmax v23.8h, v23.8h, v15.8h\n"
"fmla v31.8h, v6.8h, v13.8h\n"
- "fmax v24.8h, v24.8h, v18.8h\n"
- "fmax v25.8h, v25.8h, v18.8h\n"
- "fmax v26.8h, v26.8h, v18.8h\n"
- "fmax v27.8h, v27.8h, v18.8h\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "fmin v23.8h, v23.8h, v17.8h\n"
- "fmin v24.8h, v24.8h, v17.8h\n"
- "fmin v25.8h, v25.8h, v17.8h\n"
- "fmin v26.8h, v26.8h, v17.8h\n"
- "fmin v27.8h, v27.8h, v17.8h\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "fmax v30.8h, v30.8h, v15.8h\n"
+ "fmax v31.8h, v31.8h, v15.8h\n"
+ "fmin v23.8h, v23.8h, v14.8h\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "fmin v30.8h, v30.8h, v14.8h\n"
+ "fmin v31.8h, v31.8h, v14.8h\n"
"tbz %x[n_channels], #2, 89f\n"
- "ldr x23, [x16, #0x0]\n"
- "add x23, x23, x12\n"
- "st1 { v23.d }[0], [x23]\n"
- "ldr x22, [x16, #0x8]\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x22, x22, x12\n"
- "add x21, x21, x12\n"
- "ldr x23, [x16, #0x20]\n"
- "add x20, x20, x12\n"
- "add x23, x23, x12\n"
- "st1 { v24.d }[0], [x22]\n"
- "st1 { v25.d }[0], [x21]\n"
- "ldr x22, [x16, #0x28]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x22, x22, x12\n"
- "st1 { v26.d }[0], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
- "st1 { v27.d }[0], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
- "add x12, x12, #0x8\n"
- "st1 { v28.d }[0], [x22]\n"
- "st1 { v29.d }[0], [x21]\n"
- "st1 { v30.d }[0], [x20]\n"
- "st1 { v31.d }[0], [x23]\n"
+ "ldr x20, [x17, #0x0]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.d }[0], [x20]\n"
+ "ldr x23, [x17, #0x8]\n"
+ "ldr x22, [x17, #0x10]\n"
+ "ldr x21, [x17, #0x18]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x20, [x17, #0x20]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
+ "st1 { v24.d }[0], [x23]\n"
+ "st1 { v25.d }[0], [x22]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "add x23, x23, x13\n"
+ "st1 { v26.d }[0], [x21]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "st1 { v27.d }[0], [x20]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "add x20, x20, x13\n"
+ "add x13, x13, #0x8\n"
+ "st1 { v28.d }[0], [x23]\n"
+ "st1 { v29.d }[0], [x22]\n"
+ "st1 { v30.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #1, 88f\n"
- "ldr x23, [x16, #0x0]\n"
- "add x23, x23, x12\n"
- "st1 { v23.s }[2], [x23]\n"
- "ldr x22, [x16, #0x8]\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x22, x22, x12\n"
- "add x21, x21, x12\n"
- "ldr x23, [x16, #0x20]\n"
- "add x20, x20, x12\n"
- "add x23, x23, x12\n"
- "st1 { v24.s }[2], [x22]\n"
- "st1 { v25.s }[2], [x21]\n"
- "ldr x22, [x16, #0x28]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x22, x22, x12\n"
- "st1 { v26.s }[2], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
- "st1 { v27.s }[2], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
- "add x12, x12, #0x4\n"
- "st1 { v28.s }[2], [x22]\n"
- "st1 { v29.s }[2], [x21]\n"
- "st1 { v30.s }[2], [x20]\n"
- "st1 { v31.s }[2], [x23]\n"
+ "ldr x20, [x17, #0x0]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.s }[2], [x20]\n"
+ "ldr x23, [x17, #0x8]\n"
+ "ldr x22, [x17, #0x10]\n"
+ "ldr x21, [x17, #0x18]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x20, [x17, #0x20]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
+ "st1 { v24.s }[2], [x23]\n"
+ "st1 { v25.s }[2], [x22]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "add x23, x23, x13\n"
+ "st1 { v26.s }[2], [x21]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "st1 { v27.s }[2], [x20]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "add x20, x20, x13\n"
+ "add x13, x13, #0x4\n"
+ "st1 { v28.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x22]\n"
+ "st1 { v30.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
"tbz %x[n_channels], #0, 91f\n"
- "ldr x23, [x16, #0x0]\n"
- "add x23, x23, x12\n"
- "st1 { v23.h }[6], [x23]\n"
- "ldr x22, [x16, #0x8]\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x22, x22, x12\n"
- "add x21, x21, x12\n"
- "ldr x23, [x16, #0x20]\n"
- "add x20, x20, x12\n"
- "add x23, x23, x12\n"
- "st1 { v24.h }[6], [x22]\n"
- "st1 { v25.h }[6], [x21]\n"
- "ldr x22, [x16, #0x28]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x22, x22, x12\n"
- "st1 { v26.h }[6], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
- "st1 { v27.h }[6], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
- "st1 { v28.h }[6], [x22]\n"
- "st1 { v29.h }[6], [x21]\n"
- "st1 { v30.h }[6], [x20]\n"
- "st1 { v31.h }[6], [x23]\n"
+ "ldr x20, [x17, #0x0]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.h }[6], [x20]\n"
+ "ldr x23, [x17, #0x8]\n"
+ "ldr x22, [x17, #0x10]\n"
+ "ldr x21, [x17, #0x18]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x20, [x17, #0x20]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
+ "st1 { v24.h }[6], [x23]\n"
+ "st1 { v25.h }[6], [x22]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "add x23, x23, x13\n"
+ "st1 { v26.h }[6], [x21]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "st1 { v27.h }[6], [x20]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "add x20, x20, x13\n"
+ "st1 { v28.h }[6], [x23]\n"
+ "st1 { v29.h }[6], [x22]\n"
+ "st1 { v30.h }[6], [x21]\n"
+ "st1 { v31.h }[6], [x20]\n"
"b 91f\n"
"88:" // Oddments: Store: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 91f\n"
- "ldr x23, [x16, #0x0]\n"
- "add x23, x23, x12\n"
- "st1 { v23.h }[4], [x23]\n"
- "ldr x22, [x16, #0x8]\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x22, x22, x12\n"
- "add x21, x21, x12\n"
- "ldr x23, [x16, #0x20]\n"
- "add x20, x20, x12\n"
- "add x23, x23, x12\n"
- "st1 { v24.h }[4], [x22]\n"
- "st1 { v25.h }[4], [x21]\n"
- "ldr x22, [x16, #0x28]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x22, x22, x12\n"
- "st1 { v26.h }[4], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
- "st1 { v27.h }[4], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
- "st1 { v28.h }[4], [x22]\n"
- "st1 { v29.h }[4], [x21]\n"
- "st1 { v30.h }[4], [x20]\n"
- "st1 { v31.h }[4], [x23]\n"
+ "ldr x20, [x17, #0x0]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.h }[4], [x20]\n"
+ "ldr x23, [x17, #0x8]\n"
+ "ldr x22, [x17, #0x10]\n"
+ "ldr x21, [x17, #0x18]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x20, [x17, #0x20]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
+ "st1 { v24.h }[4], [x23]\n"
+ "st1 { v25.h }[4], [x22]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "add x23, x23, x13\n"
+ "st1 { v26.h }[4], [x21]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "st1 { v27.h }[4], [x20]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "add x20, x20, x13\n"
+ "st1 { v28.h }[4], [x23]\n"
+ "st1 { v29.h }[4], [x22]\n"
+ "st1 { v30.h }[4], [x21]\n"
+ "st1 { v31.h }[4], [x20]\n"
"b 91f\n"
"89:" // Oddments: Store: Bit 2: Unset
"tbz %x[n_channels], #1, 90f\n"
- "ldr x23, [x16, #0x0]\n"
- "add x23, x23, x12\n"
- "st1 { v23.s }[0], [x23]\n"
- "ldr x22, [x16, #0x8]\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x22, x22, x12\n"
- "add x21, x21, x12\n"
- "ldr x23, [x16, #0x20]\n"
- "add x20, x20, x12\n"
- "add x23, x23, x12\n"
- "st1 { v24.s }[0], [x22]\n"
- "st1 { v25.s }[0], [x21]\n"
- "ldr x22, [x16, #0x28]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x22, x22, x12\n"
- "st1 { v26.s }[0], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
- "st1 { v27.s }[0], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
- "add x12, x12, #0x4\n"
- "st1 { v28.s }[0], [x22]\n"
- "st1 { v29.s }[0], [x21]\n"
- "st1 { v30.s }[0], [x20]\n"
- "st1 { v31.s }[0], [x23]\n"
+ "ldr x20, [x17, #0x0]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.s }[0], [x20]\n"
+ "ldr x23, [x17, #0x8]\n"
+ "ldr x22, [x17, #0x10]\n"
+ "ldr x21, [x17, #0x18]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x20, [x17, #0x20]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
+ "st1 { v24.s }[0], [x23]\n"
+ "st1 { v25.s }[0], [x22]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "add x23, x23, x13\n"
+ "st1 { v26.s }[0], [x21]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "st1 { v27.s }[0], [x20]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "add x20, x20, x13\n"
+ "add x13, x13, #0x4\n"
+ "st1 { v28.s }[0], [x23]\n"
+ "st1 { v29.s }[0], [x22]\n"
+ "st1 { v30.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
"tbz %x[n_channels], #0, 91f\n"
- "ldr x23, [x16, #0x0]\n"
- "add x23, x23, x12\n"
- "st1 { v23.h }[2], [x23]\n"
- "ldr x22, [x16, #0x8]\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x22, x22, x12\n"
- "add x21, x21, x12\n"
- "ldr x23, [x16, #0x20]\n"
- "add x20, x20, x12\n"
- "add x23, x23, x12\n"
- "st1 { v24.h }[2], [x22]\n"
- "st1 { v25.h }[2], [x21]\n"
- "ldr x22, [x16, #0x28]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x22, x22, x12\n"
- "st1 { v26.h }[2], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
- "st1 { v27.h }[2], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
- "st1 { v28.h }[2], [x22]\n"
- "st1 { v29.h }[2], [x21]\n"
- "st1 { v30.h }[2], [x20]\n"
- "st1 { v31.h }[2], [x23]\n"
+ "ldr x20, [x17, #0x0]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.h }[2], [x20]\n"
+ "ldr x23, [x17, #0x8]\n"
+ "ldr x22, [x17, #0x10]\n"
+ "ldr x21, [x17, #0x18]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x20, [x17, #0x20]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
+ "st1 { v24.h }[2], [x23]\n"
+ "st1 { v25.h }[2], [x22]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "add x23, x23, x13\n"
+ "st1 { v26.h }[2], [x21]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "st1 { v27.h }[2], [x20]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "add x20, x20, x13\n"
+ "st1 { v28.h }[2], [x23]\n"
+ "st1 { v29.h }[2], [x22]\n"
+ "st1 { v30.h }[2], [x21]\n"
+ "st1 { v31.h }[2], [x20]\n"
"b 91f\n"
"90:" // Oddments: Store: Bit 2: Unset: Bit 1: Unset
- "ldr x23, [x16, #0x0]\n"
- "add x23, x23, x12\n"
- "st1 { v23.h }[0], [x23]\n"
- "ldr x22, [x16, #0x8]\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x22, x22, x12\n"
- "add x21, x21, x12\n"
- "ldr x23, [x16, #0x20]\n"
- "add x20, x20, x12\n"
- "add x23, x23, x12\n"
- "st1 { v24.h }[0], [x22]\n"
- "st1 { v25.h }[0], [x21]\n"
- "ldr x22, [x16, #0x28]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x22, x22, x12\n"
- "st1 { v26.h }[0], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
- "st1 { v27.h }[0], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
- "st1 { v28.h }[0], [x22]\n"
- "st1 { v29.h }[0], [x21]\n"
- "st1 { v30.h }[0], [x20]\n"
- "st1 { v31.h }[0], [x23]\n"
+ "ldr x20, [x17, #0x0]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.h }[0], [x20]\n"
+ "ldr x23, [x17, #0x8]\n"
+ "ldr x22, [x17, #0x10]\n"
+ "ldr x21, [x17, #0x18]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x20, [x17, #0x20]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
+ "st1 { v24.h }[0], [x23]\n"
+ "st1 { v25.h }[0], [x22]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "add x23, x23, x13\n"
+ "st1 { v26.h }[0], [x21]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "st1 { v27.h }[0], [x20]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "add x20, x20, x13\n"
+ "st1 { v28.h }[0], [x23]\n"
+ "st1 { v29.h }[0], [x22]\n"
+ "st1 { v30.h }[0], [x21]\n"
+ "st1 { v31.h }[0], [x20]\n"
"91:" // Oddments: Store: Bit 2: End
-
"92:" // End
-
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
index 4f0de6b61c..04fb532937 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -33,8 +33,8 @@
namespace arm_conv {
namespace depthwise {
-void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
-void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
class a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
{
@@ -57,7 +57,7 @@ class a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 4;
a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(4, 3, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
index a3a372be05..a1e1dd0e99 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -124,9 +124,9 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"add x27, x10, x22, LSL #1\n"
"add x23, x5, x5\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v15.8h }, [x20]\n"
+ "ld1r { v13.8h }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v14.8h }, [x20]\n"
+ "ld1r { v15.8h }, [x20]\n"
"add x26, x9, x24, LSL #1\n"
"add x25, x28, x4\n"
"add x24, x27, x22, LSL #1\n"
@@ -134,7 +134,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"mov x21, #0x0\n"
"sub x20, XZR, x6\n"
"cbz x13, 4f\n"
- "ldr q13, [x16, #0x0]\n"
+ "ldr q14, [x16, #0x0]\n"
"ldr q0, [x16, #0x10]\n"
"cmp x6, x13, LSL #4\n"
"ldr q1, [x16, #0x20]\n"
@@ -152,499 +152,499 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"ldr q12, [x14, x11]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "mov v21.16b, v13.16b\n fmla v21.8h, v4.8h, v9.8h\n"
- "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v9.8h\n"
+ "mov v26.16b, v14.16b\n fmla v26.8h, v4.8h, v9.8h\n"
+ "mov v28.16b, v14.16b\n fmla v28.8h, v8.8h, v9.8h\n"
"add x6, x6, #0x10\n"
"cmp x6, x13, LSL #4\n"
- "mov v22.16b, v13.16b\n fmla v22.8h, v3.8h, v9.8h\n"
- "mov v25.16b, v13.16b\n fmla v25.8h, v1.8h, v9.8h\n"
+ "mov v16.16b, v14.16b\n fmla v16.8h, v3.8h, v9.8h\n"
+ "mov v22.16b, v14.16b\n fmla v22.8h, v1.8h, v9.8h\n"
"add x20, x20, #0x10\n"
"add x21, x21, #0x10\n"
- "mov v26.16b, v13.16b\n fmla v26.8h, v0.8h, v9.8h\n"
- "fmla v21.8h, v5.8h, v12.8h\n"
- "mov v17.16b, v13.16b\n fmla v17.8h, v7.8h, v9.8h\n"
- "mov v18.16b, v13.16b\n fmla v18.8h, v6.8h, v9.8h\n"
- "mov v20.16b, v13.16b\n fmla v20.8h, v5.8h, v9.8h\n"
- "mov v24.16b, v13.16b\n fmla v24.8h, v2.8h, v9.8h\n"
+ "mov v23.16b, v14.16b\n fmla v23.8h, v0.8h, v9.8h\n"
+ "fmla v26.8h, v5.8h, v12.8h\n"
+ "mov v25.16b, v14.16b\n fmla v25.8h, v7.8h, v9.8h\n"
+ "mov v17.16b, v14.16b\n fmla v17.8h, v6.8h, v9.8h\n"
+ "mov v31.16b, v14.16b\n fmla v31.8h, v5.8h, v9.8h\n"
+ "mov v20.16b, v14.16b\n fmla v20.8h, v2.8h, v9.8h\n"
"ldr q9, [x12, x17]\n"
- "fmla v16.8h, v0.8h, v10.8h\n"
- "ld1 { v10.8h }, [x26]\n"
- "mov v19.16b, v13.16b\n fmla v19.8h, v2.8h, v11.8h\n"
- "ldr q11, [x26, x25]\n"
- "fmla v22.8h, v4.8h, v12.8h\n"
- "fmla v25.8h, v2.8h, v12.8h\n"
- "fmla v26.8h, v1.8h, v12.8h\n"
- "mov v28.16b, v13.16b\n fmla v28.8h, v6.8h, v10.8h\n"
+ "fmla v28.8h, v0.8h, v10.8h\n"
+ "ld1 { v30.8h }, [x26]\n"
+ "mov v29.16b, v14.16b\n fmla v29.8h, v2.8h, v11.8h\n"
+ "ldr q27, [x26, x25]\n"
+ "fmla v16.8h, v4.8h, v12.8h\n"
+ "fmla v22.8h, v2.8h, v12.8h\n"
+ "fmla v23.8h, v1.8h, v12.8h\n"
+ "mov v21.16b, v14.16b\n fmla v21.8h, v6.8h, v30.8h\n"
"ldr q10, [x12, x11]\n"
- "fmla v21.8h, v7.8h, v9.8h\n"
- "fmla v17.8h, v8.8h, v12.8h\n"
- "fmla v18.8h, v7.8h, v12.8h\n"
- "fmla v19.8h, v6.8h, v12.8h\n"
- "mov v23.16b, v13.16b\n fmla v23.8h, v3.8h, v12.8h\n"
- "mov v27.16b, v13.16b\n fmla v27.8h, v0.8h, v12.8h\n"
- "ldr q12, [x7, x4]\n"
- "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v11.8h\n"
- "ldr q11, [x7, x28]\n"
- "fmla v22.8h, v6.8h, v9.8h\n"
- "fmla v25.8h, v4.8h, v9.8h\n"
- "fmla v26.8h, v3.8h, v9.8h\n"
- "fmla v20.8h, v8.8h, v9.8h\n"
- "fmla v24.8h, v5.8h, v9.8h\n"
- "fmla v28.8h, v2.8h, v9.8h\n"
- "fmla v21.8h, v8.8h, v10.8h\n"
- "fmla v16.8h, v1.8h, v12.8h\n"
- "fmla v17.8h, v0.8h, v12.8h\n"
- "ldr q12, [x15, x25]\n"
- "fmla v18.8h, v2.8h, v11.8h\n"
- "fmla v19.8h, v1.8h, v11.8h\n"
- "ld1 { v11.8h }, [x9]\n"
- "fmla v22.8h, v7.8h, v10.8h\n"
- "fmla v23.8h, v6.8h, v10.8h\n"
- "fmla v25.8h, v5.8h, v10.8h\n"
- "fmla v26.8h, v4.8h, v10.8h\n"
- "fmla v27.8h, v3.8h, v10.8h\n"
- "fmla v31.8h, v0.8h, v10.8h\n"
- "fmla v24.8h, v6.8h, v11.8h\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x9, x25]\n"
- "fmla v19.8h, v5.8h, v12.8h\n"
- "fmla v23.8h, v2.8h, v12.8h\n"
- "ldr q12, [x15, x11]\n"
- "fmla v27.8h, v8.8h, v11.8h\n"
- "fmla v31.8h, v5.8h, v11.8h\n"
- "mov v29.16b, v13.16b\n fmla v29.8h, v1.8h, v9.8h\n"
- "mov v30.16b, v13.16b\n fmla v30.8h, v0.8h, v9.8h\n"
+ "fmla v26.8h, v7.8h, v9.8h\n"
+ "fmla v25.8h, v8.8h, v12.8h\n"
+ "fmla v17.8h, v7.8h, v12.8h\n"
+ "fmla v29.8h, v6.8h, v12.8h\n"
+ "mov v24.16b, v14.16b\n fmla v24.8h, v3.8h, v12.8h\n"
+ "mov v19.16b, v14.16b\n fmla v19.8h, v0.8h, v12.8h\n"
+ "ldr q11, [x7, x4]\n"
+ "mov v30.16b, v14.16b\n fmla v30.8h, v8.8h, v27.8h\n"
+ "ldr q12, [x7, x28]\n"
+ "fmla v16.8h, v6.8h, v9.8h\n"
+ "fmla v22.8h, v4.8h, v9.8h\n"
+ "fmla v23.8h, v3.8h, v9.8h\n"
+ "mov v27.16b, v14.16b\n fmla v27.8h, v1.8h, v9.8h\n"
+ "mov v18.16b, v14.16b\n fmla v18.8h, v0.8h, v9.8h\n"
+ "ldr q14, [x16, #0x0]\n"
+ "fmla v31.8h, v8.8h, v9.8h\n"
+ "fmla v20.8h, v5.8h, v9.8h\n"
+ "fmla v21.8h, v2.8h, v9.8h\n"
"ld1 { v9.8h }, [x15]\n"
- "fmla v29.8h, v2.8h, v10.8h\n"
- "fmla v30.8h, v1.8h, v10.8h\n"
+ "fmla v26.8h, v8.8h, v10.8h\n"
+ "fmla v28.8h, v1.8h, v11.8h\n"
+ "fmla v25.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x15, x25]\n"
+ "fmla v17.8h, v2.8h, v12.8h\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "ld1 { v12.8h }, [x9]\n"
+ "fmla v16.8h, v7.8h, v10.8h\n"
+ "fmla v24.8h, v6.8h, v10.8h\n"
+ "fmla v22.8h, v5.8h, v10.8h\n"
+ "fmla v23.8h, v4.8h, v10.8h\n"
+ "fmla v19.8h, v3.8h, v10.8h\n"
+ "fmla v27.8h, v2.8h, v10.8h\n"
+ "fmla v18.8h, v1.8h, v10.8h\n"
+ "fmla v30.8h, v0.8h, v10.8h\n"
"ldr q10, [x15, x17]\n"
- "fmla v20.8h, v0.8h, v9.8h\n"
- "fmla v21.8h, v1.8h, v10.8h\n"
- "fmla v16.8h, v3.8h, v9.8h\n"
- "ldr q11, [x26, x4]\n"
- "fmla v17.8h, v4.8h, v10.8h\n"
- "fmla v18.8h, v3.8h, v10.8h\n"
- "fmla v22.8h, v0.8h, v10.8h\n"
- "fmla v20.8h, v2.8h, v10.8h\n"
- "fmla v21.8h, v2.8h, v12.8h\n"
- "fmla v16.8h, v5.8h, v10.8h\n"
+ "fmla v31.8h, v0.8h, v9.8h\n"
+ "fmla v20.8h, v6.8h, v12.8h\n"
+ "fmla v21.8h, v3.8h, v12.8h\n"
+ "ldr q12, [x9, x25]\n"
+ "fmla v26.8h, v1.8h, v10.8h\n"
+ "fmla v28.8h, v3.8h, v9.8h\n"
+ "fmla v29.8h, v5.8h, v11.8h\n"
+ "fmla v24.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x15, x11]\n"
+ "fmla v25.8h, v4.8h, v10.8h\n"
+ "fmla v17.8h, v3.8h, v10.8h\n"
+ "fmla v16.8h, v0.8h, v10.8h\n"
+ "fmla v19.8h, v8.8h, v12.8h\n"
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "ldr q9, [x26, x4]\n"
+ "fmla v31.8h, v2.8h, v10.8h\n"
+ "fmla v26.8h, v2.8h, v11.8h\n"
+ "fmla v28.8h, v5.8h, v10.8h\n"
"ldr q10, [x14, x4]\n"
- "fmla v17.8h, v5.8h, v12.8h\n"
- "fmla v18.8h, v4.8h, v12.8h\n"
- "fmla v19.8h, v3.8h, v12.8h\n"
- "fmla v22.8h, v1.8h, v12.8h\n"
- "fmla v23.8h, v0.8h, v12.8h\n"
- "ldr q12, [x14, x28]\n"
- "fmla v28.8h, v7.8h, v11.8h\n"
- "fmla v29.8h, v6.8h, v11.8h\n"
- "ldr q11, [x26, x28]\n"
- "fmla v20.8h, v4.8h, v10.8h\n"
- "fmla v21.8h, v3.8h, v10.8h\n"
- "fmla v24.8h, v1.8h, v10.8h\n"
- "fmla v25.8h, v0.8h, v10.8h\n"
- "fmla v16.8h, v7.8h, v10.8h\n"
- "fmla v17.8h, v6.8h, v10.8h\n"
+ "fmla v25.8h, v5.8h, v11.8h\n"
+ "fmla v17.8h, v4.8h, v11.8h\n"
+ "fmla v29.8h, v3.8h, v11.8h\n"
+ "fmla v16.8h, v1.8h, v11.8h\n"
+ "fmla v24.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x14, x28]\n"
+ "fmla v21.8h, v7.8h, v9.8h\n"
+ "fmla v27.8h, v6.8h, v9.8h\n"
+ "ldr q12, [x26, x28]\n"
+ "fmla v31.8h, v4.8h, v10.8h\n"
+ "fmla v26.8h, v3.8h, v10.8h\n"
+ "fmla v20.8h, v1.8h, v10.8h\n"
+ "fmla v22.8h, v0.8h, v10.8h\n"
+ "fmla v28.8h, v7.8h, v10.8h\n"
+ "fmla v25.8h, v6.8h, v10.8h\n"
"ldr q10, [x7, x17]\n"
- "fmla v30.8h, v8.8h, v11.8h\n"
- "fmla v31.8h, v7.8h, v11.8h\n"
- "ldr q11, [x12, x4]\n"
"fmla v18.8h, v8.8h, v12.8h\n"
- "fmla v19.8h, v7.8h, v12.8h\n"
- "fmla v22.8h, v5.8h, v12.8h\n"
- "fmla v23.8h, v4.8h, v12.8h\n"
- "fmla v26.8h, v2.8h, v12.8h\n"
- "fmla v27.8h, v1.8h, v12.8h\n"
+ "fmla v30.8h, v7.8h, v12.8h\n"
+ "ldr q9, [x12, x4]\n"
+ "fmla v17.8h, v8.8h, v11.8h\n"
+ "fmla v29.8h, v7.8h, v11.8h\n"
+ "fmla v16.8h, v5.8h, v11.8h\n"
+ "fmla v24.8h, v4.8h, v11.8h\n"
+ "fmla v23.8h, v2.8h, v11.8h\n"
+ "fmla v19.8h, v1.8h, v11.8h\n"
"ldr q12, [x7, x11]\n"
"add x7, x7, #0x10\n"
- "fmla v20.8h, v7.8h, v11.8h\n"
- "fmla v21.8h, v6.8h, v11.8h\n"
- "fmla v24.8h, v4.8h, v11.8h\n"
- "fmla v25.8h, v3.8h, v11.8h\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "fmla v29.8h, v0.8h, v11.8h\n"
- "ldr q11, [x12, x28]\n"
- "fmla v16.8h, v2.8h, v10.8h\n"
- "fmla v17.8h, v1.8h, v10.8h\n"
- "fmla v18.8h, v0.8h, v10.8h\n"
+ "fmla v31.8h, v7.8h, v9.8h\n"
+ "fmla v26.8h, v6.8h, v9.8h\n"
+ "fmla v20.8h, v4.8h, v9.8h\n"
+ "fmla v22.8h, v3.8h, v9.8h\n"
+ "fmla v21.8h, v1.8h, v9.8h\n"
+ "fmla v27.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x12, x28]\n"
+ "fmla v28.8h, v2.8h, v10.8h\n"
+ "fmla v25.8h, v1.8h, v10.8h\n"
+ "fmla v17.8h, v0.8h, v10.8h\n"
"ld1 { v10.8h }, [x14]\n"
- "fmla v30.8h, v2.8h, v11.8h\n"
- "fmla v19.8h, v0.8h, v12.8h\n"
- "fmla v20.8h, v3.8h, v10.8h\n"
- "fmla v24.8h, v0.8h, v10.8h\n"
- "fmla v22.8h, v8.8h, v11.8h\n"
- "fmla v23.8h, v7.8h, v11.8h\n"
- "fmla v26.8h, v5.8h, v11.8h\n"
- "fmla v27.8h, v4.8h, v11.8h\n"
- "fmla v31.8h, v1.8h, v11.8h\n"
+ "fmla v18.8h, v2.8h, v9.8h\n"
+ "fmla v29.8h, v0.8h, v12.8h\n"
+ "fmla v31.8h, v3.8h, v10.8h\n"
+ "fmla v20.8h, v0.8h, v10.8h\n"
+ "fmla v16.8h, v8.8h, v9.8h\n"
+ "fmla v24.8h, v7.8h, v9.8h\n"
+ "fmla v23.8h, v5.8h, v9.8h\n"
+ "fmla v19.8h, v4.8h, v9.8h\n"
+ "fmla v30.8h, v1.8h, v9.8h\n"
"ldr q11, [x9, x17]\n"
- "fmla v17.8h, v2.8h, v12.8h\n"
- "fmla v18.8h, v1.8h, v12.8h\n"
+ "fmla v25.8h, v2.8h, v12.8h\n"
+ "fmla v17.8h, v1.8h, v12.8h\n"
"ldr q12, [x14, x25]\n"
"add x14, x14, #0x10\n"
- "fmla v16.8h, v6.8h, v10.8h\n"
+ "ldr q9, [x14, x17]\n"
+ "fmla v28.8h, v6.8h, v10.8h\n"
"ld1 { v10.8h }, [x12]\n"
- "fmla v29.8h, v4.8h, v11.8h\n"
- "fmla v30.8h, v3.8h, v11.8h\n"
- "fmla v19.8h, v8.8h, v12.8h\n"
- "fmla v23.8h, v5.8h, v12.8h\n"
- "fmla v27.8h, v2.8h, v12.8h\n"
+ "fmla v27.8h, v4.8h, v11.8h\n"
+ "fmla v18.8h, v3.8h, v11.8h\n"
+ "fmla v29.8h, v8.8h, v12.8h\n"
+ "fmla v24.8h, v5.8h, v12.8h\n"
+ "fmla v19.8h, v2.8h, v12.8h\n"
"ldr q12, [x12, x25]\n"
"add x12, x12, #0x10\n"
- "fmla v20.8h, v6.8h, v10.8h\n"
- "fmla v24.8h, v3.8h, v10.8h\n"
- "fmla v28.8h, v0.8h, v10.8h\n"
+ "fmla v31.8h, v6.8h, v10.8h\n"
+ "fmla v20.8h, v3.8h, v10.8h\n"
+ "fmla v21.8h, v0.8h, v10.8h\n"
"ldr q10, [x26, x17]\n"
- "fmla v31.8h, v2.8h, v12.8h\n"
- "fmla v29.8h, v7.8h, v10.8h\n"
- "fmla v30.8h, v6.8h, v10.8h\n"
- "fmla v24.8h, v8.8h, v11.8h\n"
- "fmla v25.8h, v7.8h, v11.8h\n"
- "fmla v26.8h, v6.8h, v11.8h\n"
- "fmla v28.8h, v5.8h, v11.8h\n"
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "fmla v27.8h, v7.8h, v10.8h\n"
+ "fmla v18.8h, v6.8h, v10.8h\n"
+ "fmla v20.8h, v8.8h, v11.8h\n"
+ "fmla v22.8h, v7.8h, v11.8h\n"
+ "fmla v23.8h, v6.8h, v11.8h\n"
+ "fmla v21.8h, v5.8h, v11.8h\n"
"ldr q11, [x9, x11]\n"
- "fmla v27.8h, v5.8h, v12.8h\n"
- "fmla v29.8h, v5.8h, v11.8h\n"
- "fmla v30.8h, v4.8h, v11.8h\n"
- "fmla v31.8h, v3.8h, v11.8h\n"
- "fmla v23.8h, v8.8h, v12.8h\n"
+ "fmla v19.8h, v5.8h, v12.8h\n"
+ "fmla v27.8h, v5.8h, v11.8h\n"
+ "fmla v18.8h, v4.8h, v11.8h\n"
+ "fmla v30.8h, v3.8h, v11.8h\n"
+ "fmla v24.8h, v8.8h, v12.8h\n"
"ldr q12, [x26, x11]\n"
- "fmla v28.8h, v8.8h, v10.8h\n"
+ "fmla v21.8h, v8.8h, v10.8h\n"
"ldr q10, [x15, x4]\n"
- "fmla v25.8h, v8.8h, v11.8h\n"
- "fmla v26.8h, v7.8h, v11.8h\n"
+ "fmla v22.8h, v8.8h, v11.8h\n"
+ "fmla v23.8h, v7.8h, v11.8h\n"
"add x26, x26, #0x10\n"
- "fmla v27.8h, v6.8h, v11.8h\n"
+ "fmla v19.8h, v6.8h, v11.8h\n"
"ldr q11, [x15, x28]\n"
- "fmla v29.8h, v8.8h, v12.8h\n"
+ "fmla v27.8h, v8.8h, v12.8h\n"
"add x15, x15, #0x10\n"
- "fmla v30.8h, v7.8h, v12.8h\n"
- "fmla v31.8h, v6.8h, v12.8h\n"
+ "fmla v18.8h, v7.8h, v12.8h\n"
+ "fmla v30.8h, v6.8h, v12.8h\n"
"ldr q12, [x9, x4]\n"
- "fmla v16.8h, v4.8h, v10.8h\n"
- "fmla v17.8h, v3.8h, v10.8h\n"
- "fmax v16.8h, v16.8h, v15.8h\n"
- "fmla v20.8h, v1.8h, v10.8h\n"
- "fmla v21.8h, v0.8h, v10.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "fmla v25.8h, v3.8h, v10.8h\n"
+ "fmax v28.8h, v28.8h, v13.8h\n"
+ "fmla v31.8h, v1.8h, v10.8h\n"
+ "fmla v26.8h, v0.8h, v10.8h\n"
"ldr q10, [x9, x28]\n"
- "ldr q9, [x14, x17]\n"
- "fmla v18.8h, v5.8h, v11.8h\n"
- "fmla v19.8h, v4.8h, v11.8h\n"
- "fmax v17.8h, v17.8h, v15.8h\n"
+ "ldr q0, [x16, #0x10]\n"
+ "fmla v17.8h, v5.8h, v11.8h\n"
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "fmax v25.8h, v25.8h, v13.8h\n"
"add x9, x9, #0x10\n"
- "fmla v22.8h, v2.8h, v11.8h\n"
- "ldr q13, [x16, #0x0]\n"
- "fmla v23.8h, v1.8h, v11.8h\n"
+ "fmla v16.8h, v2.8h, v11.8h\n"
+ "ldr q2, [x16, #0x30]\n"
+ "fmla v24.8h, v1.8h, v11.8h\n"
"ldr q11, [x7, x25]\n"
- "ldr q0, [x16, #0x10]\n"
- "fmla v24.8h, v7.8h, v12.8h\n"
- "fmla v25.8h, v6.8h, v12.8h\n"
"ldr q1, [x16, #0x20]\n"
- "fmla v28.8h, v4.8h, v12.8h\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
+ "fmla v20.8h, v7.8h, v12.8h\n"
+ "fmla v22.8h, v6.8h, v12.8h\n"
+ "ldr q6, [x16, #0x70]\n"
+ "fmla v21.8h, v4.8h, v12.8h\n"
+ "fmla v27.8h, v3.8h, v12.8h\n"
"ldr q12, [x14, x11]\n"
- "ldr q2, [x16, #0x30]\n"
- "fmla v26.8h, v8.8h, v10.8h\n"
"ldr q3, [x16, #0x40]\n"
- "fmla v27.8h, v7.8h, v10.8h\n"
- "ldr q6, [x16, #0x70]\n"
- "fmla v30.8h, v5.8h, v10.8h\n"
+ "fmla v23.8h, v8.8h, v10.8h\n"
+ "ldr q8, [x16, #0x90]\n"
+ "fmla v19.8h, v7.8h, v10.8h\n"
+ "ldr q7, [x16, #0x80]\n"
+ "fmla v18.8h, v5.8h, v10.8h\n"
"ldr q5, [x16, #0x60]\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
"ld1 { v10.8h }, [x7]\n"
"ldr q4, [x16, #0x50]\n"
- "fmax v18.8h, v18.8h, v15.8h\n"
- "fmax v19.8h, v19.8h, v15.8h\n"
- "fmax v20.8h, v20.8h, v15.8h\n"
- "fmax v21.8h, v21.8h, v15.8h\n"
- "fmax v22.8h, v22.8h, v15.8h\n"
- "fmax v23.8h, v23.8h, v15.8h\n"
- "fmax v24.8h, v24.8h, v15.8h\n"
- "fmax v25.8h, v25.8h, v15.8h\n"
- "fmax v26.8h, v26.8h, v15.8h\n"
- "fmax v27.8h, v27.8h, v15.8h\n"
- "fmax v28.8h, v28.8h, v15.8h\n"
- "fmax v29.8h, v29.8h, v15.8h\n"
- "fmax v30.8h, v30.8h, v15.8h\n"
- "fmax v31.8h, v31.8h, v15.8h\n"
- "fmin v16.8h, v16.8h, v14.8h\n"
- "fmin v17.8h, v17.8h, v14.8h\n"
- "st1 { v16.8h }, [x8]\n"
- "ldr q7, [x16, #0x80]\n"
- "fmin v18.8h, v18.8h, v14.8h\n"
- "fmin v19.8h, v19.8h, v14.8h\n"
- "str q17, [x8, x5]\n"
- "ldr q8, [x16, #0x90]\n"
- "fmin v20.8h, v20.8h, v14.8h\n"
- "fmin v21.8h, v21.8h, v14.8h\n"
- "str q18, [x8, x23]\n"
+ "fmax v17.8h, v17.8h, v13.8h\n"
+ "fmax v29.8h, v29.8h, v13.8h\n"
"add x16, x16, #0xa0\n"
- "fmin v22.8h, v22.8h, v14.8h\n"
- "fmin v23.8h, v23.8h, v14.8h\n"
- "str q19, [x8, x22]\n"
+ "fmax v31.8h, v31.8h, v13.8h\n"
+ "fmax v26.8h, v26.8h, v13.8h\n"
+ "fmax v16.8h, v16.8h, v13.8h\n"
+ "fmax v24.8h, v24.8h, v13.8h\n"
+ "fmax v20.8h, v20.8h, v13.8h\n"
+ "fmax v22.8h, v22.8h, v13.8h\n"
+ "fmax v23.8h, v23.8h, v13.8h\n"
+ "fmax v19.8h, v19.8h, v13.8h\n"
+ "fmax v21.8h, v21.8h, v13.8h\n"
+ "fmax v27.8h, v27.8h, v13.8h\n"
+ "fmax v18.8h, v18.8h, v13.8h\n"
+ "fmax v30.8h, v30.8h, v13.8h\n"
+ "fmin v28.8h, v28.8h, v15.8h\n"
+ "fmin v25.8h, v25.8h, v15.8h\n"
+ "st1 { v28.8h }, [x8]\n"
+ "fmin v17.8h, v17.8h, v15.8h\n"
+ "fmin v29.8h, v29.8h, v15.8h\n"
+ "str q25, [x8, x5]\n"
+ "fmin v31.8h, v31.8h, v15.8h\n"
+ "fmin v26.8h, v26.8h, v15.8h\n"
+ "str q17, [x8, x23]\n"
+ "fmin v16.8h, v16.8h, v15.8h\n"
+ "fmin v24.8h, v24.8h, v15.8h\n"
+ "str q29, [x8, x22]\n"
"add x8, x8, #0x10\n"
- "fmin v24.8h, v24.8h, v14.8h\n"
- "fmin v25.8h, v25.8h, v14.8h\n"
- "st1 { v20.8h }, [x10]\n"
- "fmin v26.8h, v26.8h, v14.8h\n"
- "fmin v27.8h, v27.8h, v14.8h\n"
- "str q21, [x10, x5]\n"
- "fmin v28.8h, v28.8h, v14.8h\n"
- "fmin v29.8h, v29.8h, v14.8h\n"
- "str q22, [x10, x23]\n"
- "fmin v30.8h, v30.8h, v14.8h\n"
- "fmin v31.8h, v31.8h, v14.8h\n"
- "str q23, [x10, x22]\n"
+ "fmin v20.8h, v20.8h, v15.8h\n"
+ "fmin v22.8h, v22.8h, v15.8h\n"
+ "st1 { v31.8h }, [x10]\n"
+ "fmin v23.8h, v23.8h, v15.8h\n"
+ "fmin v19.8h, v19.8h, v15.8h\n"
+ "str q26, [x10, x5]\n"
+ "fmin v21.8h, v21.8h, v15.8h\n"
+ "fmin v27.8h, v27.8h, v15.8h\n"
+ "str q16, [x10, x23]\n"
+ "fmin v18.8h, v18.8h, v15.8h\n"
+ "fmin v30.8h, v30.8h, v15.8h\n"
+ "str q24, [x10, x22]\n"
"add x10, x10, #0x10\n"
- "st1 { v24.8h }, [x27]\n"
- "str q25, [x27, x5]\n"
- "str q26, [x27, x23]\n"
- "str q27, [x27, x22]\n"
+ "st1 { v20.8h }, [x27]\n"
+ "str q22, [x27, x5]\n"
+ "str q23, [x27, x23]\n"
+ "str q19, [x27, x22]\n"
"add x27, x27, #0x10\n"
- "st1 { v28.8h }, [x24]\n"
- "str q29, [x24, x5]\n"
- "str q30, [x24, x23]\n"
- "str q31, [x24, x22]\n"
+ "st1 { v21.8h }, [x24]\n"
+ "str q27, [x24, x5]\n"
+ "str q18, [x24, x23]\n"
+ "str q30, [x24, x22]\n"
"add x24, x24, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "mov v21.16b, v13.16b\n fmla v21.8h, v4.8h, v9.8h\n"
- "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v9.8h\n"
- "mov v22.16b, v13.16b\n fmla v22.8h, v3.8h, v9.8h\n"
- "mov v25.16b, v13.16b\n fmla v25.8h, v1.8h, v9.8h\n"
- "mov v26.16b, v13.16b\n fmla v26.8h, v0.8h, v9.8h\n"
- "fmla v21.8h, v5.8h, v12.8h\n"
- "mov v17.16b, v13.16b\n fmla v17.8h, v7.8h, v9.8h\n"
- "mov v18.16b, v13.16b\n fmla v18.8h, v6.8h, v9.8h\n"
- "mov v20.16b, v13.16b\n fmla v20.8h, v5.8h, v9.8h\n"
- "mov v24.16b, v13.16b\n fmla v24.8h, v2.8h, v9.8h\n"
- "ldr q9, [x12, x17]\n"
- "fmla v16.8h, v0.8h, v10.8h\n"
- "ld1 { v10.8h }, [x26]\n"
- "mov v19.16b, v13.16b\n fmla v19.8h, v2.8h, v11.8h\n"
- "ldr q11, [x26, x25]\n"
- "fmla v22.8h, v4.8h, v12.8h\n"
- "fmla v25.8h, v2.8h, v12.8h\n"
- "fmla v26.8h, v1.8h, v12.8h\n"
- "mov v28.16b, v13.16b\n fmla v28.8h, v6.8h, v10.8h\n"
- "ldr q10, [x12, x11]\n"
- "fmla v21.8h, v7.8h, v9.8h\n"
+ "mov v16.16b, v14.16b\n fmla v16.8h, v4.8h, v9.8h\n"
+ "mov v23.16b, v14.16b\n fmla v23.8h, v8.8h, v9.8h\n"
+ "mov v31.16b, v14.16b\n fmla v31.8h, v3.8h, v9.8h\n"
+ "mov v30.16b, v14.16b\n fmla v30.8h, v1.8h, v9.8h\n"
+ "mov v18.16b, v14.16b\n fmla v18.8h, v0.8h, v9.8h\n"
+ "fmla v16.8h, v5.8h, v12.8h\n"
+ "mov v17.16b, v14.16b\n fmla v17.8h, v7.8h, v9.8h\n"
+ "mov v19.16b, v14.16b\n fmla v19.8h, v6.8h, v9.8h\n"
+ "mov v28.16b, v14.16b\n fmla v28.8h, v5.8h, v9.8h\n"
+ "mov v27.16b, v14.16b\n fmla v27.8h, v2.8h, v9.8h\n"
+ "ldr q24, [x12, x17]\n"
+ "fmla v23.8h, v0.8h, v10.8h\n"
+ "ld1 { v21.8h }, [x26]\n"
+ "mov v29.16b, v14.16b\n fmla v29.8h, v2.8h, v11.8h\n"
+ "ldr q20, [x26, x25]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "fmla v18.8h, v1.8h, v12.8h\n"
+ "mov v26.16b, v14.16b\n fmla v26.8h, v6.8h, v21.8h\n"
+ "ldr q9, [x12, x11]\n"
+ "fmla v16.8h, v7.8h, v24.8h\n"
"fmla v17.8h, v8.8h, v12.8h\n"
- "fmla v18.8h, v7.8h, v12.8h\n"
- "fmla v19.8h, v6.8h, v12.8h\n"
- "mov v23.16b, v13.16b\n fmla v23.8h, v3.8h, v12.8h\n"
- "mov v27.16b, v13.16b\n fmla v27.8h, v0.8h, v12.8h\n"
- "ldr q12, [x7, x4]\n"
- "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v11.8h\n"
- "ldr q11, [x7, x28]\n"
- "fmla v22.8h, v6.8h, v9.8h\n"
- "fmla v25.8h, v4.8h, v9.8h\n"
- "fmla v26.8h, v3.8h, v9.8h\n"
- "fmla v20.8h, v8.8h, v9.8h\n"
- "fmla v24.8h, v5.8h, v9.8h\n"
- "fmla v28.8h, v2.8h, v9.8h\n"
- "fmla v21.8h, v8.8h, v10.8h\n"
- "fmla v16.8h, v1.8h, v12.8h\n"
- "fmla v17.8h, v0.8h, v12.8h\n"
- "ldr q12, [x15, x25]\n"
- "fmla v18.8h, v2.8h, v11.8h\n"
- "fmla v19.8h, v1.8h, v11.8h\n"
- "ld1 { v11.8h }, [x9]\n"
- "fmla v22.8h, v7.8h, v10.8h\n"
- "fmla v23.8h, v6.8h, v10.8h\n"
- "fmla v25.8h, v5.8h, v10.8h\n"
- "fmla v26.8h, v4.8h, v10.8h\n"
- "fmla v27.8h, v3.8h, v10.8h\n"
- "fmla v31.8h, v0.8h, v10.8h\n"
- "fmla v24.8h, v6.8h, v11.8h\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x9, x25]\n"
- "fmla v19.8h, v5.8h, v12.8h\n"
- "fmla v23.8h, v2.8h, v12.8h\n"
- "ldr q12, [x15, x11]\n"
- "fmla v27.8h, v8.8h, v11.8h\n"
- "fmla v31.8h, v5.8h, v11.8h\n"
- "mov v29.16b, v13.16b\n fmla v29.8h, v1.8h, v9.8h\n"
- "mov v30.16b, v13.16b\n fmla v30.8h, v0.8h, v9.8h\n"
- "ld1 { v9.8h }, [x15]\n"
- "fmla v29.8h, v2.8h, v10.8h\n"
- "fmla v30.8h, v1.8h, v10.8h\n"
- "ldr q10, [x15, x17]\n"
- "fmla v20.8h, v0.8h, v9.8h\n"
- "fmla v21.8h, v1.8h, v10.8h\n"
- "fmla v16.8h, v3.8h, v9.8h\n"
- "ldr q11, [x26, x4]\n"
- "fmla v17.8h, v4.8h, v10.8h\n"
- "fmla v18.8h, v3.8h, v10.8h\n"
- "fmla v22.8h, v0.8h, v10.8h\n"
- "fmla v20.8h, v2.8h, v10.8h\n"
- "fmla v21.8h, v2.8h, v12.8h\n"
- "fmla v16.8h, v5.8h, v10.8h\n"
- "ldr q10, [x14, x4]\n"
- "fmla v17.8h, v5.8h, v12.8h\n"
- "fmla v18.8h, v4.8h, v12.8h\n"
- "fmla v19.8h, v3.8h, v12.8h\n"
- "fmla v22.8h, v1.8h, v12.8h\n"
- "fmla v23.8h, v0.8h, v12.8h\n"
- "ldr q12, [x14, x28]\n"
- "fmla v28.8h, v7.8h, v11.8h\n"
- "fmla v29.8h, v6.8h, v11.8h\n"
- "ldr q11, [x26, x28]\n"
- "fmla v20.8h, v4.8h, v10.8h\n"
- "fmla v21.8h, v3.8h, v10.8h\n"
- "fmla v24.8h, v1.8h, v10.8h\n"
- "fmla v25.8h, v0.8h, v10.8h\n"
- "fmla v16.8h, v7.8h, v10.8h\n"
- "fmla v17.8h, v6.8h, v10.8h\n"
- "ldr q10, [x7, x17]\n"
- "fmla v30.8h, v8.8h, v11.8h\n"
- "fmla v31.8h, v7.8h, v11.8h\n"
- "ldr q11, [x12, x4]\n"
- "fmla v18.8h, v8.8h, v12.8h\n"
"fmla v19.8h, v7.8h, v12.8h\n"
- "fmla v22.8h, v5.8h, v12.8h\n"
- "fmla v23.8h, v4.8h, v12.8h\n"
- "fmla v26.8h, v2.8h, v12.8h\n"
- "fmla v27.8h, v1.8h, v12.8h\n"
- "ldr q12, [x7, x11]\n"
+ "fmla v29.8h, v6.8h, v12.8h\n"
+ "mov v11.16b, v14.16b\n fmla v11.8h, v3.8h, v12.8h\n"
+ "mov v10.16b, v14.16b\n fmla v10.8h, v0.8h, v12.8h\n"
+ "ldr q22, [x7, x4]\n"
+ "mov v25.16b, v14.16b\n fmla v25.8h, v8.8h, v20.8h\n"
+ "ldr q21, [x7, x28]\n"
+ "fmla v31.8h, v6.8h, v24.8h\n"
+ "fmla v30.8h, v4.8h, v24.8h\n"
+ "fmla v18.8h, v3.8h, v24.8h\n"
+ "mov v12.16b, v14.16b\n fmla v12.8h, v1.8h, v24.8h\n"
+ "fmla v14.8h, v0.8h, v24.8h\n"
+ "fmla v28.8h, v8.8h, v24.8h\n"
+ "fmla v27.8h, v5.8h, v24.8h\n"
+ "fmla v26.8h, v2.8h, v24.8h\n"
+ "ld1 { v24.8h }, [x15]\n"
+ "fmla v16.8h, v8.8h, v9.8h\n"
+ "fmla v23.8h, v1.8h, v22.8h\n"
+ "fmla v17.8h, v0.8h, v22.8h\n"
+ "ldr q22, [x15, x25]\n"
+ "fmla v19.8h, v2.8h, v21.8h\n"
+ "fmla v29.8h, v1.8h, v21.8h\n"
+ "ld1 { v20.8h }, [x9]\n"
+ "fmla v31.8h, v7.8h, v9.8h\n"
+ "fmla v11.8h, v6.8h, v9.8h\n"
+ "fmla v30.8h, v5.8h, v9.8h\n"
+ "fmla v18.8h, v4.8h, v9.8h\n"
+ "fmla v10.8h, v3.8h, v9.8h\n"
+ "fmla v12.8h, v2.8h, v9.8h\n"
+ "fmla v14.8h, v1.8h, v9.8h\n"
+ "fmla v25.8h, v0.8h, v9.8h\n"
+ "ldr q21, [x15, x17]\n"
+ "fmla v28.8h, v0.8h, v24.8h\n"
+ "fmla v27.8h, v6.8h, v20.8h\n"
+ "fmla v26.8h, v3.8h, v20.8h\n"
+ "ldr q20, [x9, x25]\n"
+ "fmla v16.8h, v1.8h, v21.8h\n"
+ "fmla v23.8h, v3.8h, v24.8h\n"
+ "fmla v29.8h, v5.8h, v22.8h\n"
+ "fmla v11.8h, v2.8h, v22.8h\n"
+ "ldr q22, [x15, x11]\n"
+ "fmla v17.8h, v4.8h, v21.8h\n"
+ "fmla v19.8h, v3.8h, v21.8h\n"
+ "fmla v31.8h, v0.8h, v21.8h\n"
+ "fmla v10.8h, v8.8h, v20.8h\n"
+ "fmla v25.8h, v5.8h, v20.8h\n"
+ "ldr q20, [x26, x4]\n"
+ "fmla v28.8h, v2.8h, v21.8h\n"
+ "fmla v16.8h, v2.8h, v22.8h\n"
+ "fmla v23.8h, v5.8h, v21.8h\n"
+ "ldr q21, [x14, x4]\n"
+ "fmla v17.8h, v5.8h, v22.8h\n"
+ "fmla v19.8h, v4.8h, v22.8h\n"
+ "fmla v29.8h, v3.8h, v22.8h\n"
+ "fmla v31.8h, v1.8h, v22.8h\n"
+ "fmla v11.8h, v0.8h, v22.8h\n"
+ "ldr q22, [x14, x28]\n"
+ "fmla v26.8h, v7.8h, v20.8h\n"
+ "fmla v12.8h, v6.8h, v20.8h\n"
+ "ldr q20, [x26, x28]\n"
+ "fmla v28.8h, v4.8h, v21.8h\n"
+ "fmla v16.8h, v3.8h, v21.8h\n"
+ "fmla v27.8h, v1.8h, v21.8h\n"
+ "fmla v30.8h, v0.8h, v21.8h\n"
+ "fmla v23.8h, v7.8h, v21.8h\n"
+ "fmla v17.8h, v6.8h, v21.8h\n"
+ "ldr q21, [x7, x17]\n"
+ "fmla v14.8h, v8.8h, v20.8h\n"
+ "fmla v25.8h, v7.8h, v20.8h\n"
+ "ldr q20, [x12, x4]\n"
+ "fmla v19.8h, v8.8h, v22.8h\n"
+ "fmla v29.8h, v7.8h, v22.8h\n"
+ "fmla v31.8h, v5.8h, v22.8h\n"
+ "fmla v11.8h, v4.8h, v22.8h\n"
+ "fmla v18.8h, v2.8h, v22.8h\n"
+ "fmla v10.8h, v1.8h, v22.8h\n"
+ "ldr q22, [x7, x11]\n"
"add x7, x7, #0x10\n"
- "fmla v20.8h, v7.8h, v11.8h\n"
- "fmla v21.8h, v6.8h, v11.8h\n"
- "fmla v24.8h, v4.8h, v11.8h\n"
- "fmla v25.8h, v3.8h, v11.8h\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "fmla v29.8h, v0.8h, v11.8h\n"
- "ldr q11, [x12, x28]\n"
- "fmla v16.8h, v2.8h, v10.8h\n"
- "fmla v17.8h, v1.8h, v10.8h\n"
- "fmla v18.8h, v0.8h, v10.8h\n"
- "ld1 { v10.8h }, [x14]\n"
- "fmla v30.8h, v2.8h, v11.8h\n"
- "fmla v19.8h, v0.8h, v12.8h\n"
- "fmla v20.8h, v3.8h, v10.8h\n"
- "fmla v24.8h, v0.8h, v10.8h\n"
- "fmla v22.8h, v8.8h, v11.8h\n"
- "fmla v23.8h, v7.8h, v11.8h\n"
- "fmla v26.8h, v5.8h, v11.8h\n"
- "fmla v27.8h, v4.8h, v11.8h\n"
- "fmla v31.8h, v1.8h, v11.8h\n"
- "ldr q11, [x9, x17]\n"
- "fmla v17.8h, v2.8h, v12.8h\n"
- "fmla v18.8h, v1.8h, v12.8h\n"
- "ldr q12, [x14, x25]\n"
+ "fmla v28.8h, v7.8h, v20.8h\n"
+ "fmla v16.8h, v6.8h, v20.8h\n"
+ "fmla v27.8h, v4.8h, v20.8h\n"
+ "fmla v30.8h, v3.8h, v20.8h\n"
+ "fmla v26.8h, v1.8h, v20.8h\n"
+ "fmla v12.8h, v0.8h, v20.8h\n"
+ "ldr q20, [x12, x28]\n"
+ "fmla v23.8h, v2.8h, v21.8h\n"
+ "fmla v17.8h, v1.8h, v21.8h\n"
+ "fmla v19.8h, v0.8h, v21.8h\n"
+ "ld1 { v21.8h }, [x14]\n"
+ "fmla v14.8h, v2.8h, v20.8h\n"
+ "fmla v29.8h, v0.8h, v22.8h\n"
+ "fmla v28.8h, v3.8h, v21.8h\n"
+ "fmla v27.8h, v0.8h, v21.8h\n"
+ "fmla v31.8h, v8.8h, v20.8h\n"
+ "fmla v11.8h, v7.8h, v20.8h\n"
+ "fmla v18.8h, v5.8h, v20.8h\n"
+ "fmla v10.8h, v4.8h, v20.8h\n"
+ "fmla v25.8h, v1.8h, v20.8h\n"
+ "ldr q24, [x9, x17]\n"
+ "fmla v17.8h, v2.8h, v22.8h\n"
+ "fmla v19.8h, v1.8h, v22.8h\n"
+ "ldr q20, [x14, x25]\n"
"add x14, x14, #0x10\n"
- "fmla v16.8h, v6.8h, v10.8h\n"
- "ld1 { v10.8h }, [x12]\n"
- "fmla v29.8h, v4.8h, v11.8h\n"
- "fmla v30.8h, v3.8h, v11.8h\n"
- "fmla v19.8h, v8.8h, v12.8h\n"
- "fmla v23.8h, v5.8h, v12.8h\n"
- "fmla v27.8h, v2.8h, v12.8h\n"
- "ldr q12, [x12, x25]\n"
+ "fmla v23.8h, v6.8h, v21.8h\n"
+ "ld1 { v21.8h }, [x12]\n"
+ "fmla v12.8h, v4.8h, v24.8h\n"
+ "fmla v14.8h, v3.8h, v24.8h\n"
+ "fmla v29.8h, v8.8h, v20.8h\n"
+ "fmla v11.8h, v5.8h, v20.8h\n"
+ "fmla v10.8h, v2.8h, v20.8h\n"
+ "ldr q20, [x12, x25]\n"
"add x12, x12, #0x10\n"
- "fmla v20.8h, v6.8h, v10.8h\n"
- "fmla v24.8h, v3.8h, v10.8h\n"
- "fmla v28.8h, v0.8h, v10.8h\n"
- "ldr q10, [x26, x17]\n"
- "fmla v31.8h, v2.8h, v12.8h\n"
- "fmla v29.8h, v7.8h, v10.8h\n"
- "fmla v30.8h, v6.8h, v10.8h\n"
- "fmla v24.8h, v8.8h, v11.8h\n"
- "fmla v25.8h, v7.8h, v11.8h\n"
- "fmla v26.8h, v6.8h, v11.8h\n"
- "fmla v28.8h, v5.8h, v11.8h\n"
- "ldr q11, [x9, x11]\n"
- "fmla v27.8h, v5.8h, v12.8h\n"
- "fmla v29.8h, v5.8h, v11.8h\n"
- "fmla v30.8h, v4.8h, v11.8h\n"
- "fmla v31.8h, v3.8h, v11.8h\n"
- "fmla v23.8h, v8.8h, v12.8h\n"
- "ldr q12, [x26, x11]\n"
- "fmla v28.8h, v8.8h, v10.8h\n"
- "ldr q10, [x15, x4]\n"
- "fmla v25.8h, v8.8h, v11.8h\n"
- "fmla v26.8h, v7.8h, v11.8h\n"
+ "fmla v28.8h, v6.8h, v21.8h\n"
+ "fmla v27.8h, v3.8h, v21.8h\n"
+ "fmla v26.8h, v0.8h, v21.8h\n"
+ "ldr q22, [x26, x17]\n"
+ "fmla v25.8h, v2.8h, v20.8h\n"
+ "fmla v12.8h, v7.8h, v22.8h\n"
+ "fmla v14.8h, v6.8h, v22.8h\n"
+ "fmla v27.8h, v8.8h, v24.8h\n"
+ "fmla v30.8h, v7.8h, v24.8h\n"
+ "fmla v18.8h, v6.8h, v24.8h\n"
+ "fmla v26.8h, v5.8h, v24.8h\n"
+ "ldr q21, [x9, x11]\n"
+ "fmla v10.8h, v5.8h, v20.8h\n"
+ "fmla v12.8h, v5.8h, v21.8h\n"
+ "fmla v14.8h, v4.8h, v21.8h\n"
+ "fmla v25.8h, v3.8h, v21.8h\n"
+ "fmla v11.8h, v8.8h, v20.8h\n"
+ "ldr q20, [x26, x11]\n"
+ "fmla v26.8h, v8.8h, v22.8h\n"
+ "ldr q9, [x15, x4]\n"
+ "fmla v30.8h, v8.8h, v21.8h\n"
+ "fmla v18.8h, v7.8h, v21.8h\n"
"add x26, x26, #0x10\n"
- "fmla v27.8h, v6.8h, v11.8h\n"
- "ldr q11, [x15, x28]\n"
- "fmla v29.8h, v8.8h, v12.8h\n"
+ "fmla v10.8h, v6.8h, v21.8h\n"
+ "ldr q21, [x15, x28]\n"
+ "fmla v12.8h, v8.8h, v20.8h\n"
"add x15, x15, #0x10\n"
- "fmla v30.8h, v7.8h, v12.8h\n"
- "fmla v31.8h, v6.8h, v12.8h\n"
- "ldr q12, [x9, x4]\n"
- "fmla v16.8h, v4.8h, v10.8h\n"
- "fmla v17.8h, v3.8h, v10.8h\n"
- "fmax v16.8h, v16.8h, v15.8h\n"
- "fmla v20.8h, v1.8h, v10.8h\n"
- "fmla v21.8h, v0.8h, v10.8h\n"
- "ldr q10, [x9, x28]\n"
- "fmax v17.8h, v17.8h, v15.8h\n"
- "fmla v18.8h, v5.8h, v11.8h\n"
- "fmla v19.8h, v4.8h, v11.8h\n"
- "fmax v18.8h, v18.8h, v15.8h\n"
+ "fmla v14.8h, v7.8h, v20.8h\n"
+ "fmla v25.8h, v6.8h, v20.8h\n"
+ "ldr q24, [x9, x4]\n"
+ "fmla v23.8h, v4.8h, v9.8h\n"
+ "fmla v17.8h, v3.8h, v9.8h\n"
+ "fmax v23.8h, v23.8h, v13.8h\n"
+ "fmla v28.8h, v1.8h, v9.8h\n"
+ "fmla v16.8h, v0.8h, v9.8h\n"
+ "ldr q0, [x9, x28]\n"
+ "fmax v17.8h, v17.8h, v13.8h\n"
+ "fmla v19.8h, v5.8h, v21.8h\n"
+ "fmla v29.8h, v4.8h, v21.8h\n"
+ "fmax v19.8h, v19.8h, v13.8h\n"
"add x9, x9, #0x10\n"
- "fmla v22.8h, v2.8h, v11.8h\n"
- "fmla v23.8h, v1.8h, v11.8h\n"
- "fmax v19.8h, v19.8h, v15.8h\n"
- "fmla v24.8h, v7.8h, v12.8h\n"
- "fmla v25.8h, v6.8h, v12.8h\n"
- "fmax v20.8h, v20.8h, v15.8h\n"
- "fmla v28.8h, v4.8h, v12.8h\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
- "fmax v21.8h, v21.8h, v15.8h\n"
- "fmla v26.8h, v8.8h, v10.8h\n"
- "fmla v27.8h, v7.8h, v10.8h\n"
- "fmax v22.8h, v22.8h, v15.8h\n"
- "fmla v30.8h, v5.8h, v10.8h\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
- "fmax v23.8h, v23.8h, v15.8h\n"
- "fmax v24.8h, v24.8h, v15.8h\n"
- "fmax v25.8h, v25.8h, v15.8h\n"
- "fmax v26.8h, v26.8h, v15.8h\n"
- "fmax v27.8h, v27.8h, v15.8h\n"
- "fmax v28.8h, v28.8h, v15.8h\n"
- "fmax v29.8h, v29.8h, v15.8h\n"
- "fmax v30.8h, v30.8h, v15.8h\n"
- "fmax v31.8h, v31.8h, v15.8h\n"
- "fmin v16.8h, v16.8h, v14.8h\n"
- "fmin v17.8h, v17.8h, v14.8h\n"
- "st1 { v16.8h }, [x8]\n"
- "fmin v18.8h, v18.8h, v14.8h\n"
- "fmin v19.8h, v19.8h, v14.8h\n"
+ "fmla v31.8h, v2.8h, v21.8h\n"
+ "fmla v11.8h, v1.8h, v21.8h\n"
+ "fmax v29.8h, v29.8h, v13.8h\n"
+ "fmla v27.8h, v7.8h, v24.8h\n"
+ "fmla v30.8h, v6.8h, v24.8h\n"
+ "fmax v28.8h, v28.8h, v13.8h\n"
+ "fmla v26.8h, v4.8h, v24.8h\n"
+ "fmla v12.8h, v3.8h, v24.8h\n"
+ "fmax v16.8h, v16.8h, v13.8h\n"
+ "fmla v18.8h, v8.8h, v0.8h\n"
+ "fmla v10.8h, v7.8h, v0.8h\n"
+ "fmax v31.8h, v31.8h, v13.8h\n"
+ "fmla v14.8h, v5.8h, v0.8h\n"
+ "fmla v25.8h, v4.8h, v0.8h\n"
+ "fmax v11.8h, v11.8h, v13.8h\n"
+ "fmax v27.8h, v27.8h, v13.8h\n"
+ "fmax v30.8h, v30.8h, v13.8h\n"
+ "fmax v18.8h, v18.8h, v13.8h\n"
+ "fmax v10.8h, v10.8h, v13.8h\n"
+ "fmax v26.8h, v26.8h, v13.8h\n"
+ "fmax v12.8h, v12.8h, v13.8h\n"
+ "fmax v14.8h, v14.8h, v13.8h\n"
+ "fmax v25.8h, v25.8h, v13.8h\n"
+ "fmin v23.8h, v23.8h, v15.8h\n"
+ "fmin v17.8h, v17.8h, v15.8h\n"
+ "st1 { v23.8h }, [x8]\n"
+ "fmin v19.8h, v19.8h, v15.8h\n"
+ "fmin v29.8h, v29.8h, v15.8h\n"
"str q17, [x8, x5]\n"
- "fmin v20.8h, v20.8h, v14.8h\n"
- "fmin v21.8h, v21.8h, v14.8h\n"
- "str q18, [x8, x23]\n"
- "fmin v22.8h, v22.8h, v14.8h\n"
- "fmin v23.8h, v23.8h, v14.8h\n"
- "str q19, [x8, x22]\n"
+ "fmin v28.8h, v28.8h, v15.8h\n"
+ "fmin v16.8h, v16.8h, v15.8h\n"
+ "str q19, [x8, x23]\n"
+ "fmin v31.8h, v31.8h, v15.8h\n"
+ "fmin v11.8h, v11.8h, v15.8h\n"
+ "str q29, [x8, x22]\n"
"add x8, x8, #0x10\n"
- "fmin v24.8h, v24.8h, v14.8h\n"
- "fmin v25.8h, v25.8h, v14.8h\n"
- "st1 { v20.8h }, [x10]\n"
- "fmin v26.8h, v26.8h, v14.8h\n"
- "fmin v27.8h, v27.8h, v14.8h\n"
- "str q21, [x10, x5]\n"
- "fmin v28.8h, v28.8h, v14.8h\n"
- "fmin v29.8h, v29.8h, v14.8h\n"
- "str q22, [x10, x23]\n"
- "fmin v30.8h, v30.8h, v14.8h\n"
- "fmin v31.8h, v31.8h, v14.8h\n"
- "str q23, [x10, x22]\n"
+ "fmin v27.8h, v27.8h, v15.8h\n"
+ "fmin v30.8h, v30.8h, v15.8h\n"
+ "st1 { v28.8h }, [x10]\n"
+ "fmin v18.8h, v18.8h, v15.8h\n"
+ "fmin v10.8h, v10.8h, v15.8h\n"
+ "str q16, [x10, x5]\n"
+ "fmin v26.8h, v26.8h, v15.8h\n"
+ "fmin v12.8h, v12.8h, v15.8h\n"
+ "str q31, [x10, x23]\n"
+ "fmin v14.8h, v14.8h, v15.8h\n"
+ "fmin v25.8h, v25.8h, v15.8h\n"
+ "str q11, [x10, x22]\n"
"add x10, x10, #0x10\n"
- "st1 { v24.8h }, [x27]\n"
- "str q25, [x27, x5]\n"
- "str q26, [x27, x23]\n"
- "str q27, [x27, x22]\n"
+ "st1 { v27.8h }, [x27]\n"
+ "str q30, [x27, x5]\n"
+ "str q18, [x27, x23]\n"
+ "str q10, [x27, x22]\n"
"add x27, x27, #0x10\n"
- "st1 { v28.8h }, [x24]\n"
- "str q29, [x24, x5]\n"
- "str q30, [x24, x23]\n"
- "str q31, [x24, x22]\n"
+ "st1 { v26.8h }, [x24]\n"
+ "str q12, [x24, x5]\n"
+ "str q14, [x24, x23]\n"
+ "str q25, [x24, x22]\n"
"add x24, x24, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x7\n"
"beq 141f\n"
- "ldr q13, [x16, #0x0]\n"
+ "ldr q14, [x16, #0x0]\n"
"ldr q0, [x16, #0x10]\n"
"add x23, x14, x17\n"
"add x22, x7, XZR\n"
@@ -699,27 +699,27 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"ldr h11, [x21, #0x0]\n"
"ldr h12, [x20, #0x0]\n"
"8:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: End
- "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v9.8h\n"
- "mov v17.16b, v13.16b\n fmla v17.8h, v7.8h, v9.8h\n"
+ "mov v16.16b, v14.16b\n fmla v16.8h, v8.8h, v9.8h\n"
+ "mov v17.16b, v14.16b\n fmla v17.8h, v7.8h, v9.8h\n"
"add x20, x26, XZR\n"
- "mov v18.16b, v13.16b\n fmla v18.8h, v6.8h, v9.8h\n"
- "mov v21.16b, v13.16b\n fmla v21.8h, v4.8h, v9.8h\n"
- "mov v22.16b, v13.16b\n fmla v22.8h, v3.8h, v9.8h\n"
- "mov v25.16b, v13.16b\n fmla v25.8h, v1.8h, v9.8h\n"
- "mov v26.16b, v13.16b\n fmla v26.8h, v0.8h, v9.8h\n"
- "mov v19.16b, v13.16b\n fmla v19.8h, v2.8h, v11.8h\n"
- "mov v20.16b, v13.16b\n fmla v20.8h, v5.8h, v9.8h\n"
- "mov v24.16b, v13.16b\n fmla v24.8h, v2.8h, v9.8h\n"
+ "mov v18.16b, v14.16b\n fmla v18.8h, v6.8h, v9.8h\n"
+ "mov v21.16b, v14.16b\n fmla v21.8h, v4.8h, v9.8h\n"
+ "mov v22.16b, v14.16b\n fmla v22.8h, v3.8h, v9.8h\n"
+ "mov v25.16b, v14.16b\n fmla v25.8h, v1.8h, v9.8h\n"
+ "mov v26.16b, v14.16b\n fmla v26.8h, v0.8h, v9.8h\n"
+ "mov v19.16b, v14.16b\n fmla v19.8h, v2.8h, v11.8h\n"
+ "mov v20.16b, v14.16b\n fmla v20.8h, v5.8h, v9.8h\n"
+ "mov v24.16b, v14.16b\n fmla v24.8h, v2.8h, v9.8h\n"
"fmla v16.8h, v0.8h, v10.8h\n"
"fmla v17.8h, v8.8h, v12.8h\n"
"fmla v18.8h, v7.8h, v12.8h\n"
"fmla v19.8h, v6.8h, v12.8h\n"
"fmla v21.8h, v5.8h, v12.8h\n"
"fmla v22.8h, v4.8h, v12.8h\n"
- "mov v23.16b, v13.16b\n fmla v23.8h, v3.8h, v12.8h\n"
+ "mov v23.16b, v14.16b\n fmla v23.8h, v3.8h, v12.8h\n"
"fmla v25.8h, v2.8h, v12.8h\n"
"fmla v26.8h, v1.8h, v12.8h\n"
- "mov v27.16b, v13.16b\n fmla v27.8h, v0.8h, v12.8h\n"
+ "mov v27.16b, v14.16b\n fmla v27.8h, v0.8h, v12.8h\n"
"tbz %x[n_channels], #2, 10f\n"
"ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #1, 9f\n"
@@ -740,7 +740,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"11:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: Unset: Bit 1: Unset
"ldr h10, [x20, #0x0]\n"
"12:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: End
- "mov v28.16b, v13.16b\n fmla v28.8h, v6.8h, v10.8h\n"
+ "mov v28.16b, v14.16b\n fmla v28.8h, v6.8h, v10.8h\n"
"add x20, x26, x25\n"
"tbz %x[n_channels], #2, 14f\n"
"ldr d11, [x20], #0x8\n"
@@ -762,7 +762,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"15:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: Unset: Bit 1: Unset
"ldr h11, [x20, #0x0]\n"
"16:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: End
- "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v11.8h\n"
+ "mov v31.16b, v14.16b\n fmla v31.8h, v8.8h, v11.8h\n"
"add x20, x12, x17\n"
"tbz %x[n_channels], #2, 18f\n"
"ldr d9, [x20], #0x8\n"
@@ -792,8 +792,8 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"fmla v25.8h, v4.8h, v9.8h\n"
"fmla v26.8h, v3.8h, v9.8h\n"
"fmla v28.8h, v2.8h, v9.8h\n"
- "mov v29.16b, v13.16b\n fmla v29.8h, v1.8h, v9.8h\n"
- "mov v30.16b, v13.16b\n fmla v30.8h, v0.8h, v9.8h\n"
+ "mov v29.16b, v14.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+ "mov v30.16b, v14.16b\n fmla v30.8h, v0.8h, v9.8h\n"
"tbz %x[n_channels], #2, 22f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 21f\n"
@@ -1513,40 +1513,40 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"136:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: End
"fmla v26.8h, v8.8h, v10.8h\n"
"fmla v27.8h, v7.8h, v10.8h\n"
- "fmax v16.8h, v16.8h, v15.8h\n"
+ "fmax v16.8h, v16.8h, v13.8h\n"
"fmla v30.8h, v5.8h, v10.8h\n"
"fmla v31.8h, v4.8h, v10.8h\n"
- "fmax v17.8h, v17.8h, v15.8h\n"
- "fmax v18.8h, v18.8h, v15.8h\n"
- "fmax v19.8h, v19.8h, v15.8h\n"
- "fmax v20.8h, v20.8h, v15.8h\n"
- "fmax v21.8h, v21.8h, v15.8h\n"
- "fmax v22.8h, v22.8h, v15.8h\n"
- "fmax v23.8h, v23.8h, v15.8h\n"
- "fmax v24.8h, v24.8h, v15.8h\n"
- "fmax v25.8h, v25.8h, v15.8h\n"
- "fmax v26.8h, v26.8h, v15.8h\n"
- "fmax v27.8h, v27.8h, v15.8h\n"
- "fmax v28.8h, v28.8h, v15.8h\n"
- "fmax v29.8h, v29.8h, v15.8h\n"
- "fmax v30.8h, v30.8h, v15.8h\n"
- "fmax v31.8h, v31.8h, v15.8h\n"
- "fmin v16.8h, v16.8h, v14.8h\n"
- "fmin v17.8h, v17.8h, v14.8h\n"
- "fmin v18.8h, v18.8h, v14.8h\n"
- "fmin v19.8h, v19.8h, v14.8h\n"
- "fmin v20.8h, v20.8h, v14.8h\n"
- "fmin v21.8h, v21.8h, v14.8h\n"
- "fmin v22.8h, v22.8h, v14.8h\n"
- "fmin v23.8h, v23.8h, v14.8h\n"
- "fmin v24.8h, v24.8h, v14.8h\n"
- "fmin v25.8h, v25.8h, v14.8h\n"
- "fmin v26.8h, v26.8h, v14.8h\n"
- "fmin v27.8h, v27.8h, v14.8h\n"
- "fmin v28.8h, v28.8h, v14.8h\n"
- "fmin v29.8h, v29.8h, v14.8h\n"
- "fmin v30.8h, v30.8h, v14.8h\n"
- "fmin v31.8h, v31.8h, v14.8h\n"
+ "fmax v17.8h, v17.8h, v13.8h\n"
+ "fmax v18.8h, v18.8h, v13.8h\n"
+ "fmax v19.8h, v19.8h, v13.8h\n"
+ "fmax v20.8h, v20.8h, v13.8h\n"
+ "fmax v21.8h, v21.8h, v13.8h\n"
+ "fmax v22.8h, v22.8h, v13.8h\n"
+ "fmax v23.8h, v23.8h, v13.8h\n"
+ "fmax v24.8h, v24.8h, v13.8h\n"
+ "fmax v25.8h, v25.8h, v13.8h\n"
+ "fmax v26.8h, v26.8h, v13.8h\n"
+ "fmax v27.8h, v27.8h, v13.8h\n"
+ "fmax v28.8h, v28.8h, v13.8h\n"
+ "fmax v29.8h, v29.8h, v13.8h\n"
+ "fmax v30.8h, v30.8h, v13.8h\n"
+ "fmax v31.8h, v31.8h, v13.8h\n"
+ "fmin v16.8h, v16.8h, v15.8h\n"
+ "fmin v17.8h, v17.8h, v15.8h\n"
+ "fmin v18.8h, v18.8h, v15.8h\n"
+ "fmin v19.8h, v19.8h, v15.8h\n"
+ "fmin v20.8h, v20.8h, v15.8h\n"
+ "fmin v21.8h, v21.8h, v15.8h\n"
+ "fmin v22.8h, v22.8h, v15.8h\n"
+ "fmin v23.8h, v23.8h, v15.8h\n"
+ "fmin v24.8h, v24.8h, v15.8h\n"
+ "fmin v25.8h, v25.8h, v15.8h\n"
+ "fmin v26.8h, v26.8h, v15.8h\n"
+ "fmin v27.8h, v27.8h, v15.8h\n"
+ "fmin v28.8h, v28.8h, v15.8h\n"
+ "fmin v29.8h, v29.8h, v15.8h\n"
+ "fmin v30.8h, v30.8h, v15.8h\n"
+ "fmin v31.8h, v31.8h, v15.8h\n"
"tbz %x[n_channels], #2, 138f\n"
"mov x23, x8\n"
"mov x22, x10\n"
@@ -1712,7 +1712,6 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"st1 { v27.h }[0], [x21]\n"
"st1 { v31.h }[0], [x20]\n"
"140:" // Tile loop: Oddments: Store: Bit 2: End
-
"141:" // Tile loop: End
"ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
index 16326150fd..96feeeeece 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -98,629 +98,629 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "mov x8, #0x10\n" // cntb _, ALL, #1
- "lsr x17, %x[n_channels], #0x3\n"
- "ldr x16, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mov x6, #0x10\n" // cntb _, ALL, #1
+ "lsr x7, %x[n_channels], #0x3\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v15.8h }, [x20]\n"
+ "ld1r { v13.8h }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
"ld1r { v14.8h }, [x20]\n"
- "add x14, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "mov x13, #0x0\n"
- "sub x12, XZR, x8\n"
- "cbz x17, 3f\n"
- "ldr q13, [x15, #0x0]\n"
- "ldr q0, [x15, #0x10]\n"
- "cmp x8, x17, LSL #4\n"
- "ldr q1, [x15, #0x20]\n"
- "ldr q2, [x15, #0x30]\n"
- "ldr q3, [x15, #0x40]\n"
- "ldr q4, [x15, #0x50]\n"
- "ldr q5, [x15, #0x60]\n"
- "ldr q6, [x15, #0x70]\n"
- "ldr q7, [x15, #0x80]\n"
- "ldr q8, [x15, #0x90]\n"
- "add x15, x15, #0xa0\n"
- "ldp x11, x10, [x14, #0x0]\n"
- "ldr q9, [x11, x13]\n"
- "ldr q10, [x10, x13]\n"
- "ldp x9, x28, [x14, #0x10]\n"
- "ldr q11, [x9, x13]\n"
- "ldr q12, [x28, x13]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x15, #0x0\n"
+ "sub x14, XZR, x6\n"
+ "cbz x7, 3f\n"
+ "ldr q30, [x17, #0x0]\n"
+ "ldr q0, [x17, #0x10]\n"
+ "cmp x6, x7, LSL #4\n"
+ "ldr q1, [x17, #0x20]\n"
+ "ldr q2, [x17, #0x30]\n"
+ "ldr q3, [x17, #0x40]\n"
+ "ldr q4, [x17, #0x50]\n"
+ "ldr q5, [x17, #0x60]\n"
+ "ldr q6, [x17, #0x70]\n"
+ "ldr q7, [x17, #0x80]\n"
+ "ldr q8, [x17, #0x90]\n"
+ "add x17, x17, #0xa0\n"
+ "ldp x21, x20, [x16, #0x0]\n"
+ "ldr q9, [x21, x15]\n"
+ "ldr q10, [x20, x15]\n"
+ "ldp x21, x20, [x16, #0x10]\n"
+ "ldr q11, [x21, x15]\n"
+ "ldr q12, [x20, x15]\n"
"bge 2f\n"
"1:" // Channel loop
- "mov v21.16b, v13.16b\n fmla v21.8h, v4.8h, v9.8h\n"
- "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v9.8h\n"
- "ldr x27, [x14, #0x20]\n"
- "ldr x26, [x14, #0x30]\n"
- "mov v22.16b, v13.16b\n fmla v22.8h, v3.8h, v9.8h\n"
- "mov v25.16b, v13.16b\n fmla v25.8h, v1.8h, v9.8h\n"
- "ldr x25, [x14, #0x28]\n"
- "ldr x24, [x14, #0x38]\n"
- "mov v26.16b, v13.16b\n fmla v26.8h, v0.8h, v9.8h\n"
- "mov v17.16b, v13.16b\n fmla v17.8h, v7.8h, v9.8h\n"
- "ldr x11, [x14, #0x40]\n"
- "ldr x10, [x14, #0x48]\n"
- "mov v18.16b, v13.16b\n fmla v18.8h, v6.8h, v9.8h\n"
- "fmla v21.8h, v5.8h, v12.8h\n"
- "ldr x9, [x14, #0x50]\n"
- "ldr x28, [x14, #0x58]\n"
- "mov v20.16b, v13.16b\n fmla v20.8h, v5.8h, v9.8h\n"
- "mov v24.16b, v13.16b\n fmla v24.8h, v2.8h, v9.8h\n"
- "ldr q9, [x26, x13]\n"
- "ldr x26, [x14, #0x70]\n"
- "fmla v16.8h, v0.8h, v10.8h\n"
- "ldr q10, [x27, x13]\n"
- "mov v19.16b, v13.16b\n fmla v19.8h, v2.8h, v11.8h\n"
- "ldr q11, [x25, x13]\n"
- "fmla v22.8h, v4.8h, v12.8h\n"
- "fmla v25.8h, v2.8h, v12.8h\n"
- "ldr x27, [x14, #0x60]\n"
- "ldr x25, [x14, #0x68]\n"
- "fmla v26.8h, v1.8h, v12.8h\n"
- "fmla v17.8h, v8.8h, v12.8h\n"
- "ldr x23, [x16, #0x0]\n"
- "ldr x22, [x16, #0x8]\n"
- "fmla v18.8h, v7.8h, v12.8h\n"
- "mov v28.16b, v13.16b\n fmla v28.8h, v6.8h, v10.8h\n"
- "ldr q10, [x10, x13]\n"
- "ldr x10, [x14, #0x88]\n"
- "fmla v21.8h, v7.8h, v9.8h\n"
- "fmla v19.8h, v6.8h, v12.8h\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "mov v23.16b, v13.16b\n fmla v23.8h, v3.8h, v12.8h\n"
- "mov v27.16b, v13.16b\n fmla v27.8h, v0.8h, v12.8h\n"
- "ldr q12, [x24, x13]\n"
- "ldr x24, [x14, #0x78]\n"
- "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v11.8h\n"
- "ldr q11, [x11, x13]\n"
- "fmla v22.8h, v6.8h, v9.8h\n"
- "ldr x11, [x14, #0x80]\n"
- "fmla v25.8h, v4.8h, v9.8h\n"
- "fmla v26.8h, v3.8h, v9.8h\n"
- "add x12, x12, #0x10\n"
- "fmla v20.8h, v8.8h, v9.8h\n"
+ "mov v23.16b, v30.16b\n fmla v23.8h, v4.8h, v9.8h\n"
+ "mov v17.16b, v30.16b\n fmla v17.8h, v8.8h, v9.8h\n"
+ "ldr x27, [x16, #0x20]\n"
+ "ldr x24, [x16, #0x30]\n"
+ "mov v25.16b, v30.16b\n fmla v25.8h, v3.8h, v9.8h\n"
+ "mov v28.16b, v30.16b\n fmla v28.8h, v1.8h, v9.8h\n"
+ "ldr x23, [x16, #0x28]\n"
+ "ldr x22, [x16, #0x38]\n"
+ "mov v20.16b, v30.16b\n fmla v20.8h, v0.8h, v9.8h\n"
+ "mov v16.16b, v30.16b\n fmla v16.8h, v7.8h, v9.8h\n"
+ "ldr x26, [x16, #0x40]\n"
+ "ldr x20, [x16, #0x48]\n"
+ "mov v15.16b, v30.16b\n fmla v15.8h, v6.8h, v9.8h\n"
+ "fmla v23.8h, v5.8h, v12.8h\n"
+ "ldr x25, [x16, #0x50]\n"
+ "ldr x21, [x16, #0x58]\n"
+ "mov v27.16b, v30.16b\n fmla v27.8h, v5.8h, v9.8h\n"
+ "mov v31.16b, v30.16b\n fmla v31.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x24, x15]\n"
+ "ldr x13, [x16, #0x70]\n"
+ "fmla v17.8h, v0.8h, v10.8h\n"
+ "ldr q22, [x27, x15]\n"
+ "mov v10.16b, v30.16b\n fmla v10.8h, v2.8h, v11.8h\n"
+ "ldr q18, [x23, x15]\n"
+ "fmla v25.8h, v4.8h, v12.8h\n"
+ "fmla v28.8h, v2.8h, v12.8h\n"
+ "ldr x24, [x16, #0x60]\n"
+ "ldr x23, [x16, #0x68]\n"
+ "fmla v20.8h, v1.8h, v12.8h\n"
+ "fmla v16.8h, v8.8h, v12.8h\n"
+ "ldr x12, [x8, #0x0]\n"
+ "ldr x11, [x8, #0x8]\n"
+ "fmla v15.8h, v7.8h, v12.8h\n"
+ "mov v29.16b, v30.16b\n fmla v29.8h, v6.8h, v22.8h\n"
+ "ldr q22, [x20, x15]\n"
+ "ldr x28, [x16, #0x88]\n"
+ "fmla v23.8h, v7.8h, v9.8h\n"
+ "fmla v10.8h, v6.8h, v12.8h\n"
+ "ldr x10, [x8, #0x10]\n"
+ "ldr x9, [x8, #0x18]\n"
+ "mov v21.16b, v30.16b\n fmla v21.8h, v3.8h, v12.8h\n"
+ "mov v19.16b, v30.16b\n fmla v19.8h, v0.8h, v12.8h\n"
+ "ldr q11, [x22, x15]\n"
+ "ldr x22, [x16, #0x78]\n"
+ "mov v24.16b, v30.16b\n fmla v24.8h, v8.8h, v18.8h\n"
+ "ldr q12, [x26, x15]\n"
+ "fmla v25.8h, v6.8h, v9.8h\n"
+ "ldr x20, [x16, #0x80]\n"
+ "fmla v28.8h, v4.8h, v9.8h\n"
+ "fmla v20.8h, v3.8h, v9.8h\n"
+ "add x14, x14, #0x10\n"
+ "mov v26.16b, v30.16b\n fmla v26.8h, v1.8h, v9.8h\n"
+ "mov v18.16b, v30.16b\n fmla v18.8h, v0.8h, v9.8h\n"
+ "ldr q30, [x17, #0x0]\n"
+ "fmla v27.8h, v8.8h, v9.8h\n"
+ "fmla v31.8h, v5.8h, v9.8h\n"
+ "fmla v29.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x25, x15]\n"
+ "fmla v17.8h, v1.8h, v11.8h\n"
+ "ldr x27, [x16, #0x90]\n"
+ "fmla v16.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x21, x15]\n"
+ "fmla v15.8h, v2.8h, v12.8h\n"
+ "ldr x21, [x16, #0x98]\n"
+ "fmla v23.8h, v8.8h, v22.8h\n"
+ "fmla v10.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x24, x15]\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla v25.8h, v7.8h, v22.8h\n"
+ "fmla v21.8h, v6.8h, v22.8h\n"
+ "fmla v28.8h, v5.8h, v22.8h\n"
+ "fmla v20.8h, v4.8h, v22.8h\n"
+ "fmla v19.8h, v3.8h, v22.8h\n"
+ "fmla v26.8h, v2.8h, v22.8h\n"
+ "fmla v18.8h, v1.8h, v22.8h\n"
+ "fmla v24.8h, v0.8h, v22.8h\n"
+ "ldr q22, [x23, x15]\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla v17.8h, v3.8h, v9.8h\n"
+ "fmla v27.8h, v0.8h, v9.8h\n"
+ "fmla v31.8h, v6.8h, v12.8h\n"
+ "fmla v29.8h, v3.8h, v12.8h\n"
+ "ldr q9, [x13, x15]\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla v16.8h, v4.8h, v22.8h\n"
+ "fmla v15.8h, v3.8h, v22.8h\n"
+ "fmla v23.8h, v1.8h, v22.8h\n"
+ "fmla v10.8h, v5.8h, v11.8h\n"
+ "fmla v21.8h, v2.8h, v11.8h\n"
+ "ldr q12, [x22, x15]\n"
+ "fmla v25.8h, v0.8h, v22.8h\n"
+ "ldr x23, [x16, #0xb8]\n"
+ "fmla v19.8h, v8.8h, v9.8h\n"
"fmla v24.8h, v5.8h, v9.8h\n"
- "fmla v28.8h, v2.8h, v9.8h\n"
- "fmla v16.8h, v1.8h, v12.8h\n"
- "fmla v17.8h, v0.8h, v12.8h\n"
- "ldr q12, [x28, x13]\n"
- "fmla v18.8h, v2.8h, v11.8h\n"
- "ldr x28, [x14, #0x98]\n"
- "fmla v21.8h, v8.8h, v10.8h\n"
- "fmla v19.8h, v1.8h, v11.8h\n"
- "ldr q11, [x27, x13]\n"
- "ldr x27, [x14, #0xa0]\n"
- "fmla v22.8h, v7.8h, v10.8h\n"
- "fmla v23.8h, v6.8h, v10.8h\n"
- "fmla v25.8h, v5.8h, v10.8h\n"
- "fmla v26.8h, v4.8h, v10.8h\n"
- "fmla v27.8h, v3.8h, v10.8h\n"
- "fmla v31.8h, v0.8h, v10.8h\n"
- "fmla v24.8h, v6.8h, v11.8h\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x26, x13]\n"
- "ldr x26, [x14, #0xb0]\n"
- "fmla v19.8h, v5.8h, v12.8h\n"
+ "ldr q11, [x20, x15]\n"
+ "ldr x22, [x16, #0xc0]\n"
+ "fmla v17.8h, v5.8h, v22.8h\n"
+ "fmla v27.8h, v2.8h, v22.8h\n"
+ "ldr q22, [x28, x15]\n"
+ "ldr x20, [x16, #0xc8]\n"
+ "fmla v16.8h, v5.8h, v12.8h\n"
+ "fmla v15.8h, v4.8h, v12.8h\n"
"fmla v23.8h, v2.8h, v12.8h\n"
- "ldr q12, [x24, x13]\n"
- "ldr x24, [x14, #0xb8]\n"
- "fmla v27.8h, v8.8h, v11.8h\n"
- "fmla v31.8h, v5.8h, v11.8h\n"
- "mov v29.16b, v13.16b\n fmla v29.8h, v1.8h, v9.8h\n"
- "mov v30.16b, v13.16b\n fmla v30.8h, v0.8h, v9.8h\n"
- "ldr q9, [x9, x13]\n"
- "ldr x9, [x14, #0x90]\n"
- "fmla v29.8h, v2.8h, v10.8h\n"
- "fmla v30.8h, v1.8h, v10.8h\n"
- "ldr q10, [x25, x13]\n"
- "ldr x25, [x14, #0xa8]\n"
- "fmla v16.8h, v3.8h, v9.8h\n"
- "fmla v20.8h, v0.8h, v9.8h\n"
- "ldr q11, [x11, x13]\n"
- "ldr x11, [x14, #0xc0]\n"
- "fmla v17.8h, v4.8h, v10.8h\n"
- "fmla v18.8h, v3.8h, v10.8h\n"
- "fmla v21.8h, v1.8h, v10.8h\n"
- "fmla v22.8h, v0.8h, v10.8h\n"
- "fmla v16.8h, v5.8h, v10.8h\n"
- "fmla v20.8h, v2.8h, v10.8h\n"
- "ldr q10, [x10, x13]\n"
- "ldr x10, [x14, #0xc8]\n"
- "fmla v17.8h, v5.8h, v12.8h\n"
- "fmla v18.8h, v4.8h, v12.8h\n"
- "fmla v21.8h, v2.8h, v12.8h\n"
- "fmla v19.8h, v3.8h, v12.8h\n"
- "fmla v22.8h, v1.8h, v12.8h\n"
- "fmla v23.8h, v0.8h, v12.8h\n"
- "ldr q12, [x28, x13]\n"
- "ldr x28, [x14, #0xd8]\n"
- "fmla v28.8h, v7.8h, v11.8h\n"
- "fmla v29.8h, v6.8h, v11.8h\n"
- "ldr q11, [x9, x13]\n"
- "ldr x9, [x14, #0xd0]\n"
- "fmla v16.8h, v7.8h, v10.8h\n"
- "fmla v17.8h, v6.8h, v10.8h\n"
- "fmla v20.8h, v4.8h, v10.8h\n"
- "fmla v21.8h, v3.8h, v10.8h\n"
- "fmla v24.8h, v1.8h, v10.8h\n"
- "fmla v25.8h, v0.8h, v10.8h\n"
- "ldr q10, [x27, x13]\n"
- "ldr x27, [x14, #0xe0]\n"
- "fmla v18.8h, v8.8h, v12.8h\n"
- "fmla v30.8h, v8.8h, v11.8h\n"
- "fmla v31.8h, v7.8h, v11.8h\n"
- "ldr q11, [x25, x13]\n"
- "fmla v27.8h, v1.8h, v12.8h\n"
- "ldr x25, [x14, #0xe8]\n"
- "fmla v19.8h, v7.8h, v12.8h\n"
- "fmla v22.8h, v5.8h, v12.8h\n"
- "fmla v23.8h, v4.8h, v12.8h\n"
- "fmla v26.8h, v2.8h, v12.8h\n"
- "ldr q12, [x26, x13]\n"
- "ldr x26, [x14, #0xf0]\n"
- "fmla v16.8h, v2.8h, v10.8h\n"
- "fmla v17.8h, v1.8h, v10.8h\n"
- "fmla v18.8h, v0.8h, v10.8h\n"
- "ldr q10, [x24, x13]\n"
- "fmla v20.8h, v7.8h, v11.8h\n"
- "ldr x24, [x14, #0xf8]\n"
- "fmla v21.8h, v6.8h, v11.8h\n"
- "fmla v24.8h, v4.8h, v11.8h\n"
- "fmla v25.8h, v3.8h, v11.8h\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "fmla v29.8h, v0.8h, v11.8h\n"
- "ldr q11, [x11, x13]\n"
- "fmla v27.8h, v4.8h, v11.8h\n"
- "ldr x11, [x14, #0x100]\n"
- "fmla v30.8h, v2.8h, v11.8h\n"
- "fmla v17.8h, v2.8h, v12.8h\n"
- "fmla v18.8h, v1.8h, v12.8h\n"
- "fmla v19.8h, v0.8h, v12.8h\n"
- "ldr q12, [x10, x13]\n"
- "ldr x10, [x14, #0x108]\n"
- "fmla v16.8h, v6.8h, v10.8h\n"
- "fmla v20.8h, v3.8h, v10.8h\n"
- "fmla v24.8h, v0.8h, v10.8h\n"
- "ldr q10, [x9, x13]\n"
- "fmla v22.8h, v8.8h, v11.8h\n"
- "ldr x9, [x14, #0x110]\n"
- "fmla v23.8h, v7.8h, v11.8h\n"
- "fmla v26.8h, v5.8h, v11.8h\n"
- "fmla v31.8h, v1.8h, v11.8h\n"
- "ldr q11, [x28, x13]\n"
- "fmla v27.8h, v2.8h, v12.8h\n"
- "ldr x28, [x14, #0x118]\n"
- "fmla v28.8h, v0.8h, v10.8h\n"
- "fmla v29.8h, v4.8h, v11.8h\n"
- "fmla v30.8h, v3.8h, v11.8h\n"
- "fmla v19.8h, v8.8h, v12.8h\n"
- "fmla v23.8h, v5.8h, v12.8h\n"
- "ldr q12, [x27, x13]\n"
- "fmla v20.8h, v6.8h, v10.8h\n"
- "fmla v24.8h, v3.8h, v10.8h\n"
- "ldr q10, [x25, x13]\n"
- "fmla v25.8h, v7.8h, v11.8h\n"
+ "fmla v10.8h, v3.8h, v12.8h\n"
+ "fmla v25.8h, v1.8h, v12.8h\n"
+ "fmla v21.8h, v0.8h, v12.8h\n"
+ "ldr q9, [x21, x15]\n"
+ "ldr x28, [x16, #0xd8]\n"
+ "fmla v29.8h, v7.8h, v11.8h\n"
"fmla v26.8h, v6.8h, v11.8h\n"
- "fmla v28.8h, v5.8h, v11.8h\n"
- "fmla v27.8h, v5.8h, v12.8h\n"
- "fmla v31.8h, v2.8h, v12.8h\n"
- "fmla v29.8h, v7.8h, v10.8h\n"
- "fmla v30.8h, v6.8h, v10.8h\n"
- "fmla v24.8h, v8.8h, v11.8h\n"
- "ldr q11, [x26, x13]\n"
- "fmla v28.8h, v8.8h, v10.8h\n"
- "ldr q10, [x11, x13]\n"
- "fmla v25.8h, v8.8h, v11.8h\n"
- "fmla v26.8h, v7.8h, v11.8h\n"
- "fmla v27.8h, v6.8h, v11.8h\n"
- "fmla v29.8h, v5.8h, v11.8h\n"
- "fmla v30.8h, v4.8h, v11.8h\n"
- "fmla v31.8h, v3.8h, v11.8h\n"
- "ldr q11, [x10, x13]\n"
- "ldp x11, x10, [x14, #0x0]\n"
- "fmla v23.8h, v8.8h, v12.8h\n"
- "ldr q12, [x24, x13]\n"
- "fmla v16.8h, v4.8h, v10.8h\n"
- "fmax v16.8h, v16.8h, v15.8h\n"
- "fmla v17.8h, v3.8h, v10.8h\n"
- "fmla v18.8h, v5.8h, v11.8h\n"
- "fmax v17.8h, v17.8h, v15.8h\n"
- "fmla v19.8h, v4.8h, v11.8h\n"
- "fmla v29.8h, v8.8h, v12.8h\n"
- "fmax v18.8h, v18.8h, v15.8h\n"
- "fmla v30.8h, v7.8h, v12.8h\n"
- "fmla v31.8h, v6.8h, v12.8h\n"
- "ldr q12, [x9, x13]\n"
- "fmax v19.8h, v19.8h, v15.8h\n"
- "fmla v20.8h, v1.8h, v10.8h\n"
- "fmla v21.8h, v0.8h, v10.8h\n"
- "ldr q10, [x28, x13]\n"
- "ldr q9, [x11, x8]\n"
- "fmla v22.8h, v2.8h, v11.8h\n"
- "ldr q13, [x15, #0x0]\n"
- "fmla v23.8h, v1.8h, v11.8h\n"
- "ldr q0, [x15, #0x10]\n"
- "ldr q1, [x15, #0x20]\n"
+ "ldr q12, [x27, x15]\n"
+ "ldr x21, [x16, #0xd0]\n"
+ "fmla v17.8h, v7.8h, v22.8h\n"
+ "fmla v16.8h, v6.8h, v22.8h\n"
+ "fmla v27.8h, v4.8h, v22.8h\n"
+ "fmla v23.8h, v3.8h, v22.8h\n"
+ "fmla v31.8h, v1.8h, v22.8h\n"
+ "fmla v28.8h, v0.8h, v22.8h\n"
+ "ldr q11, [x26, x15]\n"
+ "ldr x27, [x16, #0xe0]\n"
+ "fmla v15.8h, v8.8h, v9.8h\n"
+ "fmla v18.8h, v8.8h, v12.8h\n"
"fmla v24.8h, v7.8h, v12.8h\n"
- "fmla v25.8h, v6.8h, v12.8h\n"
- "ldr q2, [x15, #0x30]\n"
- "fmla v26.8h, v8.8h, v10.8h\n"
- "ldr q6, [x15, #0x70]\n"
- "fmla v27.8h, v7.8h, v10.8h\n"
- "ldr q7, [x15, #0x80]\n"
- "fmin v16.8h, v16.8h, v14.8h\n"
+ "ldr q12, [x25, x15]\n"
+ "fmla v19.8h, v1.8h, v9.8h\n"
+ "ldr x26, [x16, #0xe8]\n"
+ "fmla v10.8h, v7.8h, v9.8h\n"
+ "fmla v25.8h, v5.8h, v9.8h\n"
+ "fmla v21.8h, v4.8h, v9.8h\n"
+ "fmla v20.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x24, x15]\n"
+ "ldr x24, [x16, #0xf0]\n"
+ "fmla v17.8h, v2.8h, v11.8h\n"
+ "fmla v16.8h, v1.8h, v11.8h\n"
+ "fmla v15.8h, v0.8h, v11.8h\n"
+ "ldr q22, [x23, x15]\n"
+ "fmla v27.8h, v7.8h, v12.8h\n"
+ "ldr x25, [x16, #0xf8]\n"
+ "fmla v23.8h, v6.8h, v12.8h\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "fmla v26.8h, v0.8h, v12.8h\n"
+ "ldr q11, [x22, x15]\n"
+ "fmla v19.8h, v4.8h, v11.8h\n"
+ "ldr x23, [x16, #0x100]\n"
+ "fmla v18.8h, v2.8h, v11.8h\n"
+ "fmla v16.8h, v2.8h, v9.8h\n"
+ "fmla v15.8h, v1.8h, v9.8h\n"
+ "fmla v10.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x20, x15]\n"
+ "ldr x20, [x16, #0x108]\n"
+ "fmla v17.8h, v6.8h, v22.8h\n"
+ "fmla v27.8h, v3.8h, v22.8h\n"
+ "fmla v31.8h, v0.8h, v22.8h\n"
+ "ldr q22, [x21, x15]\n"
+ "fmla v25.8h, v8.8h, v11.8h\n"
+ "ldr x22, [x16, #0x110]\n"
+ "fmla v21.8h, v7.8h, v11.8h\n"
+ "fmla v20.8h, v5.8h, v11.8h\n"
+ "fmla v24.8h, v1.8h, v11.8h\n"
+ "ldr q12, [x28, x15]\n"
+ "fmla v19.8h, v2.8h, v9.8h\n"
+ "ldr x21, [x16, #0x118]\n"
+ "fmla v29.8h, v0.8h, v22.8h\n"
+ "fmla v26.8h, v4.8h, v12.8h\n"
+ "fmla v18.8h, v3.8h, v12.8h\n"
+ "fmla v10.8h, v8.8h, v9.8h\n"
+ "fmla v21.8h, v5.8h, v9.8h\n"
+ "ldr q11, [x27, x15]\n"
+ "fmla v27.8h, v6.8h, v22.8h\n"
+ "fmla v31.8h, v3.8h, v22.8h\n"
+ "ldr q22, [x26, x15]\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "fmla v20.8h, v6.8h, v12.8h\n"
+ "fmla v29.8h, v5.8h, v12.8h\n"
+ "fmla v19.8h, v5.8h, v11.8h\n"
+ "fmla v24.8h, v2.8h, v11.8h\n"
+ "fmla v26.8h, v7.8h, v22.8h\n"
+ "fmla v18.8h, v6.8h, v22.8h\n"
+ "fmla v31.8h, v8.8h, v12.8h\n"
+ "ldr q12, [x24, x15]\n"
+ "fmla v29.8h, v8.8h, v22.8h\n"
+ "ldr q22, [x23, x15]\n"
+ "fmla v28.8h, v8.8h, v12.8h\n"
+ "fmla v20.8h, v7.8h, v12.8h\n"
+ "fmla v19.8h, v6.8h, v12.8h\n"
+ "fmla v26.8h, v5.8h, v12.8h\n"
+ "fmla v18.8h, v4.8h, v12.8h\n"
+ "fmla v24.8h, v3.8h, v12.8h\n"
+ "ldr q12, [x20, x15]\n"
+ "ldp x20, x24, [x16, #0x0]\n"
+ "ldr q9, [x20, x6]\n"
+ "fmla v21.8h, v8.8h, v11.8h\n"
+ "ldr q11, [x25, x15]\n"
+ "fmla v17.8h, v4.8h, v22.8h\n"
+ "fmla v16.8h, v3.8h, v22.8h\n"
+ "fmla v15.8h, v5.8h, v12.8h\n"
+ "fmax v17.8h, v17.8h, v13.8h\n"
+ "fmla v10.8h, v4.8h, v12.8h\n"
+ "fmla v26.8h, v8.8h, v11.8h\n"
+ "fmax v16.8h, v16.8h, v13.8h\n"
+ "fmla v18.8h, v7.8h, v11.8h\n"
+ "fmla v24.8h, v6.8h, v11.8h\n"
+ "ldr q11, [x22, x15]\n"
+ "fmax v15.8h, v15.8h, v13.8h\n"
+ "fmla v27.8h, v1.8h, v22.8h\n"
+ "fmla v23.8h, v0.8h, v22.8h\n"
+ "ldr q22, [x21, x15]\n"
+ "ldr q0, [x17, #0x10]\n"
+ "fmla v25.8h, v2.8h, v12.8h\n"
+ "ldr q2, [x17, #0x30]\n"
+ "fmla v21.8h, v1.8h, v12.8h\n"
+ "ldr q1, [x17, #0x20]\n"
+ "fmax v10.8h, v10.8h, v13.8h\n"
+ "fmla v31.8h, v7.8h, v11.8h\n"
+ "fmla v28.8h, v6.8h, v11.8h\n"
+ "ldr q6, [x17, #0x70]\n"
+ "fmla v20.8h, v8.8h, v22.8h\n"
+ "ldr q8, [x17, #0x90]\n"
+ "fmla v19.8h, v7.8h, v22.8h\n"
+ "ldr q7, [x17, #0x80]\n"
"fmin v17.8h, v17.8h, v14.8h\n"
- "str q16, [x23, x12]\n"
- "ldr q8, [x15, #0x90]\n"
- "fmin v18.8h, v18.8h, v14.8h\n"
- "fmin v19.8h, v19.8h, v14.8h\n"
- "str q17, [x22, x12]\n"
- "ldr x23, [x16, #0x20]\n"
- "fmax v20.8h, v20.8h, v15.8h\n"
- "fmax v21.8h, v21.8h, v15.8h\n"
- "str q18, [x21, x12]\n"
- "ldr x22, [x16, #0x28]\n"
- "fmax v22.8h, v22.8h, v15.8h\n"
- "fmax v23.8h, v23.8h, v15.8h\n"
- "str q19, [x20, x12]\n"
- "ldr x21, [x16, #0x30]\n"
- "ldr x20, [x16, #0x38]\n"
- "fmla v28.8h, v4.8h, v12.8h\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
- "ldr q3, [x15, #0x40]\n"
- "fmla v30.8h, v5.8h, v10.8h\n"
- "ldr q5, [x15, #0x60]\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
- "ldr q10, [x10, x8]\n"
- "ldr q4, [x15, #0x50]\n"
- "fmin v20.8h, v20.8h, v14.8h\n"
- "fmin v21.8h, v21.8h, v14.8h\n"
- "str q20, [x23, x12]\n"
- "fmin v22.8h, v22.8h, v14.8h\n"
+ "fmin v16.8h, v16.8h, v14.8h\n"
+ "str q17, [x12, x14]\n"
+ "ldr x23, [x8, #0x20]\n"
+ "fmin v15.8h, v15.8h, v14.8h\n"
+ "fmin v10.8h, v10.8h, v14.8h\n"
+ "str q16, [x11, x14]\n"
+ "ldr x22, [x8, #0x28]\n"
+ "fmax v27.8h, v27.8h, v13.8h\n"
+ "fmax v23.8h, v23.8h, v13.8h\n"
+ "str q15, [x10, x14]\n"
+ "ldr x21, [x8, #0x30]\n"
+ "fmax v25.8h, v25.8h, v13.8h\n"
+ "fmax v21.8h, v21.8h, v13.8h\n"
+ "str q10, [x9, x14]\n"
+ "ldr x20, [x8, #0x38]\n"
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "fmla v26.8h, v3.8h, v11.8h\n"
+ "ldr q3, [x17, #0x40]\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "fmla v18.8h, v5.8h, v22.8h\n"
+ "ldr q5, [x17, #0x60]\n"
+ "fmla v24.8h, v4.8h, v22.8h\n"
+ "ldr q10, [x24, x6]\n"
+ "ldr q4, [x17, #0x50]\n"
"fmin v23.8h, v23.8h, v14.8h\n"
- "str q21, [x22, x12]\n"
- "ldr x23, [x16, #0x40]\n"
- "fmax v24.8h, v24.8h, v15.8h\n"
- "fmax v25.8h, v25.8h, v15.8h\n"
- "str q22, [x21, x12]\n"
- "ldr x22, [x16, #0x48]\n"
- "fmax v26.8h, v26.8h, v15.8h\n"
- "fmax v27.8h, v27.8h, v15.8h\n"
- "str q23, [x20, x12]\n"
- "ldr x21, [x16, #0x50]\n"
- "ldr x20, [x16, #0x58]\n"
- "ldp x9, x28, [x14, #0x10]\n"
- "fmin v24.8h, v24.8h, v14.8h\n"
"fmin v25.8h, v25.8h, v14.8h\n"
- "ldr q11, [x9, x8]\n"
- "ldr q12, [x28, x8]\n"
- "fmin v26.8h, v26.8h, v14.8h\n"
- "fmin v27.8h, v27.8h, v14.8h\n"
- "fmax v28.8h, v28.8h, v15.8h\n"
- "fmax v29.8h, v29.8h, v15.8h\n"
- "str q24, [x23, x12]\n"
- "ldr x23, [x16, #0x60]\n"
- "fmax v30.8h, v30.8h, v15.8h\n"
- "fmax v31.8h, v31.8h, v15.8h\n"
- "str q25, [x22, x12]\n"
- "ldr x22, [x16, #0x68]\n"
- "str q26, [x21, x12]\n"
- "ldr x21, [x16, #0x70]\n"
- "add x8, x8, #0x10\n"
- "cmp x8, x17, LSL #4\n"
- "str q27, [x20, x12]\n"
- "ldr x20, [x16, #0x78]\n"
+ "str q27, [x23, x14]\n"
+ "fmin v21.8h, v21.8h, v14.8h\n"
+ "fmax v31.8h, v31.8h, v13.8h\n"
+ "str q23, [x22, x14]\n"
+ "ldr x25, [x8, #0x40]\n"
+ "fmax v28.8h, v28.8h, v13.8h\n"
+ "fmax v20.8h, v20.8h, v13.8h\n"
+ "str q25, [x21, x14]\n"
+ "ldr x23, [x8, #0x48]\n"
+ "fmax v19.8h, v19.8h, v13.8h\n"
+ "str q21, [x20, x14]\n"
+ "ldr x22, [x8, #0x50]\n"
+ "ldr x24, [x8, #0x58]\n"
+ "ldp x21, x20, [x16, #0x10]\n"
+ "ldr q11, [x21, x6]\n"
+ "fmin v31.8h, v31.8h, v14.8h\n"
"fmin v28.8h, v28.8h, v14.8h\n"
+ "ldr q12, [x20, x6]\n"
+ "fmin v20.8h, v20.8h, v14.8h\n"
+ "fmin v19.8h, v19.8h, v14.8h\n"
+ "str q31, [x25, x14]\n"
+ "fmax v29.8h, v29.8h, v13.8h\n"
+ "fmax v26.8h, v26.8h, v13.8h\n"
+ "str q28, [x23, x14]\n"
+ "ldr x23, [x8, #0x60]\n"
+ "fmax v18.8h, v18.8h, v13.8h\n"
+ "fmax v24.8h, v24.8h, v13.8h\n"
+ "str q20, [x22, x14]\n"
+ "ldr x22, [x8, #0x68]\n"
+ "str q19, [x24, x14]\n"
+ "ldr x21, [x8, #0x70]\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x6, x6, #0x10\n"
+ "cmp x6, x7, LSL #4\n"
"fmin v29.8h, v29.8h, v14.8h\n"
- "fmin v30.8h, v30.8h, v14.8h\n"
- "fmin v31.8h, v31.8h, v14.8h\n"
- "add x13, x13, #0x10\n"
- "str q28, [x23, x12]\n"
- "str q29, [x22, x12]\n"
- "add x15, x15, #0xa0\n"
- "str q30, [x21, x12]\n"
- "str q31, [x20, x12]\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "add x15, x15, #0x10\n"
+ "fmin v18.8h, v18.8h, v14.8h\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "str q29, [x23, x14]\n"
+ "add x17, x17, #0xa0\n"
+ "str q26, [x22, x14]\n"
+ "str q18, [x21, x14]\n"
+ "str q24, [x20, x14]\n"
"blt 1b\n"
"2:" // Channel tail
- "mov v21.16b, v13.16b\n fmla v21.8h, v4.8h, v9.8h\n"
- "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v9.8h\n"
- "ldr x27, [x14, #0x20]\n"
- "ldr x26, [x14, #0x30]\n"
- "mov v22.16b, v13.16b\n fmla v22.8h, v3.8h, v9.8h\n"
- "mov v25.16b, v13.16b\n fmla v25.8h, v1.8h, v9.8h\n"
- "ldr x25, [x14, #0x28]\n"
- "ldr x24, [x14, #0x38]\n"
- "mov v26.16b, v13.16b\n fmla v26.8h, v0.8h, v9.8h\n"
- "mov v17.16b, v13.16b\n fmla v17.8h, v7.8h, v9.8h\n"
- "ldr x11, [x14, #0x40]\n"
- "ldr x10, [x14, #0x48]\n"
- "mov v18.16b, v13.16b\n fmla v18.8h, v6.8h, v9.8h\n"
- "fmla v21.8h, v5.8h, v12.8h\n"
- "ldr x9, [x14, #0x50]\n"
- "ldr x28, [x14, #0x58]\n"
- "mov v20.16b, v13.16b\n fmla v20.8h, v5.8h, v9.8h\n"
- "mov v24.16b, v13.16b\n fmla v24.8h, v2.8h, v9.8h\n"
- "ldr q9, [x26, x13]\n"
- "ldr x26, [x14, #0x70]\n"
- "fmla v16.8h, v0.8h, v10.8h\n"
- "ldr q10, [x27, x13]\n"
- "mov v19.16b, v13.16b\n fmla v19.8h, v2.8h, v11.8h\n"
- "ldr q11, [x25, x13]\n"
- "fmla v22.8h, v4.8h, v12.8h\n"
- "fmla v25.8h, v2.8h, v12.8h\n"
- "ldr x27, [x14, #0x60]\n"
- "ldr x25, [x14, #0x68]\n"
- "fmla v26.8h, v1.8h, v12.8h\n"
- "fmla v17.8h, v8.8h, v12.8h\n"
- "ldr x23, [x16, #0x0]\n"
- "ldr x22, [x16, #0x8]\n"
- "fmla v18.8h, v7.8h, v12.8h\n"
- "mov v28.16b, v13.16b\n fmla v28.8h, v6.8h, v10.8h\n"
- "ldr q10, [x10, x13]\n"
- "ldr x10, [x14, #0x88]\n"
- "fmla v21.8h, v7.8h, v9.8h\n"
- "fmla v19.8h, v6.8h, v12.8h\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "mov v23.16b, v13.16b\n fmla v23.8h, v3.8h, v12.8h\n"
- "mov v27.16b, v13.16b\n fmla v27.8h, v0.8h, v12.8h\n"
- "ldr q12, [x24, x13]\n"
- "ldr x24, [x14, #0x78]\n"
- "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v11.8h\n"
- "ldr q11, [x11, x13]\n"
- "fmla v22.8h, v6.8h, v9.8h\n"
- "ldr x11, [x14, #0x80]\n"
- "fmla v25.8h, v4.8h, v9.8h\n"
- "fmla v26.8h, v3.8h, v9.8h\n"
- "add x12, x12, #0x10\n"
- "fmla v20.8h, v8.8h, v9.8h\n"
- "fmla v24.8h, v5.8h, v9.8h\n"
- "fmla v28.8h, v2.8h, v9.8h\n"
- "fmla v16.8h, v1.8h, v12.8h\n"
- "fmla v17.8h, v0.8h, v12.8h\n"
- "ldr q12, [x28, x13]\n"
- "fmla v18.8h, v2.8h, v11.8h\n"
- "ldr x28, [x14, #0x98]\n"
- "fmla v21.8h, v8.8h, v10.8h\n"
- "fmla v19.8h, v1.8h, v11.8h\n"
- "ldr q11, [x27, x13]\n"
- "ldr x27, [x14, #0xa0]\n"
- "fmla v22.8h, v7.8h, v10.8h\n"
- "fmla v23.8h, v6.8h, v10.8h\n"
- "fmla v25.8h, v5.8h, v10.8h\n"
- "fmla v26.8h, v4.8h, v10.8h\n"
- "fmla v27.8h, v3.8h, v10.8h\n"
- "fmla v31.8h, v0.8h, v10.8h\n"
- "fmla v24.8h, v6.8h, v11.8h\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x26, x13]\n"
- "ldr x26, [x14, #0xb0]\n"
- "fmla v19.8h, v5.8h, v12.8h\n"
- "fmla v23.8h, v2.8h, v12.8h\n"
- "ldr q12, [x24, x13]\n"
- "ldr x24, [x14, #0xb8]\n"
- "fmla v27.8h, v8.8h, v11.8h\n"
- "fmla v31.8h, v5.8h, v11.8h\n"
- "mov v29.16b, v13.16b\n fmla v29.8h, v1.8h, v9.8h\n"
- "mov v30.16b, v13.16b\n fmla v30.8h, v0.8h, v9.8h\n"
- "ldr q9, [x9, x13]\n"
- "ldr x9, [x14, #0x90]\n"
- "fmla v29.8h, v2.8h, v10.8h\n"
- "fmla v30.8h, v1.8h, v10.8h\n"
- "ldr q10, [x25, x13]\n"
- "ldr x25, [x14, #0xa8]\n"
- "fmla v16.8h, v3.8h, v9.8h\n"
- "fmla v20.8h, v0.8h, v9.8h\n"
- "ldr q11, [x11, x13]\n"
- "ldr x11, [x14, #0xc0]\n"
- "fmla v17.8h, v4.8h, v10.8h\n"
- "fmla v18.8h, v3.8h, v10.8h\n"
- "fmla v21.8h, v1.8h, v10.8h\n"
- "fmla v22.8h, v0.8h, v10.8h\n"
- "fmla v16.8h, v5.8h, v10.8h\n"
- "fmla v20.8h, v2.8h, v10.8h\n"
- "ldr q10, [x10, x13]\n"
- "ldr x10, [x14, #0xc8]\n"
- "fmla v17.8h, v5.8h, v12.8h\n"
- "fmla v18.8h, v4.8h, v12.8h\n"
- "fmla v21.8h, v2.8h, v12.8h\n"
- "fmla v19.8h, v3.8h, v12.8h\n"
- "fmla v22.8h, v1.8h, v12.8h\n"
- "fmla v23.8h, v0.8h, v12.8h\n"
- "ldr q12, [x28, x13]\n"
- "ldr x28, [x14, #0xd8]\n"
- "fmla v28.8h, v7.8h, v11.8h\n"
- "fmla v29.8h, v6.8h, v11.8h\n"
- "ldr q11, [x9, x13]\n"
- "ldr x9, [x14, #0xd0]\n"
- "fmla v16.8h, v7.8h, v10.8h\n"
- "fmla v17.8h, v6.8h, v10.8h\n"
- "fmla v20.8h, v4.8h, v10.8h\n"
- "fmla v21.8h, v3.8h, v10.8h\n"
- "fmla v24.8h, v1.8h, v10.8h\n"
- "fmla v25.8h, v0.8h, v10.8h\n"
- "ldr q10, [x27, x13]\n"
- "ldr x27, [x14, #0xe0]\n"
- "fmla v18.8h, v8.8h, v12.8h\n"
- "fmla v30.8h, v8.8h, v11.8h\n"
- "fmla v31.8h, v7.8h, v11.8h\n"
- "ldr q11, [x25, x13]\n"
- "fmla v27.8h, v1.8h, v12.8h\n"
- "ldr x25, [x14, #0xe8]\n"
- "fmla v19.8h, v7.8h, v12.8h\n"
- "fmla v22.8h, v5.8h, v12.8h\n"
- "fmla v23.8h, v4.8h, v12.8h\n"
- "fmla v26.8h, v2.8h, v12.8h\n"
- "ldr q12, [x26, x13]\n"
- "ldr x26, [x14, #0xf0]\n"
- "fmla v16.8h, v2.8h, v10.8h\n"
- "fmla v17.8h, v1.8h, v10.8h\n"
- "fmla v18.8h, v0.8h, v10.8h\n"
- "ldr q10, [x24, x13]\n"
- "fmla v20.8h, v7.8h, v11.8h\n"
- "ldr x24, [x14, #0xf8]\n"
- "fmla v21.8h, v6.8h, v11.8h\n"
- "fmla v24.8h, v4.8h, v11.8h\n"
- "fmla v25.8h, v3.8h, v11.8h\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "fmla v29.8h, v0.8h, v11.8h\n"
- "ldr q11, [x11, x13]\n"
- "fmla v27.8h, v4.8h, v11.8h\n"
- "ldr x11, [x14, #0x100]\n"
- "fmla v30.8h, v2.8h, v11.8h\n"
- "fmla v17.8h, v2.8h, v12.8h\n"
- "fmla v18.8h, v1.8h, v12.8h\n"
- "fmla v19.8h, v0.8h, v12.8h\n"
- "ldr q12, [x10, x13]\n"
- "ldr x10, [x14, #0x108]\n"
- "fmla v16.8h, v6.8h, v10.8h\n"
- "fmla v20.8h, v3.8h, v10.8h\n"
- "fmla v24.8h, v0.8h, v10.8h\n"
- "ldr q10, [x9, x13]\n"
- "fmla v22.8h, v8.8h, v11.8h\n"
- "ldr x9, [x14, #0x110]\n"
- "fmla v23.8h, v7.8h, v11.8h\n"
- "fmla v26.8h, v5.8h, v11.8h\n"
- "fmla v31.8h, v1.8h, v11.8h\n"
- "ldr q11, [x28, x13]\n"
- "fmla v27.8h, v2.8h, v12.8h\n"
- "ldr x28, [x14, #0x118]\n"
- "fmla v28.8h, v0.8h, v10.8h\n"
- "fmla v29.8h, v4.8h, v11.8h\n"
- "fmla v30.8h, v3.8h, v11.8h\n"
- "fmla v19.8h, v8.8h, v12.8h\n"
- "fmla v23.8h, v5.8h, v12.8h\n"
- "ldr q12, [x27, x13]\n"
- "fmla v20.8h, v6.8h, v10.8h\n"
- "fmla v24.8h, v3.8h, v10.8h\n"
- "ldr q10, [x25, x13]\n"
- "fmla v25.8h, v7.8h, v11.8h\n"
- "fmla v26.8h, v6.8h, v11.8h\n"
- "fmla v28.8h, v5.8h, v11.8h\n"
- "fmla v27.8h, v5.8h, v12.8h\n"
- "fmla v31.8h, v2.8h, v12.8h\n"
- "fmla v29.8h, v7.8h, v10.8h\n"
- "fmla v30.8h, v6.8h, v10.8h\n"
- "fmla v24.8h, v8.8h, v11.8h\n"
- "ldr q11, [x26, x13]\n"
- "fmla v28.8h, v8.8h, v10.8h\n"
- "ldr q10, [x11, x13]\n"
- "fmla v25.8h, v8.8h, v11.8h\n"
- "fmla v26.8h, v7.8h, v11.8h\n"
- "fmla v27.8h, v6.8h, v11.8h\n"
- "fmla v29.8h, v5.8h, v11.8h\n"
- "fmla v30.8h, v4.8h, v11.8h\n"
- "fmla v31.8h, v3.8h, v11.8h\n"
- "ldr q11, [x10, x13]\n"
- "fmla v23.8h, v8.8h, v12.8h\n"
- "ldr q12, [x24, x13]\n"
- "fmla v16.8h, v4.8h, v10.8h\n"
- "fmax v16.8h, v16.8h, v15.8h\n"
- "fmla v17.8h, v3.8h, v10.8h\n"
- "fmla v18.8h, v5.8h, v11.8h\n"
- "fmax v17.8h, v17.8h, v15.8h\n"
- "fmla v19.8h, v4.8h, v11.8h\n"
- "fmla v29.8h, v8.8h, v12.8h\n"
- "fmax v18.8h, v18.8h, v15.8h\n"
- "fmla v30.8h, v7.8h, v12.8h\n"
- "fmla v31.8h, v6.8h, v12.8h\n"
- "ldr q12, [x9, x13]\n"
- "fmax v19.8h, v19.8h, v15.8h\n"
- "fmla v20.8h, v1.8h, v10.8h\n"
- "fmla v21.8h, v0.8h, v10.8h\n"
- "ldr q10, [x28, x13]\n"
- "fmin v16.8h, v16.8h, v14.8h\n"
- "fmla v22.8h, v2.8h, v11.8h\n"
- "fmla v23.8h, v1.8h, v11.8h\n"
+ "mov v31.16b, v30.16b\n fmla v31.8h, v4.8h, v9.8h\n"
+ "mov v17.16b, v30.16b\n fmla v17.8h, v8.8h, v9.8h\n"
+ "ldr x27, [x16, #0x20]\n"
+ "ldr x24, [x16, #0x30]\n"
+ "mov v15.16b, v30.16b\n fmla v15.8h, v3.8h, v9.8h\n"
+ "mov v29.16b, v30.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+ "ldr x23, [x16, #0x28]\n"
+ "ldr x22, [x16, #0x38]\n"
+ "mov v19.16b, v30.16b\n fmla v19.8h, v0.8h, v9.8h\n"
+ "mov v20.16b, v30.16b\n fmla v20.8h, v7.8h, v9.8h\n"
+ "ldr x26, [x16, #0x40]\n"
+ "ldr x21, [x16, #0x48]\n"
+ "mov v21.16b, v30.16b\n fmla v21.8h, v6.8h, v9.8h\n"
+ "fmla v31.8h, v5.8h, v12.8h\n"
+ "ldr x25, [x16, #0x50]\n"
+ "ldr x20, [x16, #0x58]\n"
+ "mov v18.16b, v30.16b\n fmla v18.8h, v5.8h, v9.8h\n"
+ "mov v27.16b, v30.16b\n fmla v27.8h, v2.8h, v9.8h\n"
+ "ldr q24, [x24, x15]\n"
+ "ldr x13, [x16, #0x70]\n"
+ "fmla v17.8h, v0.8h, v10.8h\n"
+ "ldr q22, [x27, x15]\n"
+ "mov v28.16b, v30.16b\n fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr q16, [x23, x15]\n"
+ "fmla v15.8h, v4.8h, v12.8h\n"
+ "fmla v29.8h, v2.8h, v12.8h\n"
+ "ldr x24, [x16, #0x60]\n"
+ "ldr x23, [x16, #0x68]\n"
+ "fmla v19.8h, v1.8h, v12.8h\n"
+ "fmla v20.8h, v8.8h, v12.8h\n"
+ "ldr x12, [x8, #0x0]\n"
+ "ldr x11, [x8, #0x8]\n"
+ "fmla v21.8h, v7.8h, v12.8h\n"
+ "mov v10.16b, v30.16b\n fmla v10.8h, v6.8h, v22.8h\n"
+ "ldr q22, [x21, x15]\n"
+ "ldr x28, [x16, #0x88]\n"
+ "fmla v31.8h, v7.8h, v24.8h\n"
+ "fmla v28.8h, v6.8h, v12.8h\n"
+ "ldr x10, [x8, #0x10]\n"
+ "ldr x9, [x8, #0x18]\n"
+ "mov v9.16b, v30.16b\n fmla v9.8h, v3.8h, v12.8h\n"
+ "mov v11.16b, v30.16b\n fmla v11.8h, v0.8h, v12.8h\n"
+ "ldr q23, [x22, x15]\n"
+ "ldr x22, [x16, #0x78]\n"
+ "mov v12.16b, v30.16b\n fmla v12.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x26, x15]\n"
+ "fmla v15.8h, v6.8h, v24.8h\n"
+ "ldr x21, [x16, #0x80]\n"
+ "fmla v29.8h, v4.8h, v24.8h\n"
+ "fmla v19.8h, v3.8h, v24.8h\n"
+ "add x14, x14, #0x10\n"
+ "mov v26.16b, v30.16b\n fmla v26.8h, v1.8h, v24.8h\n"
+ "mov v25.16b, v30.16b\n fmla v25.8h, v0.8h, v24.8h\n"
+ "fmla v18.8h, v8.8h, v24.8h\n"
+ "fmla v27.8h, v5.8h, v24.8h\n"
+ "fmla v10.8h, v2.8h, v24.8h\n"
+ "ldr q24, [x25, x15]\n"
+ "fmla v17.8h, v1.8h, v23.8h\n"
+ "ldr x27, [x16, #0x90]\n"
+ "fmla v20.8h, v0.8h, v23.8h\n"
+ "ldr q23, [x20, x15]\n"
+ "fmla v21.8h, v2.8h, v16.8h\n"
+ "ldr x20, [x16, #0x98]\n"
+ "fmla v31.8h, v8.8h, v22.8h\n"
+ "fmla v28.8h, v1.8h, v16.8h\n"
+ "ldr q16, [x24, x15]\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla v15.8h, v7.8h, v22.8h\n"
+ "fmla v9.8h, v6.8h, v22.8h\n"
+ "fmla v29.8h, v5.8h, v22.8h\n"
+ "fmla v19.8h, v4.8h, v22.8h\n"
+ "fmla v11.8h, v3.8h, v22.8h\n"
+ "fmla v26.8h, v2.8h, v22.8h\n"
+ "fmla v25.8h, v1.8h, v22.8h\n"
+ "fmla v12.8h, v0.8h, v22.8h\n"
+ "ldr q22, [x23, x15]\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla v17.8h, v3.8h, v24.8h\n"
+ "fmla v18.8h, v0.8h, v24.8h\n"
+ "fmla v27.8h, v6.8h, v16.8h\n"
+ "fmla v10.8h, v3.8h, v16.8h\n"
+ "ldr q16, [x13, x15]\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla v20.8h, v4.8h, v22.8h\n"
+ "fmla v21.8h, v3.8h, v22.8h\n"
+ "fmla v31.8h, v1.8h, v22.8h\n"
+ "fmla v28.8h, v5.8h, v23.8h\n"
+ "fmla v9.8h, v2.8h, v23.8h\n"
+ "ldr q23, [x22, x15]\n"
+ "fmla v15.8h, v0.8h, v22.8h\n"
+ "ldr x23, [x16, #0xb8]\n"
+ "fmla v11.8h, v8.8h, v16.8h\n"
+ "fmla v12.8h, v5.8h, v16.8h\n"
+ "ldr q16, [x21, x15]\n"
+ "ldr x22, [x16, #0xc0]\n"
+ "fmla v17.8h, v5.8h, v22.8h\n"
+ "fmla v18.8h, v2.8h, v22.8h\n"
+ "ldr q22, [x28, x15]\n"
+ "ldr x21, [x16, #0xc8]\n"
+ "fmla v20.8h, v5.8h, v23.8h\n"
+ "fmla v21.8h, v4.8h, v23.8h\n"
+ "fmla v31.8h, v2.8h, v23.8h\n"
+ "fmla v28.8h, v3.8h, v23.8h\n"
+ "fmla v15.8h, v1.8h, v23.8h\n"
+ "fmla v9.8h, v0.8h, v23.8h\n"
+ "ldr q23, [x20, x15]\n"
+ "ldr x28, [x16, #0xd8]\n"
+ "fmla v10.8h, v7.8h, v16.8h\n"
+ "fmla v26.8h, v6.8h, v16.8h\n"
+ "ldr q16, [x27, x15]\n"
+ "ldr x20, [x16, #0xd0]\n"
+ "fmla v17.8h, v7.8h, v22.8h\n"
+ "fmla v20.8h, v6.8h, v22.8h\n"
+ "fmla v18.8h, v4.8h, v22.8h\n"
+ "fmla v31.8h, v3.8h, v22.8h\n"
+ "fmla v27.8h, v1.8h, v22.8h\n"
+ "fmla v29.8h, v0.8h, v22.8h\n"
+ "ldr q22, [x26, x15]\n"
+ "ldr x27, [x16, #0xe0]\n"
+ "fmla v21.8h, v8.8h, v23.8h\n"
+ "fmla v25.8h, v8.8h, v16.8h\n"
+ "fmla v12.8h, v7.8h, v16.8h\n"
+ "ldr q16, [x25, x15]\n"
+ "fmla v11.8h, v1.8h, v23.8h\n"
+ "ldr x26, [x16, #0xe8]\n"
+ "fmla v28.8h, v7.8h, v23.8h\n"
+ "fmla v15.8h, v5.8h, v23.8h\n"
+ "fmla v9.8h, v4.8h, v23.8h\n"
+ "fmla v19.8h, v2.8h, v23.8h\n"
+ "ldr q23, [x24, x15]\n"
+ "ldr x25, [x16, #0xf0]\n"
+ "fmla v17.8h, v2.8h, v22.8h\n"
+ "fmla v20.8h, v1.8h, v22.8h\n"
+ "fmla v21.8h, v0.8h, v22.8h\n"
+ "ldr q22, [x23, x15]\n"
+ "fmla v18.8h, v7.8h, v16.8h\n"
+ "ldr x24, [x16, #0xf8]\n"
+ "fmla v31.8h, v6.8h, v16.8h\n"
+ "fmla v27.8h, v4.8h, v16.8h\n"
+ "fmla v29.8h, v3.8h, v16.8h\n"
+ "fmla v10.8h, v1.8h, v16.8h\n"
+ "fmla v26.8h, v0.8h, v16.8h\n"
+ "ldr q16, [x22, x15]\n"
+ "fmla v11.8h, v4.8h, v16.8h\n"
+ "ldr x23, [x16, #0x100]\n"
+ "fmla v25.8h, v2.8h, v16.8h\n"
+ "fmla v20.8h, v2.8h, v23.8h\n"
+ "fmla v21.8h, v1.8h, v23.8h\n"
+ "fmla v28.8h, v0.8h, v23.8h\n"
+ "ldr q23, [x21, x15]\n"
+ "ldr x22, [x16, #0x108]\n"
+ "fmla v17.8h, v6.8h, v22.8h\n"
+ "fmla v18.8h, v3.8h, v22.8h\n"
+ "fmla v27.8h, v0.8h, v22.8h\n"
+ "ldr q22, [x20, x15]\n"
+ "fmla v15.8h, v8.8h, v16.8h\n"
+ "ldr x21, [x16, #0x110]\n"
+ "fmla v9.8h, v7.8h, v16.8h\n"
+ "fmla v19.8h, v5.8h, v16.8h\n"
+ "fmla v12.8h, v1.8h, v16.8h\n"
+ "ldr q16, [x28, x15]\n"
+ "fmla v11.8h, v2.8h, v23.8h\n"
+ "ldr x20, [x16, #0x118]\n"
+ "fmla v10.8h, v0.8h, v22.8h\n"
+ "fmla v26.8h, v4.8h, v16.8h\n"
+ "fmla v25.8h, v3.8h, v16.8h\n"
+ "fmla v28.8h, v8.8h, v23.8h\n"
+ "fmla v9.8h, v5.8h, v23.8h\n"
+ "ldr q23, [x27, x15]\n"
+ "fmla v18.8h, v6.8h, v22.8h\n"
+ "fmla v27.8h, v3.8h, v22.8h\n"
+ "ldr q22, [x26, x15]\n"
+ "fmla v29.8h, v7.8h, v16.8h\n"
+ "fmla v19.8h, v6.8h, v16.8h\n"
+ "fmla v10.8h, v5.8h, v16.8h\n"
+ "fmla v11.8h, v5.8h, v23.8h\n"
+ "fmla v12.8h, v2.8h, v23.8h\n"
+ "fmla v26.8h, v7.8h, v22.8h\n"
+ "fmla v25.8h, v6.8h, v22.8h\n"
+ "fmla v27.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x25, x15]\n"
+ "fmla v10.8h, v8.8h, v22.8h\n"
+ "ldr q30, [x23, x15]\n"
+ "fmla v29.8h, v8.8h, v16.8h\n"
+ "fmla v19.8h, v7.8h, v16.8h\n"
+ "fmla v11.8h, v6.8h, v16.8h\n"
+ "fmla v26.8h, v5.8h, v16.8h\n"
+ "fmla v25.8h, v4.8h, v16.8h\n"
+ "fmla v12.8h, v3.8h, v16.8h\n"
+ "ldr q24, [x22, x15]\n"
+ "fmla v9.8h, v8.8h, v23.8h\n"
+ "ldr q16, [x24, x15]\n"
+ "fmla v17.8h, v4.8h, v30.8h\n"
+ "fmax v17.8h, v17.8h, v13.8h\n"
+ "fmla v20.8h, v3.8h, v30.8h\n"
+ "fmla v21.8h, v5.8h, v24.8h\n"
+ "fmax v20.8h, v20.8h, v13.8h\n"
+ "fmla v28.8h, v4.8h, v24.8h\n"
+ "fmla v26.8h, v8.8h, v16.8h\n"
+ "fmax v21.8h, v21.8h, v13.8h\n"
+ "fmla v25.8h, v7.8h, v16.8h\n"
+ "fmla v12.8h, v6.8h, v16.8h\n"
+ "ldr q23, [x21, x15]\n"
+ "fmax v28.8h, v28.8h, v13.8h\n"
+ "fmla v18.8h, v1.8h, v30.8h\n"
+ "fmla v31.8h, v0.8h, v30.8h\n"
+ "ldr q16, [x20, x15]\n"
"fmin v17.8h, v17.8h, v14.8h\n"
- "str q16, [x23, x12]\n"
- "fmla v24.8h, v7.8h, v12.8h\n"
- "fmla v25.8h, v6.8h, v12.8h\n"
- "fmin v18.8h, v18.8h, v14.8h\n"
- "str q17, [x22, x12]\n"
- "fmla v26.8h, v8.8h, v10.8h\n"
- "fmla v27.8h, v7.8h, v10.8h\n"
- "fmin v19.8h, v19.8h, v14.8h\n"
- "str q18, [x21, x12]\n"
- "fmax v20.8h, v20.8h, v15.8h\n"
- "fmax v21.8h, v21.8h, v15.8h\n"
- "str q19, [x20, x12]\n"
- "ldr x23, [x16, #0x20]\n"
- "fmax v22.8h, v22.8h, v15.8h\n"
- "fmax v23.8h, v23.8h, v15.8h\n"
- "ldr x22, [x16, #0x28]\n"
- "ldr x21, [x16, #0x30]\n"
- "ldr x20, [x16, #0x38]\n"
- "fmla v28.8h, v4.8h, v12.8h\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
+ "fmla v15.8h, v2.8h, v24.8h\n"
+ "fmla v9.8h, v1.8h, v24.8h\n"
"fmin v20.8h, v20.8h, v14.8h\n"
- "fmla v30.8h, v5.8h, v10.8h\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
+ "str q17, [x12, x14]\n"
+ "fmla v27.8h, v7.8h, v23.8h\n"
+ "fmla v29.8h, v6.8h, v23.8h\n"
"fmin v21.8h, v21.8h, v14.8h\n"
- "str q20, [x23, x12]\n"
- "fmin v22.8h, v22.8h, v14.8h\n"
- "fmin v23.8h, v23.8h, v14.8h\n"
- "str q21, [x22, x12]\n"
- "ldr x23, [x16, #0x40]\n"
- "fmax v24.8h, v24.8h, v15.8h\n"
- "fmax v25.8h, v25.8h, v15.8h\n"
- "str q22, [x21, x12]\n"
- "ldr x22, [x16, #0x48]\n"
- "fmax v26.8h, v26.8h, v15.8h\n"
- "fmax v27.8h, v27.8h, v15.8h\n"
- "str q23, [x20, x12]\n"
- "ldr x21, [x16, #0x50]\n"
- "ldr x20, [x16, #0x58]\n"
- "fmin v24.8h, v24.8h, v14.8h\n"
- "fmin v25.8h, v25.8h, v14.8h\n"
- "str q24, [x23, x12]\n"
- "fmin v26.8h, v26.8h, v14.8h\n"
- "fmin v27.8h, v27.8h, v14.8h\n"
- "str q25, [x22, x12]\n"
- "ldr x23, [x16, #0x60]\n"
- "fmax v28.8h, v28.8h, v15.8h\n"
- "fmax v29.8h, v29.8h, v15.8h\n"
- "str q26, [x21, x12]\n"
- "ldr x22, [x16, #0x68]\n"
- "fmax v30.8h, v30.8h, v15.8h\n"
- "fmax v31.8h, v31.8h, v15.8h\n"
- "str q27, [x20, x12]\n"
- "ldr x21, [x16, #0x70]\n"
- "ldr x20, [x16, #0x78]\n"
+ "str q20, [x11, x14]\n"
+ "fmla v19.8h, v8.8h, v16.8h\n"
+ "fmla v11.8h, v7.8h, v16.8h\n"
"fmin v28.8h, v28.8h, v14.8h\n"
- "fmin v29.8h, v29.8h, v14.8h\n"
- "str q28, [x23, x12]\n"
- "fmin v30.8h, v30.8h, v14.8h\n"
+ "str q21, [x10, x14]\n"
+ "fmax v18.8h, v18.8h, v13.8h\n"
+ "fmax v31.8h, v31.8h, v13.8h\n"
+ "str q28, [x9, x14]\n"
+ "ldr x23, [x8, #0x20]\n"
+ "fmax v15.8h, v15.8h, v13.8h\n"
+ "fmax v9.8h, v9.8h, v13.8h\n"
+ "ldr x22, [x8, #0x28]\n"
+ "ldr x21, [x8, #0x30]\n"
+ "ldr x20, [x8, #0x38]\n"
+ "fmla v10.8h, v4.8h, v23.8h\n"
+ "fmla v26.8h, v3.8h, v23.8h\n"
+ "fmin v18.8h, v18.8h, v14.8h\n"
+ "fmla v25.8h, v5.8h, v16.8h\n"
+ "fmla v12.8h, v4.8h, v16.8h\n"
"fmin v31.8h, v31.8h, v14.8h\n"
- "str q29, [x22, x12]\n"
- "add x13, x13, #0x10\n"
- "str q30, [x21, x12]\n"
- "str q31, [x20, x12]\n"
+ "str q18, [x23, x14]\n"
+ "fmin v15.8h, v15.8h, v14.8h\n"
+ "fmin v9.8h, v9.8h, v14.8h\n"
+ "str q31, [x22, x14]\n"
+ "ldr x23, [x8, #0x40]\n"
+ "fmax v27.8h, v27.8h, v13.8h\n"
+ "fmax v29.8h, v29.8h, v13.8h\n"
+ "str q15, [x21, x14]\n"
+ "ldr x22, [x8, #0x48]\n"
+ "fmax v19.8h, v19.8h, v13.8h\n"
+ "fmax v11.8h, v11.8h, v13.8h\n"
+ "str q9, [x20, x14]\n"
+ "ldr x21, [x8, #0x50]\n"
+ "ldr x20, [x8, #0x58]\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "str q27, [x23, x14]\n"
+ "fmin v19.8h, v19.8h, v14.8h\n"
+ "fmin v11.8h, v11.8h, v14.8h\n"
+ "str q29, [x22, x14]\n"
+ "ldr x23, [x8, #0x60]\n"
+ "fmax v10.8h, v10.8h, v13.8h\n"
+ "fmax v26.8h, v26.8h, v13.8h\n"
+ "str q19, [x21, x14]\n"
+ "ldr x22, [x8, #0x68]\n"
+ "fmax v25.8h, v25.8h, v13.8h\n"
+ "fmax v12.8h, v12.8h, v13.8h\n"
+ "str q11, [x20, x14]\n"
+ "ldr x21, [x8, #0x70]\n"
+ "ldr x20, [x8, #0x78]\n"
+ "fmin v10.8h, v10.8h, v14.8h\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "str q10, [x23, x14]\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "fmin v12.8h, v12.8h, v14.8h\n"
+ "str q26, [x22, x14]\n"
+ "add x15, x15, #0x10\n"
+ "str q25, [x21, x14]\n"
+ "str q12, [x20, x14]\n"
"3:" // Oddments
"tst %x[n_channels], #0x7\n"
"beq 140f\n"
- "ldr q13, [x15, #0x0]\n"
- "ldr q0, [x15, #0x10]\n"
- "mov x12, x13\n"
- "ldr q1, [x15, #0x20]\n"
- "ldr q2, [x15, #0x30]\n"
- "ldr q3, [x15, #0x40]\n"
- "ldr q4, [x15, #0x50]\n"
- "ldr q5, [x15, #0x60]\n"
- "ldr q6, [x15, #0x70]\n"
- "ldr q7, [x15, #0x80]\n"
- "ldr q8, [x15, #0x90]\n"
- "ldr x23, [x14, #0x0]\n"
- "ldr x22, [x14, #0x8]\n"
- "add x23, x23, x13\n"
- "add x22, x22, x13\n"
- "ldr x21, [x14, #0x10]\n"
- "ldr x20, [x14, #0x18]\n"
- "add x21, x21, x13\n"
- "add x20, x20, x13\n"
+ "ldr q30, [x17, #0x0]\n"
+ "ldr q0, [x17, #0x10]\n"
+ "mov x14, x15\n"
+ "ldr q1, [x17, #0x20]\n"
+ "ldr q2, [x17, #0x30]\n"
+ "ldr q3, [x17, #0x40]\n"
+ "ldr q4, [x17, #0x50]\n"
+ "ldr q5, [x17, #0x60]\n"
+ "ldr q6, [x17, #0x70]\n"
+ "ldr q7, [x17, #0x80]\n"
+ "ldr q8, [x17, #0x90]\n"
+ "ldr x23, [x16, #0x0]\n"
+ "ldr x22, [x16, #0x8]\n"
+ "add x23, x23, x15\n"
+ "add x22, x22, x15\n"
+ "ldr x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x18]\n"
+ "add x21, x21, x15\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 5f\n"
"ld1 { v9.d }[0], [x23], #0x8\n"
"ld1 { v10.d }[0], [x22], #0x8\n"
@@ -762,28 +762,28 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ld1 { v11.h }[0], [x21], #0x2\n"
"ld1 { v12.h }[0], [x20], #0x2\n"
"7:" // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: End
- "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v9.8h\n"
- "mov v17.16b, v13.16b\n fmla v17.8h, v7.8h, v9.8h\n"
- "ldr x20, [x14, #0x20]\n"
- "add x20, x20, x13\n"
- "mov v18.16b, v13.16b\n fmla v18.8h, v6.8h, v9.8h\n"
- "mov v21.16b, v13.16b\n fmla v21.8h, v4.8h, v9.8h\n"
- "mov v22.16b, v13.16b\n fmla v22.8h, v3.8h, v9.8h\n"
- "mov v25.16b, v13.16b\n fmla v25.8h, v1.8h, v9.8h\n"
- "mov v26.16b, v13.16b\n fmla v26.8h, v0.8h, v9.8h\n"
- "mov v19.16b, v13.16b\n fmla v19.8h, v2.8h, v11.8h\n"
- "mov v20.16b, v13.16b\n fmla v20.8h, v5.8h, v9.8h\n"
- "mov v24.16b, v13.16b\n fmla v24.8h, v2.8h, v9.8h\n"
+ "mov v16.16b, v30.16b\n fmla v16.8h, v8.8h, v9.8h\n"
+ "mov v17.16b, v30.16b\n fmla v17.8h, v7.8h, v9.8h\n"
+ "ldr x20, [x16, #0x20]\n"
+ "add x20, x20, x15\n"
+ "mov v18.16b, v30.16b\n fmla v18.8h, v6.8h, v9.8h\n"
+ "mov v21.16b, v30.16b\n fmla v21.8h, v4.8h, v9.8h\n"
+ "mov v22.16b, v30.16b\n fmla v22.8h, v3.8h, v9.8h\n"
+ "mov v25.16b, v30.16b\n fmla v25.8h, v1.8h, v9.8h\n"
+ "mov v26.16b, v30.16b\n fmla v26.8h, v0.8h, v9.8h\n"
+ "mov v19.16b, v30.16b\n fmla v19.8h, v2.8h, v11.8h\n"
+ "mov v20.16b, v30.16b\n fmla v20.8h, v5.8h, v9.8h\n"
+ "mov v24.16b, v30.16b\n fmla v24.8h, v2.8h, v9.8h\n"
"fmla v16.8h, v0.8h, v10.8h\n"
"fmla v17.8h, v8.8h, v12.8h\n"
"fmla v18.8h, v7.8h, v12.8h\n"
"fmla v19.8h, v6.8h, v12.8h\n"
"fmla v21.8h, v5.8h, v12.8h\n"
"fmla v22.8h, v4.8h, v12.8h\n"
- "mov v23.16b, v13.16b\n fmla v23.8h, v3.8h, v12.8h\n"
+ "mov v23.16b, v30.16b\n fmla v23.8h, v3.8h, v12.8h\n"
"fmla v25.8h, v2.8h, v12.8h\n"
"fmla v26.8h, v1.8h, v12.8h\n"
- "mov v27.16b, v13.16b\n fmla v27.8h, v0.8h, v12.8h\n"
+ "mov v27.16b, v30.16b\n fmla v27.8h, v0.8h, v12.8h\n"
"tbz %x[n_channels], #2, 9f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 8f\n"
@@ -804,9 +804,9 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"10:" // Oddments: Load input (5, 0): Bit 2: Unset: Bit 1: Unset
"ld1 { v10.h }[0], [x20], #0x2\n"
"11:" // Oddments: Load input (5, 0): Bit 2: End
- "ldr x20, [x14, #0x28]\n"
- "mov v28.16b, v13.16b\n fmla v28.8h, v6.8h, v10.8h\n"
- "add x20, x20, x13\n"
+ "ldr x20, [x16, #0x28]\n"
+ "mov v28.16b, v30.16b\n fmla v28.8h, v6.8h, v10.8h\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 13f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 12f\n"
@@ -827,9 +827,9 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"14:" // Oddments: Load input (5, 5): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"15:" // Oddments: Load input (5, 5): Bit 2: End
- "ldr x20, [x14, #0x30]\n"
- "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "ldr x20, [x16, #0x30]\n"
+ "mov v31.16b, v30.16b\n fmla v31.8h, v8.8h, v11.8h\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 17f\n"
"ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 16f\n"
@@ -850,17 +850,17 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"18:" // Oddments: Load input (3, 2): Bit 2: Unset: Bit 1: Unset
"ld1 { v9.h }[0], [x20], #0x2\n"
"19:" // Oddments: Load input (3, 2): Bit 2: End
- "ldr x20, [x14, #0x38]\n"
+ "ldr x20, [x16, #0x38]\n"
"fmla v20.8h, v8.8h, v9.8h\n"
"fmla v21.8h, v7.8h, v9.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v22.8h, v6.8h, v9.8h\n"
"fmla v24.8h, v5.8h, v9.8h\n"
"fmla v25.8h, v4.8h, v9.8h\n"
"fmla v26.8h, v3.8h, v9.8h\n"
"fmla v28.8h, v2.8h, v9.8h\n"
- "mov v29.16b, v13.16b\n fmla v29.8h, v1.8h, v9.8h\n"
- "mov v30.16b, v13.16b\n fmla v30.8h, v0.8h, v9.8h\n"
+ "mov v29.16b, v30.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+ "fmla v30.8h, v0.8h, v9.8h\n"
"tbz %x[n_channels], #2, 21f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 20f\n"
@@ -881,10 +881,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"22:" // Oddments: Load input (0, 1): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"23:" // Oddments: Load input (0, 1): Bit 2: End
- "ldr x20, [x14, #0x40]\n"
+ "ldr x20, [x16, #0x40]\n"
"fmla v16.8h, v1.8h, v12.8h\n"
"fmla v17.8h, v0.8h, v12.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 25f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 24f\n"
@@ -905,10 +905,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"26:" // Oddments: Load input (0, 4): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"27:" // Oddments: Load input (0, 4): Bit 2: End
- "ldr x20, [x14, #0x48]\n"
+ "ldr x20, [x16, #0x48]\n"
"fmla v18.8h, v2.8h, v11.8h\n"
"fmla v19.8h, v1.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 29f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 28f\n"
@@ -929,10 +929,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"30:" // Oddments: Load input (3, 3): Bit 2: Unset: Bit 1: Unset
"ld1 { v10.h }[0], [x20], #0x2\n"
"31:" // Oddments: Load input (3, 3): Bit 2: End
- "ldr x20, [x14, #0x50]\n"
+ "ldr x20, [x16, #0x50]\n"
"fmla v21.8h, v8.8h, v10.8h\n"
"fmla v22.8h, v7.8h, v10.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v23.8h, v6.8h, v10.8h\n"
"fmla v25.8h, v5.8h, v10.8h\n"
"fmla v26.8h, v4.8h, v10.8h\n"
@@ -960,10 +960,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"34:" // Oddments: Load input (1, 0): Bit 2: Unset: Bit 1: Unset
"ld1 { v9.h }[0], [x20], #0x2\n"
"35:" // Oddments: Load input (1, 0): Bit 2: End
- "ldr x20, [x14, #0x58]\n"
+ "ldr x20, [x16, #0x58]\n"
"fmla v16.8h, v3.8h, v9.8h\n"
"fmla v20.8h, v0.8h, v9.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 37f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 36f\n"
@@ -984,10 +984,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"38:" // Oddments: Load input (1, 5): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"39:" // Oddments: Load input (1, 5): Bit 2: End
- "ldr x20, [x14, #0x60]\n"
+ "ldr x20, [x16, #0x60]\n"
"fmla v19.8h, v5.8h, v12.8h\n"
"fmla v23.8h, v2.8h, v12.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 41f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 40f\n"
@@ -1008,10 +1008,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"42:" // Oddments: Load input (4, 0): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"43:" // Oddments: Load input (4, 0): Bit 2: End
- "ldr x20, [x14, #0x68]\n"
+ "ldr x20, [x16, #0x68]\n"
"fmla v24.8h, v6.8h, v11.8h\n"
"fmla v28.8h, v3.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 45f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 44f\n"
@@ -1032,10 +1032,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"46:" // Oddments: Load input (1, 2): Bit 2: Unset: Bit 1: Unset
"ld1 { v10.h }[0], [x20], #0x2\n"
"47:" // Oddments: Load input (1, 2): Bit 2: End
- "ldr x20, [x14, #0x70]\n"
+ "ldr x20, [x16, #0x70]\n"
"fmla v16.8h, v5.8h, v10.8h\n"
"fmla v17.8h, v4.8h, v10.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v18.8h, v3.8h, v10.8h\n"
"fmla v20.8h, v2.8h, v10.8h\n"
"fmla v21.8h, v1.8h, v10.8h\n"
@@ -1060,10 +1060,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"50:" // Oddments: Load input (4, 5): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"51:" // Oddments: Load input (4, 5): Bit 2: End
- "ldr x20, [x14, #0x78]\n"
+ "ldr x20, [x16, #0x78]\n"
"fmla v27.8h, v8.8h, v11.8h\n"
"fmla v31.8h, v5.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 53f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 52f\n"
@@ -1084,10 +1084,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"54:" // Oddments: Load input (1, 3): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"55:" // Oddments: Load input (1, 3): Bit 2: End
- "ldr x20, [x14, #0x80]\n"
+ "ldr x20, [x16, #0x80]\n"
"fmla v17.8h, v5.8h, v12.8h\n"
"fmla v18.8h, v4.8h, v12.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v19.8h, v3.8h, v12.8h\n"
"fmla v21.8h, v2.8h, v12.8h\n"
"fmla v22.8h, v1.8h, v12.8h\n"
@@ -1112,10 +1112,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"58:" // Oddments: Load input (5, 1): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"59:" // Oddments: Load input (5, 1): Bit 2: End
- "ldr x20, [x14, #0x88]\n"
+ "ldr x20, [x16, #0x88]\n"
"fmla v28.8h, v7.8h, v11.8h\n"
"fmla v29.8h, v6.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 61f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 60f\n"
@@ -1136,10 +1136,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"62:" // Oddments: Load input (2, 1): Bit 2: Unset: Bit 1: Unset
"ld1 { v10.h }[0], [x20], #0x2\n"
"63:" // Oddments: Load input (2, 1): Bit 2: End
- "ldr x20, [x14, #0x90]\n"
+ "ldr x20, [x16, #0x90]\n"
"fmla v16.8h, v7.8h, v10.8h\n"
"fmla v17.8h, v6.8h, v10.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v20.8h, v4.8h, v10.8h\n"
"fmla v21.8h, v3.8h, v10.8h\n"
"fmla v24.8h, v1.8h, v10.8h\n"
@@ -1164,10 +1164,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"66:" // Oddments: Load input (5, 4): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"67:" // Oddments: Load input (5, 4): Bit 2: End
- "ldr x20, [x14, #0x98]\n"
+ "ldr x20, [x16, #0x98]\n"
"fmla v30.8h, v8.8h, v11.8h\n"
"fmla v31.8h, v7.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 69f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 68f\n"
@@ -1188,10 +1188,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"70:" // Oddments: Load input (2, 4): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"71:" // Oddments: Load input (2, 4): Bit 2: End
- "ldr x20, [x14, #0xa0]\n"
+ "ldr x20, [x16, #0xa0]\n"
"fmla v18.8h, v8.8h, v12.8h\n"
"fmla v19.8h, v7.8h, v12.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v22.8h, v5.8h, v12.8h\n"
"fmla v23.8h, v4.8h, v12.8h\n"
"fmla v26.8h, v2.8h, v12.8h\n"
@@ -1216,10 +1216,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"74:" // Oddments: Load input (0, 2): Bit 2: Unset: Bit 1: Unset
"ld1 { v10.h }[0], [x20], #0x2\n"
"75:" // Oddments: Load input (0, 2): Bit 2: End
- "ldr x20, [x14, #0xa8]\n"
+ "ldr x20, [x16, #0xa8]\n"
"fmla v16.8h, v2.8h, v10.8h\n"
"fmla v17.8h, v1.8h, v10.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v18.8h, v0.8h, v10.8h\n"
"tbz %x[n_channels], #2, 77f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
@@ -1241,10 +1241,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"78:" // Oddments: Load input (3, 1): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"79:" // Oddments: Load input (3, 1): Bit 2: End
- "ldr x20, [x14, #0xb0]\n"
+ "ldr x20, [x16, #0xb0]\n"
"fmla v20.8h, v7.8h, v11.8h\n"
"fmla v21.8h, v6.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v24.8h, v4.8h, v11.8h\n"
"fmla v25.8h, v3.8h, v11.8h\n"
"fmla v28.8h, v1.8h, v11.8h\n"
@@ -1269,10 +1269,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"82:" // Oddments: Load input (0, 3): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"83:" // Oddments: Load input (0, 3): Bit 2: End
- "ldr x20, [x14, #0xb8]\n"
+ "ldr x20, [x16, #0xb8]\n"
"fmla v17.8h, v2.8h, v12.8h\n"
"fmla v18.8h, v1.8h, v12.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v19.8h, v0.8h, v12.8h\n"
"tbz %x[n_channels], #2, 85f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
@@ -1294,10 +1294,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"86:" // Oddments: Load input (2, 0): Bit 2: Unset: Bit 1: Unset
"ld1 { v10.h }[0], [x20], #0x2\n"
"87:" // Oddments: Load input (2, 0): Bit 2: End
- "ldr x20, [x14, #0xc0]\n"
+ "ldr x20, [x16, #0xc0]\n"
"fmla v16.8h, v6.8h, v10.8h\n"
"fmla v20.8h, v3.8h, v10.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v24.8h, v0.8h, v10.8h\n"
"tbz %x[n_channels], #2, 89f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
@@ -1319,10 +1319,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"90:" // Oddments: Load input (3, 4): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"91:" // Oddments: Load input (3, 4): Bit 2: End
- "ldr x20, [x14, #0xc8]\n"
+ "ldr x20, [x16, #0xc8]\n"
"fmla v22.8h, v8.8h, v11.8h\n"
"fmla v23.8h, v7.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v26.8h, v5.8h, v11.8h\n"
"fmla v27.8h, v4.8h, v11.8h\n"
"fmla v30.8h, v2.8h, v11.8h\n"
@@ -1347,10 +1347,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"94:" // Oddments: Load input (2, 5): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"95:" // Oddments: Load input (2, 5): Bit 2: End
- "ldr x20, [x14, #0xd0]\n"
+ "ldr x20, [x16, #0xd0]\n"
"fmla v19.8h, v8.8h, v12.8h\n"
"fmla v23.8h, v5.8h, v12.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v27.8h, v2.8h, v12.8h\n"
"tbz %x[n_channels], #2, 97f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
@@ -1372,10 +1372,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"98:" // Oddments: Load input (3, 0): Bit 2: Unset: Bit 1: Unset
"ld1 { v10.h }[0], [x20], #0x2\n"
"99:" // Oddments: Load input (3, 0): Bit 2: End
- "ldr x20, [x14, #0xd8]\n"
+ "ldr x20, [x16, #0xd8]\n"
"fmla v20.8h, v6.8h, v10.8h\n"
"fmla v24.8h, v3.8h, v10.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v28.8h, v0.8h, v10.8h\n"
"tbz %x[n_channels], #2, 101f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
@@ -1397,10 +1397,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"102:" // Oddments: Load input (4, 2): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"103:" // Oddments: Load input (4, 2): Bit 2: End
- "ldr x20, [x14, #0xe0]\n"
+ "ldr x20, [x16, #0xe0]\n"
"fmla v24.8h, v8.8h, v11.8h\n"
"fmla v25.8h, v7.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v26.8h, v6.8h, v11.8h\n"
"fmla v28.8h, v5.8h, v11.8h\n"
"fmla v29.8h, v4.8h, v11.8h\n"
@@ -1425,10 +1425,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"106:" // Oddments: Load input (3, 5): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"107:" // Oddments: Load input (3, 5): Bit 2: End
- "ldr x20, [x14, #0xe8]\n"
+ "ldr x20, [x16, #0xe8]\n"
"fmla v23.8h, v8.8h, v12.8h\n"
"fmla v27.8h, v5.8h, v12.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v31.8h, v2.8h, v12.8h\n"
"tbz %x[n_channels], #2, 109f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
@@ -1450,10 +1450,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"110:" // Oddments: Load input (5, 2): Bit 2: Unset: Bit 1: Unset
"ld1 { v10.h }[0], [x20], #0x2\n"
"111:" // Oddments: Load input (5, 2): Bit 2: End
- "ldr x20, [x14, #0xf0]\n"
+ "ldr x20, [x16, #0xf0]\n"
"fmla v28.8h, v8.8h, v10.8h\n"
"fmla v29.8h, v7.8h, v10.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v30.8h, v6.8h, v10.8h\n"
"tbz %x[n_channels], #2, 113f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
@@ -1475,10 +1475,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"114:" // Oddments: Load input (4, 3): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"115:" // Oddments: Load input (4, 3): Bit 2: End
- "ldr x20, [x14, #0xf8]\n"
+ "ldr x20, [x16, #0xf8]\n"
"fmla v25.8h, v8.8h, v11.8h\n"
"fmla v26.8h, v7.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v27.8h, v6.8h, v11.8h\n"
"fmla v29.8h, v5.8h, v11.8h\n"
"fmla v30.8h, v4.8h, v11.8h\n"
@@ -1503,10 +1503,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"118:" // Oddments: Load input (5, 3): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"119:" // Oddments: Load input (5, 3): Bit 2: End
- "ldr x20, [x14, #0x100]\n"
+ "ldr x20, [x16, #0x100]\n"
"fmla v29.8h, v8.8h, v12.8h\n"
"fmla v30.8h, v7.8h, v12.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v31.8h, v6.8h, v12.8h\n"
"tbz %x[n_channels], #2, 121f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
@@ -1528,10 +1528,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"122:" // Oddments: Load input (1, 1): Bit 2: Unset: Bit 1: Unset
"ld1 { v10.h }[0], [x20], #0x2\n"
"123:" // Oddments: Load input (1, 1): Bit 2: End
- "ldr x20, [x14, #0x108]\n"
+ "ldr x20, [x16, #0x108]\n"
"fmla v16.8h, v4.8h, v10.8h\n"
"fmla v17.8h, v3.8h, v10.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v20.8h, v1.8h, v10.8h\n"
"fmla v21.8h, v0.8h, v10.8h\n"
"tbz %x[n_channels], #2, 125f\n"
@@ -1554,10 +1554,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"126:" // Oddments: Load input (1, 4): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"127:" // Oddments: Load input (1, 4): Bit 2: End
- "ldr x20, [x14, #0x110]\n"
+ "ldr x20, [x16, #0x110]\n"
"fmla v18.8h, v5.8h, v11.8h\n"
"fmla v19.8h, v4.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v22.8h, v2.8h, v11.8h\n"
"fmla v23.8h, v1.8h, v11.8h\n"
"tbz %x[n_channels], #2, 129f\n"
@@ -1580,10 +1580,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"130:" // Oddments: Load input (4, 1): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"131:" // Oddments: Load input (4, 1): Bit 2: End
- "ldr x20, [x14, #0x118]\n"
+ "ldr x20, [x16, #0x118]\n"
"fmla v24.8h, v7.8h, v12.8h\n"
"fmla v25.8h, v6.8h, v12.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v28.8h, v4.8h, v12.8h\n"
"fmla v29.8h, v3.8h, v12.8h\n"
"tbz %x[n_channels], #2, 133f\n"
@@ -1608,24 +1608,24 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"135:" // Oddments: Load input (4, 4): Bit 2: End
"fmla v26.8h, v8.8h, v10.8h\n"
"fmla v27.8h, v7.8h, v10.8h\n"
- "fmax v16.8h, v16.8h, v15.8h\n"
+ "fmax v16.8h, v16.8h, v13.8h\n"
"fmla v30.8h, v5.8h, v10.8h\n"
"fmla v31.8h, v4.8h, v10.8h\n"
- "fmax v17.8h, v17.8h, v15.8h\n"
- "fmax v18.8h, v18.8h, v15.8h\n"
- "fmax v19.8h, v19.8h, v15.8h\n"
- "fmax v20.8h, v20.8h, v15.8h\n"
- "fmax v21.8h, v21.8h, v15.8h\n"
- "fmax v22.8h, v22.8h, v15.8h\n"
- "fmax v23.8h, v23.8h, v15.8h\n"
- "fmax v24.8h, v24.8h, v15.8h\n"
- "fmax v25.8h, v25.8h, v15.8h\n"
- "fmax v26.8h, v26.8h, v15.8h\n"
- "fmax v27.8h, v27.8h, v15.8h\n"
- "fmax v28.8h, v28.8h, v15.8h\n"
- "fmax v29.8h, v29.8h, v15.8h\n"
- "fmax v30.8h, v30.8h, v15.8h\n"
- "fmax v31.8h, v31.8h, v15.8h\n"
+ "fmax v17.8h, v17.8h, v13.8h\n"
+ "fmax v18.8h, v18.8h, v13.8h\n"
+ "fmax v19.8h, v19.8h, v13.8h\n"
+ "fmax v20.8h, v20.8h, v13.8h\n"
+ "fmax v21.8h, v21.8h, v13.8h\n"
+ "fmax v22.8h, v22.8h, v13.8h\n"
+ "fmax v23.8h, v23.8h, v13.8h\n"
+ "fmax v24.8h, v24.8h, v13.8h\n"
+ "fmax v25.8h, v25.8h, v13.8h\n"
+ "fmax v26.8h, v26.8h, v13.8h\n"
+ "fmax v27.8h, v27.8h, v13.8h\n"
+ "fmax v28.8h, v28.8h, v13.8h\n"
+ "fmax v29.8h, v29.8h, v13.8h\n"
+ "fmax v30.8h, v30.8h, v13.8h\n"
+ "fmax v31.8h, v31.8h, v13.8h\n"
"fmin v16.8h, v16.8h, v14.8h\n"
"fmin v17.8h, v17.8h, v14.8h\n"
"fmin v18.8h, v18.8h, v14.8h\n"
@@ -1643,150 +1643,150 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"fmin v30.8h, v30.8h, v14.8h\n"
"fmin v31.8h, v31.8h, v14.8h\n"
"tbz %x[n_channels], #2, 137f\n"
- "ldr x23, [x16, #0x0]\n"
- "ldr x22, [x16, #0x8]\n"
- "add x23, x23, x12\n"
- "add x22, x22, x12\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
+ "ldr x23, [x8, #0x0]\n"
+ "ldr x22, [x8, #0x8]\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "ldr x21, [x8, #0x10]\n"
+ "ldr x20, [x8, #0x18]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"st1 { v16.d }[0], [x23]\n"
- "ldr x23, [x16, #0x20]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x20]\n"
+ "add x23, x23, x14\n"
"st1 { v17.d }[0], [x22]\n"
- "ldr x22, [x16, #0x28]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x28]\n"
+ "add x22, x22, x14\n"
"st1 { v18.d }[0], [x21]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x30]\n"
+ "add x21, x21, x14\n"
"st1 { v19.d }[0], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x38]\n"
+ "add x20, x20, x14\n"
"st1 { v20.d }[0], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x23, x23, x14\n"
"st1 { v21.d }[0], [x22]\n"
- "ldr x22, [x16, #0x48]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x48]\n"
+ "add x22, x22, x14\n"
"st1 { v22.d }[0], [x21]\n"
- "ldr x21, [x16, #0x50]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x50]\n"
+ "add x21, x21, x14\n"
"st1 { v23.d }[0], [x20]\n"
- "ldr x20, [x16, #0x58]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x58]\n"
+ "add x20, x20, x14\n"
"st1 { v24.d }[0], [x23]\n"
- "ldr x23, [x16, #0x60]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x60]\n"
+ "add x23, x23, x14\n"
"st1 { v25.d }[0], [x22]\n"
- "ldr x22, [x16, #0x68]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x68]\n"
+ "add x22, x22, x14\n"
"st1 { v26.d }[0], [x21]\n"
- "ldr x21, [x16, #0x70]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x70]\n"
+ "add x21, x21, x14\n"
"st1 { v27.d }[0], [x20]\n"
- "ldr x20, [x16, #0x78]\n"
- "add x20, x20, x12\n"
- "add x12, x12, #0x8\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x20, x20, x14\n"
+ "add x14, x14, #0x8\n"
"st1 { v28.d }[0], [x23]\n"
"st1 { v29.d }[0], [x22]\n"
"st1 { v30.d }[0], [x21]\n"
"st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #1, 136f\n"
- "ldr x23, [x16, #0x0]\n"
- "ldr x22, [x16, #0x8]\n"
- "add x23, x23, x12\n"
- "add x22, x22, x12\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
+ "ldr x23, [x8, #0x0]\n"
+ "ldr x22, [x8, #0x8]\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "ldr x21, [x8, #0x10]\n"
+ "ldr x20, [x8, #0x18]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"st1 { v16.s }[2], [x23]\n"
- "ldr x23, [x16, #0x20]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x20]\n"
+ "add x23, x23, x14\n"
"st1 { v17.s }[2], [x22]\n"
- "ldr x22, [x16, #0x28]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x28]\n"
+ "add x22, x22, x14\n"
"st1 { v18.s }[2], [x21]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x30]\n"
+ "add x21, x21, x14\n"
"st1 { v19.s }[2], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x38]\n"
+ "add x20, x20, x14\n"
"st1 { v20.s }[2], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x23, x23, x14\n"
"st1 { v21.s }[2], [x22]\n"
- "ldr x22, [x16, #0x48]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x48]\n"
+ "add x22, x22, x14\n"
"st1 { v22.s }[2], [x21]\n"
- "ldr x21, [x16, #0x50]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x50]\n"
+ "add x21, x21, x14\n"
"st1 { v23.s }[2], [x20]\n"
- "ldr x20, [x16, #0x58]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x58]\n"
+ "add x20, x20, x14\n"
"st1 { v24.s }[2], [x23]\n"
- "ldr x23, [x16, #0x60]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x60]\n"
+ "add x23, x23, x14\n"
"st1 { v25.s }[2], [x22]\n"
- "ldr x22, [x16, #0x68]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x68]\n"
+ "add x22, x22, x14\n"
"st1 { v26.s }[2], [x21]\n"
- "ldr x21, [x16, #0x70]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x70]\n"
+ "add x21, x21, x14\n"
"st1 { v27.s }[2], [x20]\n"
- "ldr x20, [x16, #0x78]\n"
- "add x20, x20, x12\n"
- "add x12, x12, #0x4\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x20, x20, x14\n"
+ "add x14, x14, #0x4\n"
"st1 { v28.s }[2], [x23]\n"
"st1 { v29.s }[2], [x22]\n"
"st1 { v30.s }[2], [x21]\n"
"st1 { v31.s }[2], [x20]\n"
"tbz %x[n_channels], #0, 139f\n"
- "ldr x23, [x16, #0x0]\n"
- "ldr x22, [x16, #0x8]\n"
- "add x23, x23, x12\n"
- "add x22, x22, x12\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
+ "ldr x23, [x8, #0x0]\n"
+ "ldr x22, [x8, #0x8]\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "ldr x21, [x8, #0x10]\n"
+ "ldr x20, [x8, #0x18]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"st1 { v16.h }[6], [x23]\n"
- "ldr x23, [x16, #0x20]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x20]\n"
+ "add x23, x23, x14\n"
"st1 { v17.h }[6], [x22]\n"
- "ldr x22, [x16, #0x28]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x28]\n"
+ "add x22, x22, x14\n"
"st1 { v18.h }[6], [x21]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x30]\n"
+ "add x21, x21, x14\n"
"st1 { v19.h }[6], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x38]\n"
+ "add x20, x20, x14\n"
"st1 { v20.h }[6], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x23, x23, x14\n"
"st1 { v21.h }[6], [x22]\n"
- "ldr x22, [x16, #0x48]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x48]\n"
+ "add x22, x22, x14\n"
"st1 { v22.h }[6], [x21]\n"
- "ldr x21, [x16, #0x50]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x50]\n"
+ "add x21, x21, x14\n"
"st1 { v23.h }[6], [x20]\n"
- "ldr x20, [x16, #0x58]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x58]\n"
+ "add x20, x20, x14\n"
"st1 { v24.h }[6], [x23]\n"
- "ldr x23, [x16, #0x60]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x60]\n"
+ "add x23, x23, x14\n"
"st1 { v25.h }[6], [x22]\n"
- "ldr x22, [x16, #0x68]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x68]\n"
+ "add x22, x22, x14\n"
"st1 { v26.h }[6], [x21]\n"
- "ldr x21, [x16, #0x70]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x70]\n"
+ "add x21, x21, x14\n"
"st1 { v27.h }[6], [x20]\n"
- "ldr x20, [x16, #0x78]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x20, x20, x14\n"
"st1 { v28.h }[6], [x23]\n"
"st1 { v29.h }[6], [x22]\n"
"st1 { v30.h }[6], [x21]\n"
@@ -1794,50 +1794,50 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"b 139f\n"
"136:" // Oddments: Store: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 139f\n"
- "ldr x23, [x16, #0x0]\n"
- "ldr x22, [x16, #0x8]\n"
- "add x23, x23, x12\n"
- "add x22, x22, x12\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
+ "ldr x23, [x8, #0x0]\n"
+ "ldr x22, [x8, #0x8]\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "ldr x21, [x8, #0x10]\n"
+ "ldr x20, [x8, #0x18]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"st1 { v16.h }[4], [x23]\n"
- "ldr x23, [x16, #0x20]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x20]\n"
+ "add x23, x23, x14\n"
"st1 { v17.h }[4], [x22]\n"
- "ldr x22, [x16, #0x28]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x28]\n"
+ "add x22, x22, x14\n"
"st1 { v18.h }[4], [x21]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x30]\n"
+ "add x21, x21, x14\n"
"st1 { v19.h }[4], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x38]\n"
+ "add x20, x20, x14\n"
"st1 { v20.h }[4], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x23, x23, x14\n"
"st1 { v21.h }[4], [x22]\n"
- "ldr x22, [x16, #0x48]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x48]\n"
+ "add x22, x22, x14\n"
"st1 { v22.h }[4], [x21]\n"
- "ldr x21, [x16, #0x50]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x50]\n"
+ "add x21, x21, x14\n"
"st1 { v23.h }[4], [x20]\n"
- "ldr x20, [x16, #0x58]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x58]\n"
+ "add x20, x20, x14\n"
"st1 { v24.h }[4], [x23]\n"
- "ldr x23, [x16, #0x60]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x60]\n"
+ "add x23, x23, x14\n"
"st1 { v25.h }[4], [x22]\n"
- "ldr x22, [x16, #0x68]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x68]\n"
+ "add x22, x22, x14\n"
"st1 { v26.h }[4], [x21]\n"
- "ldr x21, [x16, #0x70]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x70]\n"
+ "add x21, x21, x14\n"
"st1 { v27.h }[4], [x20]\n"
- "ldr x20, [x16, #0x78]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x20, x20, x14\n"
"st1 { v28.h }[4], [x23]\n"
"st1 { v29.h }[4], [x22]\n"
"st1 { v30.h }[4], [x21]\n"
@@ -1845,161 +1845,159 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"b 139f\n"
"137:" // Oddments: Store: Bit 2: Unset
"tbz %x[n_channels], #1, 138f\n"
- "ldr x23, [x16, #0x0]\n"
- "ldr x22, [x16, #0x8]\n"
- "add x23, x23, x12\n"
- "add x22, x22, x12\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
+ "ldr x23, [x8, #0x0]\n"
+ "ldr x22, [x8, #0x8]\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "ldr x21, [x8, #0x10]\n"
+ "ldr x20, [x8, #0x18]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"st1 { v16.s }[0], [x23]\n"
- "ldr x23, [x16, #0x20]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x20]\n"
+ "add x23, x23, x14\n"
"st1 { v17.s }[0], [x22]\n"
- "ldr x22, [x16, #0x28]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x28]\n"
+ "add x22, x22, x14\n"
"st1 { v18.s }[0], [x21]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x30]\n"
+ "add x21, x21, x14\n"
"st1 { v19.s }[0], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x38]\n"
+ "add x20, x20, x14\n"
"st1 { v20.s }[0], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x23, x23, x14\n"
"st1 { v21.s }[0], [x22]\n"
- "ldr x22, [x16, #0x48]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x48]\n"
+ "add x22, x22, x14\n"
"st1 { v22.s }[0], [x21]\n"
- "ldr x21, [x16, #0x50]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x50]\n"
+ "add x21, x21, x14\n"
"st1 { v23.s }[0], [x20]\n"
- "ldr x20, [x16, #0x58]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x58]\n"
+ "add x20, x20, x14\n"
"st1 { v24.s }[0], [x23]\n"
- "ldr x23, [x16, #0x60]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x60]\n"
+ "add x23, x23, x14\n"
"st1 { v25.s }[0], [x22]\n"
- "ldr x22, [x16, #0x68]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x68]\n"
+ "add x22, x22, x14\n"
"st1 { v26.s }[0], [x21]\n"
- "ldr x21, [x16, #0x70]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x70]\n"
+ "add x21, x21, x14\n"
"st1 { v27.s }[0], [x20]\n"
- "ldr x20, [x16, #0x78]\n"
- "add x20, x20, x12\n"
- "add x12, x12, #0x4\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x20, x20, x14\n"
+ "add x14, x14, #0x4\n"
"st1 { v28.s }[0], [x23]\n"
"st1 { v29.s }[0], [x22]\n"
"st1 { v30.s }[0], [x21]\n"
"st1 { v31.s }[0], [x20]\n"
"tbz %x[n_channels], #0, 139f\n"
- "ldr x23, [x16, #0x0]\n"
- "ldr x22, [x16, #0x8]\n"
- "add x23, x23, x12\n"
- "add x22, x22, x12\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
+ "ldr x23, [x8, #0x0]\n"
+ "ldr x22, [x8, #0x8]\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "ldr x21, [x8, #0x10]\n"
+ "ldr x20, [x8, #0x18]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"st1 { v16.h }[2], [x23]\n"
- "ldr x23, [x16, #0x20]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x20]\n"
+ "add x23, x23, x14\n"
"st1 { v17.h }[2], [x22]\n"
- "ldr x22, [x16, #0x28]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x28]\n"
+ "add x22, x22, x14\n"
"st1 { v18.h }[2], [x21]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x30]\n"
+ "add x21, x21, x14\n"
"st1 { v19.h }[2], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x38]\n"
+ "add x20, x20, x14\n"
"st1 { v20.h }[2], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x23, x23, x14\n"
"st1 { v21.h }[2], [x22]\n"
- "ldr x22, [x16, #0x48]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x48]\n"
+ "add x22, x22, x14\n"
"st1 { v22.h }[2], [x21]\n"
- "ldr x21, [x16, #0x50]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x50]\n"
+ "add x21, x21, x14\n"
"st1 { v23.h }[2], [x20]\n"
- "ldr x20, [x16, #0x58]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x58]\n"
+ "add x20, x20, x14\n"
"st1 { v24.h }[2], [x23]\n"
- "ldr x23, [x16, #0x60]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x60]\n"
+ "add x23, x23, x14\n"
"st1 { v25.h }[2], [x22]\n"
- "ldr x22, [x16, #0x68]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x68]\n"
+ "add x22, x22, x14\n"
"st1 { v26.h }[2], [x21]\n"
- "ldr x21, [x16, #0x70]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x70]\n"
+ "add x21, x21, x14\n"
"st1 { v27.h }[2], [x20]\n"
- "ldr x20, [x16, #0x78]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x20, x20, x14\n"
"st1 { v28.h }[2], [x23]\n"
"st1 { v29.h }[2], [x22]\n"
"st1 { v30.h }[2], [x21]\n"
"st1 { v31.h }[2], [x20]\n"
"b 139f\n"
"138:" // Oddments: Store: Bit 2: Unset: Bit 1: Unset
- "ldr x23, [x16, #0x0]\n"
- "ldr x22, [x16, #0x8]\n"
- "add x23, x23, x12\n"
- "add x22, x22, x12\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
+ "ldr x23, [x8, #0x0]\n"
+ "ldr x22, [x8, #0x8]\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "ldr x21, [x8, #0x10]\n"
+ "ldr x20, [x8, #0x18]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"st1 { v16.h }[0], [x23]\n"
- "ldr x23, [x16, #0x20]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x20]\n"
+ "add x23, x23, x14\n"
"st1 { v17.h }[0], [x22]\n"
- "ldr x22, [x16, #0x28]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x28]\n"
+ "add x22, x22, x14\n"
"st1 { v18.h }[0], [x21]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x30]\n"
+ "add x21, x21, x14\n"
"st1 { v19.h }[0], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x38]\n"
+ "add x20, x20, x14\n"
"st1 { v20.h }[0], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x23, x23, x14\n"
"st1 { v21.h }[0], [x22]\n"
- "ldr x22, [x16, #0x48]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x48]\n"
+ "add x22, x22, x14\n"
"st1 { v22.h }[0], [x21]\n"
- "ldr x21, [x16, #0x50]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x50]\n"
+ "add x21, x21, x14\n"
"st1 { v23.h }[0], [x20]\n"
- "ldr x20, [x16, #0x58]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x58]\n"
+ "add x20, x20, x14\n"
"st1 { v24.h }[0], [x23]\n"
- "ldr x23, [x16, #0x60]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x60]\n"
+ "add x23, x23, x14\n"
"st1 { v25.h }[0], [x22]\n"
- "ldr x22, [x16, #0x68]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x68]\n"
+ "add x22, x22, x14\n"
"st1 { v26.h }[0], [x21]\n"
- "ldr x21, [x16, #0x70]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x70]\n"
+ "add x21, x21, x14\n"
"st1 { v27.h }[0], [x20]\n"
- "ldr x20, [x16, #0x78]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x20, x20, x14\n"
"st1 { v28.h }[0], [x23]\n"
"st1 { v29.h }[0], [x22]\n"
"st1 { v30.h }[0], [x21]\n"
"st1 { v31.h }[0], [x20]\n"
"139:" // Oddments: Store: Bit 2: End
-
"140:" // End
-
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index d52f48064f..8ad6a37fea 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -33,8 +33,8 @@
namespace arm_conv {
namespace depthwise {
-void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
-void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
class a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
{
@@ -57,7 +57,7 @@ class a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 2;
a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(2, 3, 2) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
index 268dda531d..8954999990 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -118,9 +118,9 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"add x11, x13, x6\n"
"add x17, x17, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v19.8h }, [x20]\n"
+ "ld1r { v26.8h }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v18.8h }, [x20]\n"
+ "ld1r { v27.8h }, [x20]\n"
"add x10, x12, x24, LSL #1\n"
"add x9, x11, x6\n"
"add x28, x17, x21, LSL #1\n"
@@ -128,7 +128,7 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"mov x21, #0x0\n"
"sub x20, XZR, x23\n"
"cbz x22, 4f\n"
- "ldr q17, [x15, #0x0]\n"
+ "ldr q31, [x15, #0x0]\n"
"ldr q0, [x15, #0x10]\n"
"cmp x23, x22, LSL #4\n"
"ldr q1, [x15, #0x20]\n"
@@ -150,179 +150,179 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr q16, [x8, x13]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "mov v28.16b, v17.16b\n fmla v28.8h, v8.8h, v9.8h\n"
- "mov v29.16b, v17.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "mov v29.16b, v31.16b\n fmla v29.8h, v8.8h, v9.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v6.8h, v9.8h\n"
"add x23, x23, #0x10\n"
"add x8, x8, #0x10\n"
- "fmla v28.8h, v0.8h, v10.8h\n"
+ "fmla v29.8h, v0.8h, v10.8h\n"
"ld1 { v10.8h }, [x8]\n"
- "fmla v29.8h, v1.8h, v12.8h\n"
- "ldr q12, [x16, x9]\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "ldr q11, [x16, x11]\n"
- "fmla v29.8h, v2.8h, v13.8h\n"
- "ldr q13, [x16, x13]\n"
- "fmla v28.8h, v3.8h, v14.8h\n"
- "ld1 { v14.8h }, [x12]\n"
- "fmla v29.8h, v0.8h, v16.8h\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "ldr q21, [x16, x9]\n"
+ "fmla v29.8h, v1.8h, v11.8h\n"
+ "ldr q18, [x16, x11]\n"
+ "fmla v28.8h, v2.8h, v13.8h\n"
+ "ldr q17, [x16, x13]\n"
+ "fmla v29.8h, v3.8h, v14.8h\n"
+ "ld1 { v20.8h }, [x12]\n"
+ "fmla v28.8h, v0.8h, v16.8h\n"
"add x16, x16, #0x10\n"
- "fmla v28.8h, v4.8h, v15.8h\n"
- "ld1 { v15.8h }, [x14]\n"
- "fmla v29.8h, v4.8h, v11.8h\n"
- "ldr q11, [x12, x6]\n"
- "fmla v28.8h, v2.8h, v16.8h\n"
- "ldr q16, [x14, x6]\n"
- "fmla v29.8h, v5.8h, v12.8h\n"
- "ldr q12, [x14, x11]\n"
- "mov v30.16b, v17.16b\n fmla v30.8h, v2.8h, v9.8h\n"
- "mov v31.16b, v17.16b\n fmla v31.8h, v0.8h, v9.8h\n"
- "ldr q17, [x15, #0x0]\n"
+ "fmla v29.8h, v4.8h, v15.8h\n"
+ "ld1 { v25.8h }, [x14]\n"
+ "fmla v28.8h, v4.8h, v18.8h\n"
+ "ldr q19, [x12, x6]\n"
+ "fmla v29.8h, v2.8h, v16.8h\n"
+ "ldr q18, [x14, x6]\n"
+ "fmla v28.8h, v5.8h, v21.8h\n"
+ "ldr q24, [x14, x11]\n"
+ "mov v23.16b, v31.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+ "mov v22.16b, v31.16b\n fmla v22.8h, v0.8h, v9.8h\n"
+ "ldr q31, [x15, #0x0]\n"
"cmp x23, x22, LSL #4\n"
- "fmla v28.8h, v5.8h, v13.8h\n"
- "fmla v29.8h, v3.8h, v13.8h\n"
- "ldr q13, [x12, x11]\n"
+ "fmla v29.8h, v5.8h, v17.8h\n"
+ "fmla v28.8h, v3.8h, v17.8h\n"
+ "ldr q17, [x12, x11]\n"
"add x20, x20, #0x10\n"
- "fmla v30.8h, v3.8h, v14.8h\n"
- "ldr q14, [x12, x9]\n"
- "fmla v31.8h, v4.8h, v13.8h\n"
- "ldr q13, [x10, x6]\n"
- "fmla v30.8h, v0.8h, v15.8h\n"
+ "fmla v23.8h, v3.8h, v20.8h\n"
+ "ldr q16, [x12, x9]\n"
+ "fmla v22.8h, v4.8h, v17.8h\n"
+ "ldr q21, [x10, x6]\n"
+ "fmla v23.8h, v0.8h, v25.8h\n"
"ldr q0, [x15, #0x10]\n"
- "fmla v31.8h, v1.8h, v12.8h\n"
+ "fmla v22.8h, v1.8h, v24.8h\n"
"add x21, x21, #0x10\n"
- "fmla v30.8h, v4.8h, v11.8h\n"
- "ldr q11, [x14, x9]\n"
+ "fmla v23.8h, v4.8h, v19.8h\n"
+ "ldr q20, [x14, x9]\n"
"ldr q4, [x15, #0x50]\n"
- "fmla v31.8h, v5.8h, v14.8h\n"
- "ldr q14, [x10, x11]\n"
- "fmla v28.8h, v6.8h, v15.8h\n"
- "ld1 { v15.8h }, [x10]\n"
- "fmla v30.8h, v1.8h, v16.8h\n"
+ "fmla v22.8h, v5.8h, v16.8h\n"
+ "ldr q19, [x10, x11]\n"
+ "fmla v29.8h, v6.8h, v25.8h\n"
+ "ld1 { v17.8h }, [x10]\n"
+ "fmla v23.8h, v1.8h, v18.8h\n"
"ldr q1, [x15, #0x20]\n"
- "fmla v31.8h, v2.8h, v11.8h\n"
+ "fmla v22.8h, v2.8h, v20.8h\n"
"ldr q2, [x15, #0x30]\n"
- "fmla v28.8h, v7.8h, v16.8h\n"
+ "fmla v29.8h, v7.8h, v18.8h\n"
"ldr q16, [x12, x13]\n"
- "fmla v30.8h, v6.8h, v15.8h\n"
- "ldr q15, [x10, x13]\n"
- "fmla v31.8h, v3.8h, v16.8h\n"
+ "fmla v23.8h, v6.8h, v17.8h\n"
+ "ldr q18, [x10, x13]\n"
+ "fmla v22.8h, v3.8h, v16.8h\n"
"ldr q3, [x15, #0x40]\n"
- "fmla v30.8h, v7.8h, v13.8h\n"
+ "fmla v23.8h, v7.8h, v21.8h\n"
"ldr q13, [x8, x9]\n"
- "fmla v31.8h, v7.8h, v14.8h\n"
+ "fmla v22.8h, v7.8h, v19.8h\n"
"ld1 { v14.8h }, [x16]\n"
- "fmla v29.8h, v7.8h, v12.8h\n"
+ "fmla v28.8h, v7.8h, v24.8h\n"
"ldr q12, [x8, x11]\n"
- "fmla v30.8h, v5.8h, v16.8h\n"
+ "fmla v23.8h, v5.8h, v16.8h\n"
"ldr q16, [x8, x13]\n"
"ldr q5, [x15, #0x60]\n"
- "fmla v31.8h, v6.8h, v15.8h\n"
- "fmla v29.8h, v8.8h, v11.8h\n"
- "ldr q11, [x10, x9]\n"
+ "fmla v22.8h, v6.8h, v18.8h\n"
+ "fmla v28.8h, v8.8h, v20.8h\n"
+ "ldr q17, [x10, x9]\n"
"ldr q6, [x15, #0x70]\n"
- "fmla v30.8h, v8.8h, v15.8h\n"
- "fmla v31.8h, v8.8h, v11.8h\n"
+ "fmla v23.8h, v8.8h, v18.8h\n"
+ "fmla v22.8h, v8.8h, v17.8h\n"
"ldr q11, [x8, x6]\n"
"ldr q15, [x16, x6]\n"
- "fmax v28.8h, v28.8h, v19.8h\n"
- "fmax v29.8h, v29.8h, v19.8h\n"
+ "fmax v29.8h, v29.8h, v26.8h\n"
+ "fmax v28.8h, v28.8h, v26.8h\n"
"ldr q7, [x15, #0x80]\n"
"ldr q8, [x15, #0x90]\n"
- "fmax v30.8h, v30.8h, v19.8h\n"
- "fmax v31.8h, v31.8h, v19.8h\n"
+ "fmax v23.8h, v23.8h, v26.8h\n"
+ "fmax v22.8h, v22.8h, v26.8h\n"
"add x14, x14, #0x10\n"
"ldr q9, [x14, x13]\n"
- "fmin v28.8h, v28.8h, v18.8h\n"
- "fmin v29.8h, v29.8h, v18.8h\n"
- "fmin v30.8h, v30.8h, v18.8h\n"
- "fmin v31.8h, v31.8h, v18.8h\n"
+ "fmin v29.8h, v29.8h, v27.8h\n"
+ "fmin v28.8h, v28.8h, v27.8h\n"
+ "fmin v23.8h, v23.8h, v27.8h\n"
+ "fmin v22.8h, v22.8h, v27.8h\n"
"add x12, x12, #0x10\n"
"add x10, x10, #0x10\n"
- "st1 { v28.8h }, [x17]\n"
+ "st1 { v29.8h }, [x17]\n"
"add x15, x15, #0xa0\n"
- "str q29, [x17, x7]\n"
+ "str q28, [x17, x7]\n"
"add x17, x17, #0x10\n"
- "st1 { v30.8h }, [x28]\n"
- "str q31, [x28, x7]\n"
+ "st1 { v23.8h }, [x28]\n"
+ "str q22, [x28, x7]\n"
"add x28, x28, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "mov v28.16b, v17.16b\n fmla v28.8h, v8.8h, v9.8h\n"
- "mov v29.16b, v17.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "mov v29.16b, v31.16b\n fmla v29.8h, v8.8h, v9.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v6.8h, v9.8h\n"
"add x8, x8, #0x10\n"
- "fmla v28.8h, v0.8h, v10.8h\n"
- "fmla v29.8h, v1.8h, v12.8h\n"
- "ldr q12, [x16, x9]\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "ldr q11, [x16, x11]\n"
- "fmla v29.8h, v2.8h, v13.8h\n"
- "ldr q13, [x16, x13]\n"
- "fmla v28.8h, v3.8h, v14.8h\n"
- "ld1 { v14.8h }, [x12]\n"
- "fmla v29.8h, v0.8h, v16.8h\n"
+ "fmla v29.8h, v0.8h, v10.8h\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "ldr q20, [x16, x9]\n"
+ "fmla v29.8h, v1.8h, v11.8h\n"
+ "ldr q18, [x16, x11]\n"
+ "fmla v28.8h, v2.8h, v13.8h\n"
+ "ldr q17, [x16, x13]\n"
+ "fmla v29.8h, v3.8h, v14.8h\n"
+ "ld1 { v19.8h }, [x12]\n"
+ "fmla v28.8h, v0.8h, v16.8h\n"
"add x16, x16, #0x10\n"
- "fmla v28.8h, v4.8h, v15.8h\n"
- "ld1 { v15.8h }, [x14]\n"
- "fmla v29.8h, v4.8h, v11.8h\n"
- "ldr q11, [x12, x6]\n"
- "fmla v28.8h, v2.8h, v16.8h\n"
- "ldr q16, [x14, x6]\n"
- "fmla v29.8h, v5.8h, v12.8h\n"
- "ldr q12, [x14, x11]\n"
- "mov v30.16b, v17.16b\n fmla v30.8h, v2.8h, v9.8h\n"
- "mov v31.16b, v17.16b\n fmla v31.8h, v0.8h, v9.8h\n"
- "fmla v28.8h, v5.8h, v13.8h\n"
- "fmla v29.8h, v3.8h, v13.8h\n"
- "ldr q13, [x12, x11]\n"
- "fmla v30.8h, v3.8h, v14.8h\n"
- "ldr q14, [x12, x9]\n"
- "fmla v31.8h, v4.8h, v13.8h\n"
- "ldr q13, [x10, x6]\n"
- "fmla v30.8h, v0.8h, v15.8h\n"
- "fmla v31.8h, v1.8h, v12.8h\n"
- "fmla v30.8h, v4.8h, v11.8h\n"
- "ldr q11, [x14, x9]\n"
- "fmla v31.8h, v5.8h, v14.8h\n"
- "ldr q14, [x10, x11]\n"
- "fmla v28.8h, v6.8h, v15.8h\n"
- "ld1 { v15.8h }, [x10]\n"
- "fmla v30.8h, v1.8h, v16.8h\n"
+ "fmla v29.8h, v4.8h, v15.8h\n"
+ "ld1 { v25.8h }, [x14]\n"
+ "fmla v28.8h, v4.8h, v18.8h\n"
+ "ldr q18, [x12, x6]\n"
+ "fmla v29.8h, v2.8h, v16.8h\n"
+ "ldr q24, [x14, x6]\n"
+ "fmla v28.8h, v5.8h, v20.8h\n"
+ "ldr q23, [x14, x11]\n"
+ "mov v22.16b, v31.16b\n fmla v22.8h, v2.8h, v9.8h\n"
+ "mov v21.16b, v31.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "fmla v29.8h, v5.8h, v17.8h\n"
+ "fmla v28.8h, v3.8h, v17.8h\n"
+ "ldr q17, [x12, x11]\n"
+ "fmla v22.8h, v3.8h, v19.8h\n"
+ "ldr q16, [x12, x9]\n"
+ "fmla v21.8h, v4.8h, v17.8h\n"
+ "ldr q20, [x10, x6]\n"
+ "fmla v22.8h, v0.8h, v25.8h\n"
+ "fmla v21.8h, v1.8h, v23.8h\n"
+ "fmla v22.8h, v4.8h, v18.8h\n"
+ "ldr q19, [x14, x9]\n"
+ "fmla v21.8h, v5.8h, v16.8h\n"
+ "ldr q18, [x10, x11]\n"
+ "fmla v29.8h, v6.8h, v25.8h\n"
+ "ld1 { v17.8h }, [x10]\n"
+ "fmla v22.8h, v1.8h, v24.8h\n"
"add x14, x14, #0x10\n"
- "fmla v31.8h, v2.8h, v11.8h\n"
- "fmla v28.8h, v7.8h, v16.8h\n"
+ "fmla v21.8h, v2.8h, v19.8h\n"
+ "fmla v29.8h, v7.8h, v24.8h\n"
"ldr q16, [x12, x13]\n"
- "fmax v28.8h, v28.8h, v19.8h\n"
- "fmla v30.8h, v6.8h, v15.8h\n"
- "ldr q15, [x10, x13]\n"
- "fmla v31.8h, v3.8h, v16.8h\n"
- "fmin v28.8h, v28.8h, v18.8h\n"
- "fmla v30.8h, v7.8h, v13.8h\n"
- "fmla v31.8h, v7.8h, v14.8h\n"
- "st1 { v28.8h }, [x17]\n"
+ "fmax v29.8h, v29.8h, v26.8h\n"
+ "fmla v22.8h, v6.8h, v17.8h\n"
+ "ldr q17, [x10, x13]\n"
+ "fmla v21.8h, v3.8h, v16.8h\n"
+ "fmin v29.8h, v29.8h, v27.8h\n"
+ "fmla v22.8h, v7.8h, v20.8h\n"
+ "fmla v21.8h, v7.8h, v18.8h\n"
+ "st1 { v29.8h }, [x17]\n"
"add x12, x12, #0x10\n"
- "fmla v29.8h, v7.8h, v12.8h\n"
- "fmla v30.8h, v5.8h, v16.8h\n"
- "fmla v31.8h, v6.8h, v15.8h\n"
- "fmla v29.8h, v8.8h, v11.8h\n"
- "ldr q11, [x10, x9]\n"
- "fmax v29.8h, v29.8h, v19.8h\n"
- "fmla v30.8h, v8.8h, v15.8h\n"
- "fmla v31.8h, v8.8h, v11.8h\n"
- "fmax v30.8h, v30.8h, v19.8h\n"
+ "fmla v28.8h, v7.8h, v23.8h\n"
+ "fmla v22.8h, v5.8h, v16.8h\n"
+ "fmla v21.8h, v6.8h, v17.8h\n"
+ "fmla v28.8h, v8.8h, v19.8h\n"
+ "ldr q16, [x10, x9]\n"
+ "fmax v28.8h, v28.8h, v26.8h\n"
+ "fmla v22.8h, v8.8h, v17.8h\n"
+ "fmla v21.8h, v8.8h, v16.8h\n"
+ "fmax v22.8h, v22.8h, v26.8h\n"
"add x10, x10, #0x10\n"
- "fmax v31.8h, v31.8h, v19.8h\n"
- "fmin v29.8h, v29.8h, v18.8h\n"
- "str q29, [x17, x7]\n"
+ "fmax v21.8h, v21.8h, v26.8h\n"
+ "fmin v28.8h, v28.8h, v27.8h\n"
+ "str q28, [x17, x7]\n"
"add x17, x17, #0x10\n"
- "fmin v30.8h, v30.8h, v18.8h\n"
- "fmin v31.8h, v31.8h, v18.8h\n"
- "st1 { v30.8h }, [x28]\n"
- "str q31, [x28, x7]\n"
+ "fmin v22.8h, v22.8h, v27.8h\n"
+ "fmin v21.8h, v21.8h, v27.8h\n"
+ "st1 { v22.8h }, [x28]\n"
+ "str q21, [x28, x7]\n"
"add x28, x28, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x7\n"
"beq 81f\n"
- "ldr q17, [x15, #0x0]\n"
+ "ldr q31, [x15, #0x0]\n"
"ldr q0, [x15, #0x10]\n"
"add x27, x14, x13\n"
"add x26, x8, XZR\n"
@@ -409,17 +409,17 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr h15, [x21, #0x0]\n"
"ldr h16, [x20, #0x0]\n"
"8:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: End
- "mov v28.16b, v17.16b\n fmla v28.8h, v8.8h, v9.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v8.8h, v9.8h\n"
"fmla v28.8h, v0.8h, v10.8h\n"
"add x20, x16, x11\n"
- "mov v29.16b, v17.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "mov v29.16b, v31.16b\n fmla v29.8h, v6.8h, v9.8h\n"
"fmla v28.8h, v1.8h, v11.8h\n"
"fmla v29.8h, v1.8h, v12.8h\n"
"fmla v28.8h, v3.8h, v14.8h\n"
"fmla v29.8h, v2.8h, v13.8h\n"
"fmla v28.8h, v4.8h, v15.8h\n"
- "mov v30.16b, v17.16b\n fmla v30.8h, v2.8h, v9.8h\n"
- "mov v31.16b, v17.16b\n fmla v31.8h, v0.8h, v9.8h\n"
+ "mov v30.16b, v31.16b\n fmla v30.8h, v2.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v9.8h\n"
"fmla v28.8h, v2.8h, v16.8h\n"
"fmla v29.8h, v0.8h, v16.8h\n"
"tbz %x[n_channels], #2, 10f\n"
@@ -802,14 +802,14 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr h11, [x20, #0x0]\n"
"76:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: End
"fmla v31.8h, v8.8h, v11.8h\n"
- "fmax v28.8h, v28.8h, v19.8h\n"
- "fmax v29.8h, v29.8h, v19.8h\n"
- "fmax v30.8h, v30.8h, v19.8h\n"
- "fmax v31.8h, v31.8h, v19.8h\n"
- "fmin v28.8h, v28.8h, v18.8h\n"
- "fmin v29.8h, v29.8h, v18.8h\n"
- "fmin v30.8h, v30.8h, v18.8h\n"
- "fmin v31.8h, v31.8h, v18.8h\n"
+ "fmax v28.8h, v28.8h, v26.8h\n"
+ "fmax v29.8h, v29.8h, v26.8h\n"
+ "fmax v30.8h, v30.8h, v26.8h\n"
+ "fmax v31.8h, v31.8h, v26.8h\n"
+ "fmin v28.8h, v28.8h, v27.8h\n"
+ "fmin v29.8h, v29.8h, v27.8h\n"
+ "fmin v30.8h, v30.8h, v27.8h\n"
+ "fmin v31.8h, v31.8h, v27.8h\n"
"tbz %x[n_channels], #2, 78f\n"
"mov x21, x17\n"
"mov x20, x28\n"
@@ -871,7 +871,6 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"st1 { v29.h }[0], [x21]\n"
"st1 { v31.h }[0], [x20]\n"
"80:" // Tile loop: Oddments: Store: Bit 2: End
-
"81:" // Tile loop: End
"ldr x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
@@ -886,7 +885,7 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
index 144d11fb39..6ae0b30afd 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -88,258 +88,258 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
__asm__ __volatile__(
"ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "mov x26, #0x10\n" // cntb _, ALL, #1
- "lsr x25, %x[n_channels], #0x3\n"
- "ldr x24, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mov x25, #0x10\n" // cntb _, ALL, #1
+ "lsr x24, %x[n_channels], #0x3\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_params]]\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v19.8h }, [x20]\n"
+ "ld1r { v26.8h }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v18.8h }, [x20]\n"
+ "ld1r { v27.8h }, [x20]\n"
"add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
"ldp x12, x11, [x21, #0x0]\n"
"ldp x10, x9, [x21, #0x10]\n"
"mov x28, #0x0\n"
- "sub x23, XZR, x26\n"
- "cbz x25, 3f\n"
- "ldr q17, [x24, #0x0]\n"
- "ldr q0, [x24, #0x10]\n"
- "cmp x26, x25, LSL #4\n"
- "ldr q1, [x24, #0x20]\n"
- "ldr q2, [x24, #0x30]\n"
- "ldr q3, [x24, #0x40]\n"
- "ldr q4, [x24, #0x50]\n"
- "ldr q5, [x24, #0x60]\n"
- "ldr q6, [x24, #0x70]\n"
- "ldr q7, [x24, #0x80]\n"
- "ldr q8, [x24, #0x90]\n"
- "add x24, x24, #0xa0\n"
- "ldp x22, x20, [x13, #0x0]\n"
- "ldr q9, [x22, x28]\n"
+ "sub x22, XZR, x25\n"
+ "cbz x24, 3f\n"
+ "ldr q31, [x23, #0x0]\n"
+ "ldr q0, [x23, #0x10]\n"
+ "cmp x25, x24, LSL #4\n"
+ "ldr q1, [x23, #0x20]\n"
+ "ldr q2, [x23, #0x30]\n"
+ "ldr q3, [x23, #0x40]\n"
+ "ldr q4, [x23, #0x50]\n"
+ "ldr q5, [x23, #0x60]\n"
+ "ldr q6, [x23, #0x70]\n"
+ "ldr q7, [x23, #0x80]\n"
+ "ldr q8, [x23, #0x90]\n"
+ "add x23, x23, #0xa0\n"
+ "ldp x21, x20, [x13, #0x0]\n"
+ "ldr q9, [x21, x28]\n"
"ldr q10, [x20, x28]\n"
"ldp x21, x20, [x13, #0x10]\n"
"ldr q11, [x21, x28]\n"
"ldr q12, [x20, x28]\n"
- "ldp x22, x21, [x13, #0x20]\n"
- "ldr q13, [x22, x28]\n"
- "ldr q14, [x21, x28]\n"
+ "ldp x21, x20, [x13, #0x20]\n"
+ "ldr q13, [x21, x28]\n"
+ "ldr q14, [x20, x28]\n"
"ldp x21, x20, [x13, #0x30]\n"
"ldr q15, [x21, x28]\n"
"ldr q16, [x20, x28]\n"
"bge 2f\n"
"1:" // Channel loop
- "mov v28.16b, v17.16b\n fmla v28.8h, v8.8h, v9.8h\n"
- "mov v29.16b, v17.16b\n fmla v29.8h, v6.8h, v9.8h\n"
- "ldr x22, [x13, #0x40]\n"
+ "mov v24.16b, v31.16b\n fmla v24.8h, v8.8h, v9.8h\n"
+ "mov v23.16b, v31.16b\n fmla v23.8h, v6.8h, v9.8h\n"
+ "ldr x21, [x13, #0x40]\n"
"ldr x20, [x13, #0x48]\n"
- "fmla v28.8h, v0.8h, v10.8h\n"
- "fmla v29.8h, v1.8h, v12.8h\n"
- "ldr q12, [x20, x28]\n"
- "ldr x21, [x13, #0x50]\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "ldr q11, [x22, x28]\n"
- "fmla v29.8h, v2.8h, v13.8h\n"
- "ldr q13, [x21, x28]\n"
- "fmla v28.8h, v3.8h, v14.8h\n"
- "fmla v29.8h, v0.8h, v16.8h\n"
+ "fmla v24.8h, v0.8h, v10.8h\n"
+ "fmla v23.8h, v1.8h, v12.8h\n"
+ "ldr q20, [x20, x28]\n"
+ "ldr x20, [x13, #0x50]\n"
+ "fmla v24.8h, v1.8h, v11.8h\n"
+ "ldr q19, [x21, x28]\n"
+ "fmla v23.8h, v2.8h, v13.8h\n"
+ "ldr q18, [x20, x28]\n"
+ "fmla v24.8h, v3.8h, v14.8h\n"
+ "fmla v23.8h, v0.8h, v16.8h\n"
"ldr x20, [x13, #0x58]\n"
- "ldr q14, [x20, x28]\n"
- "fmla v28.8h, v4.8h, v15.8h\n"
- "fmla v29.8h, v4.8h, v11.8h\n"
- "ldr x20, [x13, #0x78]\n"
- "ldr x22, [x13, #0x60]\n"
- "ldr q15, [x22, x28]\n"
- "fmla v28.8h, v2.8h, v16.8h\n"
- "fmla v29.8h, v5.8h, v12.8h\n"
- "ldr x22, [x13, #0x80]\n"
- "ldr q12, [x22, x28]\n"
- "mov v30.16b, v17.16b\n fmla v30.8h, v2.8h, v9.8h\n"
- "mov v31.16b, v17.16b\n fmla v31.8h, v0.8h, v9.8h\n"
- "ldr q17, [x24, #0x0]\n"
- "fmla v28.8h, v5.8h, v13.8h\n"
- "fmla v29.8h, v3.8h, v13.8h\n"
- "ldr q13, [x20, x28]\n"
- "ldr x21, [x13, #0x68]\n"
- "ldr q11, [x21, x28]\n"
- "fmla v30.8h, v3.8h, v14.8h\n"
- "fmla v31.8h, v4.8h, v13.8h\n"
- "ldr x20, [x13, #0x88]\n"
- "ldr q14, [x20, x28]\n"
- "fmla v30.8h, v0.8h, v15.8h\n"
- "ldr q0, [x24, #0x10]\n"
- "fmla v31.8h, v1.8h, v12.8h\n"
- "ldr x21, [x13, #0x70]\n"
+ "ldr q17, [x20, x28]\n"
+ "fmla v24.8h, v4.8h, v15.8h\n"
+ "fmla v23.8h, v4.8h, v19.8h\n"
+ "ldr x21, [x13, #0x78]\n"
+ "ldr x20, [x13, #0x60]\n"
+ "ldr q22, [x20, x28]\n"
+ "fmla v24.8h, v2.8h, v16.8h\n"
+ "fmla v23.8h, v5.8h, v20.8h\n"
+ "ldr x20, [x13, #0x80]\n"
+ "ldr q21, [x20, x28]\n"
+ "mov v20.16b, v31.16b\n fmla v20.8h, v2.8h, v9.8h\n"
+ "mov v19.16b, v31.16b\n fmla v19.8h, v0.8h, v9.8h\n"
+ "ldr q31, [x23, #0x0]\n"
+ "fmla v24.8h, v5.8h, v18.8h\n"
+ "fmla v23.8h, v3.8h, v18.8h\n"
"ldr q16, [x21, x28]\n"
- "fmla v30.8h, v4.8h, v11.8h\n"
- "fmla v31.8h, v5.8h, v14.8h\n"
- "ldr q4, [x24, #0x50]\n"
+ "ldr x20, [x13, #0x68]\n"
+ "ldr q18, [x20, x28]\n"
+ "fmla v20.8h, v3.8h, v17.8h\n"
+ "fmla v19.8h, v4.8h, v16.8h\n"
+ "ldr x20, [x13, #0x88]\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v20.8h, v0.8h, v22.8h\n"
+ "ldr q0, [x23, #0x10]\n"
+ "fmla v19.8h, v1.8h, v21.8h\n"
+ "ldr x20, [x13, #0x70]\n"
+ "ldr q17, [x20, x28]\n"
+ "fmla v20.8h, v4.8h, v18.8h\n"
+ "fmla v19.8h, v5.8h, v16.8h\n"
+ "ldr q4, [x23, #0x50]\n"
"ldr x20, [x13, #0x98]\n"
- "fmla v28.8h, v6.8h, v15.8h\n"
- "fmla v30.8h, v1.8h, v16.8h\n"
- "ldr q11, [x20, x28]\n"
- "ldr q1, [x24, #0x20]\n"
- "fmla v31.8h, v2.8h, v11.8h\n"
- "fmla v28.8h, v7.8h, v16.8h\n"
- "ldr q2, [x24, #0x30]\n"
- "ldr x21, [x13, #0x90]\n"
- "fmla v29.8h, v7.8h, v12.8h\n"
- "fmla v29.8h, v8.8h, v11.8h\n"
- "ldr q15, [x21, x28]\n"
- "ldr x21, [x13, #0xa8]\n"
- "fmla v30.8h, v6.8h, v15.8h\n"
- "fmax v28.8h, v28.8h, v19.8h\n"
- "ldr q16, [x21, x28]\n"
- "ldr x22, [x13, #0xa0]\n"
- "fmla v31.8h, v3.8h, v16.8h\n"
- "fmax v29.8h, v29.8h, v19.8h\n"
- "ldr q13, [x22, x28]\n"
- "ldr q3, [x24, #0x40]\n"
- "fmla v30.8h, v7.8h, v13.8h\n"
- "fmla v30.8h, v5.8h, v16.8h\n"
- "ldr q5, [x24, #0x60]\n"
- "ldr x21, [x13, #0xb0]\n"
- "add x23, x23, #0x10\n"
- "fmin v28.8h, v28.8h, v18.8h\n"
- "ldr q14, [x21, x28]\n"
+ "fmla v24.8h, v6.8h, v22.8h\n"
+ "fmla v20.8h, v1.8h, v17.8h\n"
+ "ldr q16, [x20, x28]\n"
+ "ldr q1, [x23, #0x20]\n"
+ "fmla v19.8h, v2.8h, v16.8h\n"
+ "fmla v24.8h, v7.8h, v17.8h\n"
+ "ldr q2, [x23, #0x30]\n"
+ "ldr x20, [x13, #0x90]\n"
+ "fmla v23.8h, v7.8h, v21.8h\n"
+ "fmla v23.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x20, x28]\n"
+ "ldr x20, [x13, #0xa8]\n"
+ "fmla v20.8h, v6.8h, v16.8h\n"
+ "fmax v24.8h, v24.8h, v26.8h\n"
+ "ldr q17, [x20, x28]\n"
+ "ldr x20, [x13, #0xa0]\n"
+ "fmla v19.8h, v3.8h, v17.8h\n"
+ "fmax v23.8h, v23.8h, v26.8h\n"
+ "ldr q16, [x20, x28]\n"
+ "ldr q3, [x23, #0x40]\n"
+ "fmla v20.8h, v7.8h, v16.8h\n"
+ "fmla v20.8h, v5.8h, v17.8h\n"
+ "ldr q5, [x23, #0x60]\n"
+ "ldr x20, [x13, #0xb0]\n"
+ "add x22, x22, #0x10\n"
+ "fmin v24.8h, v24.8h, v27.8h\n"
+ "ldr q16, [x20, x28]\n"
"ldr x20, [x13, #0xb8]\n"
- "fmla v31.8h, v7.8h, v14.8h\n"
- "fmin v29.8h, v29.8h, v18.8h\n"
- "ldr q15, [x20, x28]\n"
- "ldr q7, [x24, #0x80]\n"
- "fmla v31.8h, v6.8h, v15.8h\n"
- "fmla v30.8h, v8.8h, v15.8h\n"
- "ldr q6, [x24, #0x70]\n"
- "ldr x22, [x13, #0xc0]\n"
- "fmax v30.8h, v30.8h, v19.8h\n"
- "fmin v30.8h, v30.8h, v18.8h\n"
- "ldr q11, [x22, x28]\n"
- "fmla v31.8h, v8.8h, v11.8h\n"
- "ldr q8, [x24, #0x90]\n"
- "fmax v31.8h, v31.8h, v19.8h\n"
- "ldp x22, x20, [x13, #0x0]\n"
- "ldr q9, [x22, x26]\n"
- "fmin v31.8h, v31.8h, v18.8h\n"
+ "fmla v19.8h, v7.8h, v16.8h\n"
+ "fmin v23.8h, v23.8h, v27.8h\n"
+ "ldr q16, [x20, x28]\n"
+ "ldr q7, [x23, #0x80]\n"
+ "fmla v19.8h, v6.8h, v16.8h\n"
+ "fmla v20.8h, v8.8h, v16.8h\n"
+ "ldr q6, [x23, #0x70]\n"
+ "ldr x20, [x13, #0xc0]\n"
+ "fmax v20.8h, v20.8h, v26.8h\n"
+ "fmin v20.8h, v20.8h, v27.8h\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v19.8h, v8.8h, v16.8h\n"
+ "ldr q8, [x23, #0x90]\n"
+ "fmax v19.8h, v19.8h, v26.8h\n"
+ "ldp x21, x20, [x13, #0x0]\n"
+ "ldr q9, [x21, x25]\n"
+ "fmin v19.8h, v19.8h, v27.8h\n"
"add x28, x28, #0x10\n"
- "ldr q10, [x20, x26]\n"
+ "ldr q10, [x20, x25]\n"
"ldp x21, x20, [x13, #0x10]\n"
- "str q28, [x12, x23]\n"
- "add x24, x24, #0xa0\n"
- "ldr q11, [x21, x26]\n"
- "ldr q12, [x20, x26]\n"
- "str q29, [x11, x23]\n"
- "ldp x22, x21, [x13, #0x20]\n"
- "ldr q13, [x22, x26]\n"
- "str q30, [x10, x23]\n"
- "ldr q14, [x21, x26]\n"
+ "str q24, [x12, x22]\n"
+ "add x23, x23, #0xa0\n"
+ "ldr q11, [x21, x25]\n"
+ "ldr q12, [x20, x25]\n"
+ "str q23, [x11, x22]\n"
+ "ldp x21, x20, [x13, #0x20]\n"
+ "ldr q13, [x21, x25]\n"
+ "str q20, [x10, x22]\n"
+ "ldr q14, [x20, x25]\n"
"ldp x21, x20, [x13, #0x30]\n"
- "str q31, [x9, x23]\n"
- "ldr q15, [x21, x26]\n"
- "ldr q16, [x20, x26]\n"
- "add x26, x26, #0x10\n"
- "cmp x26, x25, LSL #4\n"
+ "str q19, [x9, x22]\n"
+ "ldr q15, [x21, x25]\n"
+ "ldr q16, [x20, x25]\n"
+ "add x25, x25, #0x10\n"
+ "cmp x25, x24, LSL #4\n"
"blt 1b\n"
"2:" // Channel tail
- "mov v28.16b, v17.16b\n fmla v28.8h, v8.8h, v9.8h\n"
- "mov v29.16b, v17.16b\n fmla v29.8h, v6.8h, v9.8h\n"
- "ldr x22, [x13, #0x40]\n"
+ "mov v25.16b, v31.16b\n fmla v25.8h, v8.8h, v9.8h\n"
+ "mov v24.16b, v31.16b\n fmla v24.8h, v6.8h, v9.8h\n"
+ "ldr x21, [x13, #0x40]\n"
"ldr x20, [x13, #0x48]\n"
- "fmla v28.8h, v0.8h, v10.8h\n"
- "fmla v29.8h, v1.8h, v12.8h\n"
- "ldr q12, [x20, x28]\n"
- "ldr x21, [x13, #0x50]\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "ldr q11, [x22, x28]\n"
- "fmla v29.8h, v2.8h, v13.8h\n"
- "ldr q13, [x21, x28]\n"
- "fmla v28.8h, v3.8h, v14.8h\n"
- "fmla v29.8h, v0.8h, v16.8h\n"
+ "fmla v25.8h, v0.8h, v10.8h\n"
+ "fmla v24.8h, v1.8h, v12.8h\n"
+ "ldr q20, [x20, x28]\n"
+ "ldr x20, [x13, #0x50]\n"
+ "fmla v25.8h, v1.8h, v11.8h\n"
+ "ldr q18, [x21, x28]\n"
+ "fmla v24.8h, v2.8h, v13.8h\n"
+ "ldr q19, [x20, x28]\n"
+ "fmla v25.8h, v3.8h, v14.8h\n"
+ "fmla v24.8h, v0.8h, v16.8h\n"
"ldr x20, [x13, #0x58]\n"
- "ldr q14, [x20, x28]\n"
- "fmla v28.8h, v4.8h, v15.8h\n"
- "fmla v29.8h, v4.8h, v11.8h\n"
- "ldr x20, [x13, #0x78]\n"
- "ldr x22, [x13, #0x60]\n"
- "ldr q15, [x22, x28]\n"
- "fmla v28.8h, v2.8h, v16.8h\n"
- "fmla v29.8h, v5.8h, v12.8h\n"
- "ldr x22, [x13, #0x80]\n"
- "ldr q12, [x22, x28]\n"
- "mov v30.16b, v17.16b\n fmla v30.8h, v2.8h, v9.8h\n"
- "mov v31.16b, v17.16b\n fmla v31.8h, v0.8h, v9.8h\n"
- "ldr x21, [x13, #0x68]\n"
- "ldr q11, [x21, x28]\n"
- "fmla v28.8h, v5.8h, v13.8h\n"
- "fmla v29.8h, v3.8h, v13.8h\n"
- "ldr q13, [x20, x28]\n"
- "fmla v30.8h, v3.8h, v14.8h\n"
- "fmla v31.8h, v4.8h, v13.8h\n"
- "ldr x20, [x13, #0x88]\n"
- "ldr q14, [x20, x28]\n"
- "fmla v30.8h, v0.8h, v15.8h\n"
- "fmla v31.8h, v1.8h, v12.8h\n"
- "ldr x21, [x13, #0x70]\n"
+ "ldr q17, [x20, x28]\n"
+ "fmla v25.8h, v4.8h, v15.8h\n"
+ "fmla v24.8h, v4.8h, v18.8h\n"
+ "ldr x21, [x13, #0x78]\n"
+ "ldr x20, [x13, #0x60]\n"
+ "ldr q23, [x20, x28]\n"
+ "fmla v25.8h, v2.8h, v16.8h\n"
+ "fmla v24.8h, v5.8h, v20.8h\n"
+ "ldr x20, [x13, #0x80]\n"
+ "ldr q22, [x20, x28]\n"
+ "mov v21.16b, v31.16b\n fmla v21.8h, v2.8h, v9.8h\n"
+ "mov v20.16b, v31.16b\n fmla v20.8h, v0.8h, v9.8h\n"
+ "ldr x20, [x13, #0x68]\n"
+ "ldr q18, [x20, x28]\n"
+ "fmla v25.8h, v5.8h, v19.8h\n"
+ "fmla v24.8h, v3.8h, v19.8h\n"
"ldr q16, [x21, x28]\n"
+ "fmla v21.8h, v3.8h, v17.8h\n"
+ "fmla v20.8h, v4.8h, v16.8h\n"
+ "ldr x20, [x13, #0x88]\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v21.8h, v0.8h, v23.8h\n"
+ "fmla v20.8h, v1.8h, v22.8h\n"
+ "ldr x20, [x13, #0x70]\n"
+ "ldr q17, [x20, x28]\n"
"ldr x20, [x13, #0x98]\n"
- "fmla v30.8h, v4.8h, v11.8h\n"
- "ldr q11, [x20, x28]\n"
- "fmla v31.8h, v5.8h, v14.8h\n"
- "fmla v28.8h, v6.8h, v15.8h\n"
- "ldr x21, [x13, #0x90]\n"
- "ldr q15, [x21, x28]\n"
- "fmla v30.8h, v1.8h, v16.8h\n"
- "ldr x21, [x13, #0xa8]\n"
- "fmla v31.8h, v2.8h, v11.8h\n"
- "fmla v28.8h, v7.8h, v16.8h\n"
- "ldr q16, [x21, x28]\n"
- "ldr x22, [x13, #0xa0]\n"
- "ldr q13, [x22, x28]\n"
- "fmla v30.8h, v6.8h, v15.8h\n"
- "fmla v31.8h, v3.8h, v16.8h\n"
- "ldr x21, [x13, #0xb0]\n"
- "ldr q14, [x21, x28]\n"
- "fmla v30.8h, v7.8h, v13.8h\n"
- "fmla v31.8h, v7.8h, v14.8h\n"
+ "fmla v21.8h, v4.8h, v18.8h\n"
+ "ldr q19, [x20, x28]\n"
+ "fmla v20.8h, v5.8h, v16.8h\n"
+ "fmla v25.8h, v6.8h, v23.8h\n"
+ "ldr x20, [x13, #0x90]\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v21.8h, v1.8h, v17.8h\n"
+ "ldr x20, [x13, #0xa8]\n"
+ "fmla v20.8h, v2.8h, v19.8h\n"
+ "fmla v25.8h, v7.8h, v17.8h\n"
+ "ldr q18, [x20, x28]\n"
+ "ldr x20, [x13, #0xa0]\n"
+ "ldr q17, [x20, x28]\n"
+ "fmla v21.8h, v6.8h, v16.8h\n"
+ "fmla v20.8h, v3.8h, v18.8h\n"
+ "ldr x20, [x13, #0xb0]\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v21.8h, v7.8h, v17.8h\n"
+ "fmla v20.8h, v7.8h, v16.8h\n"
"ldr x20, [x13, #0xb8]\n"
- "ldr q15, [x20, x28]\n"
- "fmla v29.8h, v7.8h, v12.8h\n"
- "fmla v30.8h, v5.8h, v16.8h\n"
- "ldr x22, [x13, #0xc0]\n"
- "fmla v31.8h, v6.8h, v15.8h\n"
- "fmla v29.8h, v8.8h, v11.8h\n"
- "ldr q11, [x22, x28]\n"
- "fmla v30.8h, v8.8h, v15.8h\n"
- "fmla v31.8h, v8.8h, v11.8h\n"
- "fmax v28.8h, v28.8h, v19.8h\n"
- "add x23, x23, #0x10\n"
- "fmax v29.8h, v29.8h, v19.8h\n"
- "fmax v30.8h, v30.8h, v19.8h\n"
+ "ldr q17, [x20, x28]\n"
+ "fmla v24.8h, v7.8h, v22.8h\n"
+ "fmla v21.8h, v5.8h, v18.8h\n"
+ "ldr x20, [x13, #0xc0]\n"
+ "fmla v20.8h, v6.8h, v17.8h\n"
+ "fmla v24.8h, v8.8h, v19.8h\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v21.8h, v8.8h, v17.8h\n"
+ "fmla v20.8h, v8.8h, v16.8h\n"
+ "fmax v25.8h, v25.8h, v26.8h\n"
+ "add x22, x22, #0x10\n"
+ "fmax v24.8h, v24.8h, v26.8h\n"
+ "fmax v21.8h, v21.8h, v26.8h\n"
"add x28, x28, #0x10\n"
- "fmax v31.8h, v31.8h, v19.8h\n"
- "fmin v28.8h, v28.8h, v18.8h\n"
- "str q28, [x12, x23]\n"
- "fmin v29.8h, v29.8h, v18.8h\n"
- "fmin v30.8h, v30.8h, v18.8h\n"
- "str q29, [x11, x23]\n"
- "fmin v31.8h, v31.8h, v18.8h\n"
- "str q30, [x10, x23]\n"
- "str q31, [x9, x23]\n"
+ "fmax v20.8h, v20.8h, v26.8h\n"
+ "fmin v25.8h, v25.8h, v27.8h\n"
+ "str q25, [x12, x22]\n"
+ "fmin v24.8h, v24.8h, v27.8h\n"
+ "fmin v21.8h, v21.8h, v27.8h\n"
+ "str q24, [x11, x22]\n"
+ "fmin v20.8h, v20.8h, v27.8h\n"
+ "str q21, [x10, x22]\n"
+ "str q20, [x9, x22]\n"
"3:" // Oddments
"tst %x[n_channels], #0x7\n"
"beq 80f\n"
- "ldr q17, [x24, #0x0]\n"
- "ldr q0, [x24, #0x10]\n"
- "mov x23, x28\n"
- "add x12, x12, x23\n"
- "ldr q1, [x24, #0x20]\n"
- "ldr q2, [x24, #0x30]\n"
- "add x11, x11, x23\n"
- "add x10, x10, x23\n"
- "ldr q3, [x24, #0x40]\n"
- "ldr q4, [x24, #0x50]\n"
- "add x9, x9, x23\n"
- "ldr q5, [x24, #0x60]\n"
- "ldr q6, [x24, #0x70]\n"
- "ldr q7, [x24, #0x80]\n"
- "ldr q8, [x24, #0x90]\n"
+ "ldr q31, [x23, #0x0]\n"
+ "ldr q0, [x23, #0x10]\n"
+ "mov x20, x28\n"
+ "add x12, x12, x20\n"
+ "ldr q1, [x23, #0x20]\n"
+ "ldr q2, [x23, #0x30]\n"
+ "add x11, x11, x20\n"
+ "add x10, x10, x20\n"
+ "ldr q3, [x23, #0x40]\n"
+ "ldr q4, [x23, #0x50]\n"
+ "add x9, x9, x20\n"
+ "ldr q5, [x23, #0x60]\n"
+ "ldr q6, [x23, #0x70]\n"
+ "ldr q7, [x23, #0x80]\n"
+ "ldr q8, [x23, #0x90]\n"
"ldr x27, [x13, #0x0]\n"
"ldr x26, [x13, #0x8]\n"
"add x27, x27, x28\n"
@@ -425,18 +425,18 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"ld1 { v15.h }[0], [x21], #0x2\n"
"ld1 { v16.h }[0], [x20], #0x2\n"
"7:" // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: End
- "mov v28.16b, v17.16b\n fmla v28.8h, v8.8h, v9.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v8.8h, v9.8h\n"
"fmla v28.8h, v0.8h, v10.8h\n"
"ldr x20, [x13, #0x40]\n"
"add x20, x20, x28\n"
- "mov v29.16b, v17.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "mov v29.16b, v31.16b\n fmla v29.8h, v6.8h, v9.8h\n"
"fmla v28.8h, v1.8h, v11.8h\n"
"fmla v29.8h, v1.8h, v12.8h\n"
"fmla v28.8h, v3.8h, v14.8h\n"
"fmla v29.8h, v2.8h, v13.8h\n"
"fmla v28.8h, v4.8h, v15.8h\n"
- "mov v30.16b, v17.16b\n fmla v30.8h, v2.8h, v9.8h\n"
- "mov v31.16b, v17.16b\n fmla v31.8h, v0.8h, v9.8h\n"
+ "mov v30.16b, v31.16b\n fmla v30.8h, v2.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v9.8h\n"
"fmla v28.8h, v2.8h, v16.8h\n"
"fmla v29.8h, v0.8h, v16.8h\n"
"tbz %x[n_channels], #2, 9f\n"
@@ -835,14 +835,14 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"ld1 { v11.h }[0], [x20], #0x2\n"
"75:" // Oddments: Load input (4, 4): Bit 2: End
"fmla v31.8h, v8.8h, v11.8h\n"
- "fmax v28.8h, v28.8h, v19.8h\n"
- "fmax v29.8h, v29.8h, v19.8h\n"
- "fmax v30.8h, v30.8h, v19.8h\n"
- "fmax v31.8h, v31.8h, v19.8h\n"
- "fmin v28.8h, v28.8h, v18.8h\n"
- "fmin v29.8h, v29.8h, v18.8h\n"
- "fmin v30.8h, v30.8h, v18.8h\n"
- "fmin v31.8h, v31.8h, v18.8h\n"
+ "fmax v28.8h, v28.8h, v26.8h\n"
+ "fmax v29.8h, v29.8h, v26.8h\n"
+ "fmax v30.8h, v30.8h, v26.8h\n"
+ "fmax v31.8h, v31.8h, v26.8h\n"
+ "fmin v28.8h, v28.8h, v27.8h\n"
+ "fmin v29.8h, v29.8h, v27.8h\n"
+ "fmin v30.8h, v30.8h, v27.8h\n"
+ "fmin v31.8h, v31.8h, v27.8h\n"
"tbz %x[n_channels], #2, 77f\n"
"st1 { v28.d }[0], [x12], #0x8\n"
"st1 { v29.d }[0], [x11], #0x8\n"
@@ -887,7 +887,7 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"80:" // End
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index 81a608e349..1d1d491c28 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -33,8 +33,8 @@
namespace arm_conv {
namespace depthwise {
-void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
-void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
class a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
{
@@ -57,7 +57,7 @@ class a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 2;
a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(2, 5, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
index 8807f5d306..cecaf79704 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -120,9 +120,9 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"add x13, x15, x2\n"
"add x5, x5, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v18.8h }, [x20]\n"
+ "ld1r { v27.8h }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v17.8h }, [x20]\n"
+ "ld1r { v15.8h }, [x20]\n"
"add x12, x14, x24, LSL #1\n"
"add x11, x13, x2\n"
"add x10, x5, x21, LSL #1\n"
@@ -130,7 +130,7 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"mov x21, #0x0\n"
"sub x20, XZR, x23\n"
"cbz x22, 4f\n"
- "ldr q16, [x8, #0x0]\n"
+ "ldr q25, [x8, #0x0]\n"
"ldr q0, [x8, #0x10]\n"
"cmp x23, x22, LSL #4\n"
"ldr q1, [x8, #0x20]\n"
@@ -150,366 +150,366 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"ld1 { v14.8h }, [x17]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v5.8h\n"
- "ldr q5, [x7, x15]\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v6.8h\n"
+ "mov v30.16b, v25.16b\n fmla v30.8h, v0.8h, v5.8h\n"
+ "ldr q23, [x7, x15]\n"
+ "mov v31.16b, v25.16b\n fmla v31.8h, v0.8h, v6.8h\n"
"add x23, x23, #0x10\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v7.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v8.8h\n"
- "ldr q0, [x8, #0x0]\n"
- "ldr q16, [x8, #0x140]\n"
- "fmla v28.8h, v1.8h, v6.8h\n"
- "ldr q6, [x7, x13]\n"
- "fmla v29.8h, v1.8h, v9.8h\n"
+ "mov v29.16b, v25.16b\n fmla v29.8h, v0.8h, v7.8h\n"
+ "mov v28.16b, v25.16b\n fmla v28.8h, v0.8h, v8.8h\n"
+ "ldr q19, [x8, #0x0]\n"
+ "ldr q25, [x8, #0x140]\n"
+ "fmla v30.8h, v1.8h, v6.8h\n"
+ "ldr q21, [x7, x13]\n"
+ "fmla v31.8h, v1.8h, v9.8h\n"
"add x7, x7, #0x10\n"
- "fmla v30.8h, v1.8h, v8.8h\n"
- "fmla v31.8h, v1.8h, v13.8h\n"
+ "fmla v29.8h, v1.8h, v8.8h\n"
+ "fmla v28.8h, v1.8h, v13.8h\n"
"ldr q1, [x8, #0x10]\n"
"cmp x23, x22, LSL #4\n"
- "fmla v28.8h, v2.8h, v9.8h\n"
- "ldr q9, [x4, x11]\n"
- "fmla v29.8h, v2.8h, v11.8h\n"
+ "fmla v30.8h, v2.8h, v9.8h\n"
+ "ldr q18, [x4, x11]\n"
+ "fmla v31.8h, v2.8h, v11.8h\n"
"add x4, x4, #0x10\n"
- "fmla v30.8h, v2.8h, v13.8h\n"
- "fmla v31.8h, v2.8h, v5.8h\n"
- "ldr q2, [x8, #0x20]\n"
+ "fmla v29.8h, v2.8h, v13.8h\n"
+ "fmla v28.8h, v2.8h, v23.8h\n"
+ "ldr q17, [x8, #0x20]\n"
"add x20, x20, #0x10\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x17, x2]\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
+ "fmla v30.8h, v3.8h, v11.8h\n"
+ "ldr q6, [x17, x2]\n"
+ "fmla v31.8h, v3.8h, v12.8h\n"
"add x21, x21, #0x10\n"
- "fmla v30.8h, v3.8h, v5.8h\n"
- "fmla v31.8h, v3.8h, v6.8h\n"
- "ldr q3, [x8, #0x30]\n"
- "fmla v28.8h, v4.8h, v12.8h\n"
- "ldr q12, [x17, x6]\n"
- "fmla v29.8h, v4.8h, v9.8h\n"
- "ldr q9, [x17, x15]\n"
- "fmla v30.8h, v4.8h, v6.8h\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
- "ldr q4, [x8, #0x40]\n"
- "fmla v28.8h, v0.8h, v7.8h\n"
+ "fmla v29.8h, v3.8h, v23.8h\n"
+ "fmla v28.8h, v3.8h, v21.8h\n"
+ "ldr q16, [x8, #0x30]\n"
+ "fmla v30.8h, v4.8h, v12.8h\n"
+ "ldr q2, [x17, x6]\n"
+ "fmla v31.8h, v4.8h, v18.8h\n"
+ "ldr q0, [x17, x15]\n"
+ "fmla v29.8h, v4.8h, v21.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "ldr q20, [x8, #0x40]\n"
+ "fmla v30.8h, v19.8h, v7.8h\n"
"ld1 { v7.8h }, [x7]\n"
- "fmla v29.8h, v0.8h, v8.8h\n"
- "fmla v30.8h, v0.8h, v14.8h\n"
- "fmla v31.8h, v0.8h, v11.8h\n"
- "ldr q0, [x8, #0x50]\n"
- "fmla v28.8h, v1.8h, v8.8h\n"
- "ldr q8, [x17, x11]\n"
- "fmla v29.8h, v1.8h, v13.8h\n"
- "fmla v30.8h, v1.8h, v11.8h\n"
- "fmla v31.8h, v1.8h, v12.8h\n"
- "ldr q1, [x8, #0x60]\n"
- "fmla v28.8h, v2.8h, v13.8h\n"
- "ldr q13, [x17, x13]\n"
- "fmla v29.8h, v2.8h, v5.8h\n"
+ "fmla v31.8h, v19.8h, v8.8h\n"
+ "fmla v29.8h, v19.8h, v14.8h\n"
+ "fmla v28.8h, v19.8h, v6.8h\n"
+ "ldr q19, [x8, #0x50]\n"
+ "fmla v30.8h, v1.8h, v8.8h\n"
+ "ldr q26, [x17, x11]\n"
+ "fmla v31.8h, v1.8h, v13.8h\n"
+ "fmla v29.8h, v1.8h, v6.8h\n"
+ "fmla v28.8h, v1.8h, v2.8h\n"
+ "ldr q18, [x8, #0x60]\n"
+ "fmla v30.8h, v17.8h, v13.8h\n"
+ "ldr q1, [x17, x13]\n"
+ "fmla v31.8h, v17.8h, v23.8h\n"
"add x17, x17, #0x10\n"
- "fmla v30.8h, v2.8h, v12.8h\n"
- "fmla v31.8h, v2.8h, v9.8h\n"
- "ldr q2, [x8, #0x70]\n"
- "fmla v28.8h, v3.8h, v5.8h\n"
- "ld1 { v5.8h }, [x16]\n"
- "fmla v29.8h, v3.8h, v6.8h\n"
- "fmla v30.8h, v3.8h, v9.8h\n"
- "fmla v31.8h, v3.8h, v13.8h\n"
- "ldr q3, [x8, #0x80]\n"
- "fmla v28.8h, v4.8h, v6.8h\n"
- "ldr q6, [x16, x2]\n"
- "fmla v29.8h, v4.8h, v10.8h\n"
- "ldr q10, [x16, x6]\n"
- "fmla v30.8h, v4.8h, v13.8h\n"
- "fmla v31.8h, v4.8h, v8.8h\n"
- "ldr q4, [x8, #0x90]\n"
- "fmla v28.8h, v0.8h, v14.8h\n"
- "ldr q14, [x16, x11]\n"
- "fmla v29.8h, v0.8h, v11.8h\n"
- "fmla v30.8h, v0.8h, v5.8h\n"
- "fmla v31.8h, v0.8h, v6.8h\n"
- "ldr q0, [x8, #0xa0]\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "ldr q11, [x16, x15]\n"
- "fmla v29.8h, v1.8h, v12.8h\n"
- "fmla v30.8h, v1.8h, v6.8h\n"
- "fmla v31.8h, v1.8h, v10.8h\n"
- "ldr q1, [x8, #0xb0]\n"
- "fmla v28.8h, v2.8h, v12.8h\n"
- "ldr q12, [x16, x13]\n"
- "fmla v29.8h, v2.8h, v9.8h\n"
+ "fmla v29.8h, v17.8h, v2.8h\n"
+ "fmla v28.8h, v17.8h, v0.8h\n"
+ "ldr q17, [x8, #0x70]\n"
+ "fmla v30.8h, v16.8h, v23.8h\n"
+ "ld1 { v24.8h }, [x16]\n"
+ "fmla v31.8h, v16.8h, v21.8h\n"
+ "fmla v29.8h, v16.8h, v0.8h\n"
+ "fmla v28.8h, v16.8h, v1.8h\n"
+ "ldr q16, [x8, #0x80]\n"
+ "fmla v30.8h, v20.8h, v21.8h\n"
+ "ldr q23, [x16, x2]\n"
+ "fmla v31.8h, v20.8h, v10.8h\n"
+ "ldr q22, [x16, x6]\n"
+ "fmla v29.8h, v20.8h, v1.8h\n"
+ "fmla v28.8h, v20.8h, v26.8h\n"
+ "ldr q21, [x8, #0x90]\n"
+ "fmla v30.8h, v19.8h, v14.8h\n"
+ "ldr q5, [x16, x11]\n"
+ "fmla v31.8h, v19.8h, v6.8h\n"
+ "fmla v29.8h, v19.8h, v24.8h\n"
+ "fmla v28.8h, v19.8h, v23.8h\n"
+ "ldr q11, [x8, #0xa0]\n"
+ "fmla v30.8h, v18.8h, v6.8h\n"
+ "ldr q20, [x16, x15]\n"
+ "fmla v31.8h, v18.8h, v2.8h\n"
+ "fmla v29.8h, v18.8h, v23.8h\n"
+ "fmla v28.8h, v18.8h, v22.8h\n"
+ "ldr q18, [x8, #0xb0]\n"
+ "fmla v30.8h, v17.8h, v2.8h\n"
+ "ldr q19, [x16, x13]\n"
+ "fmla v31.8h, v17.8h, v0.8h\n"
"add x16, x16, #0x10\n"
- "fmla v30.8h, v2.8h, v10.8h\n"
- "fmla v31.8h, v2.8h, v11.8h\n"
- "ldr q2, [x8, #0xc0]\n"
- "fmla v28.8h, v3.8h, v9.8h\n"
- "ld1 { v9.8h }, [x14]\n"
- "fmla v29.8h, v3.8h, v13.8h\n"
- "fmla v30.8h, v3.8h, v11.8h\n"
- "fmla v31.8h, v3.8h, v12.8h\n"
- "ldr q3, [x8, #0xd0]\n"
- "fmla v28.8h, v4.8h, v13.8h\n"
- "ldr q13, [x14, x2]\n"
- "fmla v29.8h, v4.8h, v8.8h\n"
- "ldr q8, [x14, x13]\n"
- "fmla v30.8h, v4.8h, v12.8h\n"
- "fmla v31.8h, v4.8h, v14.8h\n"
- "ldr q4, [x8, #0xe0]\n"
- "fmla v28.8h, v0.8h, v5.8h\n"
- "ldr q5, [x14, x6]\n"
- "fmla v29.8h, v0.8h, v6.8h\n"
- "fmla v30.8h, v0.8h, v9.8h\n"
- "fmla v31.8h, v0.8h, v13.8h\n"
- "ldr q0, [x8, #0xf0]\n"
- "fmla v28.8h, v1.8h, v6.8h\n"
- "ldr q6, [x14, x15]\n"
- "fmla v29.8h, v1.8h, v10.8h\n"
- "fmla v30.8h, v1.8h, v13.8h\n"
- "fmla v31.8h, v1.8h, v5.8h\n"
- "ldr q1, [x8, #0x100]\n"
- "fmla v28.8h, v2.8h, v10.8h\n"
- "ldr q10, [x14, x11]\n"
- "fmla v29.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v17.8h, v22.8h\n"
+ "fmla v28.8h, v17.8h, v20.8h\n"
+ "ldr q17, [x8, #0xc0]\n"
+ "fmla v30.8h, v16.8h, v0.8h\n"
+ "ld1 { v0.8h }, [x14]\n"
+ "fmla v31.8h, v16.8h, v1.8h\n"
+ "fmla v29.8h, v16.8h, v20.8h\n"
+ "fmla v28.8h, v16.8h, v19.8h\n"
+ "ldr q16, [x8, #0xd0]\n"
+ "fmla v30.8h, v21.8h, v1.8h\n"
+ "ldr q4, [x14, x2]\n"
+ "fmla v31.8h, v21.8h, v26.8h\n"
+ "ldr q12, [x14, x13]\n"
+ "fmla v29.8h, v21.8h, v19.8h\n"
+ "fmla v28.8h, v21.8h, v5.8h\n"
+ "ldr q13, [x8, #0xe0]\n"
+ "fmla v30.8h, v11.8h, v24.8h\n"
+ "ldr q6, [x14, x6]\n"
+ "fmla v31.8h, v11.8h, v23.8h\n"
+ "fmla v29.8h, v11.8h, v0.8h\n"
+ "fmla v28.8h, v11.8h, v4.8h\n"
+ "ldr q24, [x8, #0xf0]\n"
+ "fmla v30.8h, v18.8h, v23.8h\n"
+ "ldr q26, [x14, x15]\n"
+ "fmla v31.8h, v18.8h, v22.8h\n"
+ "fmla v29.8h, v18.8h, v4.8h\n"
+ "fmla v28.8h, v18.8h, v6.8h\n"
+ "ldr q23, [x8, #0x100]\n"
+ "fmla v30.8h, v17.8h, v22.8h\n"
+ "ldr q22, [x14, x11]\n"
+ "fmla v31.8h, v17.8h, v20.8h\n"
"add x14, x14, #0x10\n"
- "fmla v30.8h, v2.8h, v5.8h\n"
- "fmla v31.8h, v2.8h, v6.8h\n"
- "ldr q2, [x8, #0x110]\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ld1 { v11.8h }, [x12]\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
- "fmla v30.8h, v3.8h, v6.8h\n"
- "fmla v31.8h, v3.8h, v8.8h\n"
- "ldr q3, [x8, #0x120]\n"
- "fmla v28.8h, v4.8h, v12.8h\n"
- "ldr q12, [x12, x2]\n"
- "fmla v29.8h, v4.8h, v14.8h\n"
+ "fmla v29.8h, v17.8h, v6.8h\n"
+ "fmla v28.8h, v17.8h, v26.8h\n"
+ "ldr q21, [x8, #0x110]\n"
+ "fmla v30.8h, v16.8h, v20.8h\n"
+ "ld1 { v18.8h }, [x12]\n"
+ "fmla v31.8h, v16.8h, v19.8h\n"
+ "fmla v29.8h, v16.8h, v26.8h\n"
+ "fmla v28.8h, v16.8h, v12.8h\n"
+ "ldr q20, [x8, #0x120]\n"
+ "fmla v30.8h, v13.8h, v19.8h\n"
+ "ldr q17, [x12, x2]\n"
+ "fmla v31.8h, v13.8h, v5.8h\n"
"ld1 { v14.8h }, [x17]\n"
- "fmla v30.8h, v4.8h, v8.8h\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
- "ldr q4, [x8, #0x130]\n"
- "fmla v28.8h, v0.8h, v9.8h\n"
- "ldr q9, [x12, x6]\n"
- "fmla v29.8h, v0.8h, v13.8h\n"
- "fmla v30.8h, v0.8h, v11.8h\n"
- "ldr q11, [x12, x15]\n"
- "fmla v31.8h, v0.8h, v12.8h\n"
+ "fmla v29.8h, v13.8h, v12.8h\n"
+ "fmla v28.8h, v13.8h, v22.8h\n"
+ "ldr q19, [x8, #0x130]\n"
+ "fmla v30.8h, v24.8h, v0.8h\n"
+ "ldr q16, [x12, x6]\n"
+ "fmla v31.8h, v24.8h, v4.8h\n"
+ "fmla v29.8h, v24.8h, v18.8h\n"
+ "ldr q18, [x12, x15]\n"
+ "fmla v28.8h, v24.8h, v17.8h\n"
"ldr q0, [x8, #0x150]\n"
- "fmla v28.8h, v1.8h, v13.8h\n"
+ "fmla v30.8h, v23.8h, v4.8h\n"
"ldr q13, [x7, x6]\n"
- "fmla v29.8h, v1.8h, v5.8h\n"
- "fmla v30.8h, v1.8h, v12.8h\n"
- "ldr q12, [x12, x13]\n"
- "fmla v31.8h, v1.8h, v9.8h\n"
+ "fmla v31.8h, v23.8h, v6.8h\n"
+ "fmla v29.8h, v23.8h, v17.8h\n"
+ "ldr q17, [x12, x13]\n"
+ "fmla v28.8h, v23.8h, v16.8h\n"
"ldr q1, [x8, #0x160]\n"
- "fmla v28.8h, v2.8h, v5.8h\n"
+ "fmla v30.8h, v21.8h, v6.8h\n"
"ld1 { v5.8h }, [x4]\n"
- "fmla v29.8h, v2.8h, v6.8h\n"
- "fmla v30.8h, v2.8h, v9.8h\n"
- "ldr q9, [x12, x11]\n"
- "fmla v31.8h, v2.8h, v11.8h\n"
+ "fmla v31.8h, v21.8h, v26.8h\n"
+ "fmla v29.8h, v21.8h, v16.8h\n"
+ "ldr q16, [x12, x11]\n"
+ "fmla v28.8h, v21.8h, v18.8h\n"
"ldr q2, [x8, #0x170]\n"
- "fmla v28.8h, v3.8h, v6.8h\n"
+ "fmla v30.8h, v20.8h, v26.8h\n"
"ldr q6, [x4, x2]\n"
- "fmla v29.8h, v3.8h, v8.8h\n"
+ "fmla v31.8h, v20.8h, v12.8h\n"
"add x12, x12, #0x10\n"
- "fmla v30.8h, v3.8h, v11.8h\n"
+ "fmla v29.8h, v20.8h, v18.8h\n"
"ldr q11, [x4, x15]\n"
- "fmla v31.8h, v3.8h, v12.8h\n"
+ "fmla v28.8h, v20.8h, v17.8h\n"
"ldr q3, [x8, #0x180]\n"
- "fmla v28.8h, v4.8h, v8.8h\n"
+ "fmla v30.8h, v19.8h, v12.8h\n"
"ldr q8, [x7, x2]\n"
- "fmla v29.8h, v4.8h, v10.8h\n"
+ "fmla v31.8h, v19.8h, v22.8h\n"
"ldr q10, [x7, x11]\n"
- "fmla v30.8h, v4.8h, v12.8h\n"
+ "fmla v29.8h, v19.8h, v17.8h\n"
"ldr q12, [x4, x13]\n"
- "fmla v31.8h, v4.8h, v9.8h\n"
+ "fmla v28.8h, v19.8h, v16.8h\n"
"ldr q9, [x4, x6]\n"
"ldr q4, [x8, #0x190]\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
+ "fmax v30.8h, v30.8h, v27.8h\n"
+ "fmax v31.8h, v31.8h, v27.8h\n"
"add x8, x8, #0x1a0\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "st1 { v28.8h }, [x5]\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
- "str q29, [x5, x3]\n"
+ "fmax v29.8h, v29.8h, v27.8h\n"
+ "fmax v28.8h, v28.8h, v27.8h\n"
+ "fmin v30.8h, v30.8h, v15.8h\n"
+ "fmin v31.8h, v31.8h, v15.8h\n"
+ "st1 { v30.8h }, [x5]\n"
+ "fmin v29.8h, v29.8h, v15.8h\n"
+ "fmin v28.8h, v28.8h, v15.8h\n"
+ "str q31, [x5, x3]\n"
"add x5, x5, #0x10\n"
- "st1 { v30.8h }, [x10]\n"
- "str q31, [x10, x3]\n"
+ "st1 { v29.8h }, [x10]\n"
+ "str q28, [x10, x3]\n"
"add x10, x10, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v5.8h\n"
- "ldr q5, [x7, x15]\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v6.8h\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v7.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v8.8h\n"
- "ldr q0, [x8, #0x0]\n"
- "fmla v28.8h, v1.8h, v6.8h\n"
- "ldr q6, [x7, x13]\n"
- "fmla v29.8h, v1.8h, v9.8h\n"
+ "mov v31.16b, v25.16b\n fmla v31.8h, v0.8h, v5.8h\n"
+ "ldr q22, [x7, x15]\n"
+ "mov v5.16b, v25.16b\n fmla v5.8h, v0.8h, v6.8h\n"
+ "mov v30.16b, v25.16b\n fmla v30.8h, v0.8h, v7.8h\n"
+ "mov v29.16b, v25.16b\n fmla v29.8h, v0.8h, v8.8h\n"
+ "ldr q19, [x8, #0x0]\n"
+ "fmla v31.8h, v1.8h, v6.8h\n"
+ "ldr q21, [x7, x13]\n"
+ "fmla v5.8h, v1.8h, v9.8h\n"
"add x7, x7, #0x10\n"
"fmla v30.8h, v1.8h, v8.8h\n"
- "fmla v31.8h, v1.8h, v13.8h\n"
- "ldr q1, [x8, #0x10]\n"
- "fmla v28.8h, v2.8h, v9.8h\n"
- "ldr q9, [x4, x11]\n"
- "fmla v29.8h, v2.8h, v11.8h\n"
- "add x4, x4, #0x10\n"
- "fmla v30.8h, v2.8h, v13.8h\n"
- "fmla v31.8h, v2.8h, v5.8h\n"
- "ldr q2, [x8, #0x20]\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x17, x2]\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
- "fmla v30.8h, v3.8h, v5.8h\n"
- "fmla v31.8h, v3.8h, v6.8h\n"
- "ldr q3, [x8, #0x30]\n"
- "fmla v28.8h, v4.8h, v12.8h\n"
- "ldr q12, [x17, x6]\n"
- "fmla v29.8h, v4.8h, v9.8h\n"
- "ldr q9, [x17, x15]\n"
- "fmla v30.8h, v4.8h, v6.8h\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
- "ldr q4, [x8, #0x40]\n"
- "fmla v28.8h, v0.8h, v7.8h\n"
- "fmla v29.8h, v0.8h, v8.8h\n"
- "fmla v30.8h, v0.8h, v14.8h\n"
- "fmla v31.8h, v0.8h, v11.8h\n"
- "ldr q0, [x8, #0x50]\n"
- "fmla v28.8h, v1.8h, v8.8h\n"
- "ldr q8, [x17, x11]\n"
"fmla v29.8h, v1.8h, v13.8h\n"
- "fmla v30.8h, v1.8h, v11.8h\n"
- "fmla v31.8h, v1.8h, v12.8h\n"
- "ldr q1, [x8, #0x60]\n"
- "fmla v28.8h, v2.8h, v13.8h\n"
- "ldr q13, [x17, x13]\n"
- "fmla v29.8h, v2.8h, v5.8h\n"
- "add x17, x17, #0x10\n"
- "fmla v30.8h, v2.8h, v12.8h\n"
+ "ldr q18, [x8, #0x10]\n"
"fmla v31.8h, v2.8h, v9.8h\n"
- "ldr q2, [x8, #0x70]\n"
- "fmla v28.8h, v3.8h, v5.8h\n"
- "ld1 { v5.8h }, [x16]\n"
- "fmla v29.8h, v3.8h, v6.8h\n"
- "fmla v30.8h, v3.8h, v9.8h\n"
- "fmla v31.8h, v3.8h, v13.8h\n"
- "ldr q3, [x8, #0x80]\n"
- "fmla v28.8h, v4.8h, v6.8h\n"
- "ldr q6, [x16, x2]\n"
+ "ldr q16, [x4, x11]\n"
+ "fmla v5.8h, v2.8h, v11.8h\n"
+ "add x4, x4, #0x10\n"
+ "fmla v30.8h, v2.8h, v13.8h\n"
+ "fmla v29.8h, v2.8h, v22.8h\n"
+ "ldr q17, [x8, #0x20]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr q6, [x17, x2]\n"
+ "fmla v5.8h, v3.8h, v12.8h\n"
+ "fmla v30.8h, v3.8h, v22.8h\n"
+ "fmla v29.8h, v3.8h, v21.8h\n"
+ "ldr q20, [x8, #0x30]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "ldr q2, [x17, x6]\n"
+ "fmla v5.8h, v4.8h, v16.8h\n"
+ "ldr q28, [x17, x15]\n"
+ "fmla v30.8h, v4.8h, v21.8h\n"
"fmla v29.8h, v4.8h, v10.8h\n"
- "ldr q10, [x16, x6]\n"
- "fmla v30.8h, v4.8h, v13.8h\n"
- "fmla v31.8h, v4.8h, v8.8h\n"
- "ldr q4, [x8, #0x90]\n"
- "fmla v28.8h, v0.8h, v14.8h\n"
- "ldr q14, [x16, x11]\n"
- "fmla v29.8h, v0.8h, v11.8h\n"
- "fmla v30.8h, v0.8h, v5.8h\n"
- "fmla v31.8h, v0.8h, v6.8h\n"
- "ldr q0, [x8, #0xa0]\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "ldr q11, [x16, x15]\n"
- "fmla v29.8h, v1.8h, v12.8h\n"
- "fmla v30.8h, v1.8h, v6.8h\n"
- "fmla v31.8h, v1.8h, v10.8h\n"
- "ldr q1, [x8, #0xb0]\n"
- "fmla v28.8h, v2.8h, v12.8h\n"
- "ldr q12, [x16, x13]\n"
- "fmla v29.8h, v2.8h, v9.8h\n"
+ "ldr q16, [x8, #0x40]\n"
+ "fmla v31.8h, v19.8h, v7.8h\n"
+ "fmla v5.8h, v19.8h, v8.8h\n"
+ "fmla v30.8h, v19.8h, v14.8h\n"
+ "fmla v29.8h, v19.8h, v6.8h\n"
+ "ldr q19, [x8, #0x50]\n"
+ "fmla v31.8h, v18.8h, v8.8h\n"
+ "ldr q1, [x17, x11]\n"
+ "fmla v5.8h, v18.8h, v13.8h\n"
+ "fmla v30.8h, v18.8h, v6.8h\n"
+ "fmla v29.8h, v18.8h, v2.8h\n"
+ "ldr q18, [x8, #0x60]\n"
+ "fmla v31.8h, v17.8h, v13.8h\n"
+ "ldr q26, [x17, x13]\n"
+ "fmla v5.8h, v17.8h, v22.8h\n"
+ "add x17, x17, #0x10\n"
+ "fmla v30.8h, v17.8h, v2.8h\n"
+ "fmla v29.8h, v17.8h, v28.8h\n"
+ "ldr q17, [x8, #0x70]\n"
+ "fmla v31.8h, v20.8h, v22.8h\n"
+ "ld1 { v25.8h }, [x16]\n"
+ "fmla v5.8h, v20.8h, v21.8h\n"
+ "fmla v30.8h, v20.8h, v28.8h\n"
+ "fmla v29.8h, v20.8h, v26.8h\n"
+ "ldr q24, [x8, #0x80]\n"
+ "fmla v31.8h, v16.8h, v21.8h\n"
+ "ldr q23, [x16, x2]\n"
+ "fmla v5.8h, v16.8h, v10.8h\n"
+ "ldr q0, [x16, x6]\n"
+ "fmla v30.8h, v16.8h, v26.8h\n"
+ "fmla v29.8h, v16.8h, v1.8h\n"
+ "ldr q22, [x8, #0x90]\n"
+ "fmla v31.8h, v19.8h, v14.8h\n"
+ "ldr q16, [x16, x11]\n"
+ "fmla v5.8h, v19.8h, v6.8h\n"
+ "fmla v30.8h, v19.8h, v25.8h\n"
+ "fmla v29.8h, v19.8h, v23.8h\n"
+ "ldr q21, [x8, #0xa0]\n"
+ "fmla v31.8h, v18.8h, v6.8h\n"
+ "ldr q20, [x16, x15]\n"
+ "fmla v5.8h, v18.8h, v2.8h\n"
+ "fmla v30.8h, v18.8h, v23.8h\n"
+ "fmla v29.8h, v18.8h, v0.8h\n"
+ "ldr q18, [x8, #0xb0]\n"
+ "fmla v31.8h, v17.8h, v2.8h\n"
+ "ldr q19, [x16, x13]\n"
+ "fmla v5.8h, v17.8h, v28.8h\n"
"add x16, x16, #0x10\n"
- "fmla v30.8h, v2.8h, v10.8h\n"
- "fmla v31.8h, v2.8h, v11.8h\n"
- "ldr q2, [x8, #0xc0]\n"
- "fmla v28.8h, v3.8h, v9.8h\n"
- "ld1 { v9.8h }, [x14]\n"
- "fmla v29.8h, v3.8h, v13.8h\n"
- "fmla v30.8h, v3.8h, v11.8h\n"
- "fmla v31.8h, v3.8h, v12.8h\n"
- "ldr q3, [x8, #0xd0]\n"
- "fmla v28.8h, v4.8h, v13.8h\n"
- "ldr q13, [x14, x2]\n"
- "fmla v29.8h, v4.8h, v8.8h\n"
- "ldr q8, [x14, x13]\n"
- "fmla v30.8h, v4.8h, v12.8h\n"
- "fmla v31.8h, v4.8h, v14.8h\n"
- "ldr q4, [x8, #0xe0]\n"
- "fmla v28.8h, v0.8h, v5.8h\n"
- "ldr q5, [x14, x6]\n"
- "fmla v29.8h, v0.8h, v6.8h\n"
- "fmla v30.8h, v0.8h, v9.8h\n"
- "fmla v31.8h, v0.8h, v13.8h\n"
- "ldr q0, [x8, #0xf0]\n"
- "fmla v28.8h, v1.8h, v6.8h\n"
- "ldr q6, [x14, x15]\n"
- "fmla v29.8h, v1.8h, v10.8h\n"
- "fmla v30.8h, v1.8h, v13.8h\n"
- "fmla v31.8h, v1.8h, v5.8h\n"
- "ldr q1, [x8, #0x100]\n"
- "fmla v28.8h, v2.8h, v10.8h\n"
- "ldr q10, [x14, x11]\n"
- "fmla v29.8h, v2.8h, v11.8h\n"
+ "fmla v30.8h, v17.8h, v0.8h\n"
+ "fmla v29.8h, v17.8h, v20.8h\n"
+ "ldr q17, [x8, #0xc0]\n"
+ "fmla v31.8h, v24.8h, v28.8h\n"
+ "ld1 { v7.8h }, [x14]\n"
+ "fmla v5.8h, v24.8h, v26.8h\n"
+ "fmla v30.8h, v24.8h, v20.8h\n"
+ "fmla v29.8h, v24.8h, v19.8h\n"
+ "ldr q2, [x8, #0xd0]\n"
+ "fmla v31.8h, v22.8h, v26.8h\n"
+ "ldr q28, [x14, x2]\n"
+ "fmla v5.8h, v22.8h, v1.8h\n"
+ "ldr q13, [x14, x13]\n"
+ "fmla v30.8h, v22.8h, v19.8h\n"
+ "fmla v29.8h, v22.8h, v16.8h\n"
+ "ldr q14, [x8, #0xe0]\n"
+ "fmla v31.8h, v21.8h, v25.8h\n"
+ "ldr q26, [x14, x6]\n"
+ "fmla v5.8h, v21.8h, v23.8h\n"
+ "fmla v30.8h, v21.8h, v7.8h\n"
+ "fmla v29.8h, v21.8h, v28.8h\n"
+ "ldr q25, [x8, #0xf0]\n"
+ "fmla v31.8h, v18.8h, v23.8h\n"
+ "ldr q24, [x14, x15]\n"
+ "fmla v5.8h, v18.8h, v0.8h\n"
+ "fmla v30.8h, v18.8h, v28.8h\n"
+ "fmla v29.8h, v18.8h, v26.8h\n"
+ "ldr q23, [x8, #0x100]\n"
+ "fmla v31.8h, v17.8h, v0.8h\n"
+ "ldr q22, [x14, x11]\n"
+ "fmla v5.8h, v17.8h, v20.8h\n"
"add x14, x14, #0x10\n"
- "fmla v30.8h, v2.8h, v5.8h\n"
- "fmla v31.8h, v2.8h, v6.8h\n"
- "ldr q2, [x8, #0x110]\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ld1 { v11.8h }, [x12]\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
- "fmla v30.8h, v3.8h, v6.8h\n"
- "fmla v31.8h, v3.8h, v8.8h\n"
- "ldr q3, [x8, #0x120]\n"
- "fmla v28.8h, v4.8h, v12.8h\n"
- "ldr q12, [x12, x2]\n"
- "fmla v29.8h, v4.8h, v14.8h\n"
- "fmla v30.8h, v4.8h, v8.8h\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
- "ldr q4, [x8, #0x130]\n"
+ "fmla v30.8h, v17.8h, v26.8h\n"
+ "fmla v29.8h, v17.8h, v24.8h\n"
+ "ldr q21, [x8, #0x110]\n"
+ "fmla v31.8h, v2.8h, v20.8h\n"
+ "ld1 { v18.8h }, [x12]\n"
+ "fmla v5.8h, v2.8h, v19.8h\n"
+ "fmla v30.8h, v2.8h, v24.8h\n"
+ "fmla v29.8h, v2.8h, v13.8h\n"
+ "ldr q20, [x8, #0x120]\n"
+ "fmla v31.8h, v14.8h, v19.8h\n"
+ "ldr q17, [x12, x2]\n"
+ "fmla v5.8h, v14.8h, v16.8h\n"
+ "fmla v30.8h, v14.8h, v13.8h\n"
+ "fmla v29.8h, v14.8h, v22.8h\n"
+ "ldr q19, [x8, #0x130]\n"
"add x8, x8, #0x140\n"
- "fmla v28.8h, v0.8h, v9.8h\n"
- "ldr q9, [x12, x6]\n"
- "fmla v29.8h, v0.8h, v13.8h\n"
- "fmla v30.8h, v0.8h, v11.8h\n"
- "ldr q11, [x12, x15]\n"
- "fmla v31.8h, v0.8h, v12.8h\n"
- "fmla v28.8h, v1.8h, v13.8h\n"
- "fmla v29.8h, v1.8h, v5.8h\n"
- "fmla v30.8h, v1.8h, v12.8h\n"
- "ldr q12, [x12, x13]\n"
- "fmla v31.8h, v1.8h, v9.8h\n"
- "fmla v28.8h, v2.8h, v5.8h\n"
- "fmla v29.8h, v2.8h, v6.8h\n"
- "fmla v30.8h, v2.8h, v9.8h\n"
- "ldr q9, [x12, x11]\n"
- "fmla v31.8h, v2.8h, v11.8h\n"
+ "fmla v31.8h, v25.8h, v7.8h\n"
+ "ldr q16, [x12, x6]\n"
+ "fmla v5.8h, v25.8h, v28.8h\n"
+ "fmla v30.8h, v25.8h, v18.8h\n"
+ "ldr q18, [x12, x15]\n"
+ "fmla v29.8h, v25.8h, v17.8h\n"
+ "fmla v31.8h, v23.8h, v28.8h\n"
+ "fmla v5.8h, v23.8h, v26.8h\n"
+ "fmla v30.8h, v23.8h, v17.8h\n"
+ "ldr q17, [x12, x13]\n"
+ "fmla v29.8h, v23.8h, v16.8h\n"
+ "fmla v31.8h, v21.8h, v26.8h\n"
+ "fmla v5.8h, v21.8h, v24.8h\n"
+ "fmla v30.8h, v21.8h, v16.8h\n"
+ "ldr q16, [x12, x11]\n"
+ "fmla v29.8h, v21.8h, v18.8h\n"
"add x12, x12, #0x10\n"
- "fmla v28.8h, v3.8h, v6.8h\n"
- "fmla v29.8h, v3.8h, v8.8h\n"
- "fmla v30.8h, v3.8h, v11.8h\n"
- "fmla v31.8h, v3.8h, v12.8h\n"
- "fmla v28.8h, v4.8h, v8.8h\n"
- "fmla v29.8h, v4.8h, v10.8h\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmla v30.8h, v4.8h, v12.8h\n"
- "fmla v31.8h, v4.8h, v9.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "st1 { v28.8h }, [x5]\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
- "str q29, [x5, x3]\n"
+ "fmla v31.8h, v20.8h, v24.8h\n"
+ "fmla v5.8h, v20.8h, v13.8h\n"
+ "fmla v30.8h, v20.8h, v18.8h\n"
+ "fmla v29.8h, v20.8h, v17.8h\n"
+ "fmla v31.8h, v19.8h, v13.8h\n"
+ "fmla v5.8h, v19.8h, v22.8h\n"
+ "fmax v31.8h, v31.8h, v27.8h\n"
+ "fmla v30.8h, v19.8h, v17.8h\n"
+ "fmla v29.8h, v19.8h, v16.8h\n"
+ "fmax v5.8h, v5.8h, v27.8h\n"
+ "fmax v30.8h, v30.8h, v27.8h\n"
+ "fmax v29.8h, v29.8h, v27.8h\n"
+ "fmin v31.8h, v31.8h, v15.8h\n"
+ "fmin v5.8h, v5.8h, v15.8h\n"
+ "st1 { v31.8h }, [x5]\n"
+ "fmin v30.8h, v30.8h, v15.8h\n"
+ "fmin v29.8h, v29.8h, v15.8h\n"
+ "str q5, [x5, x3]\n"
"add x5, x5, #0x10\n"
"st1 { v30.8h }, [x10]\n"
- "str q31, [x10, x3]\n"
+ "str q29, [x10, x3]\n"
"add x10, x10, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x7\n"
"beq 117f\n"
- "ldr q16, [x8, #0x0]\n"
+ "ldr q25, [x8, #0x0]\n"
"ldr q0, [x8, #0x10]\n"
"add x9, x4, XZR\n"
"add x28, x4, x2\n"
@@ -609,11 +609,11 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"ldr h10, [x21, #0x0]\n"
"ldr h14, [x20, #0x0]\n"
"8:" // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: End
- "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v5.8h\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v6.8h\n"
+ "mov v28.16b, v25.16b\n fmla v28.8h, v0.8h, v5.8h\n"
+ "mov v29.16b, v25.16b\n fmla v29.8h, v0.8h, v6.8h\n"
"add x20, x7, x15\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v7.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v8.8h\n"
+ "mov v30.16b, v25.16b\n fmla v30.8h, v0.8h, v7.8h\n"
+ "mov v31.16b, v25.16b\n fmla v31.8h, v0.8h, v8.8h\n"
"fmla v28.8h, v1.8h, v6.8h\n"
"fmla v29.8h, v1.8h, v9.8h\n"
"fmla v30.8h, v1.8h, v8.8h\n"
@@ -1294,14 +1294,14 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"ldr h9, [x20, #0x0]\n"
"112:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: End
"fmla v31.8h, v4.8h, v9.8h\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
+ "fmax v28.8h, v28.8h, v27.8h\n"
+ "fmax v29.8h, v29.8h, v27.8h\n"
+ "fmax v30.8h, v30.8h, v27.8h\n"
+ "fmax v31.8h, v31.8h, v27.8h\n"
+ "fmin v28.8h, v28.8h, v15.8h\n"
+ "fmin v29.8h, v29.8h, v15.8h\n"
+ "fmin v30.8h, v30.8h, v15.8h\n"
+ "fmin v31.8h, v31.8h, v15.8h\n"
"tbz %x[n_channels], #2, 114f\n"
"mov x21, x5\n"
"mov x20, x10\n"
@@ -1363,7 +1363,6 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"st1 { v29.h }[0], [x21]\n"
"st1 { v31.h }[0], [x20]\n"
"116:" // Tile loop: Oddments: Store: Bit 2: End
-
"117:" // Tile loop: End
"ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
@@ -1378,7 +1377,7 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index a2791d277e..4913340c4c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -103,16 +103,16 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"lsr x9, %x[n_channels], #0x3\n"
"ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v18.8h }, [x20]\n"
+ "ld1r { v27.8h }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v17.8h }, [x20]\n"
+ "ld1r { v15.8h }, [x20]\n"
"add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
"ldp x14, x13, [x21, #0x0]\n"
"ldp x12, x11, [x21, #0x10]\n"
"mov x10, #0x0\n"
"sub x28, XZR, x17\n"
"cbz x9, 3f\n"
- "ldr q16, [x16, #0x0]\n"
+ "ldr q26, [x16, #0x0]\n"
"ldr q0, [x16, #0x10]\n"
"cmp x17, x9, LSL #4\n"
"ldr q1, [x16, #0x20]\n"
@@ -120,436 +120,436 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q3, [x16, #0x40]\n"
"ldr q4, [x16, #0x50]\n"
"add x16, x16, #0x60\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "ldr q5, [x27, x10]\n"
- "ldr q6, [x26, x10]\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "ldr q7, [x25, x10]\n"
- "ldr q8, [x24, x10]\n"
- "ldp x23, x22, [x15, #0x20]\n"
- "ldr q9, [x23, x10]\n"
- "ldr q13, [x22, x10]\n"
+ "ldp x21, x20, [x15, #0x0]\n"
+ "ldr q5, [x21, x10]\n"
+ "ldr q6, [x20, x10]\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr q7, [x21, x10]\n"
+ "ldr q8, [x20, x10]\n"
+ "ldp x21, x20, [x15, #0x20]\n"
+ "ldr q9, [x21, x10]\n"
+ "ldr q13, [x20, x10]\n"
"ldp x21, x20, [x15, #0x30]\n"
"ldr q11, [x21, x10]\n"
"ldr q12, [x20, x10]\n"
- "ldp x27, x26, [x15, #0x40]\n"
- "ldr q10, [x27, x10]\n"
- "ldr q14, [x26, x10]\n"
+ "ldp x21, x20, [x15, #0x40]\n"
+ "ldr q10, [x21, x10]\n"
+ "ldr q14, [x20, x10]\n"
"bge 2f\n"
"1:" // Channel loop
- "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v5.8h\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v6.8h\n"
- "ldr x25, [x15, #0x50]\n"
- "ldr q5, [x25, x10]\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v7.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v8.8h\n"
- "ldr q0, [x16, #0x0]\n"
- "ldr q16, [x16, #0x140]\n"
- "fmla v28.8h, v1.8h, v6.8h\n"
- "fmla v29.8h, v1.8h, v9.8h\n"
- "ldr x24, [x15, #0x58]\n"
- "ldr q6, [x24, x10]\n"
- "fmla v30.8h, v1.8h, v8.8h\n"
- "fmla v31.8h, v1.8h, v13.8h\n"
- "ldr q1, [x16, #0x10]\n"
- "ldr x23, [x15, #0x60]\n"
- "fmla v28.8h, v2.8h, v9.8h\n"
- "ldr q9, [x23, x10]\n"
- "fmla v29.8h, v2.8h, v11.8h\n"
- "ldr x22, [x15, #0x68]\n"
- "fmla v30.8h, v2.8h, v13.8h\n"
- "fmla v31.8h, v2.8h, v5.8h\n"
- "ldr q2, [x16, #0x20]\n"
- "ldr x21, [x15, #0x70]\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x22, x10]\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
- "ldr x20, [x15, #0x78]\n"
- "fmla v30.8h, v3.8h, v5.8h\n"
- "fmla v31.8h, v3.8h, v6.8h\n"
- "ldr q3, [x16, #0x30]\n"
- "ldr x27, [x15, #0x80]\n"
- "fmla v28.8h, v4.8h, v12.8h\n"
- "ldr q12, [x21, x10]\n"
- "fmla v29.8h, v4.8h, v9.8h\n"
- "ldr q9, [x20, x10]\n"
- "fmla v30.8h, v4.8h, v6.8h\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
- "ldr q4, [x16, #0x40]\n"
- "ldr x26, [x15, #0x88]\n"
- "fmla v28.8h, v0.8h, v7.8h\n"
- "fmla v29.8h, v0.8h, v8.8h\n"
- "ldr x25, [x15, #0x90]\n"
- "ldr x24, [x15, #0x98]\n"
- "fmla v30.8h, v0.8h, v14.8h\n"
- "fmla v31.8h, v0.8h, v11.8h\n"
- "ldr q0, [x16, #0x50]\n"
- "ldr x23, [x15, #0xa0]\n"
+ "mov v30.16b, v26.16b\n fmla v30.8h, v0.8h, v5.8h\n"
+ "mov v31.16b, v26.16b\n fmla v31.8h, v0.8h, v6.8h\n"
+ "ldr x20, [x15, #0x50]\n"
+ "ldr q24, [x20, x10]\n"
+ "mov v28.16b, v26.16b\n fmla v28.8h, v0.8h, v7.8h\n"
+ "mov v29.16b, v26.16b\n fmla v29.8h, v0.8h, v8.8h\n"
+ "ldr q23, [x16, #0x0]\n"
+ "ldr q26, [x16, #0x140]\n"
+ "fmla v30.8h, v1.8h, v6.8h\n"
+ "fmla v31.8h, v1.8h, v9.8h\n"
+ "ldr x20, [x15, #0x58]\n"
+ "ldr q22, [x20, x10]\n"
"fmla v28.8h, v1.8h, v8.8h\n"
- "ldr q8, [x26, x10]\n"
"fmla v29.8h, v1.8h, v13.8h\n"
- "ldr x22, [x15, #0xa8]\n"
- "fmla v30.8h, v1.8h, v11.8h\n"
- "fmla v31.8h, v1.8h, v12.8h\n"
- "ldr q1, [x16, #0x60]\n"
- "ldr x21, [x15, #0xb0]\n"
- "fmla v28.8h, v2.8h, v13.8h\n"
- "ldr q13, [x27, x10]\n"
- "fmla v29.8h, v2.8h, v5.8h\n"
- "ldr x20, [x15, #0xb8]\n"
- "fmla v30.8h, v2.8h, v12.8h\n"
- "fmla v31.8h, v2.8h, v9.8h\n"
- "ldr q2, [x16, #0x70]\n"
- "ldr x27, [x15, #0xc0]\n"
- "fmla v28.8h, v3.8h, v5.8h\n"
- "ldr q5, [x25, x10]\n"
- "fmla v29.8h, v3.8h, v6.8h\n"
- "ldr x26, [x15, #0xc8]\n"
- "fmla v30.8h, v3.8h, v9.8h\n"
- "fmla v31.8h, v3.8h, v13.8h\n"
- "ldr q3, [x16, #0x80]\n"
- "ldr x25, [x15, #0xd0]\n"
- "fmla v28.8h, v4.8h, v6.8h\n"
- "ldr q6, [x24, x10]\n"
- "fmla v29.8h, v4.8h, v10.8h\n"
- "ldr q10, [x23, x10]\n"
- "fmla v30.8h, v4.8h, v13.8h\n"
- "fmla v31.8h, v4.8h, v8.8h\n"
- "ldr q4, [x16, #0x90]\n"
- "ldr x24, [x15, #0xd8]\n"
- "fmla v28.8h, v0.8h, v14.8h\n"
- "ldr q14, [x20, x10]\n"
- "fmla v29.8h, v0.8h, v11.8h\n"
- "ldr x23, [x15, #0xe0]\n"
- "fmla v30.8h, v0.8h, v5.8h\n"
- "fmla v31.8h, v0.8h, v6.8h\n"
- "ldr q0, [x16, #0xa0]\n"
- "ldr x20, [x15, #0xf8]\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "ldr q11, [x22, x10]\n"
- "fmla v29.8h, v1.8h, v12.8h\n"
- "ldr x22, [x15, #0xe8]\n"
- "fmla v30.8h, v1.8h, v6.8h\n"
- "fmla v31.8h, v1.8h, v10.8h\n"
- "ldr q1, [x16, #0xb0]\n"
- "add x28, x28, #0x10\n"
- "fmla v28.8h, v2.8h, v12.8h\n"
- "ldr q12, [x21, x10]\n"
- "fmla v29.8h, v2.8h, v9.8h\n"
- "ldr x21, [x15, #0xf0]\n"
- "fmla v30.8h, v2.8h, v10.8h\n"
+ "ldr q21, [x16, #0x10]\n"
+ "ldr x20, [x15, #0x60]\n"
+ "fmla v30.8h, v2.8h, v9.8h\n"
+ "ldr q17, [x20, x10]\n"
"fmla v31.8h, v2.8h, v11.8h\n"
- "ldr q2, [x16, #0xc0]\n"
- "fmla v28.8h, v3.8h, v9.8h\n"
- "ldr q9, [x27, x10]\n"
- "fmla v29.8h, v3.8h, v13.8h\n"
- "ldr x27, [x15, #0x100]\n"
+ "ldr x20, [x15, #0x68]\n"
+ "fmla v28.8h, v2.8h, v13.8h\n"
+ "fmla v29.8h, v2.8h, v24.8h\n"
+ "ldr q16, [x16, #0x20]\n"
+ "ldr x22, [x15, #0x70]\n"
"fmla v30.8h, v3.8h, v11.8h\n"
+ "ldr q5, [x20, x10]\n"
"fmla v31.8h, v3.8h, v12.8h\n"
- "ldr q3, [x16, #0xd0]\n"
- "fmla v28.8h, v4.8h, v13.8h\n"
- "ldr q13, [x26, x10]\n"
- "fmla v29.8h, v4.8h, v8.8h\n"
- "ldr q8, [x23, x10]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "fmla v28.8h, v3.8h, v24.8h\n"
+ "fmla v29.8h, v3.8h, v22.8h\n"
+ "ldr q20, [x16, #0x30]\n"
+ "ldr x21, [x15, #0x80]\n"
"fmla v30.8h, v4.8h, v12.8h\n"
- "fmla v31.8h, v4.8h, v14.8h\n"
- "ldr q4, [x16, #0xe0]\n"
- "ldr x26, [x15, #0x108]\n"
- "fmla v28.8h, v0.8h, v5.8h\n"
- "ldr q5, [x25, x10]\n"
- "fmla v29.8h, v0.8h, v6.8h\n"
- "ldr x25, [x15, #0x110]\n"
- "fmla v30.8h, v0.8h, v9.8h\n"
- "fmla v31.8h, v0.8h, v13.8h\n"
- "ldr q0, [x16, #0xf0]\n"
- "fmla v28.8h, v1.8h, v6.8h\n"
- "ldr q6, [x24, x10]\n"
- "fmla v29.8h, v1.8h, v10.8h\n"
- "ldr x24, [x15, #0x118]\n"
- "fmla v30.8h, v1.8h, v13.8h\n"
+ "ldr q19, [x22, x10]\n"
+ "fmla v31.8h, v4.8h, v17.8h\n"
+ "ldr q2, [x20, x10]\n"
+ "fmla v28.8h, v4.8h, v22.8h\n"
+ "fmla v29.8h, v4.8h, v10.8h\n"
+ "ldr q18, [x16, #0x40]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "fmla v30.8h, v23.8h, v7.8h\n"
+ "fmla v31.8h, v23.8h, v8.8h\n"
+ "ldr x23, [x15, #0x90]\n"
+ "ldr x26, [x15, #0x98]\n"
+ "fmla v28.8h, v23.8h, v14.8h\n"
+ "fmla v29.8h, v23.8h, v5.8h\n"
+ "ldr q1, [x16, #0x50]\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "fmla v30.8h, v21.8h, v8.8h\n"
+ "ldr q25, [x20, x10]\n"
+ "fmla v31.8h, v21.8h, v13.8h\n"
+ "ldr x25, [x15, #0xa8]\n"
+ "fmla v28.8h, v21.8h, v5.8h\n"
+ "fmla v29.8h, v21.8h, v19.8h\n"
+ "ldr q17, [x16, #0x60]\n"
+ "ldr x24, [x15, #0xb0]\n"
+ "fmla v30.8h, v16.8h, v13.8h\n"
+ "ldr q8, [x21, x10]\n"
+ "fmla v31.8h, v16.8h, v24.8h\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla v28.8h, v16.8h, v19.8h\n"
+ "fmla v29.8h, v16.8h, v2.8h\n"
+ "ldr q16, [x16, #0x70]\n"
+ "ldr x21, [x15, #0xc0]\n"
+ "fmla v30.8h, v20.8h, v24.8h\n"
+ "ldr q24, [x23, x10]\n"
+ "fmla v31.8h, v20.8h, v22.8h\n"
+ "ldr x27, [x15, #0xc8]\n"
+ "fmla v28.8h, v20.8h, v2.8h\n"
+ "fmla v29.8h, v20.8h, v8.8h\n"
+ "ldr q23, [x16, #0x80]\n"
+ "ldr x23, [x15, #0xd0]\n"
+ "fmla v30.8h, v18.8h, v22.8h\n"
+ "ldr q22, [x26, x10]\n"
+ "fmla v31.8h, v18.8h, v10.8h\n"
+ "ldr q21, [x22, x10]\n"
+ "fmla v28.8h, v18.8h, v8.8h\n"
+ "fmla v29.8h, v18.8h, v25.8h\n"
+ "ldr q20, [x16, #0x90]\n"
+ "ldr x22, [x15, #0xd8]\n"
+ "fmla v30.8h, v1.8h, v14.8h\n"
+ "ldr q0, [x20, x10]\n"
"fmla v31.8h, v1.8h, v5.8h\n"
- "ldr q1, [x16, #0x100]\n"
- "fmla v28.8h, v2.8h, v10.8h\n"
- "ldr q10, [x22, x10]\n"
- "fmla v29.8h, v2.8h, v11.8h\n"
- "fmla v30.8h, v2.8h, v5.8h\n"
- "fmla v31.8h, v2.8h, v6.8h\n"
- "ldr q2, [x16, #0x110]\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x21, x10]\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
- "fmla v30.8h, v3.8h, v6.8h\n"
- "fmla v31.8h, v3.8h, v8.8h\n"
- "ldr q3, [x16, #0x120]\n"
- "fmla v28.8h, v4.8h, v12.8h\n"
- "ldr q12, [x20, x10]\n"
- "fmla v29.8h, v4.8h, v14.8h\n"
- "fmla v30.8h, v4.8h, v8.8h\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
- "ldr q4, [x16, #0x130]\n"
- "fmla v28.8h, v0.8h, v9.8h\n"
- "ldr q9, [x27, x10]\n"
- "fmla v29.8h, v0.8h, v13.8h\n"
- "fmla v30.8h, v0.8h, v11.8h\n"
- "ldr q11, [x26, x10]\n"
- "fmla v31.8h, v0.8h, v12.8h\n"
+ "ldr x20, [x15, #0xe0]\n"
+ "fmla v28.8h, v1.8h, v24.8h\n"
+ "fmla v29.8h, v1.8h, v22.8h\n"
+ "ldr q6, [x16, #0xa0]\n"
+ "ldr x26, [x15, #0xf8]\n"
+ "fmla v30.8h, v17.8h, v5.8h\n"
+ "ldr q1, [x25, x10]\n"
+ "fmla v31.8h, v17.8h, v19.8h\n"
+ "ldr x25, [x15, #0xe8]\n"
+ "fmla v28.8h, v17.8h, v22.8h\n"
+ "fmla v29.8h, v17.8h, v21.8h\n"
+ "ldr q18, [x16, #0xb0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v30.8h, v16.8h, v19.8h\n"
+ "ldr q19, [x24, x10]\n"
+ "fmla v31.8h, v16.8h, v2.8h\n"
+ "ldr x24, [x15, #0xf0]\n"
+ "fmla v28.8h, v16.8h, v21.8h\n"
+ "fmla v29.8h, v16.8h, v1.8h\n"
+ "ldr q17, [x16, #0xc0]\n"
+ "fmla v30.8h, v23.8h, v2.8h\n"
+ "ldr q16, [x21, x10]\n"
+ "fmla v31.8h, v23.8h, v8.8h\n"
+ "ldr x21, [x15, #0x100]\n"
+ "fmla v28.8h, v23.8h, v1.8h\n"
+ "fmla v29.8h, v23.8h, v19.8h\n"
+ "ldr q13, [x16, #0xd0]\n"
+ "fmla v30.8h, v20.8h, v8.8h\n"
+ "ldr q2, [x27, x10]\n"
+ "fmla v31.8h, v20.8h, v25.8h\n"
+ "ldr q10, [x20, x10]\n"
+ "fmla v28.8h, v20.8h, v19.8h\n"
+ "fmla v29.8h, v20.8h, v0.8h\n"
+ "ldr q9, [x16, #0xe0]\n"
+ "ldr x20, [x15, #0x108]\n"
+ "fmla v30.8h, v6.8h, v24.8h\n"
+ "ldr q5, [x23, x10]\n"
+ "fmla v31.8h, v6.8h, v22.8h\n"
+ "ldr x23, [x15, #0x110]\n"
+ "fmla v28.8h, v6.8h, v16.8h\n"
+ "fmla v29.8h, v6.8h, v2.8h\n"
+ "ldr q24, [x16, #0xf0]\n"
+ "fmla v30.8h, v18.8h, v22.8h\n"
+ "ldr q25, [x22, x10]\n"
+ "fmla v31.8h, v18.8h, v21.8h\n"
+ "ldr x22, [x15, #0x118]\n"
+ "fmla v28.8h, v18.8h, v2.8h\n"
+ "fmla v29.8h, v18.8h, v5.8h\n"
+ "ldr q23, [x16, #0x100]\n"
+ "fmla v30.8h, v17.8h, v21.8h\n"
+ "ldr q22, [x25, x10]\n"
+ "fmla v31.8h, v17.8h, v1.8h\n"
+ "fmla v28.8h, v17.8h, v5.8h\n"
+ "fmla v29.8h, v17.8h, v25.8h\n"
+ "ldr q21, [x16, #0x110]\n"
+ "fmla v30.8h, v13.8h, v1.8h\n"
+ "ldr q18, [x24, x10]\n"
+ "fmla v31.8h, v13.8h, v19.8h\n"
+ "fmla v28.8h, v13.8h, v25.8h\n"
+ "fmla v29.8h, v13.8h, v10.8h\n"
+ "ldr q20, [x16, #0x120]\n"
+ "fmla v30.8h, v9.8h, v19.8h\n"
+ "ldr q17, [x26, x10]\n"
+ "fmla v31.8h, v9.8h, v0.8h\n"
+ "fmla v28.8h, v9.8h, v10.8h\n"
+ "fmla v29.8h, v9.8h, v22.8h\n"
+ "ldr q19, [x16, #0x130]\n"
+ "fmla v30.8h, v24.8h, v16.8h\n"
+ "ldr q16, [x21, x10]\n"
+ "fmla v31.8h, v24.8h, v2.8h\n"
+ "fmla v28.8h, v24.8h, v18.8h\n"
+ "ldr q18, [x20, x10]\n"
+ "fmla v29.8h, v24.8h, v17.8h\n"
"ldr q0, [x16, #0x150]\n"
- "fmla v28.8h, v1.8h, v13.8h\n"
- "fmla v29.8h, v1.8h, v5.8h\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "fmla v30.8h, v1.8h, v12.8h\n"
- "ldr q12, [x25, x10]\n"
- "fmla v31.8h, v1.8h, v9.8h\n"
+ "fmla v30.8h, v23.8h, v2.8h\n"
+ "fmla v31.8h, v23.8h, v5.8h\n"
+ "ldp x21, x20, [x15, #0x0]\n"
+ "fmla v28.8h, v23.8h, v17.8h\n"
+ "ldr q17, [x23, x10]\n"
+ "fmla v29.8h, v23.8h, v16.8h\n"
"ldr q1, [x16, #0x160]\n"
- "fmla v28.8h, v2.8h, v5.8h\n"
- "ldr q5, [x27, x17]\n"
- "fmla v29.8h, v2.8h, v6.8h\n"
- "fmla v30.8h, v2.8h, v9.8h\n"
- "ldr q9, [x24, x10]\n"
- "fmla v31.8h, v2.8h, v11.8h\n"
+ "fmla v30.8h, v21.8h, v5.8h\n"
+ "ldr q5, [x21, x17]\n"
+ "fmla v31.8h, v21.8h, v25.8h\n"
+ "fmla v28.8h, v21.8h, v16.8h\n"
+ "ldr q16, [x22, x10]\n"
+ "fmla v29.8h, v21.8h, v18.8h\n"
"ldr q2, [x16, #0x170]\n"
- "fmla v28.8h, v3.8h, v6.8h\n"
- "ldr q6, [x26, x17]\n"
- "fmla v29.8h, v3.8h, v8.8h\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "ldr q7, [x25, x17]\n"
- "fmla v30.8h, v3.8h, v11.8h\n"
- "fmla v31.8h, v3.8h, v12.8h\n"
+ "fmla v30.8h, v20.8h, v25.8h\n"
+ "ldr q6, [x20, x17]\n"
+ "fmla v31.8h, v20.8h, v10.8h\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr q7, [x21, x17]\n"
+ "fmla v28.8h, v20.8h, v18.8h\n"
+ "fmla v29.8h, v20.8h, v17.8h\n"
"ldr q3, [x16, #0x180]\n"
- "fmla v28.8h, v4.8h, v8.8h\n"
- "ldr q8, [x24, x17]\n"
- "fmla v29.8h, v4.8h, v10.8h\n"
- "ldp x23, x22, [x15, #0x20]\n"
- "ldr q13, [x22, x17]\n"
- "fmla v30.8h, v4.8h, v12.8h\n"
- "fmla v31.8h, v4.8h, v9.8h\n"
- "ldr q9, [x23, x17]\n"
+ "fmla v30.8h, v19.8h, v10.8h\n"
+ "ldr q8, [x20, x17]\n"
+ "fmla v31.8h, v19.8h, v22.8h\n"
+ "ldp x21, x20, [x15, #0x20]\n"
+ "ldr q13, [x20, x17]\n"
+ "fmla v28.8h, v19.8h, v17.8h\n"
+ "fmla v29.8h, v19.8h, v16.8h\n"
+ "ldr q9, [x21, x17]\n"
"ldr q4, [x16, #0x190]\n"
"ldp x21, x20, [x15, #0x30]\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
+ "fmax v30.8h, v30.8h, v27.8h\n"
+ "fmax v31.8h, v31.8h, v27.8h\n"
"ldr q11, [x21, x17]\n"
"ldr q12, [x20, x17]\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "ldp x27, x26, [x15, #0x40]\n"
- "ldr q10, [x27, x17]\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "ldr q14, [x26, x17]\n"
+ "fmax v28.8h, v28.8h, v27.8h\n"
+ "fmax v29.8h, v29.8h, v27.8h\n"
+ "ldp x21, x20, [x15, #0x40]\n"
+ "ldr q10, [x21, x17]\n"
+ "fmin v30.8h, v30.8h, v15.8h\n"
+ "fmin v31.8h, v31.8h, v15.8h\n"
+ "ldr q14, [x20, x17]\n"
"add x17, x17, #0x10\n"
"cmp x17, x9, LSL #4\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
+ "fmin v28.8h, v28.8h, v15.8h\n"
+ "fmin v29.8h, v29.8h, v15.8h\n"
"add x10, x10, #0x10\n"
- "str q28, [x14, x28]\n"
+ "str q30, [x14, x28]\n"
"add x16, x16, #0x1a0\n"
- "str q29, [x13, x28]\n"
- "str q30, [x12, x28]\n"
- "str q31, [x11, x28]\n"
+ "str q31, [x13, x28]\n"
+ "str q28, [x12, x28]\n"
+ "str q29, [x11, x28]\n"
"blt 1b\n"
"2:" // Channel tail
- "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v5.8h\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v6.8h\n"
- "ldr x25, [x15, #0x50]\n"
- "ldr q5, [x25, x10]\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v7.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v8.8h\n"
- "ldr q0, [x16, #0x0]\n"
- "ldr x24, [x15, #0x58]\n"
- "fmla v28.8h, v1.8h, v6.8h\n"
- "ldr q6, [x24, x10]\n"
- "fmla v29.8h, v1.8h, v9.8h\n"
- "ldr x23, [x15, #0x60]\n"
+ "mov v31.16b, v26.16b\n fmla v31.8h, v0.8h, v5.8h\n"
+ "mov v5.16b, v26.16b\n fmla v5.8h, v0.8h, v6.8h\n"
+ "ldr x20, [x15, #0x50]\n"
+ "ldr q22, [x20, x10]\n"
+ "mov v30.16b, v26.16b\n fmla v30.8h, v0.8h, v7.8h\n"
+ "mov v29.16b, v26.16b\n fmla v29.8h, v0.8h, v8.8h\n"
+ "ldr q19, [x16, #0x0]\n"
+ "ldr x20, [x15, #0x58]\n"
+ "fmla v31.8h, v1.8h, v6.8h\n"
+ "ldr q21, [x20, x10]\n"
+ "fmla v5.8h, v1.8h, v9.8h\n"
+ "ldr x21, [x15, #0x60]\n"
"fmla v30.8h, v1.8h, v8.8h\n"
- "fmla v31.8h, v1.8h, v13.8h\n"
- "ldr q1, [x16, #0x10]\n"
- "ldr x22, [x15, #0x68]\n"
- "fmla v28.8h, v2.8h, v9.8h\n"
- "ldr q9, [x23, x10]\n"
- "fmla v29.8h, v2.8h, v11.8h\n"
- "ldr x21, [x15, #0x70]\n"
- "fmla v30.8h, v2.8h, v13.8h\n"
- "fmla v31.8h, v2.8h, v5.8h\n"
- "ldr q2, [x16, #0x20]\n"
- "ldr x20, [x15, #0x78]\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x22, x10]\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
- "ldr x27, [x15, #0x80]\n"
- "fmla v30.8h, v3.8h, v5.8h\n"
- "fmla v31.8h, v3.8h, v6.8h\n"
- "ldr q3, [x16, #0x30]\n"
- "ldr x26, [x15, #0x88]\n"
- "fmla v28.8h, v4.8h, v12.8h\n"
- "ldr q12, [x21, x10]\n"
- "fmla v29.8h, v4.8h, v9.8h\n"
- "ldr q9, [x20, x10]\n"
- "fmla v30.8h, v4.8h, v6.8h\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
- "ldr q4, [x16, #0x40]\n"
- "ldr x25, [x15, #0x90]\n"
- "fmla v28.8h, v0.8h, v7.8h\n"
- "fmla v29.8h, v0.8h, v8.8h\n"
- "ldr x24, [x15, #0x98]\n"
- "ldr x23, [x15, #0xa0]\n"
- "fmla v30.8h, v0.8h, v14.8h\n"
- "fmla v31.8h, v0.8h, v11.8h\n"
- "ldr q0, [x16, #0x50]\n"
- "ldr x22, [x15, #0xa8]\n"
- "fmla v28.8h, v1.8h, v8.8h\n"
- "ldr q8, [x26, x10]\n"
"fmla v29.8h, v1.8h, v13.8h\n"
- "ldr x21, [x15, #0xb0]\n"
- "fmla v30.8h, v1.8h, v11.8h\n"
- "fmla v31.8h, v1.8h, v12.8h\n"
- "ldr q1, [x16, #0x60]\n"
- "ldr x20, [x15, #0xb8]\n"
- "fmla v28.8h, v2.8h, v13.8h\n"
- "ldr q13, [x27, x10]\n"
- "fmla v29.8h, v2.8h, v5.8h\n"
- "ldr x27, [x15, #0xc0]\n"
- "fmla v30.8h, v2.8h, v12.8h\n"
+ "ldr q18, [x16, #0x10]\n"
+ "ldr x20, [x15, #0x68]\n"
"fmla v31.8h, v2.8h, v9.8h\n"
- "ldr q2, [x16, #0x70]\n"
- "ldr x26, [x15, #0xc8]\n"
- "fmla v28.8h, v3.8h, v5.8h\n"
- "ldr q5, [x25, x10]\n"
- "fmla v29.8h, v3.8h, v6.8h\n"
- "ldr x25, [x15, #0xd0]\n"
- "fmla v30.8h, v3.8h, v9.8h\n"
- "fmla v31.8h, v3.8h, v13.8h\n"
- "ldr q3, [x16, #0x80]\n"
- "add x28, x28, #0x10\n"
- "fmla v28.8h, v4.8h, v6.8h\n"
- "ldr q6, [x24, x10]\n"
+ "ldr q16, [x21, x10]\n"
+ "fmla v5.8h, v2.8h, v11.8h\n"
+ "ldr x23, [x15, #0x70]\n"
+ "fmla v30.8h, v2.8h, v13.8h\n"
+ "fmla v29.8h, v2.8h, v22.8h\n"
+ "ldr q17, [x16, #0x20]\n"
+ "ldr x21, [x15, #0x78]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr q6, [x20, x10]\n"
+ "fmla v5.8h, v3.8h, v12.8h\n"
+ "ldr x22, [x15, #0x80]\n"
+ "fmla v30.8h, v3.8h, v22.8h\n"
+ "fmla v29.8h, v3.8h, v21.8h\n"
+ "ldr q20, [x16, #0x30]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "ldr q2, [x23, x10]\n"
+ "fmla v5.8h, v4.8h, v16.8h\n"
+ "ldr q28, [x21, x10]\n"
+ "fmla v30.8h, v4.8h, v21.8h\n"
"fmla v29.8h, v4.8h, v10.8h\n"
- "ldr q10, [x23, x10]\n"
- "fmla v30.8h, v4.8h, v13.8h\n"
- "fmla v31.8h, v4.8h, v8.8h\n"
- "ldr q4, [x16, #0x90]\n"
- "ldr x24, [x15, #0xd8]\n"
- "fmla v28.8h, v0.8h, v14.8h\n"
- "ldr q14, [x20, x10]\n"
- "fmla v29.8h, v0.8h, v11.8h\n"
- "ldr x23, [x15, #0xe0]\n"
- "fmla v30.8h, v0.8h, v5.8h\n"
- "fmla v31.8h, v0.8h, v6.8h\n"
- "ldr q0, [x16, #0xa0]\n"
- "ldr x20, [x15, #0xf8]\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "ldr q11, [x22, x10]\n"
- "fmla v29.8h, v1.8h, v12.8h\n"
- "ldr x22, [x15, #0xe8]\n"
- "fmla v30.8h, v1.8h, v6.8h\n"
- "fmla v31.8h, v1.8h, v10.8h\n"
- "ldr q1, [x16, #0xb0]\n"
- "fmla v28.8h, v2.8h, v12.8h\n"
- "ldr q12, [x21, x10]\n"
- "fmla v29.8h, v2.8h, v9.8h\n"
- "ldr x21, [x15, #0xf0]\n"
- "fmla v30.8h, v2.8h, v10.8h\n"
- "fmla v31.8h, v2.8h, v11.8h\n"
- "ldr q2, [x16, #0xc0]\n"
- "fmla v28.8h, v3.8h, v9.8h\n"
- "ldr q9, [x27, x10]\n"
- "fmla v29.8h, v3.8h, v13.8h\n"
- "ldr x27, [x15, #0x100]\n"
- "fmla v30.8h, v3.8h, v11.8h\n"
- "fmla v31.8h, v3.8h, v12.8h\n"
+ "ldr q16, [x16, #0x40]\n"
+ "ldr x21, [x15, #0x90]\n"
+ "fmla v31.8h, v19.8h, v7.8h\n"
+ "fmla v5.8h, v19.8h, v8.8h\n"
+ "ldr x27, [x15, #0x98]\n"
+ "ldr x26, [x15, #0xa0]\n"
+ "fmla v30.8h, v19.8h, v14.8h\n"
+ "fmla v29.8h, v19.8h, v6.8h\n"
+ "ldr q19, [x16, #0x50]\n"
+ "ldr x25, [x15, #0xa8]\n"
+ "fmla v31.8h, v18.8h, v8.8h\n"
+ "ldr q1, [x20, x10]\n"
+ "fmla v5.8h, v18.8h, v13.8h\n"
+ "ldr x24, [x15, #0xb0]\n"
+ "fmla v30.8h, v18.8h, v6.8h\n"
+ "fmla v29.8h, v18.8h, v2.8h\n"
+ "ldr q18, [x16, #0x60]\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla v31.8h, v17.8h, v13.8h\n"
+ "ldr q26, [x22, x10]\n"
+ "fmla v5.8h, v17.8h, v22.8h\n"
+ "ldr x23, [x15, #0xc0]\n"
+ "fmla v30.8h, v17.8h, v2.8h\n"
+ "fmla v29.8h, v17.8h, v28.8h\n"
+ "ldr q17, [x16, #0x70]\n"
+ "ldr x22, [x15, #0xc8]\n"
+ "fmla v31.8h, v20.8h, v22.8h\n"
+ "ldr q25, [x21, x10]\n"
+ "fmla v5.8h, v20.8h, v21.8h\n"
+ "ldr x21, [x15, #0xd0]\n"
+ "fmla v30.8h, v20.8h, v28.8h\n"
+ "fmla v29.8h, v20.8h, v26.8h\n"
+ "ldr q24, [x16, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v31.8h, v16.8h, v21.8h\n"
+ "ldr q23, [x27, x10]\n"
+ "fmla v5.8h, v16.8h, v10.8h\n"
+ "ldr q0, [x26, x10]\n"
+ "fmla v30.8h, v16.8h, v26.8h\n"
+ "fmla v29.8h, v16.8h, v1.8h\n"
+ "ldr q22, [x16, #0x90]\n"
+ "ldr x27, [x15, #0xd8]\n"
+ "fmla v31.8h, v19.8h, v14.8h\n"
+ "ldr q16, [x20, x10]\n"
+ "fmla v5.8h, v19.8h, v6.8h\n"
+ "ldr x20, [x15, #0xe0]\n"
+ "fmla v30.8h, v19.8h, v25.8h\n"
+ "fmla v29.8h, v19.8h, v23.8h\n"
+ "ldr q21, [x16, #0xa0]\n"
+ "ldr x26, [x15, #0xf8]\n"
+ "fmla v31.8h, v18.8h, v6.8h\n"
+ "ldr q20, [x25, x10]\n"
+ "fmla v5.8h, v18.8h, v2.8h\n"
+ "ldr x25, [x15, #0xe8]\n"
+ "fmla v30.8h, v18.8h, v23.8h\n"
+ "fmla v29.8h, v18.8h, v0.8h\n"
+ "ldr q18, [x16, #0xb0]\n"
+ "fmla v31.8h, v17.8h, v2.8h\n"
+ "ldr q19, [x24, x10]\n"
+ "fmla v5.8h, v17.8h, v28.8h\n"
+ "ldr x24, [x15, #0xf0]\n"
+ "fmla v30.8h, v17.8h, v0.8h\n"
+ "fmla v29.8h, v17.8h, v20.8h\n"
+ "ldr q17, [x16, #0xc0]\n"
+ "fmla v31.8h, v24.8h, v28.8h\n"
+ "ldr q7, [x23, x10]\n"
+ "fmla v5.8h, v24.8h, v26.8h\n"
+ "ldr x23, [x15, #0x100]\n"
+ "fmla v30.8h, v24.8h, v20.8h\n"
+ "fmla v29.8h, v24.8h, v19.8h\n"
"ldr q3, [x16, #0xd0]\n"
- "fmla v28.8h, v4.8h, v13.8h\n"
- "ldr q13, [x26, x10]\n"
- "fmla v29.8h, v4.8h, v8.8h\n"
- "ldr q8, [x23, x10]\n"
- "fmla v30.8h, v4.8h, v12.8h\n"
- "fmla v31.8h, v4.8h, v14.8h\n"
- "ldr q4, [x16, #0xe0]\n"
- "ldr x26, [x15, #0x108]\n"
- "fmla v28.8h, v0.8h, v5.8h\n"
- "ldr q5, [x25, x10]\n"
- "fmla v29.8h, v0.8h, v6.8h\n"
- "ldr x25, [x15, #0x110]\n"
- "fmla v30.8h, v0.8h, v9.8h\n"
- "fmla v31.8h, v0.8h, v13.8h\n"
- "ldr q0, [x16, #0xf0]\n"
- "fmla v28.8h, v1.8h, v6.8h\n"
- "ldr q6, [x24, x10]\n"
- "fmla v29.8h, v1.8h, v10.8h\n"
- "ldr x24, [x15, #0x118]\n"
- "fmla v30.8h, v1.8h, v13.8h\n"
- "fmla v31.8h, v1.8h, v5.8h\n"
- "ldr q1, [x16, #0x100]\n"
- "fmla v28.8h, v2.8h, v10.8h\n"
- "ldr q10, [x22, x10]\n"
- "fmla v29.8h, v2.8h, v11.8h\n"
- "fmla v30.8h, v2.8h, v5.8h\n"
- "fmla v31.8h, v2.8h, v6.8h\n"
- "ldr q2, [x16, #0x110]\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x21, x10]\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
- "fmla v30.8h, v3.8h, v6.8h\n"
- "fmla v31.8h, v3.8h, v8.8h\n"
- "ldr q3, [x16, #0x120]\n"
- "fmla v28.8h, v4.8h, v12.8h\n"
- "ldr q12, [x20, x10]\n"
- "fmla v29.8h, v4.8h, v14.8h\n"
- "fmla v30.8h, v4.8h, v8.8h\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
- "ldr q4, [x16, #0x130]\n"
+ "fmla v31.8h, v22.8h, v26.8h\n"
+ "ldr q28, [x22, x10]\n"
+ "fmla v5.8h, v22.8h, v1.8h\n"
+ "ldr q13, [x20, x10]\n"
+ "fmla v30.8h, v22.8h, v19.8h\n"
+ "fmla v29.8h, v22.8h, v16.8h\n"
+ "ldr q11, [x16, #0xe0]\n"
+ "ldr x22, [x15, #0x108]\n"
+ "fmla v31.8h, v21.8h, v25.8h\n"
+ "ldr q26, [x21, x10]\n"
+ "fmla v5.8h, v21.8h, v23.8h\n"
+ "ldr x21, [x15, #0x110]\n"
+ "fmla v30.8h, v21.8h, v7.8h\n"
+ "fmla v29.8h, v21.8h, v28.8h\n"
+ "ldr q25, [x16, #0xf0]\n"
+ "fmla v31.8h, v18.8h, v23.8h\n"
+ "ldr q24, [x27, x10]\n"
+ "fmla v5.8h, v18.8h, v0.8h\n"
+ "ldr x20, [x15, #0x118]\n"
+ "fmla v30.8h, v18.8h, v28.8h\n"
+ "fmla v29.8h, v18.8h, v26.8h\n"
+ "ldr q23, [x16, #0x100]\n"
+ "fmla v31.8h, v17.8h, v0.8h\n"
+ "ldr q22, [x25, x10]\n"
+ "fmla v5.8h, v17.8h, v20.8h\n"
+ "fmla v30.8h, v17.8h, v26.8h\n"
+ "fmla v29.8h, v17.8h, v24.8h\n"
+ "ldr q21, [x16, #0x110]\n"
+ "fmla v31.8h, v3.8h, v20.8h\n"
+ "ldr q18, [x24, x10]\n"
+ "fmla v5.8h, v3.8h, v19.8h\n"
+ "fmla v30.8h, v3.8h, v24.8h\n"
+ "fmla v29.8h, v3.8h, v13.8h\n"
+ "ldr q20, [x16, #0x120]\n"
+ "fmla v31.8h, v11.8h, v19.8h\n"
+ "ldr q17, [x26, x10]\n"
+ "fmla v5.8h, v11.8h, v16.8h\n"
+ "fmla v30.8h, v11.8h, v13.8h\n"
+ "fmla v29.8h, v11.8h, v22.8h\n"
+ "ldr q19, [x16, #0x130]\n"
"add x16, x16, #0x140\n"
- "fmla v28.8h, v0.8h, v9.8h\n"
- "ldr q9, [x27, x10]\n"
- "fmla v29.8h, v0.8h, v13.8h\n"
- "fmla v30.8h, v0.8h, v11.8h\n"
- "ldr q11, [x26, x10]\n"
- "fmla v31.8h, v0.8h, v12.8h\n"
- "fmla v28.8h, v1.8h, v13.8h\n"
- "fmla v29.8h, v1.8h, v5.8h\n"
- "fmla v30.8h, v1.8h, v12.8h\n"
- "ldr q12, [x25, x10]\n"
- "fmla v31.8h, v1.8h, v9.8h\n"
- "fmla v28.8h, v2.8h, v5.8h\n"
- "fmla v29.8h, v2.8h, v6.8h\n"
- "fmla v30.8h, v2.8h, v9.8h\n"
- "ldr q9, [x24, x10]\n"
- "fmla v31.8h, v2.8h, v11.8h\n"
+ "fmla v31.8h, v25.8h, v7.8h\n"
+ "ldr q16, [x23, x10]\n"
+ "fmla v5.8h, v25.8h, v28.8h\n"
+ "fmla v30.8h, v25.8h, v18.8h\n"
+ "ldr q18, [x22, x10]\n"
+ "fmla v29.8h, v25.8h, v17.8h\n"
+ "fmla v31.8h, v23.8h, v28.8h\n"
+ "fmla v5.8h, v23.8h, v26.8h\n"
+ "fmla v30.8h, v23.8h, v17.8h\n"
+ "ldr q17, [x21, x10]\n"
+ "fmla v29.8h, v23.8h, v16.8h\n"
+ "fmla v31.8h, v21.8h, v26.8h\n"
+ "fmla v5.8h, v21.8h, v24.8h\n"
+ "fmla v30.8h, v21.8h, v16.8h\n"
+ "ldr q16, [x20, x10]\n"
+ "fmla v29.8h, v21.8h, v18.8h\n"
"add x10, x10, #0x10\n"
- "fmla v28.8h, v3.8h, v6.8h\n"
- "fmla v29.8h, v3.8h, v8.8h\n"
- "fmla v30.8h, v3.8h, v11.8h\n"
- "fmla v31.8h, v3.8h, v12.8h\n"
- "fmla v28.8h, v4.8h, v8.8h\n"
- "fmla v29.8h, v4.8h, v10.8h\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmla v30.8h, v4.8h, v12.8h\n"
- "fmla v31.8h, v4.8h, v9.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "str q28, [x14, x28]\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
- "str q29, [x13, x28]\n"
+ "fmla v31.8h, v20.8h, v24.8h\n"
+ "fmla v5.8h, v20.8h, v13.8h\n"
+ "fmla v30.8h, v20.8h, v18.8h\n"
+ "fmla v29.8h, v20.8h, v17.8h\n"
+ "fmla v31.8h, v19.8h, v13.8h\n"
+ "fmla v5.8h, v19.8h, v22.8h\n"
+ "fmax v31.8h, v31.8h, v27.8h\n"
+ "fmla v30.8h, v19.8h, v17.8h\n"
+ "fmla v29.8h, v19.8h, v16.8h\n"
+ "fmax v5.8h, v5.8h, v27.8h\n"
+ "fmax v30.8h, v30.8h, v27.8h\n"
+ "fmax v29.8h, v29.8h, v27.8h\n"
+ "fmin v31.8h, v31.8h, v15.8h\n"
+ "fmin v5.8h, v5.8h, v15.8h\n"
+ "str q31, [x14, x28]\n"
+ "fmin v30.8h, v30.8h, v15.8h\n"
+ "fmin v29.8h, v29.8h, v15.8h\n"
+ "str q5, [x13, x28]\n"
"str q30, [x12, x28]\n"
- "str q31, [x11, x28]\n"
+ "str q29, [x11, x28]\n"
"3:" // Oddments
"tst %x[n_channels], #0x7\n"
"beq 116f\n"
- "ldr q16, [x16, #0x0]\n"
+ "ldr q26, [x16, #0x0]\n"
"ldr q0, [x16, #0x10]\n"
- "mov x28, x10\n"
- "add x14, x14, x28\n"
+ "mov x20, x10\n"
+ "add x14, x14, x20\n"
"ldr q1, [x16, #0x20]\n"
"ldr q2, [x16, #0x30]\n"
- "add x13, x13, x28\n"
- "add x12, x12, x28\n"
+ "add x13, x13, x20\n"
+ "add x12, x12, x20\n"
"ldr q3, [x16, #0x40]\n"
"ldr q4, [x16, #0x50]\n"
- "add x11, x11, x28\n"
+ "add x11, x11, x20\n"
"ldr x9, [x15, #0x0]\n"
"ldr x28, [x15, #0x8]\n"
"add x9, x9, x10\n"
@@ -654,12 +654,12 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ld1 { v10.h }[0], [x21], #0x2\n"
"ld1 { v14.h }[0], [x20], #0x2\n"
"7:" // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: End
- "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v5.8h\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v6.8h\n"
+ "mov v28.16b, v26.16b\n fmla v28.8h, v0.8h, v5.8h\n"
+ "mov v29.16b, v26.16b\n fmla v29.8h, v0.8h, v6.8h\n"
"ldr x20, [x15, #0x50]\n"
"add x20, x20, x10\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v7.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v8.8h\n"
+ "mov v30.16b, v26.16b\n fmla v30.8h, v0.8h, v7.8h\n"
+ "mov v31.16b, v26.16b\n fmla v31.8h, v0.8h, v8.8h\n"
"fmla v28.8h, v1.8h, v6.8h\n"
"fmla v29.8h, v1.8h, v9.8h\n"
"fmla v30.8h, v1.8h, v8.8h\n"
@@ -1365,14 +1365,14 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ld1 { v9.h }[0], [x20], #0x2\n"
"111:" // Oddments: Load input (5, 5): Bit 2: End
"fmla v31.8h, v4.8h, v9.8h\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
+ "fmax v28.8h, v28.8h, v27.8h\n"
+ "fmax v29.8h, v29.8h, v27.8h\n"
+ "fmax v30.8h, v30.8h, v27.8h\n"
+ "fmax v31.8h, v31.8h, v27.8h\n"
+ "fmin v28.8h, v28.8h, v15.8h\n"
+ "fmin v29.8h, v29.8h, v15.8h\n"
+ "fmin v30.8h, v30.8h, v15.8h\n"
+ "fmin v31.8h, v31.8h, v15.8h\n"
"tbz %x[n_channels], #2, 113f\n"
"st1 { v28.d }[0], [x14], #0x8\n"
"st1 { v29.d }[0], [x13], #0x8\n"
@@ -1417,7 +1417,7 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"116:" // End
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp
index 1ccd3408e2..b7608af721 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -42,7 +42,7 @@ class a64_fp16_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKer
public:
a64_fp16_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy<__fp16, __fp16, __fp16, __fp16>(9, arm_gemm::VLType::None) {}
- virtual KernelType get_kernel() const override { return kernel; }
+ KernelType get_kernel() const override { return kernel; }
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp
index 418530fdc4..08f40b785f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -45,70 +45,70 @@ void a64_fp16_nhwc_generic_output9_mla_depthfirst_impl(
__asm__ __volatile__(
"ld1r { v2.8h }, [%x[minmax_vals]]\n"
- "lsr x12, %x[n_channels], #0x3\n"
+ "lsr x9, %x[n_channels], #0x3\n"
"add x20, %x[minmax_vals], #0x2\n"
"ld1r { v1.8h }, [x20]\n"
"mov x11, #0x0\n"
- "cbz x12, 5f\n"
+ "cbz x9, 5f\n"
"1:" // Channel loop
"movi v23.16b, #0x0\n"
"cbz %x[bias], 2f\n"
"ldr q23, [%x[bias], x11]\n"
"2:" // Channel loop: Load bias: Done
"ldr q0, [%x[params], #0x0]\n"
- "mov x21, %x[inptrs]\n"
- "ldp x10, x9, [x21], #0x10\n"
- "subs x20, %x[n_points], #0x1\n"
- "ldr q14, [x10, x11]\n"
- "ldr q15, [x9, x11]\n"
+ "mov x26, %x[inptrs]\n"
+ "ldp x21, x20, [x26], #0x10\n"
+ "subs x25, %x[n_points], #0x1\n"
+ "ldr q14, [x21, x11]\n"
+ "ldr q15, [x20, x11]\n"
"mov v24.16b, v23.16b\n"
"mov v25.16b, v23.16b\n"
- "ldp x28, x27, [x21], #0x10\n"
- "ldr q16, [x28, x11]\n"
+ "ldp x21, x20, [x26], #0x10\n"
+ "ldr q16, [x21, x11]\n"
"mov v26.16b, v23.16b\n"
"mov v27.16b, v23.16b\n"
- "ldr q17, [x27, x11]\n"
- "ldp x26, x25, [x21], #0x10\n"
+ "ldr q17, [x20, x11]\n"
+ "ldp x21, x20, [x26], #0x10\n"
"mov v28.16b, v23.16b\n"
"mov v29.16b, v23.16b\n"
- "ldr q18, [x26, x11]\n"
- "ldr q19, [x25, x11]\n"
+ "ldr q18, [x21, x11]\n"
+ "ldr q19, [x20, x11]\n"
"mov v30.16b, v23.16b\n"
"mov v31.16b, v23.16b\n"
- "ldp x24, x23, [x21], #0x10\n"
- "ldr q20, [x24, x11]\n"
+ "ldp x21, x20, [x26], #0x10\n"
+ "ldr q20, [x21, x11]\n"
"add %x[params], %x[params], #0x10\n"
- "ldr q21, [x23, x11]\n"
- "ldr x22, [x21], #0x8\n"
- "ldr q22, [x22, x11]\n"
+ "ldr q21, [x20, x11]\n"
+ "ldr x20, [x26], #0x8\n"
+ "ldr q22, [x20, x11]\n"
"ble 4f\n"
"3:" // Channel loop: Planar loop
- "ldp x10, x9, [x21], #0x10\n"
- "ldp x28, x27, [x21], #0x10\n"
- "subs x20, x20, #0x1\n"
+ "ldp x20, x24, [x26], #0x10\n"
+ "ldp x23, x22, [x26], #0x10\n"
+ "subs x25, x25, #0x1\n"
"fmla v23.8h, v14.8h, v0.8h\n"
- "ldr q14, [x10, x11]\n"
- "ldp x26, x25, [x21], #0x10\n"
+ "ldr q14, [x20, x11]\n"
+ "ldp x21, x20, [x26], #0x10\n"
"fmla v24.8h, v15.8h, v0.8h\n"
"fmla v25.8h, v16.8h, v0.8h\n"
- "ldr q15, [x9, x11]\n"
- "ldr q16, [x28, x11]\n"
+ "ldr q15, [x24, x11]\n"
+ "ldr q16, [x23, x11]\n"
"fmla v26.8h, v17.8h, v0.8h\n"
"fmla v27.8h, v18.8h, v0.8h\n"
- "ldr q17, [x27, x11]\n"
- "ldr q18, [x26, x11]\n"
+ "ldr q17, [x22, x11]\n"
+ "ldr q18, [x21, x11]\n"
"fmla v28.8h, v19.8h, v0.8h\n"
"fmla v29.8h, v20.8h, v0.8h\n"
- "ldr q19, [x25, x11]\n"
- "ldp x24, x23, [x21], #0x10\n"
+ "ldr q19, [x20, x11]\n"
+ "ldp x21, x20, [x26], #0x10\n"
"fmla v30.8h, v21.8h, v0.8h\n"
"fmla v31.8h, v22.8h, v0.8h\n"
"ldr q0, [%x[params], #0x0]\n"
- "ldr q20, [x24, x11]\n"
+ "ldr q20, [x21, x11]\n"
"add %x[params], %x[params], #0x10\n"
- "ldr q21, [x23, x11]\n"
- "ldr x22, [x21], #0x8\n"
- "ldr q22, [x22, x11]\n"
+ "ldr q21, [x20, x11]\n"
+ "ldr x20, [x26], #0x8\n"
+ "ldr q22, [x20, x11]\n"
"bgt 3b\n"
"4:" // Channel loop: Planar tail
"fmla v23.8h, v14.8h, v0.8h\n"
@@ -153,7 +153,7 @@ void a64_fp16_nhwc_generic_output9_mla_depthfirst_impl(
"str q30, [x21, x11]\n"
"str q31, [x20, x11]\n"
"add x11, x11, #0x10\n"
- "cmp x11, x12, LSL #4\n"
+ "cmp x11, x9, LSL #4\n"
"blt 1b\n"
"5:" // Oddments
"tst %x[n_channels], #0x7\n"
@@ -183,209 +183,209 @@ void a64_fp16_nhwc_generic_output9_mla_depthfirst_impl(
"9:" // Oddments: Load bias: Bit 2: End
"10:" // Oddments: Load bias: Done
"ldr q0, [%x[params], #0x0]\n"
- "mov x21, %x[inptrs]\n"
- "ldp x10, x9, [x21], #0x10\n"
+ "mov x10, %x[inptrs]\n"
+ "ldp x9, x28, [x10], #0x10\n"
"mov v24.16b, v23.16b\n"
- "ldp x28, x27, [x21], #0x10\n"
- "ldp x26, x25, [x21], #0x10\n"
+ "ldp x27, x26, [x10], #0x10\n"
+ "ldp x25, x24, [x10], #0x10\n"
"mov v25.16b, v23.16b\n"
"mov v26.16b, v23.16b\n"
- "ldp x24, x23, [x21], #0x10\n"
- "ldr x22, [x21], #0x8\n"
+ "ldp x23, x22, [x10], #0x10\n"
+ "ldr x21, [x10], #0x8\n"
"mov v27.16b, v23.16b\n"
"mov v28.16b, v23.16b\n"
"mov v29.16b, v23.16b\n"
"mov v30.16b, v23.16b\n"
- "add x10, x10, x11\n"
"add x9, x9, x11\n"
- "mov v31.16b, v23.16b\n"
"add x28, x28, x11\n"
+ "mov v31.16b, v23.16b\n"
"add x27, x27, x11\n"
"add x26, x26, x11\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
+ "add x21, x21, x11\n"
"add %x[params], %x[params], #0x10\n"
"tbz %x[n_channels], #2, 12f\n"
- "ldr d14, [x10], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "ldr d16, [x28], #0x8\n"
- "ldr d17, [x27], #0x8\n"
- "ldr d18, [x26], #0x8\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
- "ldr d21, [x23], #0x8\n"
- "ldr d22, [x22], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d15, [x28], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d17, [x26], #0x8\n"
+ "ldr d18, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "ldr d22, [x21], #0x8\n"
"tbz %x[n_channels], #1, 11f\n"
- "ld1 { v14.s }[2], [x10], #0x4\n"
- "ld1 { v15.s }[2], [x9], #0x4\n"
- "ld1 { v16.s }[2], [x28], #0x4\n"
- "ld1 { v17.s }[2], [x27], #0x4\n"
- "ld1 { v18.s }[2], [x26], #0x4\n"
- "ld1 { v19.s }[2], [x25], #0x4\n"
- "ld1 { v20.s }[2], [x24], #0x4\n"
- "ld1 { v21.s }[2], [x23], #0x4\n"
- "ld1 { v22.s }[2], [x22], #0x4\n"
+ "ld1 { v14.s }[2], [x9], #0x4\n"
+ "ld1 { v15.s }[2], [x28], #0x4\n"
+ "ld1 { v16.s }[2], [x27], #0x4\n"
+ "ld1 { v17.s }[2], [x26], #0x4\n"
+ "ld1 { v18.s }[2], [x25], #0x4\n"
+ "ld1 { v19.s }[2], [x24], #0x4\n"
+ "ld1 { v20.s }[2], [x23], #0x4\n"
+ "ld1 { v21.s }[2], [x22], #0x4\n"
+ "ld1 { v22.s }[2], [x21], #0x4\n"
"tbz %x[n_channels], #0, 14f\n"
- "ld1 { v14.h }[6], [x10], #0x2\n"
- "ld1 { v15.h }[6], [x9], #0x2\n"
- "ld1 { v16.h }[6], [x28], #0x2\n"
- "ld1 { v17.h }[6], [x27], #0x2\n"
- "ld1 { v18.h }[6], [x26], #0x2\n"
- "ld1 { v19.h }[6], [x25], #0x2\n"
- "ld1 { v20.h }[6], [x24], #0x2\n"
- "ld1 { v21.h }[6], [x23], #0x2\n"
- "ld1 { v22.h }[6], [x22], #0x2\n"
+ "ld1 { v14.h }[6], [x9], #0x2\n"
+ "ld1 { v15.h }[6], [x28], #0x2\n"
+ "ld1 { v16.h }[6], [x27], #0x2\n"
+ "ld1 { v17.h }[6], [x26], #0x2\n"
+ "ld1 { v18.h }[6], [x25], #0x2\n"
+ "ld1 { v19.h }[6], [x24], #0x2\n"
+ "ld1 { v20.h }[6], [x23], #0x2\n"
+ "ld1 { v21.h }[6], [x22], #0x2\n"
+ "ld1 { v22.h }[6], [x21], #0x2\n"
"b 14f\n"
"11:" // Oddments: Load: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 14f\n"
- "ld1 { v14.h }[4], [x10], #0x2\n"
- "ld1 { v15.h }[4], [x9], #0x2\n"
- "ld1 { v16.h }[4], [x28], #0x2\n"
- "ld1 { v17.h }[4], [x27], #0x2\n"
- "ld1 { v18.h }[4], [x26], #0x2\n"
- "ld1 { v19.h }[4], [x25], #0x2\n"
- "ld1 { v20.h }[4], [x24], #0x2\n"
- "ld1 { v21.h }[4], [x23], #0x2\n"
- "ld1 { v22.h }[4], [x22], #0x2\n"
+ "ld1 { v14.h }[4], [x9], #0x2\n"
+ "ld1 { v15.h }[4], [x28], #0x2\n"
+ "ld1 { v16.h }[4], [x27], #0x2\n"
+ "ld1 { v17.h }[4], [x26], #0x2\n"
+ "ld1 { v18.h }[4], [x25], #0x2\n"
+ "ld1 { v19.h }[4], [x24], #0x2\n"
+ "ld1 { v20.h }[4], [x23], #0x2\n"
+ "ld1 { v21.h }[4], [x22], #0x2\n"
+ "ld1 { v22.h }[4], [x21], #0x2\n"
"b 14f\n"
"12:" // Oddments: Load: Bit 2: Unset
"tbz %x[n_channels], #1, 13f\n"
- "ldr s14, [x10], #0x4\n"
- "ldr s15, [x9], #0x4\n"
- "ldr s16, [x28], #0x4\n"
- "ldr s17, [x27], #0x4\n"
- "ldr s18, [x26], #0x4\n"
- "ldr s19, [x25], #0x4\n"
- "ldr s20, [x24], #0x4\n"
- "ldr s21, [x23], #0x4\n"
- "ldr s22, [x22], #0x4\n"
+ "ldr s14, [x9], #0x4\n"
+ "ldr s15, [x28], #0x4\n"
+ "ldr s16, [x27], #0x4\n"
+ "ldr s17, [x26], #0x4\n"
+ "ldr s18, [x25], #0x4\n"
+ "ldr s19, [x24], #0x4\n"
+ "ldr s20, [x23], #0x4\n"
+ "ldr s21, [x22], #0x4\n"
+ "ldr s22, [x21], #0x4\n"
"tbz %x[n_channels], #0, 14f\n"
- "ld1 { v14.h }[2], [x10], #0x2\n"
- "ld1 { v15.h }[2], [x9], #0x2\n"
- "ld1 { v16.h }[2], [x28], #0x2\n"
- "ld1 { v17.h }[2], [x27], #0x2\n"
- "ld1 { v18.h }[2], [x26], #0x2\n"
- "ld1 { v19.h }[2], [x25], #0x2\n"
- "ld1 { v20.h }[2], [x24], #0x2\n"
- "ld1 { v21.h }[2], [x23], #0x2\n"
- "ld1 { v22.h }[2], [x22], #0x2\n"
+ "ld1 { v14.h }[2], [x9], #0x2\n"
+ "ld1 { v15.h }[2], [x28], #0x2\n"
+ "ld1 { v16.h }[2], [x27], #0x2\n"
+ "ld1 { v17.h }[2], [x26], #0x2\n"
+ "ld1 { v18.h }[2], [x25], #0x2\n"
+ "ld1 { v19.h }[2], [x24], #0x2\n"
+ "ld1 { v20.h }[2], [x23], #0x2\n"
+ "ld1 { v21.h }[2], [x22], #0x2\n"
+ "ld1 { v22.h }[2], [x21], #0x2\n"
"b 14f\n"
"13:" // Oddments: Load: Bit 2: Unset: Bit 1: Unset
- "ldr h14, [x10], #0x2\n"
- "ldr h15, [x9], #0x2\n"
- "ldr h16, [x28], #0x2\n"
- "ldr h17, [x27], #0x2\n"
- "ldr h18, [x26], #0x2\n"
- "ldr h19, [x25], #0x2\n"
- "ldr h20, [x24], #0x2\n"
- "ldr h21, [x23], #0x2\n"
- "ldr h22, [x22], #0x2\n"
+ "ldr h14, [x9], #0x2\n"
+ "ldr h15, [x28], #0x2\n"
+ "ldr h16, [x27], #0x2\n"
+ "ldr h17, [x26], #0x2\n"
+ "ldr h18, [x25], #0x2\n"
+ "ldr h19, [x24], #0x2\n"
+ "ldr h20, [x23], #0x2\n"
+ "ldr h21, [x22], #0x2\n"
+ "ldr h22, [x21], #0x2\n"
"14:" // Oddments: Load: Bit 2: End
"subs x20, %x[n_points], #0x1\n"
"ble 20f\n"
"15:" // Oddments: Planar loop
- "ldp x10, x9, [x21], #0x10\n"
- "ldp x28, x27, [x21], #0x10\n"
+ "ldp x9, x28, [x10], #0x10\n"
+ "ldp x27, x26, [x10], #0x10\n"
"fmla v23.8h, v14.8h, v0.8h\n"
"fmla v24.8h, v15.8h, v0.8h\n"
- "ldp x26, x25, [x21], #0x10\n"
- "ldp x24, x23, [x21], #0x10\n"
+ "ldp x25, x24, [x10], #0x10\n"
+ "ldp x23, x22, [x10], #0x10\n"
"fmla v25.8h, v16.8h, v0.8h\n"
"fmla v26.8h, v17.8h, v0.8h\n"
- "ldr x22, [x21], #0x8\n"
+ "ldr x21, [x10], #0x8\n"
"fmla v27.8h, v18.8h, v0.8h\n"
"fmla v28.8h, v19.8h, v0.8h\n"
- "add x10, x10, x11\n"
+ "add x9, x9, x11\n"
"fmla v29.8h, v20.8h, v0.8h\n"
"fmla v30.8h, v21.8h, v0.8h\n"
- "add x9, x9, x11\n"
"add x28, x28, x11\n"
+ "add x27, x27, x11\n"
"fmla v31.8h, v22.8h, v0.8h\n"
"ldr q0, [%x[params], #0x0]\n"
- "add x27, x27, x11\n"
"add x26, x26, x11\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
+ "add x21, x21, x11\n"
"add %x[params], %x[params], #0x10\n"
"tbz %x[n_channels], #2, 17f\n"
- "ldr d14, [x10], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "ldr d16, [x28], #0x8\n"
- "ldr d17, [x27], #0x8\n"
- "ldr d18, [x26], #0x8\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
- "ldr d21, [x23], #0x8\n"
- "ldr d22, [x22], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d15, [x28], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d17, [x26], #0x8\n"
+ "ldr d18, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "ldr d22, [x21], #0x8\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v14.s }[2], [x10], #0x4\n"
- "ld1 { v15.s }[2], [x9], #0x4\n"
- "ld1 { v16.s }[2], [x28], #0x4\n"
- "ld1 { v17.s }[2], [x27], #0x4\n"
- "ld1 { v18.s }[2], [x26], #0x4\n"
- "ld1 { v19.s }[2], [x25], #0x4\n"
- "ld1 { v20.s }[2], [x24], #0x4\n"
- "ld1 { v21.s }[2], [x23], #0x4\n"
- "ld1 { v22.s }[2], [x22], #0x4\n"
+ "ld1 { v14.s }[2], [x9], #0x4\n"
+ "ld1 { v15.s }[2], [x28], #0x4\n"
+ "ld1 { v16.s }[2], [x27], #0x4\n"
+ "ld1 { v17.s }[2], [x26], #0x4\n"
+ "ld1 { v18.s }[2], [x25], #0x4\n"
+ "ld1 { v19.s }[2], [x24], #0x4\n"
+ "ld1 { v20.s }[2], [x23], #0x4\n"
+ "ld1 { v21.s }[2], [x22], #0x4\n"
+ "ld1 { v22.s }[2], [x21], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v14.h }[6], [x10], #0x2\n"
- "ld1 { v15.h }[6], [x9], #0x2\n"
- "ld1 { v16.h }[6], [x28], #0x2\n"
- "ld1 { v17.h }[6], [x27], #0x2\n"
- "ld1 { v18.h }[6], [x26], #0x2\n"
- "ld1 { v19.h }[6], [x25], #0x2\n"
- "ld1 { v20.h }[6], [x24], #0x2\n"
- "ld1 { v21.h }[6], [x23], #0x2\n"
- "ld1 { v22.h }[6], [x22], #0x2\n"
+ "ld1 { v14.h }[6], [x9], #0x2\n"
+ "ld1 { v15.h }[6], [x28], #0x2\n"
+ "ld1 { v16.h }[6], [x27], #0x2\n"
+ "ld1 { v17.h }[6], [x26], #0x2\n"
+ "ld1 { v18.h }[6], [x25], #0x2\n"
+ "ld1 { v19.h }[6], [x24], #0x2\n"
+ "ld1 { v20.h }[6], [x23], #0x2\n"
+ "ld1 { v21.h }[6], [x22], #0x2\n"
+ "ld1 { v22.h }[6], [x21], #0x2\n"
"b 19f\n"
"16:" // Oddments: Planar loop: Load: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v14.h }[4], [x10], #0x2\n"
- "ld1 { v15.h }[4], [x9], #0x2\n"
- "ld1 { v16.h }[4], [x28], #0x2\n"
- "ld1 { v17.h }[4], [x27], #0x2\n"
- "ld1 { v18.h }[4], [x26], #0x2\n"
- "ld1 { v19.h }[4], [x25], #0x2\n"
- "ld1 { v20.h }[4], [x24], #0x2\n"
- "ld1 { v21.h }[4], [x23], #0x2\n"
- "ld1 { v22.h }[4], [x22], #0x2\n"
+ "ld1 { v14.h }[4], [x9], #0x2\n"
+ "ld1 { v15.h }[4], [x28], #0x2\n"
+ "ld1 { v16.h }[4], [x27], #0x2\n"
+ "ld1 { v17.h }[4], [x26], #0x2\n"
+ "ld1 { v18.h }[4], [x25], #0x2\n"
+ "ld1 { v19.h }[4], [x24], #0x2\n"
+ "ld1 { v20.h }[4], [x23], #0x2\n"
+ "ld1 { v21.h }[4], [x22], #0x2\n"
+ "ld1 { v22.h }[4], [x21], #0x2\n"
"b 19f\n"
"17:" // Oddments: Planar loop: Load: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ldr s14, [x10], #0x4\n"
- "ldr s15, [x9], #0x4\n"
- "ldr s16, [x28], #0x4\n"
- "ldr s17, [x27], #0x4\n"
- "ldr s18, [x26], #0x4\n"
- "ldr s19, [x25], #0x4\n"
- "ldr s20, [x24], #0x4\n"
- "ldr s21, [x23], #0x4\n"
- "ldr s22, [x22], #0x4\n"
+ "ldr s14, [x9], #0x4\n"
+ "ldr s15, [x28], #0x4\n"
+ "ldr s16, [x27], #0x4\n"
+ "ldr s17, [x26], #0x4\n"
+ "ldr s18, [x25], #0x4\n"
+ "ldr s19, [x24], #0x4\n"
+ "ldr s20, [x23], #0x4\n"
+ "ldr s21, [x22], #0x4\n"
+ "ldr s22, [x21], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v14.h }[2], [x10], #0x2\n"
- "ld1 { v15.h }[2], [x9], #0x2\n"
- "ld1 { v16.h }[2], [x28], #0x2\n"
- "ld1 { v17.h }[2], [x27], #0x2\n"
- "ld1 { v18.h }[2], [x26], #0x2\n"
- "ld1 { v19.h }[2], [x25], #0x2\n"
- "ld1 { v20.h }[2], [x24], #0x2\n"
- "ld1 { v21.h }[2], [x23], #0x2\n"
- "ld1 { v22.h }[2], [x22], #0x2\n"
+ "ld1 { v14.h }[2], [x9], #0x2\n"
+ "ld1 { v15.h }[2], [x28], #0x2\n"
+ "ld1 { v16.h }[2], [x27], #0x2\n"
+ "ld1 { v17.h }[2], [x26], #0x2\n"
+ "ld1 { v18.h }[2], [x25], #0x2\n"
+ "ld1 { v19.h }[2], [x24], #0x2\n"
+ "ld1 { v20.h }[2], [x23], #0x2\n"
+ "ld1 { v21.h }[2], [x22], #0x2\n"
+ "ld1 { v22.h }[2], [x21], #0x2\n"
"b 19f\n"
"18:" // Oddments: Planar loop: Load: Bit 2: Unset: Bit 1: Unset
- "ldr h14, [x10], #0x2\n"
- "ldr h15, [x9], #0x2\n"
- "ldr h16, [x28], #0x2\n"
- "ldr h17, [x27], #0x2\n"
- "ldr h18, [x26], #0x2\n"
- "ldr h19, [x25], #0x2\n"
- "ldr h20, [x24], #0x2\n"
- "ldr h21, [x23], #0x2\n"
- "ldr h22, [x22], #0x2\n"
+ "ldr h14, [x9], #0x2\n"
+ "ldr h15, [x28], #0x2\n"
+ "ldr h16, [x27], #0x2\n"
+ "ldr h17, [x26], #0x2\n"
+ "ldr h18, [x25], #0x2\n"
+ "ldr h19, [x24], #0x2\n"
+ "ldr h20, [x23], #0x2\n"
+ "ldr h21, [x22], #0x2\n"
+ "ldr h22, [x21], #0x2\n"
"19:" // Oddments: Planar loop: Load: Bit 2: End
"subs x20, x20, #0x1\n"
"bgt 15b\n"
@@ -507,12 +507,10 @@ void a64_fp16_nhwc_generic_output9_mla_depthfirst_impl(
"st1 { v30.h }[0], [x21], #0x2\n"
"st1 { v31.h }[0], [x20], #0x2\n"
"24:" // Oddments: Store: Bit 2: End
-
"25:" // End
-
: [params] "+&r" (params)
: [bias] "r" (bias), [inptrs] "r" (inptrs), [minmax_vals] "r" (minmax_vals), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [outptrs] "r" (outptrs)
- : "cc", "memory", "v0", "v1", "v2", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
index 8fcbce2cfe..3646c18b04 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
index f246cec87e..cee3fb59c5 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -44,10 +44,10 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
const __fp16 minmax_vals[2] = { activation_min, activation_max };
__asm__ __volatile__(
- "ld1r { v7.8h }, [%x[minmax_vals]]\n"
+ "ld1r { v8.8h }, [%x[minmax_vals]]\n"
"lsr x11, %x[n_output_channels], #0x3\n"
"add x20, %x[minmax_vals], #0x2\n"
- "ld1r { v6.8h }, [x20]\n"
+ "ld1r { v7.8h }, [x20]\n"
"mov x10, #0x0\n"
"cbz x11, 8f\n"
"1:" // Output channel loop
@@ -56,12 +56,12 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"lsl x20, x10, #0x1\n"
"ldr q31, [%x[bias], x20]\n"
"2:" // Output channel loop: Load bias: Done
- "ldr q5, [%x[weights], #0x0]\n"
- "mov x20, %x[inptrs]\n"
- "ldp x24, x9, [x20], #0x10\n"
- "lsr x21, %x[kernel_points], #0x1\n"
- "ldr q4, [x24, #0x0]\n"
- "ldr q3, [x9, #0x0]\n"
+ "ldr q6, [%x[weights], #0x0]\n"
+ "mov x22, %x[inptrs]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "lsr x23, %x[kernel_points], #0x1\n"
+ "ldr q1, [x21, #0x0]\n"
+ "ldr q0, [x20, #0x0]\n"
"mov v16.16b, v31.16b\n"
"mov v17.16b, v31.16b\n"
"mov v18.16b, v31.16b\n"
@@ -79,26 +79,46 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"mov v29.16b, v31.16b\n"
"mov v30.16b, v31.16b\n"
"mov v31.16b, v31.16b\n"
- "cbz x21, 6f\n"
- "ldr q2, [%x[weights], #0x0]\n"
- "ldp x24, x9, [x20], #0x10\n"
- "subs x21, x21, #0x1\n"
+ "cbz x23, 6f\n"
+ "ldr q5, [%x[weights], #0x0]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "subs x23, x23, #0x1\n"
"add %x[weights], %x[weights], #0x10\n"
- "ldr q1, [x24, #0x0]\n"
- "ldr q0, [x9, #0x0]\n"
+ "ldr q4, [x21, #0x0]\n"
+ "ldr q3, [x20, #0x0]\n"
"beq 4f\n"
"3:" // Output channel loop: Kernel loop
- "ldp x24, x9, [x20], #0x10\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "fmla v16.8h, v6.8h, v1.h[0]\n"
+ "fmla v17.8h, v6.8h, v1.h[1]\n"
+ "subs x23, x23, #0x1\n"
+ "fmla v18.8h, v6.8h, v1.h[2]\n"
+ "fmla v19.8h, v6.8h, v1.h[3]\n"
+ "fmla v20.8h, v6.8h, v1.h[4]\n"
+ "fmla v21.8h, v6.8h, v1.h[5]\n"
+ "fmla v22.8h, v6.8h, v1.h[6]\n"
+ "fmla v23.8h, v6.8h, v1.h[7]\n"
+ "ldr q1, [x21, #0x0]\n"
+ "fmla v24.8h, v6.8h, v0.h[0]\n"
+ "fmla v25.8h, v6.8h, v0.h[1]\n"
+ "fmla v26.8h, v6.8h, v0.h[2]\n"
+ "fmla v27.8h, v6.8h, v0.h[3]\n"
+ "fmla v28.8h, v6.8h, v0.h[4]\n"
+ "fmla v29.8h, v6.8h, v0.h[5]\n"
+ "fmla v30.8h, v6.8h, v0.h[6]\n"
+ "fmla v31.8h, v6.8h, v0.h[7]\n"
+ "ldr q0, [x20, #0x0]\n"
+ "ldr q6, [%x[weights], #0x0]\n"
+ "ldp x21, x20, [x22], #0x10\n"
"fmla v16.8h, v5.8h, v4.h[0]\n"
"fmla v17.8h, v5.8h, v4.h[1]\n"
- "subs x21, x21, #0x1\n"
"fmla v18.8h, v5.8h, v4.h[2]\n"
"fmla v19.8h, v5.8h, v4.h[3]\n"
"fmla v20.8h, v5.8h, v4.h[4]\n"
"fmla v21.8h, v5.8h, v4.h[5]\n"
"fmla v22.8h, v5.8h, v4.h[6]\n"
"fmla v23.8h, v5.8h, v4.h[7]\n"
- "ldr q4, [x24, #0x0]\n"
+ "ldr q4, [x21, #0x0]\n"
"fmla v24.8h, v5.8h, v3.h[0]\n"
"fmla v25.8h, v5.8h, v3.h[1]\n"
"fmla v26.8h, v5.8h, v3.h[2]\n"
@@ -107,332 +127,312 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"fmla v29.8h, v5.8h, v3.h[5]\n"
"fmla v30.8h, v5.8h, v3.h[6]\n"
"fmla v31.8h, v5.8h, v3.h[7]\n"
- "ldr q3, [x9, #0x0]\n"
- "ldr q5, [%x[weights], #0x0]\n"
- "ldp x24, x9, [x20], #0x10\n"
- "fmla v16.8h, v2.8h, v1.h[0]\n"
- "fmla v17.8h, v2.8h, v1.h[1]\n"
- "fmla v18.8h, v2.8h, v1.h[2]\n"
- "fmla v19.8h, v2.8h, v1.h[3]\n"
- "fmla v20.8h, v2.8h, v1.h[4]\n"
- "fmla v21.8h, v2.8h, v1.h[5]\n"
- "fmla v22.8h, v2.8h, v1.h[6]\n"
- "fmla v23.8h, v2.8h, v1.h[7]\n"
- "ldr q1, [x24, #0x0]\n"
- "fmla v24.8h, v2.8h, v0.h[0]\n"
- "fmla v25.8h, v2.8h, v0.h[1]\n"
- "fmla v26.8h, v2.8h, v0.h[2]\n"
- "fmla v27.8h, v2.8h, v0.h[3]\n"
- "fmla v28.8h, v2.8h, v0.h[4]\n"
- "fmla v29.8h, v2.8h, v0.h[5]\n"
- "fmla v30.8h, v2.8h, v0.h[6]\n"
- "fmla v31.8h, v2.8h, v0.h[7]\n"
- "ldr q0, [x9, #0x0]\n"
- "ldr q2, [%x[weights], #0x10]\n"
+ "ldr q3, [x20, #0x0]\n"
+ "ldr q5, [%x[weights], #0x10]\n"
"add %x[weights], %x[weights], #0x20\n"
"bgt 3b\n"
"4:" // Output channel loop: Kernel loop tail
"tbnz %x[kernel_points], #0, 5f\n"
+ "fmla v16.8h, v6.8h, v1.h[0]\n"
+ "fmla v17.8h, v6.8h, v1.h[1]\n"
+ "lsl x28, x10, #0x1\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "fmla v18.8h, v6.8h, v1.h[2]\n"
+ "fmla v19.8h, v6.8h, v1.h[3]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "fmla v20.8h, v6.8h, v1.h[4]\n"
+ "fmla v21.8h, v6.8h, v1.h[5]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "fmla v22.8h, v6.8h, v1.h[6]\n"
+ "fmla v23.8h, v6.8h, v1.h[7]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "fmla v24.8h, v6.8h, v0.h[0]\n"
+ "fmla v25.8h, v6.8h, v0.h[1]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "fmla v26.8h, v6.8h, v0.h[2]\n"
+ "fmla v27.8h, v6.8h, v0.h[3]\n"
+ "fmla v28.8h, v6.8h, v0.h[4]\n"
+ "fmla v29.8h, v6.8h, v0.h[5]\n"
+ "fmla v30.8h, v6.8h, v0.h[6]\n"
+ "fmla v31.8h, v6.8h, v0.h[7]\n"
"fmla v16.8h, v5.8h, v4.h[0]\n"
"fmla v17.8h, v5.8h, v4.h[1]\n"
- "lsl x28, x10, #0x1\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
+ "fmin v16.8h, v16.8h, v7.8h\n"
"fmla v18.8h, v5.8h, v4.h[2]\n"
"fmla v19.8h, v5.8h, v4.h[3]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
+ "fmin v17.8h, v17.8h, v7.8h\n"
"fmla v20.8h, v5.8h, v4.h[4]\n"
"fmla v21.8h, v5.8h, v4.h[5]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
+ "fmin v18.8h, v18.8h, v7.8h\n"
"fmla v22.8h, v5.8h, v4.h[6]\n"
"fmla v23.8h, v5.8h, v4.h[7]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
+ "fmin v19.8h, v19.8h, v7.8h\n"
"fmla v24.8h, v5.8h, v3.h[0]\n"
"fmla v25.8h, v5.8h, v3.h[1]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
+ "fmin v20.8h, v20.8h, v7.8h\n"
"fmla v26.8h, v5.8h, v3.h[2]\n"
"fmla v27.8h, v5.8h, v3.h[3]\n"
+ "fmin v21.8h, v21.8h, v7.8h\n"
"fmla v28.8h, v5.8h, v3.h[4]\n"
"fmla v29.8h, v5.8h, v3.h[5]\n"
+ "fmin v22.8h, v22.8h, v7.8h\n"
"fmla v30.8h, v5.8h, v3.h[6]\n"
"fmla v31.8h, v5.8h, v3.h[7]\n"
- "fmla v16.8h, v2.8h, v1.h[0]\n"
- "fmla v17.8h, v2.8h, v1.h[1]\n"
- "fmin v16.8h, v16.8h, v6.8h\n"
- "fmla v18.8h, v2.8h, v1.h[2]\n"
- "fmla v19.8h, v2.8h, v1.h[3]\n"
- "fmin v17.8h, v17.8h, v6.8h\n"
- "fmla v20.8h, v2.8h, v1.h[4]\n"
- "fmla v21.8h, v2.8h, v1.h[5]\n"
- "fmin v18.8h, v18.8h, v6.8h\n"
- "fmla v22.8h, v2.8h, v1.h[6]\n"
- "fmla v23.8h, v2.8h, v1.h[7]\n"
- "fmin v19.8h, v19.8h, v6.8h\n"
- "fmla v24.8h, v2.8h, v0.h[0]\n"
- "fmla v25.8h, v2.8h, v0.h[1]\n"
- "fmin v20.8h, v20.8h, v6.8h\n"
- "fmla v26.8h, v2.8h, v0.h[2]\n"
- "fmla v27.8h, v2.8h, v0.h[3]\n"
- "fmin v21.8h, v21.8h, v6.8h\n"
- "fmla v28.8h, v2.8h, v0.h[4]\n"
- "fmla v29.8h, v2.8h, v0.h[5]\n"
- "fmin v22.8h, v22.8h, v6.8h\n"
- "fmla v30.8h, v2.8h, v0.h[6]\n"
- "fmla v31.8h, v2.8h, v0.h[7]\n"
- "fmin v23.8h, v23.8h, v6.8h\n"
- "fmax v16.8h, v16.8h, v7.8h\n"
- "fmax v17.8h, v17.8h, v7.8h\n"
- "str q16, [x20, x28]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "fmax v18.8h, v18.8h, v7.8h\n"
- "fmax v19.8h, v19.8h, v7.8h\n"
- "str q17, [x21, x28]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "fmax v20.8h, v20.8h, v7.8h\n"
- "fmax v21.8h, v21.8h, v7.8h\n"
- "str q18, [x22, x28]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "fmax v22.8h, v22.8h, v7.8h\n"
- "fmax v23.8h, v23.8h, v7.8h\n"
- "str q19, [x23, x28]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "fmin v24.8h, v24.8h, v6.8h\n"
- "fmin v25.8h, v25.8h, v6.8h\n"
- "str q20, [x24, x28]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "fmin v26.8h, v26.8h, v6.8h\n"
- "fmin v27.8h, v27.8h, v6.8h\n"
- "str q21, [x25, x28]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "fmin v28.8h, v28.8h, v6.8h\n"
- "fmin v29.8h, v29.8h, v6.8h\n"
- "str q22, [x26, x28]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
- "fmin v30.8h, v30.8h, v6.8h\n"
- "fmin v31.8h, v31.8h, v6.8h\n"
- "str q23, [x27, x28]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
- "fmax v24.8h, v24.8h, v7.8h\n"
- "fmax v25.8h, v25.8h, v7.8h\n"
- "str q24, [x20, x28]\n"
- "fmax v26.8h, v26.8h, v7.8h\n"
- "fmax v27.8h, v27.8h, v7.8h\n"
- "str q25, [x21, x28]\n"
- "fmax v28.8h, v28.8h, v7.8h\n"
- "fmax v29.8h, v29.8h, v7.8h\n"
- "str q26, [x22, x28]\n"
- "fmax v30.8h, v30.8h, v7.8h\n"
- "fmax v31.8h, v31.8h, v7.8h\n"
- "str q27, [x23, x28]\n"
- "str q28, [x24, x28]\n"
- "str q29, [x25, x28]\n"
- "str q30, [x26, x28]\n"
- "str q31, [x27, x28]\n"
+ "fmin v23.8h, v23.8h, v7.8h\n"
+ "fmax v16.8h, v16.8h, v8.8h\n"
+ "fmax v17.8h, v17.8h, v8.8h\n"
+ "str q16, [x27, x28]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "fmax v18.8h, v18.8h, v8.8h\n"
+ "fmax v19.8h, v19.8h, v8.8h\n"
+ "str q17, [x26, x28]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "fmax v20.8h, v20.8h, v8.8h\n"
+ "fmax v21.8h, v21.8h, v8.8h\n"
+ "str q18, [x25, x28]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "fmax v22.8h, v22.8h, v8.8h\n"
+ "fmax v23.8h, v23.8h, v8.8h\n"
+ "str q19, [x24, x28]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "fmin v24.8h, v24.8h, v7.8h\n"
+ "fmin v25.8h, v25.8h, v7.8h\n"
+ "str q20, [x23, x28]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmin v26.8h, v26.8h, v7.8h\n"
+ "fmin v27.8h, v27.8h, v7.8h\n"
+ "str q21, [x22, x28]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "fmin v28.8h, v28.8h, v7.8h\n"
+ "fmin v29.8h, v29.8h, v7.8h\n"
+ "str q22, [x21, x28]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "fmin v30.8h, v30.8h, v7.8h\n"
+ "fmin v31.8h, v31.8h, v7.8h\n"
+ "str q23, [x20, x28]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "fmax v24.8h, v24.8h, v8.8h\n"
+ "fmax v25.8h, v25.8h, v8.8h\n"
+ "str q24, [x27, x28]\n"
+ "fmax v26.8h, v26.8h, v8.8h\n"
+ "fmax v27.8h, v27.8h, v8.8h\n"
+ "str q25, [x26, x28]\n"
+ "fmax v28.8h, v28.8h, v8.8h\n"
+ "fmax v29.8h, v29.8h, v8.8h\n"
+ "str q26, [x25, x28]\n"
+ "fmax v30.8h, v30.8h, v8.8h\n"
+ "fmax v31.8h, v31.8h, v8.8h\n"
+ "str q27, [x24, x28]\n"
+ "str q28, [x23, x28]\n"
+ "str q29, [x22, x28]\n"
+ "str q30, [x21, x28]\n"
+ "str q31, [x20, x28]\n"
"b 7f\n"
"5:" // Output channel loop: Odd tail
- "fmla v16.8h, v5.8h, v4.h[0]\n"
- "fmla v17.8h, v5.8h, v4.h[1]\n"
- "ldp x24, x9, [x20], #0x10\n"
+ "fmla v16.8h, v6.8h, v1.h[0]\n"
+ "fmla v17.8h, v6.8h, v1.h[1]\n"
+ "ldp x20, x9, [x22], #0x10\n"
"lsl x28, x10, #0x1\n"
- "fmla v18.8h, v5.8h, v4.h[2]\n"
- "fmla v19.8h, v5.8h, v4.h[3]\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "fmla v20.8h, v5.8h, v4.h[4]\n"
- "fmla v21.8h, v5.8h, v4.h[5]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "fmla v22.8h, v5.8h, v4.h[6]\n"
- "fmla v23.8h, v5.8h, v4.h[7]\n"
- "ldr q4, [x24, #0x0]\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "fmla v24.8h, v5.8h, v3.h[0]\n"
- "fmla v25.8h, v5.8h, v3.h[1]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "fmla v26.8h, v5.8h, v3.h[2]\n"
- "fmla v27.8h, v5.8h, v3.h[3]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "fmla v28.8h, v5.8h, v3.h[4]\n"
- "fmla v29.8h, v5.8h, v3.h[5]\n"
- "fmla v30.8h, v5.8h, v3.h[6]\n"
- "fmla v31.8h, v5.8h, v3.h[7]\n"
- "ldr q5, [%x[weights], #0x0]\n"
- "ldr q3, [x9, #0x0]\n"
- "fmla v16.8h, v2.8h, v1.h[0]\n"
- "fmla v17.8h, v2.8h, v1.h[1]\n"
- "add %x[weights], %x[weights], #0x10\n"
- "fmla v18.8h, v2.8h, v1.h[2]\n"
- "fmla v19.8h, v2.8h, v1.h[3]\n"
- "fmla v20.8h, v2.8h, v1.h[4]\n"
- "fmla v21.8h, v2.8h, v1.h[5]\n"
- "fmla v22.8h, v2.8h, v1.h[6]\n"
- "fmla v23.8h, v2.8h, v1.h[7]\n"
- "fmla v24.8h, v2.8h, v0.h[0]\n"
- "fmla v25.8h, v2.8h, v0.h[1]\n"
- "fmla v26.8h, v2.8h, v0.h[2]\n"
- "fmla v27.8h, v2.8h, v0.h[3]\n"
- "fmla v28.8h, v2.8h, v0.h[4]\n"
- "fmla v29.8h, v2.8h, v0.h[5]\n"
- "fmla v30.8h, v2.8h, v0.h[6]\n"
- "fmla v31.8h, v2.8h, v0.h[7]\n"
+ "fmla v18.8h, v6.8h, v1.h[2]\n"
+ "fmla v19.8h, v6.8h, v1.h[3]\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "fmla v20.8h, v6.8h, v1.h[4]\n"
+ "fmla v21.8h, v6.8h, v1.h[5]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "fmla v22.8h, v6.8h, v1.h[6]\n"
+ "fmla v23.8h, v6.8h, v1.h[7]\n"
+ "ldr q2, [x20, #0x0]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "fmla v24.8h, v6.8h, v0.h[0]\n"
+ "fmla v25.8h, v6.8h, v0.h[1]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "fmla v26.8h, v6.8h, v0.h[2]\n"
+ "fmla v27.8h, v6.8h, v0.h[3]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "fmla v28.8h, v6.8h, v0.h[4]\n"
+ "fmla v29.8h, v6.8h, v0.h[5]\n"
+ "fmla v30.8h, v6.8h, v0.h[6]\n"
+ "fmla v31.8h, v6.8h, v0.h[7]\n"
+ "ldr q1, [%x[weights], #0x0]\n"
+ "ldr q0, [x9, #0x0]\n"
"fmla v16.8h, v5.8h, v4.h[0]\n"
"fmla v17.8h, v5.8h, v4.h[1]\n"
- "fmin v16.8h, v16.8h, v6.8h\n"
+ "add %x[weights], %x[weights], #0x10\n"
"fmla v18.8h, v5.8h, v4.h[2]\n"
"fmla v19.8h, v5.8h, v4.h[3]\n"
- "fmin v17.8h, v17.8h, v6.8h\n"
"fmla v20.8h, v5.8h, v4.h[4]\n"
"fmla v21.8h, v5.8h, v4.h[5]\n"
- "fmin v18.8h, v18.8h, v6.8h\n"
"fmla v22.8h, v5.8h, v4.h[6]\n"
"fmla v23.8h, v5.8h, v4.h[7]\n"
- "fmin v19.8h, v19.8h, v6.8h\n"
"fmla v24.8h, v5.8h, v3.h[0]\n"
"fmla v25.8h, v5.8h, v3.h[1]\n"
- "fmin v20.8h, v20.8h, v6.8h\n"
"fmla v26.8h, v5.8h, v3.h[2]\n"
"fmla v27.8h, v5.8h, v3.h[3]\n"
- "fmin v21.8h, v21.8h, v6.8h\n"
"fmla v28.8h, v5.8h, v3.h[4]\n"
"fmla v29.8h, v5.8h, v3.h[5]\n"
- "fmin v22.8h, v22.8h, v6.8h\n"
"fmla v30.8h, v5.8h, v3.h[6]\n"
"fmla v31.8h, v5.8h, v3.h[7]\n"
- "fmin v23.8h, v23.8h, v6.8h\n"
- "fmax v16.8h, v16.8h, v7.8h\n"
- "fmax v17.8h, v17.8h, v7.8h\n"
- "str q16, [x20, x28]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "fmax v18.8h, v18.8h, v7.8h\n"
- "fmax v19.8h, v19.8h, v7.8h\n"
- "str q17, [x21, x28]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "fmax v20.8h, v20.8h, v7.8h\n"
- "fmax v21.8h, v21.8h, v7.8h\n"
- "str q18, [x22, x28]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "fmax v22.8h, v22.8h, v7.8h\n"
- "fmax v23.8h, v23.8h, v7.8h\n"
- "str q19, [x23, x28]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "fmin v24.8h, v24.8h, v6.8h\n"
- "fmin v25.8h, v25.8h, v6.8h\n"
- "str q20, [x24, x28]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "fmin v26.8h, v26.8h, v6.8h\n"
- "fmin v27.8h, v27.8h, v6.8h\n"
- "str q21, [x25, x28]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "fmin v28.8h, v28.8h, v6.8h\n"
- "fmin v29.8h, v29.8h, v6.8h\n"
- "str q22, [x26, x28]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
- "fmin v30.8h, v30.8h, v6.8h\n"
- "fmin v31.8h, v31.8h, v6.8h\n"
- "str q23, [x27, x28]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
- "fmax v24.8h, v24.8h, v7.8h\n"
- "fmax v25.8h, v25.8h, v7.8h\n"
- "str q24, [x20, x28]\n"
- "fmax v26.8h, v26.8h, v7.8h\n"
- "fmax v27.8h, v27.8h, v7.8h\n"
- "str q25, [x21, x28]\n"
- "fmax v28.8h, v28.8h, v7.8h\n"
- "fmax v29.8h, v29.8h, v7.8h\n"
- "str q26, [x22, x28]\n"
- "fmax v30.8h, v30.8h, v7.8h\n"
- "fmax v31.8h, v31.8h, v7.8h\n"
- "str q27, [x23, x28]\n"
- "str q28, [x24, x28]\n"
- "str q29, [x25, x28]\n"
- "str q30, [x26, x28]\n"
- "str q31, [x27, x28]\n"
+ "fmla v16.8h, v1.8h, v2.h[0]\n"
+ "fmla v17.8h, v1.8h, v2.h[1]\n"
+ "fmin v16.8h, v16.8h, v7.8h\n"
+ "fmla v18.8h, v1.8h, v2.h[2]\n"
+ "fmla v19.8h, v1.8h, v2.h[3]\n"
+ "fmin v17.8h, v17.8h, v7.8h\n"
+ "fmla v20.8h, v1.8h, v2.h[4]\n"
+ "fmla v21.8h, v1.8h, v2.h[5]\n"
+ "fmin v18.8h, v18.8h, v7.8h\n"
+ "fmla v22.8h, v1.8h, v2.h[6]\n"
+ "fmla v23.8h, v1.8h, v2.h[7]\n"
+ "fmin v19.8h, v19.8h, v7.8h\n"
+ "fmla v24.8h, v1.8h, v0.h[0]\n"
+ "fmla v25.8h, v1.8h, v0.h[1]\n"
+ "fmin v20.8h, v20.8h, v7.8h\n"
+ "fmla v26.8h, v1.8h, v0.h[2]\n"
+ "fmla v27.8h, v1.8h, v0.h[3]\n"
+ "fmin v21.8h, v21.8h, v7.8h\n"
+ "fmla v28.8h, v1.8h, v0.h[4]\n"
+ "fmla v29.8h, v1.8h, v0.h[5]\n"
+ "fmin v22.8h, v22.8h, v7.8h\n"
+ "fmla v30.8h, v1.8h, v0.h[6]\n"
+ "fmla v31.8h, v1.8h, v0.h[7]\n"
+ "fmin v23.8h, v23.8h, v7.8h\n"
+ "fmax v16.8h, v16.8h, v8.8h\n"
+ "fmax v17.8h, v17.8h, v8.8h\n"
+ "str q16, [x27, x28]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "fmax v18.8h, v18.8h, v8.8h\n"
+ "fmax v19.8h, v19.8h, v8.8h\n"
+ "str q17, [x26, x28]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "fmax v20.8h, v20.8h, v8.8h\n"
+ "fmax v21.8h, v21.8h, v8.8h\n"
+ "str q18, [x25, x28]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "fmax v22.8h, v22.8h, v8.8h\n"
+ "fmax v23.8h, v23.8h, v8.8h\n"
+ "str q19, [x24, x28]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "fmin v24.8h, v24.8h, v7.8h\n"
+ "fmin v25.8h, v25.8h, v7.8h\n"
+ "str q20, [x23, x28]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmin v26.8h, v26.8h, v7.8h\n"
+ "fmin v27.8h, v27.8h, v7.8h\n"
+ "str q21, [x22, x28]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "fmin v28.8h, v28.8h, v7.8h\n"
+ "fmin v29.8h, v29.8h, v7.8h\n"
+ "str q22, [x21, x28]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "fmin v30.8h, v30.8h, v7.8h\n"
+ "fmin v31.8h, v31.8h, v7.8h\n"
+ "str q23, [x20, x28]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "fmax v24.8h, v24.8h, v8.8h\n"
+ "fmax v25.8h, v25.8h, v8.8h\n"
+ "str q24, [x27, x28]\n"
+ "fmax v26.8h, v26.8h, v8.8h\n"
+ "fmax v27.8h, v27.8h, v8.8h\n"
+ "str q25, [x26, x28]\n"
+ "fmax v28.8h, v28.8h, v8.8h\n"
+ "fmax v29.8h, v29.8h, v8.8h\n"
+ "str q26, [x25, x28]\n"
+ "fmax v30.8h, v30.8h, v8.8h\n"
+ "fmax v31.8h, v31.8h, v8.8h\n"
+ "str q27, [x24, x28]\n"
+ "str q28, [x23, x28]\n"
+ "str q29, [x22, x28]\n"
+ "str q30, [x21, x28]\n"
+ "str q31, [x20, x28]\n"
"b 7f\n"
"6:" // Output channel loop: Single kernel point
- "fmla v16.8h, v5.8h, v4.h[0]\n"
- "fmla v17.8h, v5.8h, v4.h[1]\n"
- "fmin v16.8h, v16.8h, v6.8h\n"
+ "fmla v16.8h, v6.8h, v1.h[0]\n"
+ "fmla v17.8h, v6.8h, v1.h[1]\n"
+ "fmin v16.8h, v16.8h, v7.8h\n"
"lsl x28, x10, #0x1\n"
- "fmla v18.8h, v5.8h, v4.h[2]\n"
- "fmla v19.8h, v5.8h, v4.h[3]\n"
- "fmin v17.8h, v17.8h, v6.8h\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "fmla v20.8h, v5.8h, v4.h[4]\n"
- "fmla v21.8h, v5.8h, v4.h[5]\n"
- "fmin v18.8h, v18.8h, v6.8h\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "fmla v22.8h, v5.8h, v4.h[6]\n"
- "fmla v23.8h, v5.8h, v4.h[7]\n"
- "fmin v19.8h, v19.8h, v6.8h\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "fmla v24.8h, v5.8h, v3.h[0]\n"
- "fmla v25.8h, v5.8h, v3.h[1]\n"
- "fmin v20.8h, v20.8h, v6.8h\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "fmla v26.8h, v5.8h, v3.h[2]\n"
- "fmla v27.8h, v5.8h, v3.h[3]\n"
- "fmin v21.8h, v21.8h, v6.8h\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "fmla v28.8h, v5.8h, v3.h[4]\n"
- "fmla v29.8h, v5.8h, v3.h[5]\n"
- "fmin v22.8h, v22.8h, v6.8h\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "fmla v30.8h, v5.8h, v3.h[6]\n"
- "fmla v31.8h, v5.8h, v3.h[7]\n"
- "fmin v23.8h, v23.8h, v6.8h\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "fmax v16.8h, v16.8h, v7.8h\n"
- "fmax v17.8h, v17.8h, v7.8h\n"
- "str q16, [x20, x28]\n"
- "fmax v18.8h, v18.8h, v7.8h\n"
- "fmax v19.8h, v19.8h, v7.8h\n"
- "str q17, [x21, x28]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "fmax v20.8h, v20.8h, v7.8h\n"
- "fmax v21.8h, v21.8h, v7.8h\n"
- "str q18, [x22, x28]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "fmax v22.8h, v22.8h, v7.8h\n"
- "fmax v23.8h, v23.8h, v7.8h\n"
- "str q19, [x23, x28]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "fmin v24.8h, v24.8h, v6.8h\n"
- "fmin v25.8h, v25.8h, v6.8h\n"
- "str q20, [x24, x28]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "fmin v26.8h, v26.8h, v6.8h\n"
- "fmin v27.8h, v27.8h, v6.8h\n"
- "str q21, [x25, x28]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "fmin v28.8h, v28.8h, v6.8h\n"
- "fmin v29.8h, v29.8h, v6.8h\n"
- "str q22, [x26, x28]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "fmin v30.8h, v30.8h, v6.8h\n"
- "fmin v31.8h, v31.8h, v6.8h\n"
- "str q23, [x27, x28]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
- "fmax v24.8h, v24.8h, v7.8h\n"
- "fmax v25.8h, v25.8h, v7.8h\n"
- "str q24, [x20, x28]\n"
- "fmax v26.8h, v26.8h, v7.8h\n"
- "fmax v27.8h, v27.8h, v7.8h\n"
- "str q25, [x21, x28]\n"
- "fmax v28.8h, v28.8h, v7.8h\n"
- "fmax v29.8h, v29.8h, v7.8h\n"
- "str q26, [x22, x28]\n"
- "fmax v30.8h, v30.8h, v7.8h\n"
- "fmax v31.8h, v31.8h, v7.8h\n"
- "str q27, [x23, x28]\n"
- "str q28, [x24, x28]\n"
- "str q29, [x25, x28]\n"
- "str q30, [x26, x28]\n"
- "str q31, [x27, x28]\n"
+ "fmla v18.8h, v6.8h, v1.h[2]\n"
+ "fmla v19.8h, v6.8h, v1.h[3]\n"
+ "fmin v17.8h, v17.8h, v7.8h\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "fmla v20.8h, v6.8h, v1.h[4]\n"
+ "fmla v21.8h, v6.8h, v1.h[5]\n"
+ "fmin v18.8h, v18.8h, v7.8h\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "fmla v22.8h, v6.8h, v1.h[6]\n"
+ "fmla v23.8h, v6.8h, v1.h[7]\n"
+ "fmin v19.8h, v19.8h, v7.8h\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "fmla v24.8h, v6.8h, v0.h[0]\n"
+ "fmla v25.8h, v6.8h, v0.h[1]\n"
+ "fmin v20.8h, v20.8h, v7.8h\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "fmla v26.8h, v6.8h, v0.h[2]\n"
+ "fmla v27.8h, v6.8h, v0.h[3]\n"
+ "fmin v21.8h, v21.8h, v7.8h\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "fmla v28.8h, v6.8h, v0.h[4]\n"
+ "fmla v29.8h, v6.8h, v0.h[5]\n"
+ "fmin v22.8h, v22.8h, v7.8h\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "fmla v30.8h, v6.8h, v0.h[6]\n"
+ "fmla v31.8h, v6.8h, v0.h[7]\n"
+ "fmin v23.8h, v23.8h, v7.8h\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "fmax v16.8h, v16.8h, v8.8h\n"
+ "fmax v17.8h, v17.8h, v8.8h\n"
+ "str q16, [x27, x28]\n"
+ "fmax v18.8h, v18.8h, v8.8h\n"
+ "fmax v19.8h, v19.8h, v8.8h\n"
+ "str q17, [x26, x28]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "fmax v20.8h, v20.8h, v8.8h\n"
+ "fmax v21.8h, v21.8h, v8.8h\n"
+ "str q18, [x25, x28]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "fmax v22.8h, v22.8h, v8.8h\n"
+ "fmax v23.8h, v23.8h, v8.8h\n"
+ "str q19, [x24, x28]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "fmin v24.8h, v24.8h, v7.8h\n"
+ "fmin v25.8h, v25.8h, v7.8h\n"
+ "str q20, [x23, x28]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "fmin v26.8h, v26.8h, v7.8h\n"
+ "fmin v27.8h, v27.8h, v7.8h\n"
+ "str q21, [x22, x28]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmin v28.8h, v28.8h, v7.8h\n"
+ "fmin v29.8h, v29.8h, v7.8h\n"
+ "str q22, [x21, x28]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "fmin v30.8h, v30.8h, v7.8h\n"
+ "fmin v31.8h, v31.8h, v7.8h\n"
+ "str q23, [x20, x28]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "fmax v24.8h, v24.8h, v8.8h\n"
+ "fmax v25.8h, v25.8h, v8.8h\n"
+ "str q24, [x27, x28]\n"
+ "fmax v26.8h, v26.8h, v8.8h\n"
+ "fmax v27.8h, v27.8h, v8.8h\n"
+ "str q25, [x26, x28]\n"
+ "fmax v28.8h, v28.8h, v8.8h\n"
+ "fmax v29.8h, v29.8h, v8.8h\n"
+ "str q26, [x25, x28]\n"
+ "fmax v30.8h, v30.8h, v8.8h\n"
+ "fmax v31.8h, v31.8h, v8.8h\n"
+ "str q27, [x24, x28]\n"
+ "str q28, [x23, x28]\n"
+ "str q29, [x22, x28]\n"
+ "str q30, [x21, x28]\n"
+ "str q31, [x20, x28]\n"
"7:" // Output channel loop: Done
"add x10, x10, #0x8\n"
"cmp x10, x11, LSL #3\n"
@@ -464,12 +464,12 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"ld1 { v31.h }[0], [x20]\n"
"12:" // Output channel oddments: Load bias: Bit 2: End
"13:" // Output channel oddments: Load bias: Done
- "ldr q5, [%x[weights], #0x0]\n"
- "mov x20, %x[inptrs]\n"
- "ldp x24, x9, [x20], #0x10\n"
- "lsr x21, %x[kernel_points], #0x1\n"
- "ldr q4, [x24, #0x0]\n"
- "ldr q3, [x9, #0x0]\n"
+ "ldr q6, [%x[weights], #0x0]\n"
+ "mov x22, %x[inptrs]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "lsr x23, %x[kernel_points], #0x1\n"
+ "ldr q1, [x21, #0x0]\n"
+ "ldr q0, [x20, #0x0]\n"
"mov v16.16b, v31.16b\n"
"mov v17.16b, v31.16b\n"
"mov v18.16b, v31.16b\n"
@@ -487,26 +487,46 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"mov v29.16b, v31.16b\n"
"mov v30.16b, v31.16b\n"
"mov v31.16b, v31.16b\n"
- "cbz x21, 17f\n"
- "ldr q2, [%x[weights], #0x0]\n"
- "ldp x24, x9, [x20], #0x10\n"
- "subs x21, x21, #0x1\n"
+ "cbz x23, 17f\n"
+ "ldr q5, [%x[weights], #0x0]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "subs x23, x23, #0x1\n"
"add %x[weights], %x[weights], #0x10\n"
- "ldr q1, [x24, #0x0]\n"
- "ldr q0, [x9, #0x0]\n"
+ "ldr q4, [x21, #0x0]\n"
+ "ldr q3, [x20, #0x0]\n"
"beq 15f\n"
"14:" // Output channel oddments: Kernel loop
- "ldp x24, x9, [x20], #0x10\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "fmla v16.8h, v6.8h, v1.h[0]\n"
+ "fmla v17.8h, v6.8h, v1.h[1]\n"
+ "subs x23, x23, #0x1\n"
+ "fmla v18.8h, v6.8h, v1.h[2]\n"
+ "fmla v19.8h, v6.8h, v1.h[3]\n"
+ "fmla v20.8h, v6.8h, v1.h[4]\n"
+ "fmla v21.8h, v6.8h, v1.h[5]\n"
+ "fmla v22.8h, v6.8h, v1.h[6]\n"
+ "fmla v23.8h, v6.8h, v1.h[7]\n"
+ "ldr q1, [x21, #0x0]\n"
+ "fmla v24.8h, v6.8h, v0.h[0]\n"
+ "fmla v25.8h, v6.8h, v0.h[1]\n"
+ "fmla v26.8h, v6.8h, v0.h[2]\n"
+ "fmla v27.8h, v6.8h, v0.h[3]\n"
+ "fmla v28.8h, v6.8h, v0.h[4]\n"
+ "fmla v29.8h, v6.8h, v0.h[5]\n"
+ "fmla v30.8h, v6.8h, v0.h[6]\n"
+ "fmla v31.8h, v6.8h, v0.h[7]\n"
+ "ldr q0, [x20, #0x0]\n"
+ "ldr q6, [%x[weights], #0x0]\n"
+ "ldp x21, x20, [x22], #0x10\n"
"fmla v16.8h, v5.8h, v4.h[0]\n"
"fmla v17.8h, v5.8h, v4.h[1]\n"
- "subs x21, x21, #0x1\n"
"fmla v18.8h, v5.8h, v4.h[2]\n"
"fmla v19.8h, v5.8h, v4.h[3]\n"
"fmla v20.8h, v5.8h, v4.h[4]\n"
"fmla v21.8h, v5.8h, v4.h[5]\n"
"fmla v22.8h, v5.8h, v4.h[6]\n"
"fmla v23.8h, v5.8h, v4.h[7]\n"
- "ldr q4, [x24, #0x0]\n"
+ "ldr q4, [x21, #0x0]\n"
"fmla v24.8h, v5.8h, v3.h[0]\n"
"fmla v25.8h, v5.8h, v3.h[1]\n"
"fmla v26.8h, v5.8h, v3.h[2]\n"
@@ -515,32 +535,28 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"fmla v29.8h, v5.8h, v3.h[5]\n"
"fmla v30.8h, v5.8h, v3.h[6]\n"
"fmla v31.8h, v5.8h, v3.h[7]\n"
- "ldr q3, [x9, #0x0]\n"
- "ldr q5, [%x[weights], #0x0]\n"
- "ldp x24, x9, [x20], #0x10\n"
- "fmla v16.8h, v2.8h, v1.h[0]\n"
- "fmla v17.8h, v2.8h, v1.h[1]\n"
- "fmla v18.8h, v2.8h, v1.h[2]\n"
- "fmla v19.8h, v2.8h, v1.h[3]\n"
- "fmla v20.8h, v2.8h, v1.h[4]\n"
- "fmla v21.8h, v2.8h, v1.h[5]\n"
- "fmla v22.8h, v2.8h, v1.h[6]\n"
- "fmla v23.8h, v2.8h, v1.h[7]\n"
- "ldr q1, [x24, #0x0]\n"
- "fmla v24.8h, v2.8h, v0.h[0]\n"
- "fmla v25.8h, v2.8h, v0.h[1]\n"
- "fmla v26.8h, v2.8h, v0.h[2]\n"
- "fmla v27.8h, v2.8h, v0.h[3]\n"
- "fmla v28.8h, v2.8h, v0.h[4]\n"
- "fmla v29.8h, v2.8h, v0.h[5]\n"
- "fmla v30.8h, v2.8h, v0.h[6]\n"
- "fmla v31.8h, v2.8h, v0.h[7]\n"
- "ldr q0, [x9, #0x0]\n"
- "ldr q2, [%x[weights], #0x10]\n"
+ "ldr q3, [x20, #0x0]\n"
+ "ldr q5, [%x[weights], #0x10]\n"
"add %x[weights], %x[weights], #0x20\n"
"bgt 14b\n"
"15:" // Output channel oddments: Kernel loop tail
"tbnz %x[kernel_points], #0, 16f\n"
+ "fmla v16.8h, v6.8h, v1.h[0]\n"
+ "fmla v17.8h, v6.8h, v1.h[1]\n"
+ "fmla v18.8h, v6.8h, v1.h[2]\n"
+ "fmla v19.8h, v6.8h, v1.h[3]\n"
+ "fmla v20.8h, v6.8h, v1.h[4]\n"
+ "fmla v21.8h, v6.8h, v1.h[5]\n"
+ "fmla v22.8h, v6.8h, v1.h[6]\n"
+ "fmla v23.8h, v6.8h, v1.h[7]\n"
+ "fmla v24.8h, v6.8h, v0.h[0]\n"
+ "fmla v25.8h, v6.8h, v0.h[1]\n"
+ "fmla v26.8h, v6.8h, v0.h[2]\n"
+ "fmla v27.8h, v6.8h, v0.h[3]\n"
+ "fmla v28.8h, v6.8h, v0.h[4]\n"
+ "fmla v29.8h, v6.8h, v0.h[5]\n"
+ "fmla v30.8h, v6.8h, v0.h[6]\n"
+ "fmla v31.8h, v6.8h, v0.h[7]\n"
"fmla v16.8h, v5.8h, v4.h[0]\n"
"fmla v17.8h, v5.8h, v4.h[1]\n"
"fmla v18.8h, v5.8h, v4.h[2]\n"
@@ -557,63 +573,31 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"fmla v29.8h, v5.8h, v3.h[5]\n"
"fmla v30.8h, v5.8h, v3.h[6]\n"
"fmla v31.8h, v5.8h, v3.h[7]\n"
- "fmla v16.8h, v2.8h, v1.h[0]\n"
- "fmla v17.8h, v2.8h, v1.h[1]\n"
- "fmla v18.8h, v2.8h, v1.h[2]\n"
- "fmla v19.8h, v2.8h, v1.h[3]\n"
- "fmla v20.8h, v2.8h, v1.h[4]\n"
- "fmla v21.8h, v2.8h, v1.h[5]\n"
- "fmla v22.8h, v2.8h, v1.h[6]\n"
- "fmla v23.8h, v2.8h, v1.h[7]\n"
- "fmla v24.8h, v2.8h, v0.h[0]\n"
- "fmla v25.8h, v2.8h, v0.h[1]\n"
- "fmla v26.8h, v2.8h, v0.h[2]\n"
- "fmla v27.8h, v2.8h, v0.h[3]\n"
- "fmla v28.8h, v2.8h, v0.h[4]\n"
- "fmla v29.8h, v2.8h, v0.h[5]\n"
- "fmla v30.8h, v2.8h, v0.h[6]\n"
- "fmla v31.8h, v2.8h, v0.h[7]\n"
"b 18f\n"
"16:" // Output channel oddments: Odd tail
+ "fmla v16.8h, v6.8h, v1.h[0]\n"
+ "fmla v17.8h, v6.8h, v1.h[1]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "fmla v18.8h, v6.8h, v1.h[2]\n"
+ "fmla v19.8h, v6.8h, v1.h[3]\n"
+ "fmla v20.8h, v6.8h, v1.h[4]\n"
+ "fmla v21.8h, v6.8h, v1.h[5]\n"
+ "fmla v22.8h, v6.8h, v1.h[6]\n"
+ "fmla v23.8h, v6.8h, v1.h[7]\n"
+ "ldr q2, [x21, #0x0]\n"
+ "fmla v24.8h, v6.8h, v0.h[0]\n"
+ "fmla v25.8h, v6.8h, v0.h[1]\n"
+ "fmla v26.8h, v6.8h, v0.h[2]\n"
+ "fmla v27.8h, v6.8h, v0.h[3]\n"
+ "fmla v28.8h, v6.8h, v0.h[4]\n"
+ "fmla v29.8h, v6.8h, v0.h[5]\n"
+ "fmla v30.8h, v6.8h, v0.h[6]\n"
+ "fmla v31.8h, v6.8h, v0.h[7]\n"
+ "ldr q1, [x20, #0x0]\n"
+ "ldr q0, [%x[weights], #0x0]\n"
"fmla v16.8h, v5.8h, v4.h[0]\n"
"fmla v17.8h, v5.8h, v4.h[1]\n"
- "ldp x24, x9, [x20], #0x10\n"
- "fmla v18.8h, v5.8h, v4.h[2]\n"
- "fmla v19.8h, v5.8h, v4.h[3]\n"
- "fmla v20.8h, v5.8h, v4.h[4]\n"
- "fmla v21.8h, v5.8h, v4.h[5]\n"
- "fmla v22.8h, v5.8h, v4.h[6]\n"
- "fmla v23.8h, v5.8h, v4.h[7]\n"
- "ldr q4, [x24, #0x0]\n"
- "fmla v24.8h, v5.8h, v3.h[0]\n"
- "fmla v25.8h, v5.8h, v3.h[1]\n"
- "fmla v26.8h, v5.8h, v3.h[2]\n"
- "fmla v27.8h, v5.8h, v3.h[3]\n"
- "fmla v28.8h, v5.8h, v3.h[4]\n"
- "fmla v29.8h, v5.8h, v3.h[5]\n"
- "fmla v30.8h, v5.8h, v3.h[6]\n"
- "fmla v31.8h, v5.8h, v3.h[7]\n"
- "ldr q3, [x9, #0x0]\n"
- "ldr q5, [%x[weights], #0x0]\n"
- "fmla v16.8h, v2.8h, v1.h[0]\n"
- "fmla v17.8h, v2.8h, v1.h[1]\n"
"add %x[weights], %x[weights], #0x10\n"
- "fmla v18.8h, v2.8h, v1.h[2]\n"
- "fmla v19.8h, v2.8h, v1.h[3]\n"
- "fmla v20.8h, v2.8h, v1.h[4]\n"
- "fmla v21.8h, v2.8h, v1.h[5]\n"
- "fmla v22.8h, v2.8h, v1.h[6]\n"
- "fmla v23.8h, v2.8h, v1.h[7]\n"
- "fmla v24.8h, v2.8h, v0.h[0]\n"
- "fmla v25.8h, v2.8h, v0.h[1]\n"
- "fmla v26.8h, v2.8h, v0.h[2]\n"
- "fmla v27.8h, v2.8h, v0.h[3]\n"
- "fmla v28.8h, v2.8h, v0.h[4]\n"
- "fmla v29.8h, v2.8h, v0.h[5]\n"
- "fmla v30.8h, v2.8h, v0.h[6]\n"
- "fmla v31.8h, v2.8h, v0.h[7]\n"
- "fmla v16.8h, v5.8h, v4.h[0]\n"
- "fmla v17.8h, v5.8h, v4.h[1]\n"
"fmla v18.8h, v5.8h, v4.h[2]\n"
"fmla v19.8h, v5.8h, v4.h[3]\n"
"fmla v20.8h, v5.8h, v4.h[4]\n"
@@ -628,415 +612,429 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"fmla v29.8h, v5.8h, v3.h[5]\n"
"fmla v30.8h, v5.8h, v3.h[6]\n"
"fmla v31.8h, v5.8h, v3.h[7]\n"
+ "fmla v16.8h, v0.8h, v2.h[0]\n"
+ "fmla v17.8h, v0.8h, v2.h[1]\n"
+ "fmla v18.8h, v0.8h, v2.h[2]\n"
+ "fmla v19.8h, v0.8h, v2.h[3]\n"
+ "fmla v20.8h, v0.8h, v2.h[4]\n"
+ "fmla v21.8h, v0.8h, v2.h[5]\n"
+ "fmla v22.8h, v0.8h, v2.h[6]\n"
+ "fmla v23.8h, v0.8h, v2.h[7]\n"
+ "fmla v24.8h, v0.8h, v1.h[0]\n"
+ "fmla v25.8h, v0.8h, v1.h[1]\n"
+ "fmla v26.8h, v0.8h, v1.h[2]\n"
+ "fmla v27.8h, v0.8h, v1.h[3]\n"
+ "fmla v28.8h, v0.8h, v1.h[4]\n"
+ "fmla v29.8h, v0.8h, v1.h[5]\n"
+ "fmla v30.8h, v0.8h, v1.h[6]\n"
+ "fmla v31.8h, v0.8h, v1.h[7]\n"
"b 18f\n"
"17:" // Output channel oddments: Single kernel point
- "fmla v16.8h, v5.8h, v4.h[0]\n"
- "fmla v17.8h, v5.8h, v4.h[1]\n"
- "fmla v18.8h, v5.8h, v4.h[2]\n"
- "fmla v19.8h, v5.8h, v4.h[3]\n"
- "fmla v20.8h, v5.8h, v4.h[4]\n"
- "fmla v21.8h, v5.8h, v4.h[5]\n"
- "fmla v22.8h, v5.8h, v4.h[6]\n"
- "fmla v23.8h, v5.8h, v4.h[7]\n"
- "fmla v24.8h, v5.8h, v3.h[0]\n"
- "fmla v25.8h, v5.8h, v3.h[1]\n"
- "fmla v26.8h, v5.8h, v3.h[2]\n"
- "fmla v27.8h, v5.8h, v3.h[3]\n"
- "fmla v28.8h, v5.8h, v3.h[4]\n"
- "fmla v29.8h, v5.8h, v3.h[5]\n"
- "fmla v30.8h, v5.8h, v3.h[6]\n"
- "fmla v31.8h, v5.8h, v3.h[7]\n"
+ "fmla v16.8h, v6.8h, v1.h[0]\n"
+ "fmla v17.8h, v6.8h, v1.h[1]\n"
+ "fmla v18.8h, v6.8h, v1.h[2]\n"
+ "fmla v19.8h, v6.8h, v1.h[3]\n"
+ "fmla v20.8h, v6.8h, v1.h[4]\n"
+ "fmla v21.8h, v6.8h, v1.h[5]\n"
+ "fmla v22.8h, v6.8h, v1.h[6]\n"
+ "fmla v23.8h, v6.8h, v1.h[7]\n"
+ "fmla v24.8h, v6.8h, v0.h[0]\n"
+ "fmla v25.8h, v6.8h, v0.h[1]\n"
+ "fmla v26.8h, v6.8h, v0.h[2]\n"
+ "fmla v27.8h, v6.8h, v0.h[3]\n"
+ "fmla v28.8h, v6.8h, v0.h[4]\n"
+ "fmla v29.8h, v6.8h, v0.h[5]\n"
+ "fmla v30.8h, v6.8h, v0.h[6]\n"
+ "fmla v31.8h, v6.8h, v0.h[7]\n"
"18:" // Output channel oddments: Done
- "fmin v16.8h, v16.8h, v6.8h\n"
- "fmin v17.8h, v17.8h, v6.8h\n"
- "fmin v18.8h, v18.8h, v6.8h\n"
- "fmin v19.8h, v19.8h, v6.8h\n"
- "fmin v20.8h, v20.8h, v6.8h\n"
- "fmin v21.8h, v21.8h, v6.8h\n"
- "fmin v22.8h, v22.8h, v6.8h\n"
- "fmin v23.8h, v23.8h, v6.8h\n"
- "fmin v24.8h, v24.8h, v6.8h\n"
- "fmin v25.8h, v25.8h, v6.8h\n"
- "fmin v26.8h, v26.8h, v6.8h\n"
- "fmin v27.8h, v27.8h, v6.8h\n"
- "fmin v28.8h, v28.8h, v6.8h\n"
- "fmin v29.8h, v29.8h, v6.8h\n"
- "fmin v30.8h, v30.8h, v6.8h\n"
- "fmin v31.8h, v31.8h, v6.8h\n"
- "fmax v16.8h, v16.8h, v7.8h\n"
- "fmax v17.8h, v17.8h, v7.8h\n"
- "fmax v18.8h, v18.8h, v7.8h\n"
- "fmax v19.8h, v19.8h, v7.8h\n"
- "fmax v20.8h, v20.8h, v7.8h\n"
- "fmax v21.8h, v21.8h, v7.8h\n"
- "fmax v22.8h, v22.8h, v7.8h\n"
- "fmax v23.8h, v23.8h, v7.8h\n"
- "fmax v24.8h, v24.8h, v7.8h\n"
- "fmax v25.8h, v25.8h, v7.8h\n"
- "fmax v26.8h, v26.8h, v7.8h\n"
- "fmax v27.8h, v27.8h, v7.8h\n"
- "fmax v28.8h, v28.8h, v7.8h\n"
- "fmax v29.8h, v29.8h, v7.8h\n"
- "fmax v30.8h, v30.8h, v7.8h\n"
- "fmax v31.8h, v31.8h, v7.8h\n"
+ "fmin v16.8h, v16.8h, v7.8h\n"
+ "fmin v17.8h, v17.8h, v7.8h\n"
+ "fmin v18.8h, v18.8h, v7.8h\n"
+ "fmin v19.8h, v19.8h, v7.8h\n"
+ "fmin v20.8h, v20.8h, v7.8h\n"
+ "fmin v21.8h, v21.8h, v7.8h\n"
+ "fmin v22.8h, v22.8h, v7.8h\n"
+ "fmin v23.8h, v23.8h, v7.8h\n"
+ "fmin v24.8h, v24.8h, v7.8h\n"
+ "fmin v25.8h, v25.8h, v7.8h\n"
+ "fmin v26.8h, v26.8h, v7.8h\n"
+ "fmin v27.8h, v27.8h, v7.8h\n"
+ "fmin v28.8h, v28.8h, v7.8h\n"
+ "fmin v29.8h, v29.8h, v7.8h\n"
+ "fmin v30.8h, v30.8h, v7.8h\n"
+ "fmin v31.8h, v31.8h, v7.8h\n"
+ "fmax v16.8h, v16.8h, v8.8h\n"
+ "fmax v17.8h, v17.8h, v8.8h\n"
+ "fmax v18.8h, v18.8h, v8.8h\n"
+ "fmax v19.8h, v19.8h, v8.8h\n"
+ "fmax v20.8h, v20.8h, v8.8h\n"
+ "fmax v21.8h, v21.8h, v8.8h\n"
+ "fmax v22.8h, v22.8h, v8.8h\n"
+ "fmax v23.8h, v23.8h, v8.8h\n"
+ "fmax v24.8h, v24.8h, v8.8h\n"
+ "fmax v25.8h, v25.8h, v8.8h\n"
+ "fmax v26.8h, v26.8h, v8.8h\n"
+ "fmax v27.8h, v27.8h, v8.8h\n"
+ "fmax v28.8h, v28.8h, v8.8h\n"
+ "fmax v29.8h, v29.8h, v8.8h\n"
+ "fmax v30.8h, v30.8h, v8.8h\n"
+ "fmax v31.8h, v31.8h, v8.8h\n"
"tbz %x[n_output_channels], #2, 20f\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x10, LSL #1\n"
- "add x21, x21, x10, LSL #1\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x10, LSL #1\n"
- "add x23, x23, x10, LSL #1\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x10, LSL #1\n"
- "add x25, x25, x10, LSL #1\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x10, LSL #1\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x10, LSL #1\n"
- "st1 { v16.d }[0], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x10, LSL #1\n"
- "st1 { v17.d }[0], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x10, LSL #1\n"
- "st1 { v18.d }[0], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x10, LSL #1\n"
- "st1 { v19.d }[0], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x10, LSL #1\n"
- "st1 { v20.d }[0], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x10, LSL #1\n"
- "st1 { v21.d }[0], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x10, LSL #1\n"
- "st1 { v22.d }[0], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x10, LSL #1\n"
- "st1 { v23.d }[0], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v16.d }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x10, LSL #1\n"
+ "st1 { v17.d }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v18.d }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v19.d }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v20.d }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v21.d }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v22.d }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v23.d }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x10, LSL #1\n"
"add x10, x10, #0x4\n"
- "st1 { v24.d }[0], [x20]\n"
- "st1 { v25.d }[0], [x21]\n"
- "st1 { v26.d }[0], [x22]\n"
- "st1 { v27.d }[0], [x23]\n"
- "st1 { v28.d }[0], [x24]\n"
- "st1 { v29.d }[0], [x25]\n"
- "st1 { v30.d }[0], [x26]\n"
- "st1 { v31.d }[0], [x27]\n"
+ "st1 { v24.d }[0], [x27]\n"
+ "st1 { v25.d }[0], [x26]\n"
+ "st1 { v26.d }[0], [x25]\n"
+ "st1 { v27.d }[0], [x24]\n"
+ "st1 { v28.d }[0], [x23]\n"
+ "st1 { v29.d }[0], [x22]\n"
+ "st1 { v30.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
"tbz %x[n_output_channels], #1, 19f\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x10, LSL #1\n"
- "add x21, x21, x10, LSL #1\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x10, LSL #1\n"
- "add x23, x23, x10, LSL #1\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x10, LSL #1\n"
- "add x25, x25, x10, LSL #1\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x10, LSL #1\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x10, LSL #1\n"
- "st1 { v16.s }[2], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x10, LSL #1\n"
- "st1 { v17.s }[2], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x10, LSL #1\n"
- "st1 { v18.s }[2], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x10, LSL #1\n"
- "st1 { v19.s }[2], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x10, LSL #1\n"
- "st1 { v20.s }[2], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x10, LSL #1\n"
- "st1 { v21.s }[2], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x10, LSL #1\n"
- "st1 { v22.s }[2], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x10, LSL #1\n"
- "st1 { v23.s }[2], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x10, LSL #1\n"
+ "st1 { v17.s }[2], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v18.s }[2], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v19.s }[2], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v20.s }[2], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v21.s }[2], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v22.s }[2], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v23.s }[2], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x10, LSL #1\n"
"add x10, x10, #0x2\n"
- "st1 { v24.s }[2], [x20]\n"
- "st1 { v25.s }[2], [x21]\n"
- "st1 { v26.s }[2], [x22]\n"
- "st1 { v27.s }[2], [x23]\n"
- "st1 { v28.s }[2], [x24]\n"
- "st1 { v29.s }[2], [x25]\n"
- "st1 { v30.s }[2], [x26]\n"
- "st1 { v31.s }[2], [x27]\n"
+ "st1 { v24.s }[2], [x27]\n"
+ "st1 { v25.s }[2], [x26]\n"
+ "st1 { v26.s }[2], [x25]\n"
+ "st1 { v27.s }[2], [x24]\n"
+ "st1 { v28.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x22]\n"
+ "st1 { v30.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
"tbz %x[n_output_channels], #0, 22f\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x10, LSL #1\n"
- "add x21, x21, x10, LSL #1\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x10, LSL #1\n"
- "add x23, x23, x10, LSL #1\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x10, LSL #1\n"
- "add x25, x25, x10, LSL #1\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x10, LSL #1\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x10, LSL #1\n"
- "st1 { v16.h }[6], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x10, LSL #1\n"
- "st1 { v17.h }[6], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x10, LSL #1\n"
- "st1 { v18.h }[6], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x10, LSL #1\n"
- "st1 { v19.h }[6], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x10, LSL #1\n"
- "st1 { v20.h }[6], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x10, LSL #1\n"
- "st1 { v21.h }[6], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x10, LSL #1\n"
- "st1 { v22.h }[6], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x10, LSL #1\n"
- "st1 { v23.h }[6], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v16.h }[6], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x10, LSL #1\n"
- "st1 { v24.h }[6], [x20]\n"
- "st1 { v25.h }[6], [x21]\n"
- "st1 { v26.h }[6], [x22]\n"
- "st1 { v27.h }[6], [x23]\n"
- "st1 { v28.h }[6], [x24]\n"
- "st1 { v29.h }[6], [x25]\n"
- "st1 { v30.h }[6], [x26]\n"
- "st1 { v31.h }[6], [x27]\n"
+ "st1 { v17.h }[6], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v18.h }[6], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v19.h }[6], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v20.h }[6], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v21.h }[6], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v22.h }[6], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v23.h }[6], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v24.h }[6], [x27]\n"
+ "st1 { v25.h }[6], [x26]\n"
+ "st1 { v26.h }[6], [x25]\n"
+ "st1 { v27.h }[6], [x24]\n"
+ "st1 { v28.h }[6], [x23]\n"
+ "st1 { v29.h }[6], [x22]\n"
+ "st1 { v30.h }[6], [x21]\n"
+ "st1 { v31.h }[6], [x20]\n"
"b 22f\n"
"19:" // Output channel oddments: Done: Store: Bit 2: Bit 1: Unset
"tbz %x[n_output_channels], #0, 22f\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x10, LSL #1\n"
- "add x21, x21, x10, LSL #1\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x10, LSL #1\n"
- "add x23, x23, x10, LSL #1\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x10, LSL #1\n"
- "add x25, x25, x10, LSL #1\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x10, LSL #1\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x10, LSL #1\n"
- "st1 { v16.h }[4], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x10, LSL #1\n"
- "st1 { v17.h }[4], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x10, LSL #1\n"
- "st1 { v18.h }[4], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x10, LSL #1\n"
- "st1 { v19.h }[4], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x10, LSL #1\n"
- "st1 { v20.h }[4], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x10, LSL #1\n"
- "st1 { v21.h }[4], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x10, LSL #1\n"
- "st1 { v22.h }[4], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x10, LSL #1\n"
- "st1 { v23.h }[4], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v16.h }[4], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x10, LSL #1\n"
- "st1 { v24.h }[4], [x20]\n"
- "st1 { v25.h }[4], [x21]\n"
- "st1 { v26.h }[4], [x22]\n"
- "st1 { v27.h }[4], [x23]\n"
- "st1 { v28.h }[4], [x24]\n"
- "st1 { v29.h }[4], [x25]\n"
- "st1 { v30.h }[4], [x26]\n"
- "st1 { v31.h }[4], [x27]\n"
+ "st1 { v17.h }[4], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v18.h }[4], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v19.h }[4], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v20.h }[4], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v21.h }[4], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v22.h }[4], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v23.h }[4], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v24.h }[4], [x27]\n"
+ "st1 { v25.h }[4], [x26]\n"
+ "st1 { v26.h }[4], [x25]\n"
+ "st1 { v27.h }[4], [x24]\n"
+ "st1 { v28.h }[4], [x23]\n"
+ "st1 { v29.h }[4], [x22]\n"
+ "st1 { v30.h }[4], [x21]\n"
+ "st1 { v31.h }[4], [x20]\n"
"b 22f\n"
"20:" // Output channel oddments: Done: Store: Bit 2: Unset
"tbz %x[n_output_channels], #1, 21f\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x10, LSL #1\n"
- "add x21, x21, x10, LSL #1\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x10, LSL #1\n"
- "add x23, x23, x10, LSL #1\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x10, LSL #1\n"
- "add x25, x25, x10, LSL #1\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x10, LSL #1\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x10, LSL #1\n"
- "st1 { v16.s }[0], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x10, LSL #1\n"
- "st1 { v17.s }[0], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x10, LSL #1\n"
- "st1 { v18.s }[0], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x10, LSL #1\n"
- "st1 { v19.s }[0], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x10, LSL #1\n"
- "st1 { v20.s }[0], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x10, LSL #1\n"
- "st1 { v21.s }[0], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x10, LSL #1\n"
- "st1 { v22.s }[0], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x10, LSL #1\n"
- "st1 { v23.s }[0], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v16.s }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x10, LSL #1\n"
+ "st1 { v17.s }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v18.s }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v19.s }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v20.s }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v21.s }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v22.s }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v23.s }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x10, LSL #1\n"
"add x10, x10, #0x2\n"
- "st1 { v24.s }[0], [x20]\n"
- "st1 { v25.s }[0], [x21]\n"
- "st1 { v26.s }[0], [x22]\n"
- "st1 { v27.s }[0], [x23]\n"
- "st1 { v28.s }[0], [x24]\n"
- "st1 { v29.s }[0], [x25]\n"
- "st1 { v30.s }[0], [x26]\n"
- "st1 { v31.s }[0], [x27]\n"
+ "st1 { v24.s }[0], [x27]\n"
+ "st1 { v25.s }[0], [x26]\n"
+ "st1 { v26.s }[0], [x25]\n"
+ "st1 { v27.s }[0], [x24]\n"
+ "st1 { v28.s }[0], [x23]\n"
+ "st1 { v29.s }[0], [x22]\n"
+ "st1 { v30.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
"tbz %x[n_output_channels], #0, 22f\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x10, LSL #1\n"
- "add x21, x21, x10, LSL #1\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x10, LSL #1\n"
- "add x23, x23, x10, LSL #1\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x10, LSL #1\n"
- "add x25, x25, x10, LSL #1\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x10, LSL #1\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x10, LSL #1\n"
- "st1 { v16.h }[2], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x10, LSL #1\n"
- "st1 { v17.h }[2], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x10, LSL #1\n"
- "st1 { v18.h }[2], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x10, LSL #1\n"
- "st1 { v19.h }[2], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x10, LSL #1\n"
- "st1 { v20.h }[2], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x10, LSL #1\n"
- "st1 { v21.h }[2], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x10, LSL #1\n"
- "st1 { v22.h }[2], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x10, LSL #1\n"
- "st1 { v23.h }[2], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v16.h }[2], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x10, LSL #1\n"
- "st1 { v24.h }[2], [x20]\n"
- "st1 { v25.h }[2], [x21]\n"
- "st1 { v26.h }[2], [x22]\n"
- "st1 { v27.h }[2], [x23]\n"
- "st1 { v28.h }[2], [x24]\n"
- "st1 { v29.h }[2], [x25]\n"
- "st1 { v30.h }[2], [x26]\n"
- "st1 { v31.h }[2], [x27]\n"
+ "st1 { v17.h }[2], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v18.h }[2], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v19.h }[2], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v20.h }[2], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v21.h }[2], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v22.h }[2], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v23.h }[2], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v24.h }[2], [x27]\n"
+ "st1 { v25.h }[2], [x26]\n"
+ "st1 { v26.h }[2], [x25]\n"
+ "st1 { v27.h }[2], [x24]\n"
+ "st1 { v28.h }[2], [x23]\n"
+ "st1 { v29.h }[2], [x22]\n"
+ "st1 { v30.h }[2], [x21]\n"
+ "st1 { v31.h }[2], [x20]\n"
"b 22f\n"
"21:" // Output channel oddments: Done: Store: Bit 2: Unset: Bit 1: Unset
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x10, LSL #1\n"
- "add x21, x21, x10, LSL #1\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x10, LSL #1\n"
- "add x23, x23, x10, LSL #1\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x10, LSL #1\n"
- "add x25, x25, x10, LSL #1\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x10, LSL #1\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x10, LSL #1\n"
- "st1 { v16.h }[0], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x10, LSL #1\n"
- "st1 { v17.h }[0], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x10, LSL #1\n"
- "st1 { v18.h }[0], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x10, LSL #1\n"
- "st1 { v19.h }[0], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x10, LSL #1\n"
- "st1 { v20.h }[0], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x10, LSL #1\n"
- "st1 { v21.h }[0], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x10, LSL #1\n"
- "st1 { v22.h }[0], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x10, LSL #1\n"
- "st1 { v23.h }[0], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v16.h }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x10, LSL #1\n"
- "st1 { v24.h }[0], [x20]\n"
- "st1 { v25.h }[0], [x21]\n"
- "st1 { v26.h }[0], [x22]\n"
- "st1 { v27.h }[0], [x23]\n"
- "st1 { v28.h }[0], [x24]\n"
- "st1 { v29.h }[0], [x25]\n"
- "st1 { v30.h }[0], [x26]\n"
- "st1 { v31.h }[0], [x27]\n"
+ "st1 { v17.h }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v18.h }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v19.h }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v20.h }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v21.h }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v22.h }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v23.h }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v24.h }[0], [x27]\n"
+ "st1 { v25.h }[0], [x26]\n"
+ "st1 { v26.h }[0], [x25]\n"
+ "st1 { v27.h }[0], [x24]\n"
+ "st1 { v28.h }[0], [x23]\n"
+ "st1 { v29.h }[0], [x22]\n"
+ "st1 { v30.h }[0], [x21]\n"
+ "st1 { v31.h }[0], [x20]\n"
"22:" // Output channel oddments: Done: Store: Bit 2: End
-
"23:" // Done
-
: [weights] "+&r" (weights)
: [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [minmax_vals] "r" (minmax_vals), [n_output_channels] "r" ((uint64_t) n_output_channels), [outptrs] "r" (outptrs)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index 420e95384d..5d3db974f0 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -33,8 +33,8 @@
namespace arm_conv {
namespace depthwise {
-void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
-void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
class a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
{
@@ -57,7 +57,7 @@ class a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 2;
a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<float, float, float, float>(2, 3, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
index 2ff03aa15a..fd8686c15e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__
+#if defined(__aarch64__)
namespace arm_conv {
namespace depthwise {
@@ -110,15 +110,15 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"add x11, x15, x15\n"
"ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
"mul x21, x21, x27\n" // offset *= kernel_stride * output_size
- "add x13, x13, x21, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x13, x13, x21, LSL #2\n" // inptr[0] += offset * sizeof(float)
"add x9, x13, x25, LSL #2\n"
"mul x20, x20, x26\n" // offset *= output_tile_size
"add x28, x9, x25, LSL #2\n"
"add x12, x12, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v18.4s }, [x20]\n"
+ "ld1r { v27.4s }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v17.4s }, [x20]\n"
+ "ld1r { v26.4s }, [x20]\n"
"add x27, x28, x25, LSL #2\n"
"add x26, x11, x15\n"
"add x25, x12, x24, LSL #2\n"
@@ -126,7 +126,7 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"mov x21, #0x0\n"
"sub x20, XZR, x23\n"
"cbz x22, 4f\n"
- "ldr q16, [x10, #0x0]\n"
+ "ldr q25, [x10, #0x0]\n"
"ldr q0, [x10, #0x10]\n"
"cmp x23, x22, LSL #4\n"
"ldr q1, [x10, #0x20]\n"
@@ -145,162 +145,162 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"ldr q13, [x28, x15]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "mov v28.16b, v16.16b\n fmla v28.4s, v4.4s, v9.4s\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v3.4s, v9.4s\n"
+ "mov v24.16b, v25.16b\n fmla v24.4s, v4.4s, v9.4s\n"
+ "mov v23.16b, v25.16b\n fmla v23.4s, v3.4s, v9.4s\n"
"add x23, x23, #0x10\n"
"cmp x23, x22, LSL #4\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
- "ld1 { v9.4s }, [x27]\n"
- "ldr q16, [x10, #0x0]\n"
- "fmla v28.4s, v0.4s, v10.4s\n"
- "ldr q10, [x28, x11]\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
- "ldr q11, [x27, x26]\n"
- "fmla v30.4s, v2.4s, v12.4s\n"
- "fmla v31.4s, v1.4s, v12.4s\n"
+ "mov v22.16b, v25.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+ "mov v21.16b, v25.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+ "ld1 { v18.4s }, [x27]\n"
+ "ldr q25, [x10, #0x0]\n"
+ "fmla v24.4s, v0.4s, v10.4s\n"
+ "ldr q20, [x28, x11]\n"
+ "fmla v23.4s, v2.4s, v11.4s\n"
+ "ldr q17, [x27, x26]\n"
+ "fmla v22.4s, v2.4s, v12.4s\n"
+ "fmla v21.4s, v1.4s, v12.4s\n"
"add x20, x20, #0x10\n"
"add x21, x21, #0x10\n"
- "fmla v28.4s, v5.4s, v12.4s\n"
- "fmla v29.4s, v4.4s, v12.4s\n"
- "ldr q12, [x13, x15]\n"
- "fmla v30.4s, v6.4s, v9.4s\n"
- "ldr q9, [x13, x11]\n"
- "fmla v31.4s, v3.4s, v13.4s\n"
+ "fmla v24.4s, v5.4s, v12.4s\n"
+ "fmla v23.4s, v4.4s, v12.4s\n"
+ "ldr q16, [x13, x15]\n"
+ "fmla v22.4s, v6.4s, v18.4s\n"
+ "ldr q18, [x13, x11]\n"
+ "fmla v21.4s, v3.4s, v13.4s\n"
"add x13, x13, #0x10\n"
- "fmla v28.4s, v7.4s, v13.4s\n"
- "fmla v29.4s, v6.4s, v13.4s\n"
- "fmla v30.4s, v4.4s, v13.4s\n"
- "fmla v31.4s, v8.4s, v11.4s\n"
- "ld1 { v11.4s }, [x9]\n"
- "fmla v28.4s, v1.4s, v12.4s\n"
- "fmla v29.4s, v0.4s, v12.4s\n"
- "ldr q12, [x9, x26]\n"
+ "fmla v24.4s, v7.4s, v13.4s\n"
+ "fmla v23.4s, v6.4s, v13.4s\n"
+ "fmla v22.4s, v4.4s, v13.4s\n"
+ "fmla v21.4s, v8.4s, v17.4s\n"
+ "ld1 { v17.4s }, [x9]\n"
+ "fmla v24.4s, v1.4s, v16.4s\n"
+ "fmla v23.4s, v0.4s, v16.4s\n"
+ "ldr q16, [x9, x26]\n"
"add x9, x9, #0x10\n"
- "fmla v30.4s, v5.4s, v10.4s\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
+ "fmla v22.4s, v5.4s, v20.4s\n"
+ "fmla v21.4s, v4.4s, v20.4s\n"
"ldr q4, [x10, #0x50]\n"
- "fmla v28.4s, v2.4s, v9.4s\n"
- "fmla v29.4s, v1.4s, v9.4s\n"
- "ld1 { v9.4s }, [x28]\n"
+ "fmla v24.4s, v2.4s, v18.4s\n"
+ "fmla v23.4s, v1.4s, v18.4s\n"
+ "ld1 { v19.4s }, [x28]\n"
"ldr q1, [x10, #0x20]\n"
- "fmla v30.4s, v0.4s, v11.4s\n"
+ "fmla v22.4s, v0.4s, v17.4s\n"
"ldr q0, [x10, #0x10]\n"
- "fmla v31.4s, v2.4s, v12.4s\n"
+ "fmla v21.4s, v2.4s, v16.4s\n"
"ldr q2, [x10, #0x30]\n"
- "fmla v28.4s, v8.4s, v10.4s\n"
- "fmla v29.4s, v7.4s, v10.4s\n"
- "ldr q10, [x28, x26]\n"
+ "fmla v24.4s, v8.4s, v20.4s\n"
+ "fmla v23.4s, v7.4s, v20.4s\n"
+ "ldr q18, [x28, x26]\n"
"add x28, x28, #0x10\n"
"ldr q13, [x28, x15]\n"
- "fmla v30.4s, v3.4s, v9.4s\n"
- "fmla v31.4s, v5.4s, v10.4s\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ldr q11, [x27, x15]\n"
+ "fmla v22.4s, v3.4s, v19.4s\n"
+ "fmla v21.4s, v5.4s, v18.4s\n"
+ "fmla v24.4s, v3.4s, v17.4s\n"
+ "ldr q17, [x27, x15]\n"
"ldr q3, [x10, #0x40]\n"
- "fmla v29.4s, v5.4s, v12.4s\n"
- "ldr q12, [x27, x11]\n"
+ "fmla v23.4s, v5.4s, v16.4s\n"
+ "ldr q16, [x27, x11]\n"
"ldr q5, [x10, #0x60]\n"
- "fmla v30.4s, v7.4s, v11.4s\n"
- "fmla v31.4s, v6.4s, v11.4s\n"
+ "fmla v22.4s, v7.4s, v17.4s\n"
+ "fmla v21.4s, v6.4s, v17.4s\n"
"ldr q11, [x13, x26]\n"
- "fmla v28.4s, v6.4s, v9.4s\n"
+ "fmla v24.4s, v6.4s, v19.4s\n"
"ldr q9, [x9, x15]\n"
- "fmla v29.4s, v8.4s, v10.4s\n"
+ "fmla v23.4s, v8.4s, v18.4s\n"
"ld1 { v10.4s }, [x13]\n"
"ldr q6, [x10, #0x70]\n"
- "fmla v30.4s, v8.4s, v12.4s\n"
- "fmla v31.4s, v7.4s, v12.4s\n"
+ "fmla v22.4s, v8.4s, v16.4s\n"
+ "fmla v21.4s, v7.4s, v16.4s\n"
"ldr q12, [x9, x11]\n"
"ldr q7, [x10, #0x80]\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
+ "fmax v24.4s, v24.4s, v27.4s\n"
+ "fmax v23.4s, v23.4s, v27.4s\n"
"ldr q8, [x10, #0x90]\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
+ "fmax v22.4s, v22.4s, v27.4s\n"
+ "fmax v21.4s, v21.4s, v27.4s\n"
"add x27, x27, #0x10\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "st1 { v28.4s }, [x12]\n"
+ "fmin v24.4s, v24.4s, v26.4s\n"
+ "fmin v23.4s, v23.4s, v26.4s\n"
+ "st1 { v24.4s }, [x12]\n"
"add x10, x10, #0xa0\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "str q29, [x12, x14]\n"
+ "fmin v22.4s, v22.4s, v26.4s\n"
+ "fmin v21.4s, v21.4s, v26.4s\n"
+ "str q23, [x12, x14]\n"
"add x12, x12, #0x10\n"
- "st1 { v30.4s }, [x25]\n"
- "str q31, [x25, x14]\n"
+ "st1 { v22.4s }, [x25]\n"
+ "str q21, [x25, x14]\n"
"add x25, x25, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "mov v28.16b, v16.16b\n fmla v28.4s, v4.4s, v9.4s\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v3.4s, v9.4s\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
- "ld1 { v9.4s }, [x27]\n"
- "fmla v28.4s, v0.4s, v10.4s\n"
- "ldr q10, [x28, x11]\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
- "ldr q11, [x27, x26]\n"
- "fmla v30.4s, v2.4s, v12.4s\n"
- "fmla v31.4s, v1.4s, v12.4s\n"
- "fmla v28.4s, v5.4s, v12.4s\n"
- "fmla v29.4s, v4.4s, v12.4s\n"
- "ldr q12, [x13, x15]\n"
- "fmla v30.4s, v6.4s, v9.4s\n"
- "ldr q9, [x13, x11]\n"
- "fmla v31.4s, v3.4s, v13.4s\n"
+ "mov v24.16b, v25.16b\n fmla v24.4s, v4.4s, v9.4s\n"
+ "mov v23.16b, v25.16b\n fmla v23.4s, v3.4s, v9.4s\n"
+ "mov v22.16b, v25.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+ "mov v21.16b, v25.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+ "ld1 { v18.4s }, [x27]\n"
+ "fmla v24.4s, v0.4s, v10.4s\n"
+ "ldr q20, [x28, x11]\n"
+ "fmla v23.4s, v2.4s, v11.4s\n"
+ "ldr q17, [x27, x26]\n"
+ "fmla v22.4s, v2.4s, v12.4s\n"
+ "fmla v21.4s, v1.4s, v12.4s\n"
+ "fmla v24.4s, v5.4s, v12.4s\n"
+ "fmla v23.4s, v4.4s, v12.4s\n"
+ "ldr q16, [x13, x15]\n"
+ "fmla v22.4s, v6.4s, v18.4s\n"
+ "ldr q18, [x13, x11]\n"
+ "fmla v21.4s, v3.4s, v13.4s\n"
"add x13, x13, #0x10\n"
- "fmla v28.4s, v7.4s, v13.4s\n"
- "fmla v29.4s, v6.4s, v13.4s\n"
- "fmla v30.4s, v4.4s, v13.4s\n"
- "fmla v31.4s, v8.4s, v11.4s\n"
- "ld1 { v11.4s }, [x9]\n"
- "fmla v28.4s, v1.4s, v12.4s\n"
- "fmla v29.4s, v0.4s, v12.4s\n"
- "ldr q12, [x9, x26]\n"
+ "fmla v24.4s, v7.4s, v13.4s\n"
+ "fmla v23.4s, v6.4s, v13.4s\n"
+ "fmla v22.4s, v4.4s, v13.4s\n"
+ "fmla v21.4s, v8.4s, v17.4s\n"
+ "ld1 { v17.4s }, [x9]\n"
+ "fmla v24.4s, v1.4s, v16.4s\n"
+ "fmla v23.4s, v0.4s, v16.4s\n"
+ "ldr q16, [x9, x26]\n"
"add x9, x9, #0x10\n"
- "fmla v30.4s, v5.4s, v10.4s\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
- "fmla v28.4s, v2.4s, v9.4s\n"
- "fmla v29.4s, v1.4s, v9.4s\n"
- "ld1 { v9.4s }, [x28]\n"
- "fmla v30.4s, v0.4s, v11.4s\n"
- "fmla v31.4s, v2.4s, v12.4s\n"
- "fmla v28.4s, v8.4s, v10.4s\n"
- "fmla v29.4s, v7.4s, v10.4s\n"
- "ldr q10, [x28, x26]\n"
+ "fmla v22.4s, v5.4s, v20.4s\n"
+ "fmla v21.4s, v4.4s, v20.4s\n"
+ "fmla v24.4s, v2.4s, v18.4s\n"
+ "fmla v23.4s, v1.4s, v18.4s\n"
+ "ld1 { v19.4s }, [x28]\n"
+ "fmla v22.4s, v0.4s, v17.4s\n"
+ "fmla v21.4s, v2.4s, v16.4s\n"
+ "fmla v24.4s, v8.4s, v20.4s\n"
+ "fmla v23.4s, v7.4s, v20.4s\n"
+ "ldr q18, [x28, x26]\n"
"add x28, x28, #0x10\n"
- "fmla v30.4s, v3.4s, v9.4s\n"
- "fmla v31.4s, v5.4s, v10.4s\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ldr q11, [x27, x15]\n"
- "fmla v29.4s, v5.4s, v12.4s\n"
- "ldr q12, [x27, x11]\n"
- "fmla v30.4s, v7.4s, v11.4s\n"
- "fmla v31.4s, v6.4s, v11.4s\n"
+ "fmla v22.4s, v3.4s, v19.4s\n"
+ "fmla v21.4s, v5.4s, v18.4s\n"
+ "fmla v24.4s, v3.4s, v17.4s\n"
+ "ldr q17, [x27, x15]\n"
+ "fmla v23.4s, v5.4s, v16.4s\n"
+ "ldr q16, [x27, x11]\n"
+ "fmla v22.4s, v7.4s, v17.4s\n"
+ "fmla v21.4s, v6.4s, v17.4s\n"
"add x27, x27, #0x10\n"
- "fmla v28.4s, v6.4s, v9.4s\n"
- "fmla v29.4s, v8.4s, v10.4s\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmla v30.4s, v8.4s, v12.4s\n"
- "fmla v31.4s, v7.4s, v12.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "st1 { v28.4s }, [x12]\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "str q29, [x12, x14]\n"
+ "fmla v24.4s, v6.4s, v19.4s\n"
+ "fmla v23.4s, v8.4s, v18.4s\n"
+ "fmax v24.4s, v24.4s, v27.4s\n"
+ "fmla v22.4s, v8.4s, v16.4s\n"
+ "fmla v21.4s, v7.4s, v16.4s\n"
+ "fmax v23.4s, v23.4s, v27.4s\n"
+ "fmax v22.4s, v22.4s, v27.4s\n"
+ "fmax v21.4s, v21.4s, v27.4s\n"
+ "fmin v24.4s, v24.4s, v26.4s\n"
+ "fmin v23.4s, v23.4s, v26.4s\n"
+ "st1 { v24.4s }, [x12]\n"
+ "fmin v22.4s, v22.4s, v26.4s\n"
+ "fmin v21.4s, v21.4s, v26.4s\n"
+ "str q23, [x12, x14]\n"
"add x12, x12, #0x10\n"
- "st1 { v30.4s }, [x25]\n"
- "str q31, [x25, x14]\n"
+ "st1 { v22.4s }, [x25]\n"
+ "str q21, [x25, x14]\n"
"add x25, x25, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x3\n"
"beq 31f\n"
- "ldr q16, [x10, #0x0]\n"
+ "ldr q25, [x10, #0x0]\n"
"ldr q0, [x10, #0x10]\n"
"add x24, x9, x15\n"
"add x23, x13, XZR\n"
@@ -335,11 +335,11 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"ldr s12, [x21, #0x0]\n"
"ldr s13, [x20, #0x0]\n"
"6:" // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: End
- "mov v28.16b, v16.16b\n fmla v28.4s, v4.4s, v9.4s\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v3.4s, v9.4s\n"
+ "mov v28.16b, v25.16b\n fmla v28.4s, v4.4s, v9.4s\n"
+ "mov v29.16b, v25.16b\n fmla v29.4s, v3.4s, v9.4s\n"
"add x20, x27, XZR\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "mov v30.16b, v25.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+ "mov v31.16b, v25.16b\n fmla v31.4s, v0.4s, v9.4s\n"
"fmla v28.4s, v0.4s, v10.4s\n"
"fmla v29.4s, v2.4s, v11.4s\n"
"fmla v28.4s, v5.4s, v12.4s\n"
@@ -470,14 +470,14 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"28:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
"fmla v30.4s, v8.4s, v12.4s\n"
"fmla v31.4s, v7.4s, v12.4s\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
+ "fmax v28.4s, v28.4s, v27.4s\n"
+ "fmax v29.4s, v29.4s, v27.4s\n"
+ "fmax v30.4s, v30.4s, v27.4s\n"
+ "fmax v31.4s, v31.4s, v27.4s\n"
+ "fmin v28.4s, v28.4s, v26.4s\n"
+ "fmin v29.4s, v29.4s, v26.4s\n"
+ "fmin v30.4s, v30.4s, v26.4s\n"
+ "fmin v31.4s, v31.4s, v26.4s\n"
"tbz %x[n_channels], #1, 29f\n"
"mov x21, x12\n"
"mov x20, x25\n"
@@ -503,7 +503,6 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"st1 { v29.s }[0], [x21]\n"
"st1 { v31.s }[0], [x20]\n"
"30:" // Tile loop: Oddments: Store: Bit 1: End
-
"31:" // Tile loop: End
"ldr x22, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
@@ -518,11 +517,11 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index 56e9ed2e1b..7dedfd972a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__
+#if defined(__aarch64__)
namespace arm_conv {
namespace depthwise {
@@ -83,16 +83,16 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"lsr x15, %x[n_channels], #0x2\n"
"ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v18.4s }, [x20]\n"
+ "ld1r { v27.4s }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v17.4s }, [x20]\n"
+ "ld1r { v26.4s }, [x20]\n"
"add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
"ldp x12, x11, [x21, #0x0]\n"
"ldp x10, x9, [x21, #0x10]\n"
"mov x28, #0x0\n"
"sub x27, XZR, x16\n"
"cbz x15, 3f\n"
- "ldr q16, [x14, #0x0]\n"
+ "ldr q25, [x14, #0x0]\n"
"ldr q0, [x14, #0x10]\n"
"cmp x16, x15, LSL #4\n"
"ldr q1, [x14, #0x20]\n"
@@ -104,197 +104,197 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q7, [x14, #0x80]\n"
"ldr q8, [x14, #0x90]\n"
"add x14, x14, #0xa0\n"
- "ldp x26, x22, [x13, #0x0]\n"
- "ldr q9, [x26, x28]\n"
- "ldr q10, [x22, x28]\n"
- "ldp x25, x24, [x13, #0x10]\n"
- "ldr q11, [x25, x28]\n"
- "ldr q12, [x24, x28]\n"
- "ldr x23, [x13, #0x20]\n"
- "ldr q13, [x23, x28]\n"
- "bge 2f\n"
- "1:" // Channel loop
- "mov v28.16b, v16.16b\n fmla v28.4s, v4.4s, v9.4s\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v3.4s, v9.4s\n"
- "ldr x22, [x13, #0x28]\n"
- "ldr x21, [x13, #0x30]\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
- "ldr q9, [x22, x28]\n"
- "ldr q16, [x14, #0x0]\n"
- "fmla v28.4s, v0.4s, v10.4s\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
+ "ldp x21, x20, [x13, #0x0]\n"
+ "ldr q9, [x21, x28]\n"
+ "ldr q10, [x20, x28]\n"
+ "ldp x21, x20, [x13, #0x10]\n"
"ldr q11, [x21, x28]\n"
- "ldr x20, [x13, #0x38]\n"
- "fmla v30.4s, v2.4s, v12.4s\n"
- "fmla v31.4s, v1.4s, v12.4s\n"
- "ldr x22, [x13, #0x48]\n"
- "ldr q10, [x22, x28]\n"
- "fmla v28.4s, v5.4s, v12.4s\n"
- "fmla v29.4s, v4.4s, v12.4s\n"
"ldr q12, [x20, x28]\n"
- "ldr x26, [x13, #0x40]\n"
- "fmla v30.4s, v6.4s, v9.4s\n"
- "ldr q9, [x26, x28]\n"
- "fmla v31.4s, v3.4s, v13.4s\n"
- "ldr x25, [x13, #0x50]\n"
- "fmla v28.4s, v7.4s, v13.4s\n"
- "fmla v29.4s, v6.4s, v13.4s\n"
- "ldr x24, [x13, #0x58]\n"
- "ldr x23, [x13, #0x60]\n"
- "fmla v30.4s, v4.4s, v13.4s\n"
- "fmla v31.4s, v8.4s, v11.4s\n"
- "ldr q11, [x25, x28]\n"
- "ldr x22, [x13, #0x68]\n"
- "fmla v28.4s, v1.4s, v12.4s\n"
- "fmla v29.4s, v0.4s, v12.4s\n"
- "ldr q12, [x24, x28]\n"
- "ldr x21, [x13, #0x70]\n"
- "fmla v30.4s, v5.4s, v10.4s\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
+ "ldr x20, [x13, #0x20]\n"
+ "ldr q13, [x20, x28]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "mov v24.16b, v25.16b\n fmla v24.4s, v4.4s, v9.4s\n"
+ "mov v23.16b, v25.16b\n fmla v23.4s, v3.4s, v9.4s\n"
+ "ldr x21, [x13, #0x28]\n"
+ "ldr x20, [x13, #0x30]\n"
+ "mov v22.16b, v25.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+ "mov v21.16b, v25.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+ "ldr q18, [x21, x28]\n"
+ "ldr q25, [x14, #0x0]\n"
+ "fmla v24.4s, v0.4s, v10.4s\n"
+ "fmla v23.4s, v2.4s, v11.4s\n"
+ "ldr q17, [x20, x28]\n"
+ "ldr x21, [x13, #0x38]\n"
+ "fmla v22.4s, v2.4s, v12.4s\n"
+ "fmla v21.4s, v1.4s, v12.4s\n"
+ "ldr x20, [x13, #0x48]\n"
+ "ldr q20, [x20, x28]\n"
+ "fmla v24.4s, v5.4s, v12.4s\n"
+ "fmla v23.4s, v4.4s, v12.4s\n"
+ "ldr q16, [x21, x28]\n"
+ "ldr x20, [x13, #0x40]\n"
+ "fmla v22.4s, v6.4s, v18.4s\n"
+ "ldr q18, [x20, x28]\n"
+ "fmla v21.4s, v3.4s, v13.4s\n"
+ "ldr x20, [x13, #0x50]\n"
+ "fmla v24.4s, v7.4s, v13.4s\n"
+ "fmla v23.4s, v6.4s, v13.4s\n"
+ "ldr x22, [x13, #0x58]\n"
+ "ldr x21, [x13, #0x60]\n"
+ "fmla v22.4s, v4.4s, v13.4s\n"
+ "fmla v21.4s, v8.4s, v17.4s\n"
+ "ldr q17, [x20, x28]\n"
+ "ldr x20, [x13, #0x68]\n"
+ "fmla v24.4s, v1.4s, v16.4s\n"
+ "fmla v23.4s, v0.4s, v16.4s\n"
+ "ldr q16, [x22, x28]\n"
+ "ldr x26, [x13, #0x70]\n"
+ "fmla v22.4s, v5.4s, v20.4s\n"
+ "fmla v21.4s, v4.4s, v20.4s\n"
"ldr q4, [x14, #0x50]\n"
- "ldr x20, [x13, #0x78]\n"
- "fmla v28.4s, v2.4s, v9.4s\n"
- "fmla v29.4s, v1.4s, v9.4s\n"
- "ldr q9, [x23, x28]\n"
+ "ldr x25, [x13, #0x78]\n"
+ "fmla v24.4s, v2.4s, v18.4s\n"
+ "fmla v23.4s, v1.4s, v18.4s\n"
+ "ldr q19, [x21, x28]\n"
"ldr q1, [x14, #0x20]\n"
- "fmla v30.4s, v0.4s, v11.4s\n"
+ "fmla v22.4s, v0.4s, v17.4s\n"
"ldr q0, [x14, #0x10]\n"
- "fmla v31.4s, v2.4s, v12.4s\n"
+ "fmla v21.4s, v2.4s, v16.4s\n"
"ldr q2, [x14, #0x30]\n"
- "fmla v28.4s, v8.4s, v10.4s\n"
- "fmla v29.4s, v7.4s, v10.4s\n"
- "ldr q10, [x22, x28]\n"
- "ldp x26, x22, [x13, #0x0]\n"
- "fmla v30.4s, v3.4s, v9.4s\n"
- "fmla v31.4s, v5.4s, v10.4s\n"
- "ldp x25, x24, [x13, #0x10]\n"
- "ldr x23, [x13, #0x20]\n"
- "ldr q13, [x23, x16]\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ldr q11, [x21, x28]\n"
- "fmla v29.4s, v5.4s, v12.4s\n"
- "ldr q12, [x20, x28]\n"
+ "fmla v24.4s, v8.4s, v20.4s\n"
+ "fmla v23.4s, v7.4s, v20.4s\n"
+ "ldr q18, [x20, x28]\n"
+ "ldp x24, x23, [x13, #0x0]\n"
+ "fmla v22.4s, v3.4s, v19.4s\n"
+ "fmla v21.4s, v5.4s, v18.4s\n"
+ "ldp x22, x21, [x13, #0x10]\n"
+ "ldr x20, [x13, #0x20]\n"
+ "ldr q13, [x20, x16]\n"
+ "fmla v24.4s, v3.4s, v17.4s\n"
+ "ldr q17, [x26, x28]\n"
+ "fmla v23.4s, v5.4s, v16.4s\n"
+ "ldr q16, [x25, x28]\n"
"ldr q3, [x14, #0x40]\n"
- "fmla v30.4s, v7.4s, v11.4s\n"
- "fmla v31.4s, v6.4s, v11.4s\n"
- "ldr q11, [x25, x16]\n"
+ "fmla v22.4s, v7.4s, v17.4s\n"
+ "fmla v21.4s, v6.4s, v17.4s\n"
+ "ldr q11, [x22, x16]\n"
"ldr q5, [x14, #0x60]\n"
- "fmla v28.4s, v6.4s, v9.4s\n"
- "fmla v29.4s, v8.4s, v10.4s\n"
- "ldr q9, [x26, x16]\n"
- "ldr q10, [x22, x16]\n"
- "fmla v30.4s, v8.4s, v12.4s\n"
- "fmla v31.4s, v7.4s, v12.4s\n"
- "ldr q12, [x24, x16]\n"
+ "fmla v24.4s, v6.4s, v19.4s\n"
+ "fmla v23.4s, v8.4s, v18.4s\n"
+ "ldr q9, [x24, x16]\n"
+ "ldr q10, [x23, x16]\n"
+ "fmla v22.4s, v8.4s, v16.4s\n"
+ "fmla v21.4s, v7.4s, v16.4s\n"
+ "ldr q12, [x21, x16]\n"
"ldr q6, [x14, #0x70]\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
+ "fmax v24.4s, v24.4s, v27.4s\n"
+ "fmax v23.4s, v23.4s, v27.4s\n"
"ldr q7, [x14, #0x80]\n"
"ldr q8, [x14, #0x90]\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
+ "fmax v22.4s, v22.4s, v27.4s\n"
+ "fmax v21.4s, v21.4s, v27.4s\n"
"add x16, x16, #0x10\n"
"add x27, x27, #0x10\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
+ "fmin v24.4s, v24.4s, v26.4s\n"
+ "fmin v23.4s, v23.4s, v26.4s\n"
"cmp x16, x15, LSL #4\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
+ "fmin v22.4s, v22.4s, v26.4s\n"
+ "fmin v21.4s, v21.4s, v26.4s\n"
"add x28, x28, #0x10\n"
- "str q28, [x12, x27]\n"
+ "str q24, [x12, x27]\n"
"add x14, x14, #0xa0\n"
- "str q29, [x11, x27]\n"
- "str q30, [x10, x27]\n"
- "str q31, [x9, x27]\n"
+ "str q23, [x11, x27]\n"
+ "str q22, [x10, x27]\n"
+ "str q21, [x9, x27]\n"
"blt 1b\n"
"2:" // Channel tail
- "mov v28.16b, v16.16b\n fmla v28.4s, v4.4s, v9.4s\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v3.4s, v9.4s\n"
- "ldr x22, [x13, #0x28]\n"
- "ldr x21, [x13, #0x30]\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
- "ldr q9, [x22, x28]\n"
- "ldr x20, [x13, #0x38]\n"
- "fmla v28.4s, v0.4s, v10.4s\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
- "ldr q11, [x21, x28]\n"
- "ldr x22, [x13, #0x48]\n"
- "ldr q10, [x22, x28]\n"
- "fmla v30.4s, v2.4s, v12.4s\n"
- "fmla v31.4s, v1.4s, v12.4s\n"
- "ldr x26, [x13, #0x40]\n"
- "fmla v28.4s, v5.4s, v12.4s\n"
- "fmla v29.4s, v4.4s, v12.4s\n"
- "ldr q12, [x20, x28]\n"
- "ldr x25, [x13, #0x50]\n"
- "fmla v30.4s, v6.4s, v9.4s\n"
- "ldr q9, [x26, x28]\n"
- "fmla v31.4s, v3.4s, v13.4s\n"
- "ldr x24, [x13, #0x58]\n"
- "fmla v28.4s, v7.4s, v13.4s\n"
- "fmla v29.4s, v6.4s, v13.4s\n"
+ "mov v24.16b, v25.16b\n fmla v24.4s, v4.4s, v9.4s\n"
+ "mov v23.16b, v25.16b\n fmla v23.4s, v3.4s, v9.4s\n"
+ "ldr x21, [x13, #0x28]\n"
+ "ldr x20, [x13, #0x30]\n"
+ "mov v22.16b, v25.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+ "mov v21.16b, v25.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+ "ldr q18, [x21, x28]\n"
+ "ldr x21, [x13, #0x38]\n"
+ "fmla v24.4s, v0.4s, v10.4s\n"
+ "fmla v23.4s, v2.4s, v11.4s\n"
+ "ldr q17, [x20, x28]\n"
+ "ldr x20, [x13, #0x48]\n"
+ "ldr q20, [x20, x28]\n"
+ "fmla v22.4s, v2.4s, v12.4s\n"
+ "fmla v21.4s, v1.4s, v12.4s\n"
+ "ldr x20, [x13, #0x40]\n"
+ "fmla v24.4s, v5.4s, v12.4s\n"
+ "fmla v23.4s, v4.4s, v12.4s\n"
+ "ldr q16, [x21, x28]\n"
+ "ldr x21, [x13, #0x50]\n"
+ "fmla v22.4s, v6.4s, v18.4s\n"
+ "ldr q18, [x20, x28]\n"
+ "fmla v21.4s, v3.4s, v13.4s\n"
+ "ldr x20, [x13, #0x58]\n"
+ "fmla v24.4s, v7.4s, v13.4s\n"
+ "fmla v23.4s, v6.4s, v13.4s\n"
"ldr x23, [x13, #0x60]\n"
"ldr x22, [x13, #0x68]\n"
- "fmla v30.4s, v4.4s, v13.4s\n"
- "fmla v31.4s, v8.4s, v11.4s\n"
- "ldr q11, [x25, x28]\n"
+ "fmla v22.4s, v4.4s, v13.4s\n"
+ "fmla v21.4s, v8.4s, v17.4s\n"
+ "ldr q17, [x21, x28]\n"
"ldr x21, [x13, #0x70]\n"
- "fmla v28.4s, v1.4s, v12.4s\n"
- "fmla v29.4s, v0.4s, v12.4s\n"
- "ldr q12, [x24, x28]\n"
+ "fmla v24.4s, v1.4s, v16.4s\n"
+ "fmla v23.4s, v0.4s, v16.4s\n"
+ "ldr q16, [x20, x28]\n"
"ldr x20, [x13, #0x78]\n"
- "fmla v30.4s, v5.4s, v10.4s\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
+ "fmla v22.4s, v5.4s, v20.4s\n"
+ "fmla v21.4s, v4.4s, v20.4s\n"
"add x27, x27, #0x10\n"
- "fmla v28.4s, v2.4s, v9.4s\n"
- "fmla v29.4s, v1.4s, v9.4s\n"
- "ldr q9, [x23, x28]\n"
- "fmla v30.4s, v0.4s, v11.4s\n"
- "fmla v31.4s, v2.4s, v12.4s\n"
- "fmla v28.4s, v8.4s, v10.4s\n"
- "fmla v29.4s, v7.4s, v10.4s\n"
- "ldr q10, [x22, x28]\n"
- "fmla v30.4s, v3.4s, v9.4s\n"
- "fmla v31.4s, v5.4s, v10.4s\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ldr q11, [x21, x28]\n"
- "fmla v29.4s, v5.4s, v12.4s\n"
- "ldr q12, [x20, x28]\n"
- "fmla v30.4s, v7.4s, v11.4s\n"
- "fmla v31.4s, v6.4s, v11.4s\n"
+ "fmla v24.4s, v2.4s, v18.4s\n"
+ "fmla v23.4s, v1.4s, v18.4s\n"
+ "ldr q19, [x23, x28]\n"
+ "fmla v22.4s, v0.4s, v17.4s\n"
+ "fmla v21.4s, v2.4s, v16.4s\n"
+ "fmla v24.4s, v8.4s, v20.4s\n"
+ "fmla v23.4s, v7.4s, v20.4s\n"
+ "ldr q18, [x22, x28]\n"
+ "fmla v22.4s, v3.4s, v19.4s\n"
+ "fmla v21.4s, v5.4s, v18.4s\n"
+ "fmla v24.4s, v3.4s, v17.4s\n"
+ "ldr q17, [x21, x28]\n"
+ "fmla v23.4s, v5.4s, v16.4s\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v22.4s, v7.4s, v17.4s\n"
+ "fmla v21.4s, v6.4s, v17.4s\n"
"add x28, x28, #0x10\n"
- "fmla v28.4s, v6.4s, v9.4s\n"
- "fmla v29.4s, v8.4s, v10.4s\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmla v30.4s, v8.4s, v12.4s\n"
- "fmla v31.4s, v7.4s, v12.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "str q28, [x12, x27]\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "str q29, [x11, x27]\n"
- "str q30, [x10, x27]\n"
- "str q31, [x9, x27]\n"
+ "fmla v24.4s, v6.4s, v19.4s\n"
+ "fmla v23.4s, v8.4s, v18.4s\n"
+ "fmax v24.4s, v24.4s, v27.4s\n"
+ "fmla v22.4s, v8.4s, v16.4s\n"
+ "fmla v21.4s, v7.4s, v16.4s\n"
+ "fmax v23.4s, v23.4s, v27.4s\n"
+ "fmax v22.4s, v22.4s, v27.4s\n"
+ "fmax v21.4s, v21.4s, v27.4s\n"
+ "fmin v24.4s, v24.4s, v26.4s\n"
+ "fmin v23.4s, v23.4s, v26.4s\n"
+ "str q24, [x12, x27]\n"
+ "fmin v22.4s, v22.4s, v26.4s\n"
+ "fmin v21.4s, v21.4s, v26.4s\n"
+ "str q23, [x11, x27]\n"
+ "str q22, [x10, x27]\n"
+ "str q21, [x9, x27]\n"
"3:" // Oddments
"tst %x[n_channels], #0x3\n"
"beq 30f\n"
- "ldr q16, [x14, #0x0]\n"
+ "ldr q25, [x14, #0x0]\n"
"ldr q0, [x14, #0x10]\n"
- "mov x27, x28\n"
- "add x12, x12, x27\n"
+ "mov x20, x28\n"
+ "add x12, x12, x20\n"
"ldr q1, [x14, #0x20]\n"
"ldr q2, [x14, #0x30]\n"
- "add x11, x11, x27\n"
- "add x10, x10, x27\n"
+ "add x11, x11, x20\n"
+ "add x10, x10, x20\n"
"ldr q3, [x14, #0x40]\n"
"ldr q4, [x14, #0x50]\n"
- "add x9, x9, x27\n"
+ "add x9, x9, x20\n"
"ldr q5, [x14, #0x60]\n"
"ldr q6, [x14, #0x70]\n"
"ldr q7, [x14, #0x80]\n"
@@ -329,12 +329,12 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"ld1 { v12.s }[0], [x21], #0x4\n"
"ld1 { v13.s }[0], [x20], #0x4\n"
"5:" // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: End
- "mov v28.16b, v16.16b\n fmla v28.4s, v4.4s, v9.4s\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v3.4s, v9.4s\n"
+ "mov v28.16b, v25.16b\n fmla v28.4s, v4.4s, v9.4s\n"
+ "mov v29.16b, v25.16b\n fmla v29.4s, v3.4s, v9.4s\n"
"ldr x20, [x13, #0x28]\n"
"add x20, x20, x28\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "mov v30.16b, v25.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+ "mov v31.16b, v25.16b\n fmla v31.4s, v0.4s, v9.4s\n"
"fmla v28.4s, v0.4s, v10.4s\n"
"fmla v29.4s, v2.4s, v11.4s\n"
"fmla v28.4s, v5.4s, v12.4s\n"
@@ -475,14 +475,14 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"27:" // Oddments: Load input (3, 2): Bit 1: End
"fmla v30.4s, v8.4s, v12.4s\n"
"fmla v31.4s, v7.4s, v12.4s\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
+ "fmax v28.4s, v28.4s, v27.4s\n"
+ "fmax v29.4s, v29.4s, v27.4s\n"
+ "fmax v30.4s, v30.4s, v27.4s\n"
+ "fmax v31.4s, v31.4s, v27.4s\n"
+ "fmin v28.4s, v28.4s, v26.4s\n"
+ "fmin v29.4s, v29.4s, v26.4s\n"
+ "fmin v30.4s, v30.4s, v26.4s\n"
+ "fmin v31.4s, v31.4s, v26.4s\n"
"tbz %x[n_channels], #1, 28f\n"
"st1 { v28.d }[0], [x12], #0x8\n"
"st1 { v29.d }[0], [x11], #0x8\n"
@@ -503,11 +503,11 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"30:" // End
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
index 0e9a3ba3fc..c2d86615e3 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -33,8 +33,8 @@
namespace arm_conv {
namespace depthwise {
-void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
-void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
class a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
{
@@ -57,7 +57,7 @@ class a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 3;
a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<float, float, float, float>(3, 3, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
index 620319bc7c..9bfcd9cd3c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__
+#if defined(__aarch64__)
namespace arm_conv {
namespace depthwise {
@@ -110,7 +110,7 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"lsr x23, %x[n_channels], #0x2\n"
"ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
"mul x21, x21, x27\n" // offset *= kernel_stride * output_size
- "add x16, x16, x21, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x16, x16, x21, LSL #2\n" // inptr[0] += offset * sizeof(float)
"add x13, x16, x25, LSL #2\n"
"mul x20, x20, x26\n" // offset *= output_tile_size
"add x12, x13, x25, LSL #2\n"
@@ -120,9 +120,9 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"add x9, x11, x8\n"
"add x28, x15, x22, LSL #2\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v18.4s }, [x20]\n"
+ "ld1r { v15.4s }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v17.4s }, [x20]\n"
+ "ld1r { v14.4s }, [x20]\n"
"add x27, x10, x25, LSL #2\n"
"add x26, x9, x8\n"
"add x25, x28, x22, LSL #2\n"
@@ -130,7 +130,7 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"mov x21, #0x0\n"
"sub x20, XZR, x24\n"
"cbz x23, 4f\n"
- "ldr q16, [x14, #0x0]\n"
+ "ldr q31, [x14, #0x0]\n"
"ldr q0, [x14, #0x10]\n"
"cmp x24, x23, LSL #4\n"
"ldr q1, [x14, #0x20]\n"
@@ -149,304 +149,304 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"ldr q13, [x13, x11]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "mov v24.16b, v16.16b\n fmla v24.4s, v7.4s, v9.4s\n"
- "mov v23.16b, v16.16b\n fmla v23.4s, v8.4s, v9.4s\n"
+ "mov v29.16b, v31.16b\n fmla v29.4s, v7.4s, v9.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v8.4s, v9.4s\n"
"add x24, x24, #0x10\n"
"cmp x24, x23, LSL #4\n"
- "mov v25.16b, v16.16b\n fmla v25.4s, v6.4s, v9.4s\n"
- "fmla v24.4s, v4.4s, v13.4s\n"
+ "mov v27.16b, v31.16b\n fmla v27.4s, v6.4s, v9.4s\n"
+ "fmla v29.4s, v4.4s, v13.4s\n"
"add x20, x20, #0x10\n"
"add x21, x21, #0x10\n"
- "mov v26.16b, v16.16b\n fmla v26.4s, v5.4s, v9.4s\n"
- "mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n"
- "mov v28.16b, v16.16b\n fmla v28.4s, v3.4s, v9.4s\n"
- "fmla v23.4s, v0.4s, v10.4s\n"
- "ldr q10, [x12, x9]\n"
- "fmla v25.4s, v2.4s, v11.4s\n"
- "ldr q11, [x12, x8]\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v2.4s, v9.4s\n"
- "fmla v24.4s, v6.4s, v11.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
- "fmla v23.4s, v5.4s, v13.4s\n"
- "fmla v25.4s, v3.4s, v13.4s\n"
+ "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+ "mov v25.16b, v31.16b\n fmla v25.4s, v4.4s, v9.4s\n"
+ "mov v24.16b, v31.16b\n fmla v24.4s, v3.4s, v9.4s\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "ldr q23, [x12, x9]\n"
+ "fmla v27.4s, v2.4s, v11.4s\n"
+ "ldr q18, [x12, x8]\n"
+ "mov v22.16b, v31.16b\n fmla v22.4s, v2.4s, v9.4s\n"
+ "fmla v29.4s, v6.4s, v18.4s\n"
+ "mov v21.16b, v31.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+ "fmla v28.4s, v5.4s, v13.4s\n"
+ "fmla v27.4s, v3.4s, v13.4s\n"
"fmla v26.4s, v2.4s, v13.4s\n"
- "fmla v27.4s, v1.4s, v13.4s\n"
- "fmla v28.4s, v0.4s, v13.4s\n"
- "ldr q13, [x16, x8]\n"
- "fmla v29.4s, v6.4s, v12.4s\n"
- "ldr q12, [x27, x26]\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
- "ldr q16, [x14, #0x0]\n"
+ "fmla v25.4s, v1.4s, v13.4s\n"
"fmla v24.4s, v0.4s, v13.4s\n"
- "fmla v31.4s, v8.4s, v12.4s\n"
- "ldr q12, [x16, x9]\n"
- "fmla v23.4s, v7.4s, v11.4s\n"
- "fmla v30.4s, v0.4s, v11.4s\n"
- "fmla v26.4s, v4.4s, v11.4s\n"
- "fmla v27.4s, v3.4s, v11.4s\n"
- "fmla v29.4s, v1.4s, v11.4s\n"
- "ld1 { v11.4s }, [x13]\n"
- "fmla v24.4s, v2.4s, v12.4s\n"
- "fmla v25.4s, v1.4s, v12.4s\n"
- "ld1 { v12.4s }, [x10]\n"
- "fmla v28.4s, v4.4s, v10.4s\n"
- "fmla v23.4s, v1.4s, v13.4s\n"
- "ldr q13, [x13, x26]\n"
- "fmla v30.4s, v2.4s, v10.4s\n"
- "fmla v31.4s, v1.4s, v10.4s\n"
- "fmla v24.4s, v8.4s, v10.4s\n"
- "fmla v25.4s, v7.4s, v10.4s\n"
- "fmla v27.4s, v5.4s, v10.4s\n"
- "ldr q10, [x10, x11]\n"
- "fmla v26.4s, v0.4s, v11.4s\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
- "fmla v28.4s, v2.4s, v13.4s\n"
- "fmla v30.4s, v4.4s, v10.4s\n"
- "fmla v31.4s, v3.4s, v10.4s\n"
- "fmla v23.4s, v3.4s, v11.4s\n"
- "ldr q11, [x10, x26]\n"
- "fmla v25.4s, v5.4s, v13.4s\n"
- "ldr q13, [x27, x8]\n"
- "fmla v26.4s, v6.4s, v12.4s\n"
- "ldr q12, [x13, x8]\n"
- "fmla v27.4s, v7.4s, v10.4s\n"
- "fmla v29.4s, v5.4s, v10.4s\n"
- "fmla v28.4s, v6.4s, v10.4s\n"
- "fmla v31.4s, v5.4s, v11.4s\n"
- "fmla v30.4s, v6.4s, v13.4s\n"
- "fmla v26.4s, v8.4s, v10.4s\n"
- "fmla v29.4s, v7.4s, v13.4s\n"
- "ldr q13, [x27, x9]\n"
- "fmla v24.4s, v3.4s, v12.4s\n"
- "fmla v27.4s, v0.4s, v12.4s\n"
- "fmla v28.4s, v8.4s, v11.4s\n"
- "ldr q11, [x13, x9]\n"
- "fmla v30.4s, v8.4s, v13.4s\n"
+ "ldr q17, [x16, x8]\n"
+ "fmla v22.4s, v6.4s, v12.4s\n"
+ "ldr q16, [x27, x26]\n"
+ "mov v20.16b, v31.16b\n fmla v20.4s, v1.4s, v9.4s\n"
+ "ldr q31, [x14, #0x0]\n"
+ "fmla v29.4s, v0.4s, v17.4s\n"
+ "fmla v21.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x16, x9]\n"
+ "fmla v28.4s, v7.4s, v18.4s\n"
+ "fmla v20.4s, v0.4s, v18.4s\n"
+ "fmla v26.4s, v4.4s, v18.4s\n"
+ "fmla v25.4s, v3.4s, v18.4s\n"
+ "fmla v22.4s, v1.4s, v18.4s\n"
+ "ld1 { v19.4s }, [x13]\n"
+ "fmla v29.4s, v2.4s, v16.4s\n"
+ "fmla v27.4s, v1.4s, v16.4s\n"
+ "ld1 { v18.4s }, [x10]\n"
+ "fmla v24.4s, v4.4s, v23.4s\n"
+ "fmla v28.4s, v1.4s, v17.4s\n"
+ "ldr q16, [x13, x26]\n"
+ "fmla v20.4s, v2.4s, v23.4s\n"
+ "fmla v21.4s, v1.4s, v23.4s\n"
+ "fmla v29.4s, v8.4s, v23.4s\n"
+ "fmla v27.4s, v7.4s, v23.4s\n"
+ "fmla v25.4s, v5.4s, v23.4s\n"
+ "ldr q17, [x10, x11]\n"
+ "fmla v26.4s, v0.4s, v19.4s\n"
+ "fmla v22.4s, v3.4s, v18.4s\n"
+ "fmla v24.4s, v2.4s, v16.4s\n"
+ "fmla v20.4s, v4.4s, v17.4s\n"
+ "fmla v21.4s, v3.4s, v17.4s\n"
+ "fmla v28.4s, v3.4s, v19.4s\n"
+ "ldr q19, [x10, x26]\n"
+ "fmla v27.4s, v5.4s, v16.4s\n"
+ "ldr q16, [x27, x8]\n"
+ "fmla v26.4s, v6.4s, v18.4s\n"
+ "ldr q18, [x13, x8]\n"
+ "fmla v25.4s, v7.4s, v17.4s\n"
+ "fmla v22.4s, v5.4s, v17.4s\n"
+ "fmla v24.4s, v6.4s, v17.4s\n"
+ "fmla v21.4s, v5.4s, v19.4s\n"
+ "fmla v20.4s, v6.4s, v16.4s\n"
+ "fmla v26.4s, v8.4s, v17.4s\n"
+ "fmla v22.4s, v7.4s, v16.4s\n"
+ "ldr q17, [x27, x9]\n"
+ "fmla v29.4s, v3.4s, v18.4s\n"
+ "fmla v25.4s, v0.4s, v18.4s\n"
+ "fmla v24.4s, v8.4s, v19.4s\n"
+ "ldr q16, [x13, x9]\n"
+ "fmla v20.4s, v8.4s, v17.4s\n"
"add x13, x13, #0x10\n"
- "fmla v31.4s, v7.4s, v13.4s\n"
- "ldr q13, [x10, x9]\n"
- "fmla v23.4s, v4.4s, v12.4s\n"
- "fmla v26.4s, v1.4s, v12.4s\n"
- "ldr q12, [x10, x8]\n"
- "fmla v24.4s, v5.4s, v11.4s\n"
+ "fmla v21.4s, v7.4s, v17.4s\n"
+ "ldr q19, [x10, x9]\n"
+ "fmla v28.4s, v4.4s, v18.4s\n"
+ "fmla v26.4s, v1.4s, v18.4s\n"
+ "ldr q17, [x10, x8]\n"
+ "fmla v29.4s, v5.4s, v16.4s\n"
"add x10, x10, #0x10\n"
- "fmla v25.4s, v4.4s, v11.4s\n"
- "fmla v27.4s, v2.4s, v11.4s\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "ldr q11, [x16, x11]\n"
- "fmla v29.4s, v4.4s, v12.4s\n"
+ "fmla v27.4s, v4.4s, v16.4s\n"
+ "fmla v25.4s, v2.4s, v16.4s\n"
+ "fmla v24.4s, v1.4s, v16.4s\n"
+ "ldr q16, [x16, x11]\n"
+ "fmla v22.4s, v4.4s, v17.4s\n"
"add x16, x16, #0x10\n"
"ld1 { v10.4s }, [x16]\n"
- "fmla v30.4s, v3.4s, v12.4s\n"
- "fmla v31.4s, v4.4s, v13.4s\n"
+ "fmla v20.4s, v3.4s, v17.4s\n"
+ "fmla v21.4s, v4.4s, v19.4s\n"
"ldr q4, [x14, #0x50]\n"
- "fmla v26.4s, v7.4s, v12.4s\n"
- "fmla v27.4s, v6.4s, v12.4s\n"
- "ld1 { v12.4s }, [x12]\n"
- "fmla v23.4s, v2.4s, v11.4s\n"
- "fmla v24.4s, v1.4s, v11.4s\n"
+ "fmla v26.4s, v7.4s, v17.4s\n"
+ "fmla v25.4s, v6.4s, v17.4s\n"
+ "ld1 { v18.4s }, [x12]\n"
+ "fmla v28.4s, v2.4s, v16.4s\n"
+ "fmla v29.4s, v1.4s, v16.4s\n"
"ldr q1, [x14, #0x20]\n"
- "fmax v24.4s, v24.4s, v18.4s\n"
- "fmla v25.4s, v0.4s, v11.4s\n"
- "ldr q11, [x12, x26]\n"
- "fmla v28.4s, v7.4s, v13.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "fmla v27.4s, v0.4s, v16.4s\n"
+ "ldr q17, [x12, x26]\n"
+ "fmla v24.4s, v7.4s, v19.4s\n"
"add x12, x12, #0x10\n"
"ldr q9, [x12, x11]\n"
- "fmla v30.4s, v5.4s, v13.4s\n"
- "fmla v29.4s, v0.4s, v12.4s\n"
+ "fmla v20.4s, v5.4s, v19.4s\n"
+ "fmla v22.4s, v0.4s, v18.4s\n"
"ldr q0, [x14, #0x10]\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
+ "fmla v21.4s, v2.4s, v17.4s\n"
"ldr q2, [x14, #0x30]\n"
- "fmla v27.4s, v8.4s, v13.4s\n"
- "ldr q13, [x27, x11]\n"
- "fmla v23.4s, v6.4s, v12.4s\n"
- "fmla v26.4s, v3.4s, v12.4s\n"
+ "fmla v25.4s, v8.4s, v19.4s\n"
+ "ldr q16, [x27, x11]\n"
+ "fmla v28.4s, v6.4s, v18.4s\n"
+ "fmla v26.4s, v3.4s, v18.4s\n"
"ldr q3, [x14, #0x40]\n"
- "fmax v23.4s, v23.4s, v18.4s\n"
- "fmla v25.4s, v8.4s, v11.4s\n"
- "fmla v28.4s, v5.4s, v11.4s\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "fmla v27.4s, v8.4s, v17.4s\n"
+ "fmla v24.4s, v5.4s, v17.4s\n"
"ldr q11, [x16, x26]\n"
"ldr q5, [x14, #0x60]\n"
- "fmla v29.4s, v8.4s, v13.4s\n"
+ "fmla v22.4s, v8.4s, v16.4s\n"
"ldr q8, [x14, #0x90]\n"
- "fmla v30.4s, v7.4s, v13.4s\n"
+ "fmla v20.4s, v7.4s, v16.4s\n"
"ldr q7, [x14, #0x80]\n"
- "fmla v31.4s, v6.4s, v13.4s\n"
+ "fmla v21.4s, v6.4s, v16.4s\n"
"ldr q13, [x13, x11]\n"
"ldr q6, [x14, #0x70]\n"
- "fmax v25.4s, v25.4s, v18.4s\n"
- "fmax v26.4s, v26.4s, v18.4s\n"
- "fmax v27.4s, v27.4s, v18.4s\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
"add x27, x27, #0x10\n"
"ld1 { v12.4s }, [x27]\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "fmax v22.4s, v22.4s, v15.4s\n"
"add x14, x14, #0xa0\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmin v23.4s, v23.4s, v17.4s\n"
- "fmin v24.4s, v24.4s, v17.4s\n"
- "st1 { v23.4s }, [x15]\n"
- "fmin v25.4s, v25.4s, v17.4s\n"
- "fmin v26.4s, v26.4s, v17.4s\n"
- "str q24, [x15, x17]\n"
- "fmin v27.4s, v27.4s, v17.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "str q25, [x15, x22]\n"
+ "fmax v20.4s, v20.4s, v15.4s\n"
+ "fmax v21.4s, v21.4s, v15.4s\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "st1 { v28.4s }, [x15]\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "str q29, [x15, x17]\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "str q27, [x15, x22]\n"
"add x15, x15, #0x10\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
+ "fmin v22.4s, v22.4s, v14.4s\n"
+ "fmin v20.4s, v20.4s, v14.4s\n"
"st1 { v26.4s }, [x28]\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "str q27, [x28, x17]\n"
- "str q28, [x28, x22]\n"
+ "fmin v21.4s, v21.4s, v14.4s\n"
+ "str q25, [x28, x17]\n"
+ "str q24, [x28, x22]\n"
"add x28, x28, #0x10\n"
- "st1 { v29.4s }, [x25]\n"
- "str q30, [x25, x17]\n"
- "str q31, [x25, x22]\n"
+ "st1 { v22.4s }, [x25]\n"
+ "str q20, [x25, x17]\n"
+ "str q21, [x25, x22]\n"
"add x25, x25, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "mov v24.16b, v16.16b\n fmla v24.4s, v7.4s, v9.4s\n"
- "mov v23.16b, v16.16b\n fmla v23.4s, v8.4s, v9.4s\n"
- "mov v25.16b, v16.16b\n fmla v25.4s, v6.4s, v9.4s\n"
- "fmla v24.4s, v4.4s, v13.4s\n"
- "mov v26.16b, v16.16b\n fmla v26.4s, v5.4s, v9.4s\n"
- "mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n"
- "mov v28.16b, v16.16b\n fmla v28.4s, v3.4s, v9.4s\n"
- "fmla v23.4s, v0.4s, v10.4s\n"
- "ldr q10, [x12, x9]\n"
- "fmla v25.4s, v2.4s, v11.4s\n"
- "ldr q11, [x12, x8]\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v2.4s, v9.4s\n"
- "fmla v24.4s, v6.4s, v11.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
- "fmla v23.4s, v5.4s, v13.4s\n"
- "fmla v25.4s, v3.4s, v13.4s\n"
+ "mov v29.16b, v31.16b\n fmla v29.4s, v7.4s, v9.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v8.4s, v9.4s\n"
+ "mov v27.16b, v31.16b\n fmla v27.4s, v6.4s, v9.4s\n"
+ "fmla v29.4s, v4.4s, v13.4s\n"
+ "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+ "mov v25.16b, v31.16b\n fmla v25.4s, v4.4s, v9.4s\n"
+ "mov v24.16b, v31.16b\n fmla v24.4s, v3.4s, v9.4s\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "ldr q23, [x12, x9]\n"
+ "fmla v27.4s, v2.4s, v11.4s\n"
+ "ldr q18, [x12, x8]\n"
+ "mov v22.16b, v31.16b\n fmla v22.4s, v2.4s, v9.4s\n"
+ "fmla v29.4s, v6.4s, v18.4s\n"
+ "mov v21.16b, v31.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+ "fmla v28.4s, v5.4s, v13.4s\n"
+ "fmla v27.4s, v3.4s, v13.4s\n"
"fmla v26.4s, v2.4s, v13.4s\n"
- "fmla v27.4s, v1.4s, v13.4s\n"
- "fmla v28.4s, v0.4s, v13.4s\n"
- "ldr q13, [x16, x8]\n"
- "fmla v29.4s, v6.4s, v12.4s\n"
- "ldr q12, [x27, x26]\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+ "fmla v25.4s, v1.4s, v13.4s\n"
"fmla v24.4s, v0.4s, v13.4s\n"
- "fmla v31.4s, v8.4s, v12.4s\n"
- "ldr q12, [x16, x9]\n"
- "fmla v23.4s, v7.4s, v11.4s\n"
- "fmla v30.4s, v0.4s, v11.4s\n"
- "fmla v26.4s, v4.4s, v11.4s\n"
- "fmla v27.4s, v3.4s, v11.4s\n"
- "fmla v29.4s, v1.4s, v11.4s\n"
- "ld1 { v11.4s }, [x13]\n"
- "fmla v24.4s, v2.4s, v12.4s\n"
- "fmla v25.4s, v1.4s, v12.4s\n"
- "ld1 { v12.4s }, [x10]\n"
- "fmla v28.4s, v4.4s, v10.4s\n"
- "fmla v23.4s, v1.4s, v13.4s\n"
- "ldr q13, [x13, x26]\n"
- "fmla v30.4s, v2.4s, v10.4s\n"
- "fmla v31.4s, v1.4s, v10.4s\n"
- "fmla v24.4s, v8.4s, v10.4s\n"
- "fmla v25.4s, v7.4s, v10.4s\n"
- "fmla v27.4s, v5.4s, v10.4s\n"
- "ldr q10, [x10, x11]\n"
- "fmla v26.4s, v0.4s, v11.4s\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
- "fmla v28.4s, v2.4s, v13.4s\n"
- "fmla v30.4s, v4.4s, v10.4s\n"
- "fmla v31.4s, v3.4s, v10.4s\n"
- "fmla v23.4s, v3.4s, v11.4s\n"
- "ldr q11, [x10, x26]\n"
- "fmla v25.4s, v5.4s, v13.4s\n"
- "ldr q13, [x27, x8]\n"
- "fmla v26.4s, v6.4s, v12.4s\n"
- "ldr q12, [x13, x8]\n"
- "fmla v27.4s, v7.4s, v10.4s\n"
- "fmla v29.4s, v5.4s, v10.4s\n"
- "fmla v28.4s, v6.4s, v10.4s\n"
- "fmla v31.4s, v5.4s, v11.4s\n"
- "fmla v30.4s, v6.4s, v13.4s\n"
- "fmla v26.4s, v8.4s, v10.4s\n"
- "fmla v29.4s, v7.4s, v13.4s\n"
- "ldr q13, [x27, x9]\n"
- "fmla v24.4s, v3.4s, v12.4s\n"
- "fmla v27.4s, v0.4s, v12.4s\n"
- "fmla v28.4s, v8.4s, v11.4s\n"
- "ldr q11, [x13, x9]\n"
- "fmla v30.4s, v8.4s, v13.4s\n"
+ "ldr q17, [x16, x8]\n"
+ "fmla v22.4s, v6.4s, v12.4s\n"
+ "ldr q16, [x27, x26]\n"
+ "mov v20.16b, v31.16b\n fmla v20.4s, v1.4s, v9.4s\n"
+ "fmla v29.4s, v0.4s, v17.4s\n"
+ "fmla v21.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x16, x9]\n"
+ "fmla v28.4s, v7.4s, v18.4s\n"
+ "fmla v20.4s, v0.4s, v18.4s\n"
+ "fmla v26.4s, v4.4s, v18.4s\n"
+ "fmla v25.4s, v3.4s, v18.4s\n"
+ "fmla v22.4s, v1.4s, v18.4s\n"
+ "ld1 { v19.4s }, [x13]\n"
+ "fmla v29.4s, v2.4s, v16.4s\n"
+ "fmla v27.4s, v1.4s, v16.4s\n"
+ "ld1 { v18.4s }, [x10]\n"
+ "fmla v24.4s, v4.4s, v23.4s\n"
+ "fmla v28.4s, v1.4s, v17.4s\n"
+ "ldr q16, [x13, x26]\n"
+ "fmla v20.4s, v2.4s, v23.4s\n"
+ "fmla v21.4s, v1.4s, v23.4s\n"
+ "fmla v29.4s, v8.4s, v23.4s\n"
+ "fmla v27.4s, v7.4s, v23.4s\n"
+ "fmla v25.4s, v5.4s, v23.4s\n"
+ "ldr q17, [x10, x11]\n"
+ "fmla v26.4s, v0.4s, v19.4s\n"
+ "fmla v22.4s, v3.4s, v18.4s\n"
+ "fmla v24.4s, v2.4s, v16.4s\n"
+ "fmla v20.4s, v4.4s, v17.4s\n"
+ "fmla v21.4s, v3.4s, v17.4s\n"
+ "fmla v28.4s, v3.4s, v19.4s\n"
+ "ldr q19, [x10, x26]\n"
+ "fmla v27.4s, v5.4s, v16.4s\n"
+ "ldr q16, [x27, x8]\n"
+ "fmla v26.4s, v6.4s, v18.4s\n"
+ "ldr q18, [x13, x8]\n"
+ "fmla v25.4s, v7.4s, v17.4s\n"
+ "fmla v22.4s, v5.4s, v17.4s\n"
+ "fmla v24.4s, v6.4s, v17.4s\n"
+ "fmla v21.4s, v5.4s, v19.4s\n"
+ "fmla v20.4s, v6.4s, v16.4s\n"
+ "fmla v26.4s, v8.4s, v17.4s\n"
+ "fmla v22.4s, v7.4s, v16.4s\n"
+ "ldr q17, [x27, x9]\n"
+ "fmla v29.4s, v3.4s, v18.4s\n"
+ "fmla v25.4s, v0.4s, v18.4s\n"
+ "fmla v24.4s, v8.4s, v19.4s\n"
+ "ldr q16, [x13, x9]\n"
+ "fmla v20.4s, v8.4s, v17.4s\n"
"add x13, x13, #0x10\n"
- "fmla v31.4s, v7.4s, v13.4s\n"
- "ldr q13, [x10, x9]\n"
- "fmla v23.4s, v4.4s, v12.4s\n"
- "fmla v26.4s, v1.4s, v12.4s\n"
- "ldr q12, [x10, x8]\n"
- "fmla v24.4s, v5.4s, v11.4s\n"
+ "fmla v21.4s, v7.4s, v17.4s\n"
+ "ldr q19, [x10, x9]\n"
+ "fmla v28.4s, v4.4s, v18.4s\n"
+ "fmla v26.4s, v1.4s, v18.4s\n"
+ "ldr q17, [x10, x8]\n"
+ "fmla v29.4s, v5.4s, v16.4s\n"
"add x10, x10, #0x10\n"
- "fmla v25.4s, v4.4s, v11.4s\n"
- "fmla v27.4s, v2.4s, v11.4s\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "ldr q11, [x16, x11]\n"
- "fmla v29.4s, v4.4s, v12.4s\n"
+ "fmla v27.4s, v4.4s, v16.4s\n"
+ "fmla v25.4s, v2.4s, v16.4s\n"
+ "fmla v24.4s, v1.4s, v16.4s\n"
+ "ldr q16, [x16, x11]\n"
+ "fmla v22.4s, v4.4s, v17.4s\n"
"add x16, x16, #0x10\n"
- "fmla v30.4s, v3.4s, v12.4s\n"
- "fmla v31.4s, v4.4s, v13.4s\n"
- "fmla v26.4s, v7.4s, v12.4s\n"
- "fmla v27.4s, v6.4s, v12.4s\n"
- "ld1 { v12.4s }, [x12]\n"
- "fmla v23.4s, v2.4s, v11.4s\n"
- "fmla v24.4s, v1.4s, v11.4s\n"
- "fmax v24.4s, v24.4s, v18.4s\n"
- "fmla v25.4s, v0.4s, v11.4s\n"
- "ldr q11, [x12, x26]\n"
- "fmla v28.4s, v7.4s, v13.4s\n"
- "fmin v24.4s, v24.4s, v17.4s\n"
- "fmla v30.4s, v5.4s, v13.4s\n"
- "fmla v29.4s, v0.4s, v12.4s\n"
+ "fmla v20.4s, v3.4s, v17.4s\n"
+ "fmla v21.4s, v4.4s, v19.4s\n"
+ "fmla v26.4s, v7.4s, v17.4s\n"
+ "fmla v25.4s, v6.4s, v17.4s\n"
+ "ld1 { v18.4s }, [x12]\n"
+ "fmla v28.4s, v2.4s, v16.4s\n"
+ "fmla v29.4s, v1.4s, v16.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "fmla v27.4s, v0.4s, v16.4s\n"
+ "ldr q17, [x12, x26]\n"
+ "fmla v24.4s, v7.4s, v19.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "fmla v20.4s, v5.4s, v19.4s\n"
+ "fmla v22.4s, v0.4s, v18.4s\n"
"add x12, x12, #0x10\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
- "fmla v27.4s, v8.4s, v13.4s\n"
- "ldr q13, [x27, x11]\n"
- "fmax v27.4s, v27.4s, v18.4s\n"
- "fmla v23.4s, v6.4s, v12.4s\n"
- "fmla v26.4s, v3.4s, v12.4s\n"
- "fmax v23.4s, v23.4s, v18.4s\n"
+ "fmla v21.4s, v2.4s, v17.4s\n"
+ "fmla v25.4s, v8.4s, v19.4s\n"
+ "ldr q16, [x27, x11]\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "fmla v28.4s, v6.4s, v18.4s\n"
+ "fmla v26.4s, v3.4s, v18.4s\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
"add x27, x27, #0x10\n"
- "fmla v25.4s, v8.4s, v11.4s\n"
- "fmla v28.4s, v5.4s, v11.4s\n"
- "fmax v25.4s, v25.4s, v18.4s\n"
- "fmla v29.4s, v8.4s, v13.4s\n"
- "fmla v30.4s, v7.4s, v13.4s\n"
- "fmax v26.4s, v26.4s, v18.4s\n"
- "fmla v31.4s, v6.4s, v13.4s\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmin v23.4s, v23.4s, v17.4s\n"
- "st1 { v23.4s }, [x15]\n"
- "fmin v25.4s, v25.4s, v17.4s\n"
- "fmin v26.4s, v26.4s, v17.4s\n"
- "str q24, [x15, x17]\n"
- "fmin v27.4s, v27.4s, v17.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "str q25, [x15, x22]\n"
+ "fmla v27.4s, v8.4s, v17.4s\n"
+ "fmla v24.4s, v5.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "fmla v22.4s, v8.4s, v16.4s\n"
+ "fmla v20.4s, v7.4s, v16.4s\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "fmla v21.4s, v6.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "fmax v22.4s, v22.4s, v15.4s\n"
+ "fmax v20.4s, v20.4s, v15.4s\n"
+ "fmax v21.4s, v21.4s, v15.4s\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "st1 { v28.4s }, [x15]\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "str q29, [x15, x17]\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "str q27, [x15, x22]\n"
"add x15, x15, #0x10\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
+ "fmin v22.4s, v22.4s, v14.4s\n"
+ "fmin v20.4s, v20.4s, v14.4s\n"
"st1 { v26.4s }, [x28]\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "str q27, [x28, x17]\n"
- "str q28, [x28, x22]\n"
+ "fmin v21.4s, v21.4s, v14.4s\n"
+ "str q25, [x28, x17]\n"
+ "str q24, [x28, x22]\n"
"add x28, x28, #0x10\n"
- "st1 { v29.4s }, [x25]\n"
- "str q30, [x25, x17]\n"
- "str q31, [x25, x22]\n"
+ "st1 { v22.4s }, [x25]\n"
+ "str q20, [x25, x17]\n"
+ "str q21, [x25, x22]\n"
"add x25, x25, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x3\n"
"beq 49f\n"
- "ldr q16, [x14, #0x0]\n"
+ "ldr q31, [x14, #0x0]\n"
"ldr q0, [x14, #0x10]\n"
"add x24, x12, x11\n"
"add x23, x16, XZR\n"
@@ -481,18 +481,18 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"ldr s12, [x21, #0x0]\n"
"ldr s13, [x20, #0x0]\n"
"6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: End
- "mov v23.16b, v16.16b\n fmla v23.4s, v8.4s, v9.4s\n"
- "mov v25.16b, v16.16b\n fmla v25.4s, v6.4s, v9.4s\n"
+ "mov v23.16b, v31.16b\n fmla v23.4s, v8.4s, v9.4s\n"
+ "mov v25.16b, v31.16b\n fmla v25.4s, v6.4s, v9.4s\n"
"add x20, x27, x26\n"
- "mov v24.16b, v16.16b\n fmla v24.4s, v7.4s, v9.4s\n"
- "mov v26.16b, v16.16b\n fmla v26.4s, v5.4s, v9.4s\n"
- "mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n"
- "mov v28.16b, v16.16b\n fmla v28.4s, v3.4s, v9.4s\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v2.4s, v9.4s\n"
+ "mov v24.16b, v31.16b\n fmla v24.4s, v7.4s, v9.4s\n"
+ "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+ "mov v27.16b, v31.16b\n fmla v27.4s, v4.4s, v9.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v3.4s, v9.4s\n"
+ "mov v29.16b, v31.16b\n fmla v29.4s, v2.4s, v9.4s\n"
"fmla v23.4s, v0.4s, v10.4s\n"
"fmla v25.4s, v2.4s, v11.4s\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "mov v30.16b, v31.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+ "fmla v31.4s, v0.4s, v9.4s\n"
"fmla v29.4s, v6.4s, v12.4s\n"
"fmla v23.4s, v5.4s, v13.4s\n"
"fmla v24.4s, v4.4s, v13.4s\n"
@@ -741,25 +741,25 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"46:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
"fmla v29.4s, v8.4s, v13.4s\n"
"fmla v30.4s, v7.4s, v13.4s\n"
- "fmax v23.4s, v23.4s, v18.4s\n"
+ "fmax v23.4s, v23.4s, v15.4s\n"
"fmla v31.4s, v6.4s, v13.4s\n"
- "fmax v24.4s, v24.4s, v18.4s\n"
- "fmax v25.4s, v25.4s, v18.4s\n"
- "fmax v26.4s, v26.4s, v18.4s\n"
- "fmax v27.4s, v27.4s, v18.4s\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmin v23.4s, v23.4s, v17.4s\n"
- "fmin v24.4s, v24.4s, v17.4s\n"
- "fmin v25.4s, v25.4s, v17.4s\n"
- "fmin v26.4s, v26.4s, v17.4s\n"
- "fmin v27.4s, v27.4s, v17.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "fmax v30.4s, v30.4s, v15.4s\n"
+ "fmax v31.4s, v31.4s, v15.4s\n"
+ "fmin v23.4s, v23.4s, v14.4s\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "fmin v30.4s, v30.4s, v14.4s\n"
+ "fmin v31.4s, v31.4s, v14.4s\n"
"tbz %x[n_channels], #1, 47f\n"
"mov x22, x15\n"
"mov x21, x28\n"
@@ -804,7 +804,6 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"st1 { v28.s }[0], [x21]\n"
"st1 { v31.s }[0], [x20]\n"
"48:" // Tile loop: Oddments: Store: Bit 1: End
-
"49:" // Tile loop: End
"ldr x23, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x24, [%x[params_struct], %[offsetof_args_tile_i]]\n"
@@ -819,11 +818,11 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
index 15053a337a..972f7eb535 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__
+#if defined(__aarch64__)
namespace arm_conv {
namespace depthwise {
@@ -87,405 +87,405 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "mov x8, #0x10\n" // cntb _, ALL, #1
- "lsr x17, %x[n_channels], #0x2\n"
- "ldr x16, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mov x7, #0x10\n" // cntb _, ALL, #1
+ "lsr x8, %x[n_channels], #0x2\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v18.4s }, [x20]\n"
+ "ld1r { v15.4s }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v17.4s }, [x20]\n"
- "add x14, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "mov x13, #0x0\n"
- "sub x12, XZR, x8\n"
- "cbz x17, 3f\n"
- "ldr q16, [x15, #0x0]\n"
- "ldr q0, [x15, #0x10]\n"
- "cmp x8, x17, LSL #4\n"
- "ldr q1, [x15, #0x20]\n"
- "ldr q2, [x15, #0x30]\n"
- "ldr q3, [x15, #0x40]\n"
- "ldr q4, [x15, #0x50]\n"
- "ldr q5, [x15, #0x60]\n"
- "ldr q6, [x15, #0x70]\n"
- "ldr q7, [x15, #0x80]\n"
- "ldr q8, [x15, #0x90]\n"
- "add x15, x15, #0xa0\n"
- "ldp x11, x10, [x14, #0x0]\n"
- "ldr q9, [x11, x13]\n"
- "ldr q10, [x10, x13]\n"
- "ldp x9, x28, [x14, #0x10]\n"
- "ldr q11, [x9, x13]\n"
- "ldr q12, [x28, x13]\n"
- "ldr x27, [x14, #0x20]\n"
- "ldr q13, [x27, x13]\n"
+ "ld1r { v14.4s }, [x20]\n"
+ "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x14, #0x0\n"
+ "sub x13, XZR, x7\n"
+ "cbz x8, 3f\n"
+ "ldr q31, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "cmp x7, x8, LSL #4\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "ldr q5, [x16, #0x60]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "ldr q7, [x16, #0x80]\n"
+ "ldr q8, [x16, #0x90]\n"
+ "add x16, x16, #0xa0\n"
+ "ldp x21, x20, [x15, #0x0]\n"
+ "ldr q9, [x21, x14]\n"
+ "ldr q10, [x20, x14]\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr q11, [x21, x14]\n"
+ "ldr q12, [x20, x14]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ldr q13, [x20, x14]\n"
"bge 2f\n"
"1:" // Channel loop
- "mov v23.16b, v16.16b\n fmla v23.4s, v8.4s, v9.4s\n"
- "mov v24.16b, v16.16b\n fmla v24.4s, v7.4s, v9.4s\n"
- "ldr x26, [x14, #0x30]\n"
- "ldr x25, [x14, #0x38]\n"
- "mov v25.16b, v16.16b\n fmla v25.4s, v6.4s, v9.4s\n"
- "fmla v23.4s, v0.4s, v10.4s\n"
- "ldr x24, [x14, #0x28]\n"
- "ldr x10, [x14, #0x48]\n"
- "ldr q10, [x10, x13]\n"
- "fmla v24.4s, v4.4s, v13.4s\n"
- "mov v26.16b, v16.16b\n fmla v26.4s, v5.4s, v9.4s\n"
- "ldr x11, [x14, #0x40]\n"
- "mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n"
- "mov v28.16b, v16.16b\n fmla v28.4s, v3.4s, v9.4s\n"
- "ldr x9, [x14, #0x50]\n"
- "ldr x28, [x14, #0x58]\n"
- "fmla v25.4s, v2.4s, v11.4s\n"
- "ldr q11, [x26, x13]\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v2.4s, v9.4s\n"
- "ldr x27, [x14, #0x60]\n"
- "fmla v23.4s, v5.4s, v13.4s\n"
- "fmla v24.4s, v6.4s, v11.4s\n"
- "ldr x26, [x14, #0x70]\n"
- "ldr x10, [x14, #0x88]\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
- "fmla v25.4s, v3.4s, v13.4s\n"
- "ldr x23, [x16, #0x0]\n"
- "add x12, x12, #0x10\n"
+ "mov v29.16b, v31.16b\n fmla v29.4s, v8.4s, v9.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v7.4s, v9.4s\n"
+ "ldr x26, [x15, #0x30]\n"
+ "ldr x23, [x15, #0x38]\n"
+ "mov v27.16b, v31.16b\n fmla v27.4s, v6.4s, v9.4s\n"
+ "fmla v29.4s, v0.4s, v10.4s\n"
+ "ldr x22, [x15, #0x28]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "ldr q19, [x20, x14]\n"
+ "fmla v28.4s, v4.4s, v13.4s\n"
+ "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+ "ldr x21, [x15, #0x40]\n"
+ "mov v25.16b, v31.16b\n fmla v25.4s, v4.4s, v9.4s\n"
+ "mov v24.16b, v31.16b\n fmla v24.4s, v3.4s, v9.4s\n"
+ "ldr x25, [x15, #0x50]\n"
+ "ldr x24, [x15, #0x58]\n"
+ "fmla v27.4s, v2.4s, v11.4s\n"
+ "ldr q17, [x26, x14]\n"
+ "mov v23.16b, v31.16b\n fmla v23.4s, v2.4s, v9.4s\n"
+ "ldr x20, [x15, #0x60]\n"
+ "fmla v29.4s, v5.4s, v13.4s\n"
+ "fmla v28.4s, v6.4s, v17.4s\n"
+ "ldr x12, [x15, #0x70]\n"
+ "ldr x11, [x15, #0x88]\n"
+ "mov v22.16b, v31.16b\n fmla v22.4s, v0.4s, v9.4s\n"
+ "fmla v27.4s, v3.4s, v13.4s\n"
+ "ldr x10, [x17, #0x0]\n"
+ "add x13, x13, #0x10\n"
"fmla v26.4s, v2.4s, v13.4s\n"
- "fmla v27.4s, v1.4s, v13.4s\n"
- "ldr x22, [x16, #0x8]\n"
- "ldr x21, [x16, #0x10]\n"
- "fmla v28.4s, v0.4s, v13.4s\n"
- "ldr q13, [x25, x13]\n"
- "fmla v29.4s, v6.4s, v12.4s\n"
- "ldr q12, [x24, x13]\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
- "ldr q16, [x15, #0x0]\n"
- "fmla v23.4s, v7.4s, v11.4s\n"
- "ldr x24, [x14, #0x68]\n"
+ "fmla v25.4s, v1.4s, v13.4s\n"
+ "ldr x9, [x17, #0x8]\n"
+ "ldr x28, [x17, #0x10]\n"
"fmla v24.4s, v0.4s, v13.4s\n"
- "fmla v31.4s, v8.4s, v12.4s\n"
- "ldr q12, [x11, x13]\n"
- "ldr x25, [x14, #0x78]\n"
- "fmla v26.4s, v4.4s, v11.4s\n"
- "fmla v27.4s, v3.4s, v11.4s\n"
- "ldr x11, [x14, #0x80]\n"
- "ldr x20, [x16, #0x18]\n"
- "fmla v30.4s, v0.4s, v11.4s\n"
- "fmla v28.4s, v4.4s, v10.4s\n"
- "fmla v29.4s, v1.4s, v11.4s\n"
- "ldr q11, [x9, x13]\n"
- "fmla v23.4s, v1.4s, v13.4s\n"
- "ldr q13, [x28, x13]\n"
- "fmla v24.4s, v2.4s, v12.4s\n"
- "fmla v25.4s, v1.4s, v12.4s\n"
- "ldr q12, [x27, x13]\n"
- "ldr x9, [x14, #0x90]\n"
- "fmla v27.4s, v5.4s, v10.4s\n"
- "fmla v30.4s, v2.4s, v10.4s\n"
- "ldr x27, [x14, #0xa0]\n"
- "ldr x28, [x14, #0x98]\n"
- "fmla v26.4s, v0.4s, v11.4s\n"
- "fmla v28.4s, v2.4s, v13.4s\n"
- "fmla v24.4s, v8.4s, v10.4s\n"
- "fmla v25.4s, v7.4s, v10.4s\n"
- "fmla v31.4s, v1.4s, v10.4s\n"
- "ldr q10, [x24, x13]\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
- "ldr x24, [x14, #0xa8]\n"
- "fmla v26.4s, v6.4s, v12.4s\n"
- "ldr q12, [x11, x13]\n"
- "fmla v27.4s, v7.4s, v10.4s\n"
- "ldr x11, [x14, #0xc0]\n"
- "fmla v28.4s, v6.4s, v10.4s\n"
- "fmla v30.4s, v4.4s, v10.4s\n"
- "fmla v23.4s, v3.4s, v11.4s\n"
- "ldr q11, [x26, x13]\n"
- "fmla v25.4s, v5.4s, v13.4s\n"
- "ldr q13, [x25, x13]\n"
- "fmla v29.4s, v5.4s, v10.4s\n"
- "fmla v31.4s, v3.4s, v10.4s\n"
- "ldr x26, [x14, #0xb0]\n"
- "ldr x25, [x14, #0xb8]\n"
- "fmla v26.4s, v8.4s, v10.4s\n"
- "fmla v28.4s, v8.4s, v11.4s\n"
- "fmla v30.4s, v6.4s, v13.4s\n"
- "fmla v24.4s, v3.4s, v12.4s\n"
- "fmla v27.4s, v0.4s, v12.4s\n"
- "fmla v31.4s, v5.4s, v11.4s\n"
- "ldr q11, [x10, x13]\n"
- "fmla v29.4s, v7.4s, v13.4s\n"
- "ldr q13, [x9, x13]\n"
- "fmla v23.4s, v4.4s, v12.4s\n"
- "fmla v26.4s, v1.4s, v12.4s\n"
- "ldr q12, [x28, x13]\n"
- "fmla v24.4s, v5.4s, v11.4s\n"
- "fmla v25.4s, v4.4s, v11.4s\n"
- "fmla v27.4s, v2.4s, v11.4s\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "ldr q11, [x27, x13]\n"
- "fmla v30.4s, v8.4s, v13.4s\n"
- "ldr x27, [x14, #0x20]\n"
- "fmla v31.4s, v7.4s, v13.4s\n"
- "ldr q13, [x24, x13]\n"
- "fmla v23.4s, v2.4s, v11.4s\n"
- "fmla v26.4s, v7.4s, v12.4s\n"
- "fmla v27.4s, v6.4s, v12.4s\n"
- "fmla v29.4s, v4.4s, v12.4s\n"
- "fmla v30.4s, v3.4s, v12.4s\n"
- "ldr q12, [x26, x13]\n"
- "fmla v31.4s, v4.4s, v13.4s\n"
- "ldr q4, [x15, #0x50]\n"
- "fmla v24.4s, v1.4s, v11.4s\n"
- "ldr q1, [x15, #0x20]\n"
- "fmla v25.4s, v0.4s, v11.4s\n"
- "ldr q11, [x25, x13]\n"
+ "ldr q18, [x23, x14]\n"
"fmla v23.4s, v6.4s, v12.4s\n"
- "fmax v23.4s, v23.4s, v18.4s\n"
- "fmla v28.4s, v7.4s, v13.4s\n"
- "fmla v30.4s, v5.4s, v13.4s\n"
- "fmin v23.4s, v23.4s, v17.4s\n"
- "str q23, [x23, x12]\n"
- "fmla v29.4s, v0.4s, v12.4s\n"
- "ldr q0, [x15, #0x10]\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
- "ldr q2, [x15, #0x30]\n"
- "fmla v27.4s, v8.4s, v13.4s\n"
- "ldr q13, [x11, x13]\n"
- "fmla v26.4s, v3.4s, v12.4s\n"
- "ldr q3, [x15, #0x40]\n"
- "fmla v25.4s, v8.4s, v11.4s\n"
- "fmla v28.4s, v5.4s, v11.4s\n"
- "ldr q5, [x15, #0x60]\n"
- "fmax v24.4s, v24.4s, v18.4s\n"
- "fmla v29.4s, v8.4s, v13.4s\n"
- "ldr q8, [x15, #0x90]\n"
- "fmla v30.4s, v7.4s, v13.4s\n"
- "ldr q7, [x15, #0x80]\n"
- "fmla v31.4s, v6.4s, v13.4s\n"
- "ldr q13, [x27, x8]\n"
- "ldr q6, [x15, #0x70]\n"
- "fmax v25.4s, v25.4s, v18.4s\n"
- "fmax v26.4s, v26.4s, v18.4s\n"
- "fmax v27.4s, v27.4s, v18.4s\n"
- "ldr x23, [x16, #0x20]\n"
- "ldp x11, x10, [x14, #0x0]\n"
- "ldr q9, [x11, x8]\n"
- "ldr q10, [x10, x8]\n"
- "fmin v24.4s, v24.4s, v17.4s\n"
- "fmin v25.4s, v25.4s, v17.4s\n"
- "ldp x9, x28, [x14, #0x10]\n"
- "ldr q11, [x9, x8]\n"
- "fmin v26.4s, v26.4s, v17.4s\n"
- "fmin v27.4s, v27.4s, v17.4s\n"
- "ldr q12, [x28, x8]\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "str q24, [x22, x12]\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "str q25, [x21, x12]\n"
- "ldr x22, [x16, #0x28]\n"
- "str q26, [x20, x12]\n"
- "ldr x21, [x16, #0x30]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x8, x8, #0x10\n"
- "str q27, [x23, x12]\n"
- "ldr x23, [x16, #0x40]\n"
- "cmp x8, x17, LSL #4\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "add x13, x13, #0x10\n"
- "str q28, [x22, x12]\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "str q29, [x21, x12]\n"
- "add x15, x15, #0xa0\n"
- "str q30, [x20, x12]\n"
- "str q31, [x23, x12]\n"
+ "ldr q16, [x22, x14]\n"
+ "mov v21.16b, v31.16b\n fmla v21.4s, v1.4s, v9.4s\n"
+ "ldr q31, [x16, #0x0]\n"
+ "fmla v29.4s, v7.4s, v17.4s\n"
+ "ldr x23, [x15, #0x68]\n"
+ "fmla v28.4s, v0.4s, v18.4s\n"
+ "fmla v22.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x21, x14]\n"
+ "ldr x22, [x15, #0x78]\n"
+ "fmla v26.4s, v4.4s, v17.4s\n"
+ "fmla v25.4s, v3.4s, v17.4s\n"
+ "ldr x21, [x15, #0x80]\n"
+ "ldr x27, [x17, #0x18]\n"
+ "fmla v21.4s, v0.4s, v17.4s\n"
+ "fmla v24.4s, v4.4s, v19.4s\n"
+ "fmla v23.4s, v1.4s, v17.4s\n"
+ "ldr q17, [x25, x14]\n"
+ "fmla v29.4s, v1.4s, v18.4s\n"
+ "ldr q20, [x24, x14]\n"
+ "fmla v28.4s, v2.4s, v16.4s\n"
+ "fmla v27.4s, v1.4s, v16.4s\n"
+ "ldr q16, [x20, x14]\n"
+ "ldr x26, [x15, #0x90]\n"
+ "fmla v25.4s, v5.4s, v19.4s\n"
+ "fmla v21.4s, v2.4s, v19.4s\n"
+ "ldr x25, [x15, #0xa0]\n"
+ "ldr x20, [x15, #0x98]\n"
+ "fmla v26.4s, v0.4s, v17.4s\n"
+ "fmla v24.4s, v2.4s, v20.4s\n"
+ "fmla v28.4s, v8.4s, v19.4s\n"
+ "fmla v27.4s, v7.4s, v19.4s\n"
+ "fmla v22.4s, v1.4s, v19.4s\n"
+ "ldr q19, [x23, x14]\n"
+ "fmla v23.4s, v3.4s, v16.4s\n"
+ "ldr x24, [x15, #0xa8]\n"
+ "fmla v26.4s, v6.4s, v16.4s\n"
+ "ldr q18, [x21, x14]\n"
+ "fmla v25.4s, v7.4s, v19.4s\n"
+ "ldr x23, [x15, #0xc0]\n"
+ "fmla v24.4s, v6.4s, v19.4s\n"
+ "fmla v21.4s, v4.4s, v19.4s\n"
+ "fmla v29.4s, v3.4s, v17.4s\n"
+ "ldr q17, [x12, x14]\n"
+ "fmla v27.4s, v5.4s, v20.4s\n"
+ "ldr q16, [x22, x14]\n"
+ "fmla v23.4s, v5.4s, v19.4s\n"
+ "fmla v22.4s, v3.4s, v19.4s\n"
+ "ldr x22, [x15, #0xb0]\n"
+ "ldr x21, [x15, #0xb8]\n"
+ "fmla v26.4s, v8.4s, v19.4s\n"
+ "fmla v24.4s, v8.4s, v17.4s\n"
+ "fmla v21.4s, v6.4s, v16.4s\n"
+ "fmla v28.4s, v3.4s, v18.4s\n"
+ "fmla v25.4s, v0.4s, v18.4s\n"
+ "fmla v22.4s, v5.4s, v17.4s\n"
+ "ldr q17, [x11, x14]\n"
+ "fmla v23.4s, v7.4s, v16.4s\n"
+ "ldr q16, [x26, x14]\n"
+ "fmla v29.4s, v4.4s, v18.4s\n"
+ "fmla v26.4s, v1.4s, v18.4s\n"
+ "ldr q18, [x20, x14]\n"
+ "fmla v28.4s, v5.4s, v17.4s\n"
+ "fmla v27.4s, v4.4s, v17.4s\n"
+ "fmla v25.4s, v2.4s, v17.4s\n"
+ "fmla v24.4s, v1.4s, v17.4s\n"
+ "ldr q17, [x25, x14]\n"
+ "fmla v21.4s, v8.4s, v16.4s\n"
+ "ldr x20, [x15, #0x20]\n"
+ "fmla v22.4s, v7.4s, v16.4s\n"
+ "ldr q16, [x24, x14]\n"
+ "fmla v29.4s, v2.4s, v17.4s\n"
+ "fmla v26.4s, v7.4s, v18.4s\n"
+ "fmla v25.4s, v6.4s, v18.4s\n"
+ "fmla v23.4s, v4.4s, v18.4s\n"
+ "fmla v21.4s, v3.4s, v18.4s\n"
+ "ldr q18, [x22, x14]\n"
+ "fmla v22.4s, v4.4s, v16.4s\n"
+ "ldr q4, [x16, #0x50]\n"
+ "fmla v28.4s, v1.4s, v17.4s\n"
+ "ldr q1, [x16, #0x20]\n"
+ "fmla v27.4s, v0.4s, v17.4s\n"
+ "ldr q17, [x21, x14]\n"
+ "fmla v29.4s, v6.4s, v18.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "fmla v24.4s, v7.4s, v16.4s\n"
+ "fmla v21.4s, v5.4s, v16.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "str q29, [x10, x13]\n"
+ "fmla v23.4s, v0.4s, v18.4s\n"
+ "ldr q0, [x16, #0x10]\n"
+ "fmla v22.4s, v2.4s, v17.4s\n"
+ "ldr q2, [x16, #0x30]\n"
+ "fmla v25.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x23, x14]\n"
+ "fmla v26.4s, v3.4s, v18.4s\n"
+ "ldr q3, [x16, #0x40]\n"
+ "fmla v27.4s, v8.4s, v17.4s\n"
+ "fmla v24.4s, v5.4s, v17.4s\n"
+ "ldr q5, [x16, #0x60]\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "fmla v23.4s, v8.4s, v16.4s\n"
+ "ldr q8, [x16, #0x90]\n"
+ "fmla v21.4s, v7.4s, v16.4s\n"
+ "ldr q7, [x16, #0x80]\n"
+ "fmla v22.4s, v6.4s, v16.4s\n"
+ "ldr q13, [x20, x7]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "ldr x24, [x17, #0x20]\n"
+ "ldp x21, x20, [x15, #0x0]\n"
+ "ldr q9, [x21, x7]\n"
+ "ldr q10, [x20, x7]\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr q11, [x21, x7]\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "ldr q12, [x20, x7]\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "fmax v23.4s, v23.4s, v15.4s\n"
+ "str q28, [x9, x13]\n"
+ "fmax v21.4s, v21.4s, v15.4s\n"
+ "fmax v22.4s, v22.4s, v15.4s\n"
+ "str q27, [x28, x13]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "str q26, [x27, x13]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x7, x7, #0x10\n"
+ "str q25, [x24, x13]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "cmp x7, x8, LSL #4\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "fmin v23.4s, v23.4s, v14.4s\n"
+ "fmin v21.4s, v21.4s, v14.4s\n"
+ "add x14, x14, #0x10\n"
+ "str q24, [x23, x13]\n"
+ "fmin v22.4s, v22.4s, v14.4s\n"
+ "str q23, [x22, x13]\n"
+ "add x16, x16, #0xa0\n"
+ "str q21, [x21, x13]\n"
+ "str q22, [x20, x13]\n"
"blt 1b\n"
"2:" // Channel tail
- "mov v23.16b, v16.16b\n fmla v23.4s, v8.4s, v9.4s\n"
- "mov v24.16b, v16.16b\n fmla v24.4s, v7.4s, v9.4s\n"
- "ldr x26, [x14, #0x30]\n"
- "ldr x25, [x14, #0x38]\n"
- "mov v25.16b, v16.16b\n fmla v25.4s, v6.4s, v9.4s\n"
- "fmla v23.4s, v0.4s, v10.4s\n"
- "ldr x24, [x14, #0x28]\n"
- "ldr x10, [x14, #0x48]\n"
- "ldr q10, [x10, x13]\n"
- "fmla v24.4s, v4.4s, v13.4s\n"
- "mov v26.16b, v16.16b\n fmla v26.4s, v5.4s, v9.4s\n"
- "ldr x11, [x14, #0x40]\n"
- "mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n"
- "mov v28.16b, v16.16b\n fmla v28.4s, v3.4s, v9.4s\n"
- "ldr x9, [x14, #0x50]\n"
- "ldr x28, [x14, #0x58]\n"
- "fmla v25.4s, v2.4s, v11.4s\n"
- "ldr q11, [x26, x13]\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v2.4s, v9.4s\n"
- "ldr x27, [x14, #0x60]\n"
- "fmla v23.4s, v5.4s, v13.4s\n"
- "fmla v24.4s, v6.4s, v11.4s\n"
- "ldr x26, [x14, #0x70]\n"
- "ldr x10, [x14, #0x88]\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
- "fmla v25.4s, v3.4s, v13.4s\n"
- "ldr x23, [x16, #0x0]\n"
- "add x12, x12, #0x10\n"
+ "mov v29.16b, v31.16b\n fmla v29.4s, v8.4s, v9.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v7.4s, v9.4s\n"
+ "ldr x23, [x15, #0x30]\n"
+ "ldr x22, [x15, #0x38]\n"
+ "mov v27.16b, v31.16b\n fmla v27.4s, v6.4s, v9.4s\n"
+ "fmla v29.4s, v0.4s, v10.4s\n"
+ "ldr x21, [x15, #0x28]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "ldr q19, [x20, x14]\n"
+ "fmla v28.4s, v4.4s, v13.4s\n"
+ "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+ "ldr x20, [x15, #0x40]\n"
+ "mov v25.16b, v31.16b\n fmla v25.4s, v4.4s, v9.4s\n"
+ "mov v24.16b, v31.16b\n fmla v24.4s, v3.4s, v9.4s\n"
+ "ldr x25, [x15, #0x50]\n"
+ "ldr x24, [x15, #0x58]\n"
+ "fmla v27.4s, v2.4s, v11.4s\n"
+ "ldr q17, [x23, x14]\n"
+ "mov v23.16b, v31.16b\n fmla v23.4s, v2.4s, v9.4s\n"
+ "ldr x23, [x15, #0x60]\n"
+ "fmla v29.4s, v5.4s, v13.4s\n"
+ "fmla v28.4s, v6.4s, v17.4s\n"
+ "ldr x12, [x15, #0x70]\n"
+ "ldr x11, [x15, #0x88]\n"
+ "mov v22.16b, v31.16b\n fmla v22.4s, v0.4s, v9.4s\n"
+ "fmla v27.4s, v3.4s, v13.4s\n"
+ "ldr x10, [x17, #0x0]\n"
+ "add x13, x13, #0x10\n"
"fmla v26.4s, v2.4s, v13.4s\n"
- "fmla v27.4s, v1.4s, v13.4s\n"
- "ldr x22, [x16, #0x8]\n"
- "ldr x21, [x16, #0x10]\n"
- "fmla v28.4s, v0.4s, v13.4s\n"
- "ldr q13, [x25, x13]\n"
- "fmla v29.4s, v6.4s, v12.4s\n"
- "ldr q12, [x24, x13]\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
- "fmla v23.4s, v7.4s, v11.4s\n"
- "ldr x24, [x14, #0x68]\n"
- "ldr x25, [x14, #0x78]\n"
+ "fmla v25.4s, v1.4s, v13.4s\n"
+ "ldr x9, [x17, #0x8]\n"
+ "ldr x28, [x17, #0x10]\n"
"fmla v24.4s, v0.4s, v13.4s\n"
- "fmla v31.4s, v8.4s, v12.4s\n"
- "ldr q12, [x11, x13]\n"
- "ldr x11, [x14, #0x80]\n"
- "fmla v26.4s, v4.4s, v11.4s\n"
- "fmla v27.4s, v3.4s, v11.4s\n"
- "ldr x20, [x16, #0x18]\n"
- "fmla v30.4s, v0.4s, v11.4s\n"
- "fmla v28.4s, v4.4s, v10.4s\n"
- "fmla v29.4s, v1.4s, v11.4s\n"
- "ldr q11, [x9, x13]\n"
- "fmla v23.4s, v1.4s, v13.4s\n"
- "ldr q13, [x28, x13]\n"
- "fmla v24.4s, v2.4s, v12.4s\n"
- "fmla v25.4s, v1.4s, v12.4s\n"
- "ldr q12, [x27, x13]\n"
- "ldr x9, [x14, #0x90]\n"
- "fmla v27.4s, v5.4s, v10.4s\n"
- "fmla v30.4s, v2.4s, v10.4s\n"
- "ldr x27, [x14, #0xa0]\n"
- "ldr x28, [x14, #0x98]\n"
- "fmla v26.4s, v0.4s, v11.4s\n"
- "fmla v28.4s, v2.4s, v13.4s\n"
- "fmla v24.4s, v8.4s, v10.4s\n"
- "fmla v25.4s, v7.4s, v10.4s\n"
- "fmla v31.4s, v1.4s, v10.4s\n"
- "ldr q10, [x24, x13]\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
- "ldr x24, [x14, #0xa8]\n"
- "fmla v26.4s, v6.4s, v12.4s\n"
- "ldr q12, [x11, x13]\n"
- "fmla v27.4s, v7.4s, v10.4s\n"
- "ldr x11, [x14, #0xc0]\n"
- "fmla v28.4s, v6.4s, v10.4s\n"
- "fmla v30.4s, v4.4s, v10.4s\n"
- "fmla v23.4s, v3.4s, v11.4s\n"
- "ldr q11, [x26, x13]\n"
- "fmla v25.4s, v5.4s, v13.4s\n"
- "ldr q13, [x25, x13]\n"
- "fmla v29.4s, v5.4s, v10.4s\n"
- "fmla v31.4s, v3.4s, v10.4s\n"
- "ldr x26, [x14, #0xb0]\n"
- "ldr x25, [x14, #0xb8]\n"
- "fmla v26.4s, v8.4s, v10.4s\n"
- "fmla v28.4s, v8.4s, v11.4s\n"
- "fmla v30.4s, v6.4s, v13.4s\n"
- "fmla v24.4s, v3.4s, v12.4s\n"
- "fmla v27.4s, v0.4s, v12.4s\n"
- "fmla v31.4s, v5.4s, v11.4s\n"
- "ldr q11, [x10, x13]\n"
- "fmla v29.4s, v7.4s, v13.4s\n"
- "ldr q13, [x9, x13]\n"
- "fmla v23.4s, v4.4s, v12.4s\n"
- "fmla v26.4s, v1.4s, v12.4s\n"
- "ldr q12, [x28, x13]\n"
- "fmla v24.4s, v5.4s, v11.4s\n"
- "fmla v25.4s, v4.4s, v11.4s\n"
- "fmla v27.4s, v2.4s, v11.4s\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "ldr q11, [x27, x13]\n"
- "fmla v30.4s, v8.4s, v13.4s\n"
- "fmla v31.4s, v7.4s, v13.4s\n"
- "ldr q13, [x24, x13]\n"
- "fmla v23.4s, v2.4s, v11.4s\n"
- "fmla v26.4s, v7.4s, v12.4s\n"
- "fmla v27.4s, v6.4s, v12.4s\n"
- "fmla v29.4s, v4.4s, v12.4s\n"
- "fmla v30.4s, v3.4s, v12.4s\n"
- "ldr q12, [x26, x13]\n"
- "fmla v31.4s, v4.4s, v13.4s\n"
- "fmla v24.4s, v1.4s, v11.4s\n"
- "fmax v24.4s, v24.4s, v18.4s\n"
- "fmla v25.4s, v0.4s, v11.4s\n"
- "ldr q11, [x25, x13]\n"
+ "ldr q18, [x22, x14]\n"
"fmla v23.4s, v6.4s, v12.4s\n"
- "fmax v23.4s, v23.4s, v18.4s\n"
- "fmla v28.4s, v7.4s, v13.4s\n"
- "fmla v30.4s, v5.4s, v13.4s\n"
- "fmin v23.4s, v23.4s, v17.4s\n"
- "str q23, [x23, x12]\n"
- "fmla v29.4s, v0.4s, v12.4s\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
- "ldr x23, [x16, #0x20]\n"
- "fmin v24.4s, v24.4s, v17.4s\n"
- "fmla v27.4s, v8.4s, v13.4s\n"
- "ldr q13, [x11, x13]\n"
- "fmla v26.4s, v3.4s, v12.4s\n"
- "fmax v26.4s, v26.4s, v18.4s\n"
- "fmla v25.4s, v8.4s, v11.4s\n"
- "fmla v28.4s, v5.4s, v11.4s\n"
- "fmax v25.4s, v25.4s, v18.4s\n"
- "str q24, [x22, x12]\n"
- "fmla v29.4s, v8.4s, v13.4s\n"
- "fmla v30.4s, v7.4s, v13.4s\n"
- "fmax v27.4s, v27.4s, v18.4s\n"
- "ldr x22, [x16, #0x28]\n"
- "fmla v31.4s, v6.4s, v13.4s\n"
- "fmin v25.4s, v25.4s, v17.4s\n"
- "str q25, [x21, x12]\n"
- "ldr x21, [x16, #0x30]\n"
- "fmin v26.4s, v26.4s, v17.4s\n"
- "fmin v27.4s, v27.4s, v17.4s\n"
- "str q26, [x20, x12]\n"
- "ldr x20, [x16, #0x38]\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "str q27, [x23, x12]\n"
- "ldr x23, [x16, #0x40]\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "add x13, x13, #0x10\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "str q28, [x22, x12]\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "str q29, [x21, x12]\n"
- "str q30, [x20, x12]\n"
- "str q31, [x23, x12]\n"
+ "ldr q16, [x21, x14]\n"
+ "mov v21.16b, v31.16b\n fmla v21.4s, v1.4s, v9.4s\n"
+ "fmla v29.4s, v7.4s, v17.4s\n"
+ "ldr x22, [x15, #0x68]\n"
+ "ldr x21, [x15, #0x78]\n"
+ "fmla v28.4s, v0.4s, v18.4s\n"
+ "fmla v22.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x20, x14]\n"
+ "ldr x20, [x15, #0x80]\n"
+ "fmla v26.4s, v4.4s, v17.4s\n"
+ "fmla v25.4s, v3.4s, v17.4s\n"
+ "ldr x27, [x17, #0x18]\n"
+ "fmla v21.4s, v0.4s, v17.4s\n"
+ "fmla v24.4s, v4.4s, v19.4s\n"
+ "fmla v23.4s, v1.4s, v17.4s\n"
+ "ldr q17, [x25, x14]\n"
+ "fmla v29.4s, v1.4s, v18.4s\n"
+ "ldr q20, [x24, x14]\n"
+ "fmla v28.4s, v2.4s, v16.4s\n"
+ "fmla v27.4s, v1.4s, v16.4s\n"
+ "ldr q16, [x23, x14]\n"
+ "ldr x26, [x15, #0x90]\n"
+ "fmla v25.4s, v5.4s, v19.4s\n"
+ "fmla v21.4s, v2.4s, v19.4s\n"
+ "ldr x25, [x15, #0xa0]\n"
+ "ldr x24, [x15, #0x98]\n"
+ "fmla v26.4s, v0.4s, v17.4s\n"
+ "fmla v24.4s, v2.4s, v20.4s\n"
+ "fmla v28.4s, v8.4s, v19.4s\n"
+ "fmla v27.4s, v7.4s, v19.4s\n"
+ "fmla v22.4s, v1.4s, v19.4s\n"
+ "ldr q19, [x22, x14]\n"
+ "fmla v23.4s, v3.4s, v16.4s\n"
+ "ldr x23, [x15, #0xa8]\n"
+ "fmla v26.4s, v6.4s, v16.4s\n"
+ "ldr q18, [x20, x14]\n"
+ "fmla v25.4s, v7.4s, v19.4s\n"
+ "ldr x22, [x15, #0xc0]\n"
+ "fmla v24.4s, v6.4s, v19.4s\n"
+ "fmla v21.4s, v4.4s, v19.4s\n"
+ "fmla v29.4s, v3.4s, v17.4s\n"
+ "ldr q17, [x12, x14]\n"
+ "fmla v27.4s, v5.4s, v20.4s\n"
+ "ldr q16, [x21, x14]\n"
+ "fmla v23.4s, v5.4s, v19.4s\n"
+ "fmla v22.4s, v3.4s, v19.4s\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla v26.4s, v8.4s, v19.4s\n"
+ "fmla v24.4s, v8.4s, v17.4s\n"
+ "fmla v21.4s, v6.4s, v16.4s\n"
+ "fmla v28.4s, v3.4s, v18.4s\n"
+ "fmla v25.4s, v0.4s, v18.4s\n"
+ "fmla v22.4s, v5.4s, v17.4s\n"
+ "ldr q17, [x11, x14]\n"
+ "fmla v23.4s, v7.4s, v16.4s\n"
+ "ldr q16, [x26, x14]\n"
+ "fmla v29.4s, v4.4s, v18.4s\n"
+ "fmla v26.4s, v1.4s, v18.4s\n"
+ "ldr q18, [x24, x14]\n"
+ "fmla v28.4s, v5.4s, v17.4s\n"
+ "fmla v27.4s, v4.4s, v17.4s\n"
+ "fmla v25.4s, v2.4s, v17.4s\n"
+ "fmla v24.4s, v1.4s, v17.4s\n"
+ "ldr q17, [x25, x14]\n"
+ "fmla v21.4s, v8.4s, v16.4s\n"
+ "fmla v22.4s, v7.4s, v16.4s\n"
+ "ldr q16, [x23, x14]\n"
+ "fmla v29.4s, v2.4s, v17.4s\n"
+ "fmla v26.4s, v7.4s, v18.4s\n"
+ "fmla v25.4s, v6.4s, v18.4s\n"
+ "fmla v23.4s, v4.4s, v18.4s\n"
+ "fmla v21.4s, v3.4s, v18.4s\n"
+ "ldr q18, [x21, x14]\n"
+ "fmla v22.4s, v4.4s, v16.4s\n"
+ "fmla v28.4s, v1.4s, v17.4s\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "fmla v27.4s, v0.4s, v17.4s\n"
+ "ldr q17, [x20, x14]\n"
+ "fmla v29.4s, v6.4s, v18.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "fmla v24.4s, v7.4s, v16.4s\n"
+ "fmla v21.4s, v5.4s, v16.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "str q29, [x10, x13]\n"
+ "fmla v23.4s, v0.4s, v18.4s\n"
+ "fmla v22.4s, v2.4s, v17.4s\n"
+ "ldr x20, [x17, #0x20]\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "fmla v25.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x22, x14]\n"
+ "fmla v26.4s, v3.4s, v18.4s\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "fmla v27.4s, v8.4s, v17.4s\n"
+ "fmla v24.4s, v5.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "str q28, [x9, x13]\n"
+ "fmla v23.4s, v8.4s, v16.4s\n"
+ "fmla v21.4s, v7.4s, v16.4s\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "ldr x23, [x17, #0x28]\n"
+ "fmla v22.4s, v6.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "str q27, [x28, x13]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "str q26, [x27, x13]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "fmax v23.4s, v23.4s, v15.4s\n"
+ "str q25, [x20, x13]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "fmax v21.4s, v21.4s, v15.4s\n"
+ "fmax v22.4s, v22.4s, v15.4s\n"
+ "add x14, x14, #0x10\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "fmin v23.4s, v23.4s, v14.4s\n"
+ "str q24, [x23, x13]\n"
+ "fmin v21.4s, v21.4s, v14.4s\n"
+ "fmin v22.4s, v22.4s, v14.4s\n"
+ "str q23, [x22, x13]\n"
+ "str q21, [x21, x13]\n"
+ "str q22, [x20, x13]\n"
"3:" // Oddments
"tst %x[n_channels], #0x3\n"
"beq 48f\n"
- "ldr q16, [x15, #0x0]\n"
- "ldr q0, [x15, #0x10]\n"
- "mov x12, x13\n"
- "ldr q1, [x15, #0x20]\n"
- "ldr q2, [x15, #0x30]\n"
- "ldr q3, [x15, #0x40]\n"
- "ldr q4, [x15, #0x50]\n"
- "ldr q5, [x15, #0x60]\n"
- "ldr q6, [x15, #0x70]\n"
- "ldr q7, [x15, #0x80]\n"
- "ldr q8, [x15, #0x90]\n"
- "ldr x24, [x14, #0x0]\n"
- "ldr x23, [x14, #0x8]\n"
- "add x24, x24, x13\n"
- "add x23, x23, x13\n"
- "ldr x22, [x14, #0x10]\n"
- "ldr x21, [x14, #0x18]\n"
- "add x22, x22, x13\n"
- "add x21, x21, x13\n"
- "ldr x20, [x14, #0x20]\n"
- "add x20, x20, x13\n"
+ "ldr q31, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "mov x13, x14\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "ldr q5, [x16, #0x60]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "ldr q7, [x16, #0x80]\n"
+ "ldr q8, [x16, #0x90]\n"
+ "ldr x24, [x15, #0x0]\n"
+ "ldr x23, [x15, #0x8]\n"
+ "add x24, x24, x14\n"
+ "add x23, x23, x14\n"
+ "ldr x22, [x15, #0x10]\n"
+ "ldr x21, [x15, #0x18]\n"
+ "add x22, x22, x14\n"
+ "add x21, x21, x14\n"
+ "ldr x20, [x15, #0x20]\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 4f\n"
"ld1 { v9.d }[0], [x24], #0x8\n"
"ld1 { v10.d }[0], [x23], #0x8\n"
@@ -506,19 +506,19 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ld1 { v12.s }[0], [x21], #0x4\n"
"ld1 { v13.s }[0], [x20], #0x4\n"
"5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: End
- "mov v23.16b, v16.16b\n fmla v23.4s, v8.4s, v9.4s\n"
- "mov v25.16b, v16.16b\n fmla v25.4s, v6.4s, v9.4s\n"
- "ldr x20, [x14, #0x28]\n"
- "add x20, x20, x13\n"
- "mov v24.16b, v16.16b\n fmla v24.4s, v7.4s, v9.4s\n"
- "mov v26.16b, v16.16b\n fmla v26.4s, v5.4s, v9.4s\n"
- "mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n"
- "mov v28.16b, v16.16b\n fmla v28.4s, v3.4s, v9.4s\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v2.4s, v9.4s\n"
+ "mov v23.16b, v31.16b\n fmla v23.4s, v8.4s, v9.4s\n"
+ "mov v25.16b, v31.16b\n fmla v25.4s, v6.4s, v9.4s\n"
+ "ldr x20, [x15, #0x28]\n"
+ "add x20, x20, x14\n"
+ "mov v24.16b, v31.16b\n fmla v24.4s, v7.4s, v9.4s\n"
+ "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+ "mov v27.16b, v31.16b\n fmla v27.4s, v4.4s, v9.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v3.4s, v9.4s\n"
+ "mov v29.16b, v31.16b\n fmla v29.4s, v2.4s, v9.4s\n"
"fmla v23.4s, v0.4s, v10.4s\n"
"fmla v25.4s, v2.4s, v11.4s\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "mov v30.16b, v31.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+ "fmla v31.4s, v0.4s, v9.4s\n"
"fmla v29.4s, v6.4s, v12.4s\n"
"fmla v23.4s, v5.4s, v13.4s\n"
"fmla v24.4s, v4.4s, v13.4s\n"
@@ -534,9 +534,9 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"6:" // Oddments: Load input (4, 4): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"7:" // Oddments: Load input (4, 4): Bit 1: End
- "ldr x20, [x14, #0x30]\n"
+ "ldr x20, [x15, #0x30]\n"
"fmla v31.4s, v8.4s, v12.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 8f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 9f\n"
@@ -545,10 +545,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"8:" // Oddments: Load input (2, 1): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"9:" // Oddments: Load input (2, 1): Bit 1: End
- "ldr x20, [x14, #0x38]\n"
+ "ldr x20, [x15, #0x38]\n"
"fmla v23.4s, v7.4s, v11.4s\n"
"fmla v24.4s, v6.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v26.4s, v4.4s, v11.4s\n"
"fmla v27.4s, v3.4s, v11.4s\n"
"fmla v29.4s, v1.4s, v11.4s\n"
@@ -561,10 +561,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"10:" // Oddments: Load input (0, 1): Bit 1: Unset
"ld1 { v13.s }[0], [x20], #0x4\n"
"11:" // Oddments: Load input (0, 1): Bit 1: End
- "ldr x20, [x14, #0x40]\n"
+ "ldr x20, [x15, #0x40]\n"
"fmla v23.4s, v1.4s, v13.4s\n"
"fmla v24.4s, v0.4s, v13.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 12f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 13f\n"
@@ -573,10 +573,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"12:" // Oddments: Load input (0, 3): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"13:" // Oddments: Load input (0, 3): Bit 1: End
- "ldr x20, [x14, #0x48]\n"
+ "ldr x20, [x15, #0x48]\n"
"fmla v24.4s, v2.4s, v12.4s\n"
"fmla v25.4s, v1.4s, v12.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 14f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 15f\n"
@@ -585,10 +585,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"14:" // Oddments: Load input (2, 3): Bit 1: Unset
"ld1 { v10.s }[0], [x20], #0x4\n"
"15:" // Oddments: Load input (2, 3): Bit 1: End
- "ldr x20, [x14, #0x50]\n"
+ "ldr x20, [x15, #0x50]\n"
"fmla v24.4s, v8.4s, v10.4s\n"
"fmla v25.4s, v7.4s, v10.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v27.4s, v5.4s, v10.4s\n"
"fmla v28.4s, v4.4s, v10.4s\n"
"fmla v30.4s, v2.4s, v10.4s\n"
@@ -601,10 +601,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"16:" // Oddments: Load input (1, 0): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"17:" // Oddments: Load input (1, 0): Bit 1: End
- "ldr x20, [x14, #0x58]\n"
+ "ldr x20, [x15, #0x58]\n"
"fmla v23.4s, v3.4s, v11.4s\n"
"fmla v26.4s, v0.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 18f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 19f\n"
@@ -613,10 +613,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"18:" // Oddments: Load input (1, 4): Bit 1: Unset
"ld1 { v13.s }[0], [x20], #0x4\n"
"19:" // Oddments: Load input (1, 4): Bit 1: End
- "ldr x20, [x14, #0x60]\n"
+ "ldr x20, [x15, #0x60]\n"
"fmla v25.4s, v5.4s, v13.4s\n"
"fmla v28.4s, v2.4s, v13.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 20f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 21f\n"
@@ -625,10 +625,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"20:" // Oddments: Load input (3, 0): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"21:" // Oddments: Load input (3, 0): Bit 1: End
- "ldr x20, [x14, #0x68]\n"
+ "ldr x20, [x15, #0x68]\n"
"fmla v26.4s, v6.4s, v12.4s\n"
"fmla v29.4s, v3.4s, v12.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 22f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 23f\n"
@@ -637,10 +637,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"22:" // Oddments: Load input (3, 2): Bit 1: Unset
"ld1 { v10.s }[0], [x20], #0x4\n"
"23:" // Oddments: Load input (3, 2): Bit 1: End
- "ldr x20, [x14, #0x70]\n"
+ "ldr x20, [x15, #0x70]\n"
"fmla v26.4s, v8.4s, v10.4s\n"
"fmla v27.4s, v7.4s, v10.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v28.4s, v6.4s, v10.4s\n"
"fmla v29.4s, v5.4s, v10.4s\n"
"fmla v30.4s, v4.4s, v10.4s\n"
@@ -653,10 +653,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"24:" // Oddments: Load input (3, 4): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"25:" // Oddments: Load input (3, 4): Bit 1: End
- "ldr x20, [x14, #0x78]\n"
+ "ldr x20, [x15, #0x78]\n"
"fmla v28.4s, v8.4s, v11.4s\n"
"fmla v31.4s, v5.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 26f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 27f\n"
@@ -665,10 +665,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"26:" // Oddments: Load input (4, 1): Bit 1: Unset
"ld1 { v13.s }[0], [x20], #0x4\n"
"27:" // Oddments: Load input (4, 1): Bit 1: End
- "ldr x20, [x14, #0x80]\n"
+ "ldr x20, [x15, #0x80]\n"
"fmla v29.4s, v7.4s, v13.4s\n"
"fmla v30.4s, v6.4s, v13.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 28f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 29f\n"
@@ -677,10 +677,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"28:" // Oddments: Load input (1, 1): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"29:" // Oddments: Load input (1, 1): Bit 1: End
- "ldr x20, [x14, #0x88]\n"
+ "ldr x20, [x15, #0x88]\n"
"fmla v23.4s, v4.4s, v12.4s\n"
"fmla v24.4s, v3.4s, v12.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v26.4s, v1.4s, v12.4s\n"
"fmla v27.4s, v0.4s, v12.4s\n"
"tbz %x[n_channels], #1, 30f\n"
@@ -691,10 +691,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"30:" // Oddments: Load input (1, 3): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"31:" // Oddments: Load input (1, 3): Bit 1: End
- "ldr x20, [x14, #0x90]\n"
+ "ldr x20, [x15, #0x90]\n"
"fmla v24.4s, v5.4s, v11.4s\n"
"fmla v25.4s, v4.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v27.4s, v2.4s, v11.4s\n"
"fmla v28.4s, v1.4s, v11.4s\n"
"tbz %x[n_channels], #1, 32f\n"
@@ -705,10 +705,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"32:" // Oddments: Load input (4, 3): Bit 1: Unset
"ld1 { v13.s }[0], [x20], #0x4\n"
"33:" // Oddments: Load input (4, 3): Bit 1: End
- "ldr x20, [x14, #0x98]\n"
+ "ldr x20, [x15, #0x98]\n"
"fmla v30.4s, v8.4s, v13.4s\n"
"fmla v31.4s, v7.4s, v13.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 34f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 35f\n"
@@ -717,10 +717,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"34:" // Oddments: Load input (3, 1): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"35:" // Oddments: Load input (3, 1): Bit 1: End
- "ldr x20, [x14, #0xa0]\n"
+ "ldr x20, [x15, #0xa0]\n"
"fmla v26.4s, v7.4s, v12.4s\n"
"fmla v27.4s, v6.4s, v12.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v29.4s, v4.4s, v12.4s\n"
"fmla v30.4s, v3.4s, v12.4s\n"
"tbz %x[n_channels], #1, 36f\n"
@@ -731,10 +731,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"36:" // Oddments: Load input (0, 2): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"37:" // Oddments: Load input (0, 2): Bit 1: End
- "ldr x20, [x14, #0xa8]\n"
+ "ldr x20, [x15, #0xa8]\n"
"fmla v23.4s, v2.4s, v11.4s\n"
"fmla v24.4s, v1.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v25.4s, v0.4s, v11.4s\n"
"tbz %x[n_channels], #1, 38f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
@@ -744,10 +744,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"38:" // Oddments: Load input (3, 3): Bit 1: Unset
"ld1 { v13.s }[0], [x20], #0x4\n"
"39:" // Oddments: Load input (3, 3): Bit 1: End
- "ldr x20, [x14, #0xb0]\n"
+ "ldr x20, [x15, #0xb0]\n"
"fmla v27.4s, v8.4s, v13.4s\n"
"fmla v28.4s, v7.4s, v13.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v30.4s, v5.4s, v13.4s\n"
"fmla v31.4s, v4.4s, v13.4s\n"
"tbz %x[n_channels], #1, 40f\n"
@@ -758,10 +758,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"40:" // Oddments: Load input (2, 0): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"41:" // Oddments: Load input (2, 0): Bit 1: End
- "ldr x20, [x14, #0xb8]\n"
+ "ldr x20, [x15, #0xb8]\n"
"fmla v23.4s, v6.4s, v12.4s\n"
"fmla v26.4s, v3.4s, v12.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v29.4s, v0.4s, v12.4s\n"
"tbz %x[n_channels], #1, 42f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
@@ -771,10 +771,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"42:" // Oddments: Load input (2, 4): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"43:" // Oddments: Load input (2, 4): Bit 1: End
- "ldr x20, [x14, #0xc0]\n"
+ "ldr x20, [x15, #0xc0]\n"
"fmla v25.4s, v8.4s, v11.4s\n"
"fmla v28.4s, v5.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v31.4s, v2.4s, v11.4s\n"
"tbz %x[n_channels], #1, 44f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
@@ -786,120 +786,120 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"45:" // Oddments: Load input (4, 2): Bit 1: End
"fmla v29.4s, v8.4s, v13.4s\n"
"fmla v30.4s, v7.4s, v13.4s\n"
- "fmax v23.4s, v23.4s, v18.4s\n"
+ "fmax v23.4s, v23.4s, v15.4s\n"
"fmla v31.4s, v6.4s, v13.4s\n"
- "fmax v24.4s, v24.4s, v18.4s\n"
- "fmax v25.4s, v25.4s, v18.4s\n"
- "fmax v26.4s, v26.4s, v18.4s\n"
- "fmax v27.4s, v27.4s, v18.4s\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmin v23.4s, v23.4s, v17.4s\n"
- "fmin v24.4s, v24.4s, v17.4s\n"
- "fmin v25.4s, v25.4s, v17.4s\n"
- "fmin v26.4s, v26.4s, v17.4s\n"
- "fmin v27.4s, v27.4s, v17.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "fmax v30.4s, v30.4s, v15.4s\n"
+ "fmax v31.4s, v31.4s, v15.4s\n"
+ "fmin v23.4s, v23.4s, v14.4s\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "fmin v30.4s, v30.4s, v14.4s\n"
+ "fmin v31.4s, v31.4s, v14.4s\n"
"tbz %x[n_channels], #1, 46f\n"
- "ldr x23, [x16, #0x0]\n"
- "add x23, x23, x12\n"
- "st1 { v23.d }[0], [x23]\n"
- "ldr x22, [x16, #0x8]\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x22, x22, x12\n"
- "add x21, x21, x12\n"
- "ldr x23, [x16, #0x20]\n"
- "add x20, x20, x12\n"
- "add x23, x23, x12\n"
- "st1 { v24.d }[0], [x22]\n"
- "st1 { v25.d }[0], [x21]\n"
- "ldr x22, [x16, #0x28]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x22, x22, x12\n"
- "st1 { v26.d }[0], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
- "st1 { v27.d }[0], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
- "add x12, x12, #0x8\n"
- "st1 { v28.d }[0], [x22]\n"
- "st1 { v29.d }[0], [x21]\n"
- "st1 { v30.d }[0], [x20]\n"
- "st1 { v31.d }[0], [x23]\n"
+ "ldr x20, [x17, #0x0]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.d }[0], [x20]\n"
+ "ldr x23, [x17, #0x8]\n"
+ "ldr x22, [x17, #0x10]\n"
+ "ldr x21, [x17, #0x18]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x20, [x17, #0x20]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
+ "st1 { v24.d }[0], [x23]\n"
+ "st1 { v25.d }[0], [x22]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "add x23, x23, x13\n"
+ "st1 { v26.d }[0], [x21]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "st1 { v27.d }[0], [x20]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "add x20, x20, x13\n"
+ "add x13, x13, #0x8\n"
+ "st1 { v28.d }[0], [x23]\n"
+ "st1 { v29.d }[0], [x22]\n"
+ "st1 { v30.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #0, 47f\n"
- "ldr x23, [x16, #0x0]\n"
- "add x23, x23, x12\n"
- "st1 { v23.s }[2], [x23]\n"
- "ldr x22, [x16, #0x8]\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x22, x22, x12\n"
- "add x21, x21, x12\n"
- "ldr x23, [x16, #0x20]\n"
- "add x20, x20, x12\n"
- "add x23, x23, x12\n"
- "st1 { v24.s }[2], [x22]\n"
- "st1 { v25.s }[2], [x21]\n"
- "ldr x22, [x16, #0x28]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x22, x22, x12\n"
- "st1 { v26.s }[2], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
- "st1 { v27.s }[2], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
- "st1 { v28.s }[2], [x22]\n"
- "st1 { v29.s }[2], [x21]\n"
- "st1 { v30.s }[2], [x20]\n"
- "st1 { v31.s }[2], [x23]\n"
+ "ldr x20, [x17, #0x0]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.s }[2], [x20]\n"
+ "ldr x23, [x17, #0x8]\n"
+ "ldr x22, [x17, #0x10]\n"
+ "ldr x21, [x17, #0x18]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x20, [x17, #0x20]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
+ "st1 { v24.s }[2], [x23]\n"
+ "st1 { v25.s }[2], [x22]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "add x23, x23, x13\n"
+ "st1 { v26.s }[2], [x21]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "st1 { v27.s }[2], [x20]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "add x20, x20, x13\n"
+ "st1 { v28.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x22]\n"
+ "st1 { v30.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Store: Bit 1: Unset
- "ldr x23, [x16, #0x0]\n"
- "add x23, x23, x12\n"
- "st1 { v23.s }[0], [x23]\n"
- "ldr x22, [x16, #0x8]\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x22, x22, x12\n"
- "add x21, x21, x12\n"
- "ldr x23, [x16, #0x20]\n"
- "add x20, x20, x12\n"
- "add x23, x23, x12\n"
- "st1 { v24.s }[0], [x22]\n"
- "st1 { v25.s }[0], [x21]\n"
- "ldr x22, [x16, #0x28]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x22, x22, x12\n"
- "st1 { v26.s }[0], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
- "st1 { v27.s }[0], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
- "st1 { v28.s }[0], [x22]\n"
- "st1 { v29.s }[0], [x21]\n"
- "st1 { v30.s }[0], [x20]\n"
- "st1 { v31.s }[0], [x23]\n"
+ "ldr x20, [x17, #0x0]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.s }[0], [x20]\n"
+ "ldr x23, [x17, #0x8]\n"
+ "ldr x22, [x17, #0x10]\n"
+ "ldr x21, [x17, #0x18]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x20, [x17, #0x20]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
+ "st1 { v24.s }[0], [x23]\n"
+ "st1 { v25.s }[0], [x22]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "add x23, x23, x13\n"
+ "st1 { v26.s }[0], [x21]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "st1 { v27.s }[0], [x20]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "add x20, x20, x13\n"
+ "st1 { v28.s }[0], [x23]\n"
+ "st1 { v29.s }[0], [x22]\n"
+ "st1 { v30.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
"47:" // Oddments: Store: Bit 1: End
"48:" // End
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
index 6c897d6eaa..8a198c1818 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -33,8 +33,8 @@
namespace arm_conv {
namespace depthwise {
-void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
-void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
class a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
{
@@ -57,7 +57,7 @@ class a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 4;
a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<float, float, float, float>(4, 3, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
index 6d2b6ee998..3adf8b0d9f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__
+#if defined(__aarch64__)
namespace arm_conv {
namespace depthwise {
@@ -124,9 +124,9 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"add x27, x10, x22, LSL #2\n"
"add x23, x5, x5\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v15.4s }, [x20]\n"
+ "ld1r { v13.4s }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v14.4s }, [x20]\n"
+ "ld1r { v15.4s }, [x20]\n"
"add x26, x9, x24, LSL #2\n"
"add x25, x28, x4\n"
"add x24, x27, x22, LSL #2\n"
@@ -134,7 +134,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"mov x21, #0x0\n"
"sub x20, XZR, x6\n"
"cbz x13, 4f\n"
- "ldr q13, [x16, #0x0]\n"
+ "ldr q14, [x16, #0x0]\n"
"ldr q0, [x16, #0x10]\n"
"cmp x6, x13, LSL #4\n"
"ldr q1, [x16, #0x20]\n"
@@ -152,499 +152,499 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"ldr q12, [x14, x11]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "mov v21.16b, v13.16b\n fmla v21.4s, v4.4s, v9.4s\n"
- "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v9.4s\n"
+ "mov v26.16b, v14.16b\n fmla v26.4s, v4.4s, v9.4s\n"
+ "mov v28.16b, v14.16b\n fmla v28.4s, v8.4s, v9.4s\n"
"add x6, x6, #0x10\n"
"cmp x6, x13, LSL #4\n"
- "mov v22.16b, v13.16b\n fmla v22.4s, v3.4s, v9.4s\n"
- "mov v25.16b, v13.16b\n fmla v25.4s, v1.4s, v9.4s\n"
+ "mov v16.16b, v14.16b\n fmla v16.4s, v3.4s, v9.4s\n"
+ "mov v22.16b, v14.16b\n fmla v22.4s, v1.4s, v9.4s\n"
"add x20, x20, #0x10\n"
"add x21, x21, #0x10\n"
- "mov v26.16b, v13.16b\n fmla v26.4s, v0.4s, v9.4s\n"
- "fmla v21.4s, v5.4s, v12.4s\n"
- "mov v17.16b, v13.16b\n fmla v17.4s, v7.4s, v9.4s\n"
- "mov v18.16b, v13.16b\n fmla v18.4s, v6.4s, v9.4s\n"
- "mov v20.16b, v13.16b\n fmla v20.4s, v5.4s, v9.4s\n"
- "mov v24.16b, v13.16b\n fmla v24.4s, v2.4s, v9.4s\n"
+ "mov v23.16b, v14.16b\n fmla v23.4s, v0.4s, v9.4s\n"
+ "fmla v26.4s, v5.4s, v12.4s\n"
+ "mov v25.16b, v14.16b\n fmla v25.4s, v7.4s, v9.4s\n"
+ "mov v17.16b, v14.16b\n fmla v17.4s, v6.4s, v9.4s\n"
+ "mov v31.16b, v14.16b\n fmla v31.4s, v5.4s, v9.4s\n"
+ "mov v20.16b, v14.16b\n fmla v20.4s, v2.4s, v9.4s\n"
"ldr q9, [x12, x17]\n"
- "fmla v16.4s, v0.4s, v10.4s\n"
- "ld1 { v10.4s }, [x26]\n"
- "mov v19.16b, v13.16b\n fmla v19.4s, v2.4s, v11.4s\n"
- "ldr q11, [x26, x25]\n"
- "fmla v22.4s, v4.4s, v12.4s\n"
- "fmla v25.4s, v2.4s, v12.4s\n"
- "fmla v26.4s, v1.4s, v12.4s\n"
- "mov v28.16b, v13.16b\n fmla v28.4s, v6.4s, v10.4s\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "ld1 { v30.4s }, [x26]\n"
+ "mov v29.16b, v14.16b\n fmla v29.4s, v2.4s, v11.4s\n"
+ "ldr q27, [x26, x25]\n"
+ "fmla v16.4s, v4.4s, v12.4s\n"
+ "fmla v22.4s, v2.4s, v12.4s\n"
+ "fmla v23.4s, v1.4s, v12.4s\n"
+ "mov v21.16b, v14.16b\n fmla v21.4s, v6.4s, v30.4s\n"
"ldr q10, [x12, x11]\n"
- "fmla v21.4s, v7.4s, v9.4s\n"
- "fmla v17.4s, v8.4s, v12.4s\n"
- "fmla v18.4s, v7.4s, v12.4s\n"
- "fmla v19.4s, v6.4s, v12.4s\n"
- "mov v23.16b, v13.16b\n fmla v23.4s, v3.4s, v12.4s\n"
- "mov v27.16b, v13.16b\n fmla v27.4s, v0.4s, v12.4s\n"
- "ldr q12, [x7, x4]\n"
- "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v11.4s\n"
- "ldr q11, [x7, x28]\n"
- "fmla v22.4s, v6.4s, v9.4s\n"
- "fmla v25.4s, v4.4s, v9.4s\n"
- "fmla v26.4s, v3.4s, v9.4s\n"
- "fmla v20.4s, v8.4s, v9.4s\n"
- "fmla v24.4s, v5.4s, v9.4s\n"
- "fmla v28.4s, v2.4s, v9.4s\n"
- "fmla v21.4s, v8.4s, v10.4s\n"
- "fmla v16.4s, v1.4s, v12.4s\n"
- "fmla v17.4s, v0.4s, v12.4s\n"
- "ldr q12, [x15, x25]\n"
- "fmla v18.4s, v2.4s, v11.4s\n"
- "fmla v19.4s, v1.4s, v11.4s\n"
- "ld1 { v11.4s }, [x9]\n"
- "fmla v22.4s, v7.4s, v10.4s\n"
- "fmla v23.4s, v6.4s, v10.4s\n"
- "fmla v25.4s, v5.4s, v10.4s\n"
- "fmla v26.4s, v4.4s, v10.4s\n"
- "fmla v27.4s, v3.4s, v10.4s\n"
- "fmla v31.4s, v0.4s, v10.4s\n"
- "fmla v24.4s, v6.4s, v11.4s\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ldr q11, [x9, x25]\n"
- "fmla v19.4s, v5.4s, v12.4s\n"
- "fmla v23.4s, v2.4s, v12.4s\n"
- "ldr q12, [x15, x11]\n"
- "fmla v27.4s, v8.4s, v11.4s\n"
- "fmla v31.4s, v5.4s, v11.4s\n"
- "mov v29.16b, v13.16b\n fmla v29.4s, v1.4s, v9.4s\n"
- "mov v30.16b, v13.16b\n fmla v30.4s, v0.4s, v9.4s\n"
+ "fmla v26.4s, v7.4s, v9.4s\n"
+ "fmla v25.4s, v8.4s, v12.4s\n"
+ "fmla v17.4s, v7.4s, v12.4s\n"
+ "fmla v29.4s, v6.4s, v12.4s\n"
+ "mov v24.16b, v14.16b\n fmla v24.4s, v3.4s, v12.4s\n"
+ "mov v19.16b, v14.16b\n fmla v19.4s, v0.4s, v12.4s\n"
+ "ldr q11, [x7, x4]\n"
+ "mov v30.16b, v14.16b\n fmla v30.4s, v8.4s, v27.4s\n"
+ "ldr q12, [x7, x28]\n"
+ "fmla v16.4s, v6.4s, v9.4s\n"
+ "fmla v22.4s, v4.4s, v9.4s\n"
+ "fmla v23.4s, v3.4s, v9.4s\n"
+ "mov v27.16b, v14.16b\n fmla v27.4s, v1.4s, v9.4s\n"
+ "mov v18.16b, v14.16b\n fmla v18.4s, v0.4s, v9.4s\n"
+ "ldr q14, [x16, #0x0]\n"
+ "fmla v31.4s, v8.4s, v9.4s\n"
+ "fmla v20.4s, v5.4s, v9.4s\n"
+ "fmla v21.4s, v2.4s, v9.4s\n"
"ld1 { v9.4s }, [x15]\n"
- "fmla v29.4s, v2.4s, v10.4s\n"
- "fmla v30.4s, v1.4s, v10.4s\n"
+ "fmla v26.4s, v8.4s, v10.4s\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "fmla v25.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x15, x25]\n"
+ "fmla v17.4s, v2.4s, v12.4s\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "ld1 { v12.4s }, [x9]\n"
+ "fmla v16.4s, v7.4s, v10.4s\n"
+ "fmla v24.4s, v6.4s, v10.4s\n"
+ "fmla v22.4s, v5.4s, v10.4s\n"
+ "fmla v23.4s, v4.4s, v10.4s\n"
+ "fmla v19.4s, v3.4s, v10.4s\n"
+ "fmla v27.4s, v2.4s, v10.4s\n"
+ "fmla v18.4s, v1.4s, v10.4s\n"
+ "fmla v30.4s, v0.4s, v10.4s\n"
"ldr q10, [x15, x17]\n"
- "fmla v20.4s, v0.4s, v9.4s\n"
- "fmla v21.4s, v1.4s, v10.4s\n"
- "fmla v16.4s, v3.4s, v9.4s\n"
- "ldr q11, [x26, x4]\n"
- "fmla v17.4s, v4.4s, v10.4s\n"
- "fmla v18.4s, v3.4s, v10.4s\n"
- "fmla v22.4s, v0.4s, v10.4s\n"
- "fmla v20.4s, v2.4s, v10.4s\n"
- "fmla v21.4s, v2.4s, v12.4s\n"
- "fmla v16.4s, v5.4s, v10.4s\n"
+ "fmla v31.4s, v0.4s, v9.4s\n"
+ "fmla v20.4s, v6.4s, v12.4s\n"
+ "fmla v21.4s, v3.4s, v12.4s\n"
+ "ldr q12, [x9, x25]\n"
+ "fmla v26.4s, v1.4s, v10.4s\n"
+ "fmla v28.4s, v3.4s, v9.4s\n"
+ "fmla v29.4s, v5.4s, v11.4s\n"
+ "fmla v24.4s, v2.4s, v11.4s\n"
+ "ldr q11, [x15, x11]\n"
+ "fmla v25.4s, v4.4s, v10.4s\n"
+ "fmla v17.4s, v3.4s, v10.4s\n"
+ "fmla v16.4s, v0.4s, v10.4s\n"
+ "fmla v19.4s, v8.4s, v12.4s\n"
+ "fmla v30.4s, v5.4s, v12.4s\n"
+ "ldr q9, [x26, x4]\n"
+ "fmla v31.4s, v2.4s, v10.4s\n"
+ "fmla v26.4s, v2.4s, v11.4s\n"
+ "fmla v28.4s, v5.4s, v10.4s\n"
"ldr q10, [x14, x4]\n"
- "fmla v17.4s, v5.4s, v12.4s\n"
- "fmla v18.4s, v4.4s, v12.4s\n"
- "fmla v19.4s, v3.4s, v12.4s\n"
- "fmla v22.4s, v1.4s, v12.4s\n"
- "fmla v23.4s, v0.4s, v12.4s\n"
- "ldr q12, [x14, x28]\n"
- "fmla v28.4s, v7.4s, v11.4s\n"
- "fmla v29.4s, v6.4s, v11.4s\n"
- "ldr q11, [x26, x28]\n"
- "fmla v20.4s, v4.4s, v10.4s\n"
- "fmla v21.4s, v3.4s, v10.4s\n"
- "fmla v24.4s, v1.4s, v10.4s\n"
- "fmla v25.4s, v0.4s, v10.4s\n"
- "fmla v16.4s, v7.4s, v10.4s\n"
- "fmla v17.4s, v6.4s, v10.4s\n"
+ "fmla v25.4s, v5.4s, v11.4s\n"
+ "fmla v17.4s, v4.4s, v11.4s\n"
+ "fmla v29.4s, v3.4s, v11.4s\n"
+ "fmla v16.4s, v1.4s, v11.4s\n"
+ "fmla v24.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x14, x28]\n"
+ "fmla v21.4s, v7.4s, v9.4s\n"
+ "fmla v27.4s, v6.4s, v9.4s\n"
+ "ldr q12, [x26, x28]\n"
+ "fmla v31.4s, v4.4s, v10.4s\n"
+ "fmla v26.4s, v3.4s, v10.4s\n"
+ "fmla v20.4s, v1.4s, v10.4s\n"
+ "fmla v22.4s, v0.4s, v10.4s\n"
+ "fmla v28.4s, v7.4s, v10.4s\n"
+ "fmla v25.4s, v6.4s, v10.4s\n"
"ldr q10, [x7, x17]\n"
- "fmla v30.4s, v8.4s, v11.4s\n"
- "fmla v31.4s, v7.4s, v11.4s\n"
- "ldr q11, [x12, x4]\n"
"fmla v18.4s, v8.4s, v12.4s\n"
- "fmla v19.4s, v7.4s, v12.4s\n"
- "fmla v22.4s, v5.4s, v12.4s\n"
- "fmla v23.4s, v4.4s, v12.4s\n"
- "fmla v26.4s, v2.4s, v12.4s\n"
- "fmla v27.4s, v1.4s, v12.4s\n"
+ "fmla v30.4s, v7.4s, v12.4s\n"
+ "ldr q9, [x12, x4]\n"
+ "fmla v17.4s, v8.4s, v11.4s\n"
+ "fmla v29.4s, v7.4s, v11.4s\n"
+ "fmla v16.4s, v5.4s, v11.4s\n"
+ "fmla v24.4s, v4.4s, v11.4s\n"
+ "fmla v23.4s, v2.4s, v11.4s\n"
+ "fmla v19.4s, v1.4s, v11.4s\n"
"ldr q12, [x7, x11]\n"
"add x7, x7, #0x10\n"
- "fmla v20.4s, v7.4s, v11.4s\n"
- "fmla v21.4s, v6.4s, v11.4s\n"
- "fmla v24.4s, v4.4s, v11.4s\n"
- "fmla v25.4s, v3.4s, v11.4s\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "fmla v29.4s, v0.4s, v11.4s\n"
- "ldr q11, [x12, x28]\n"
- "fmla v16.4s, v2.4s, v10.4s\n"
- "fmla v17.4s, v1.4s, v10.4s\n"
- "fmla v18.4s, v0.4s, v10.4s\n"
+ "fmla v31.4s, v7.4s, v9.4s\n"
+ "fmla v26.4s, v6.4s, v9.4s\n"
+ "fmla v20.4s, v4.4s, v9.4s\n"
+ "fmla v22.4s, v3.4s, v9.4s\n"
+ "fmla v21.4s, v1.4s, v9.4s\n"
+ "fmla v27.4s, v0.4s, v9.4s\n"
+ "ldr q9, [x12, x28]\n"
+ "fmla v28.4s, v2.4s, v10.4s\n"
+ "fmla v25.4s, v1.4s, v10.4s\n"
+ "fmla v17.4s, v0.4s, v10.4s\n"
"ld1 { v10.4s }, [x14]\n"
- "fmla v30.4s, v2.4s, v11.4s\n"
- "fmla v19.4s, v0.4s, v12.4s\n"
- "fmla v20.4s, v3.4s, v10.4s\n"
- "fmla v24.4s, v0.4s, v10.4s\n"
- "fmla v22.4s, v8.4s, v11.4s\n"
- "fmla v23.4s, v7.4s, v11.4s\n"
- "fmla v26.4s, v5.4s, v11.4s\n"
- "fmla v27.4s, v4.4s, v11.4s\n"
- "fmla v31.4s, v1.4s, v11.4s\n"
+ "fmla v18.4s, v2.4s, v9.4s\n"
+ "fmla v29.4s, v0.4s, v12.4s\n"
+ "fmla v31.4s, v3.4s, v10.4s\n"
+ "fmla v20.4s, v0.4s, v10.4s\n"
+ "fmla v16.4s, v8.4s, v9.4s\n"
+ "fmla v24.4s, v7.4s, v9.4s\n"
+ "fmla v23.4s, v5.4s, v9.4s\n"
+ "fmla v19.4s, v4.4s, v9.4s\n"
+ "fmla v30.4s, v1.4s, v9.4s\n"
"ldr q11, [x9, x17]\n"
- "fmla v17.4s, v2.4s, v12.4s\n"
- "fmla v18.4s, v1.4s, v12.4s\n"
+ "fmla v25.4s, v2.4s, v12.4s\n"
+ "fmla v17.4s, v1.4s, v12.4s\n"
"ldr q12, [x14, x25]\n"
"add x14, x14, #0x10\n"
- "fmla v16.4s, v6.4s, v10.4s\n"
+ "ldr q9, [x14, x17]\n"
+ "fmla v28.4s, v6.4s, v10.4s\n"
"ld1 { v10.4s }, [x12]\n"
- "fmla v29.4s, v4.4s, v11.4s\n"
- "fmla v30.4s, v3.4s, v11.4s\n"
- "fmla v19.4s, v8.4s, v12.4s\n"
- "fmla v23.4s, v5.4s, v12.4s\n"
- "fmla v27.4s, v2.4s, v12.4s\n"
+ "fmla v27.4s, v4.4s, v11.4s\n"
+ "fmla v18.4s, v3.4s, v11.4s\n"
+ "fmla v29.4s, v8.4s, v12.4s\n"
+ "fmla v24.4s, v5.4s, v12.4s\n"
+ "fmla v19.4s, v2.4s, v12.4s\n"
"ldr q12, [x12, x25]\n"
"add x12, x12, #0x10\n"
- "fmla v20.4s, v6.4s, v10.4s\n"
- "fmla v24.4s, v3.4s, v10.4s\n"
- "fmla v28.4s, v0.4s, v10.4s\n"
+ "fmla v31.4s, v6.4s, v10.4s\n"
+ "fmla v20.4s, v3.4s, v10.4s\n"
+ "fmla v21.4s, v0.4s, v10.4s\n"
"ldr q10, [x26, x17]\n"
- "fmla v31.4s, v2.4s, v12.4s\n"
- "fmla v29.4s, v7.4s, v10.4s\n"
- "fmla v30.4s, v6.4s, v10.4s\n"
- "fmla v24.4s, v8.4s, v11.4s\n"
- "fmla v25.4s, v7.4s, v11.4s\n"
- "fmla v26.4s, v6.4s, v11.4s\n"
- "fmla v28.4s, v5.4s, v11.4s\n"
+ "fmla v30.4s, v2.4s, v12.4s\n"
+ "fmla v27.4s, v7.4s, v10.4s\n"
+ "fmla v18.4s, v6.4s, v10.4s\n"
+ "fmla v20.4s, v8.4s, v11.4s\n"
+ "fmla v22.4s, v7.4s, v11.4s\n"
+ "fmla v23.4s, v6.4s, v11.4s\n"
+ "fmla v21.4s, v5.4s, v11.4s\n"
"ldr q11, [x9, x11]\n"
- "fmla v27.4s, v5.4s, v12.4s\n"
- "fmla v29.4s, v5.4s, v11.4s\n"
- "fmla v30.4s, v4.4s, v11.4s\n"
- "fmla v31.4s, v3.4s, v11.4s\n"
- "fmla v23.4s, v8.4s, v12.4s\n"
+ "fmla v19.4s, v5.4s, v12.4s\n"
+ "fmla v27.4s, v5.4s, v11.4s\n"
+ "fmla v18.4s, v4.4s, v11.4s\n"
+ "fmla v30.4s, v3.4s, v11.4s\n"
+ "fmla v24.4s, v8.4s, v12.4s\n"
"ldr q12, [x26, x11]\n"
- "fmla v28.4s, v8.4s, v10.4s\n"
+ "fmla v21.4s, v8.4s, v10.4s\n"
"ldr q10, [x15, x4]\n"
- "fmla v25.4s, v8.4s, v11.4s\n"
- "fmla v26.4s, v7.4s, v11.4s\n"
+ "fmla v22.4s, v8.4s, v11.4s\n"
+ "fmla v23.4s, v7.4s, v11.4s\n"
"add x26, x26, #0x10\n"
- "fmla v27.4s, v6.4s, v11.4s\n"
+ "fmla v19.4s, v6.4s, v11.4s\n"
"ldr q11, [x15, x28]\n"
- "fmla v29.4s, v8.4s, v12.4s\n"
+ "fmla v27.4s, v8.4s, v12.4s\n"
"add x15, x15, #0x10\n"
- "fmla v30.4s, v7.4s, v12.4s\n"
- "fmla v31.4s, v6.4s, v12.4s\n"
+ "fmla v18.4s, v7.4s, v12.4s\n"
+ "fmla v30.4s, v6.4s, v12.4s\n"
"ldr q12, [x9, x4]\n"
- "fmla v16.4s, v4.4s, v10.4s\n"
- "fmla v17.4s, v3.4s, v10.4s\n"
- "fmax v16.4s, v16.4s, v15.4s\n"
- "fmla v20.4s, v1.4s, v10.4s\n"
- "fmla v21.4s, v0.4s, v10.4s\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "fmla v25.4s, v3.4s, v10.4s\n"
+ "fmax v28.4s, v28.4s, v13.4s\n"
+ "fmla v31.4s, v1.4s, v10.4s\n"
+ "fmla v26.4s, v0.4s, v10.4s\n"
"ldr q10, [x9, x28]\n"
- "ldr q9, [x14, x17]\n"
- "fmla v18.4s, v5.4s, v11.4s\n"
- "fmla v19.4s, v4.4s, v11.4s\n"
- "fmax v17.4s, v17.4s, v15.4s\n"
+ "ldr q0, [x16, #0x10]\n"
+ "fmla v17.4s, v5.4s, v11.4s\n"
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "fmax v25.4s, v25.4s, v13.4s\n"
"add x9, x9, #0x10\n"
- "fmla v22.4s, v2.4s, v11.4s\n"
- "ldr q13, [x16, #0x0]\n"
- "fmla v23.4s, v1.4s, v11.4s\n"
+ "fmla v16.4s, v2.4s, v11.4s\n"
+ "ldr q2, [x16, #0x30]\n"
+ "fmla v24.4s, v1.4s, v11.4s\n"
"ldr q11, [x7, x25]\n"
- "ldr q0, [x16, #0x10]\n"
- "fmla v24.4s, v7.4s, v12.4s\n"
- "fmla v25.4s, v6.4s, v12.4s\n"
"ldr q1, [x16, #0x20]\n"
- "fmla v28.4s, v4.4s, v12.4s\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
+ "fmla v20.4s, v7.4s, v12.4s\n"
+ "fmla v22.4s, v6.4s, v12.4s\n"
+ "ldr q6, [x16, #0x70]\n"
+ "fmla v21.4s, v4.4s, v12.4s\n"
+ "fmla v27.4s, v3.4s, v12.4s\n"
"ldr q12, [x14, x11]\n"
- "ldr q2, [x16, #0x30]\n"
- "fmla v26.4s, v8.4s, v10.4s\n"
"ldr q3, [x16, #0x40]\n"
- "fmla v27.4s, v7.4s, v10.4s\n"
- "ldr q6, [x16, #0x70]\n"
- "fmla v30.4s, v5.4s, v10.4s\n"
+ "fmla v23.4s, v8.4s, v10.4s\n"
+ "ldr q8, [x16, #0x90]\n"
+ "fmla v19.4s, v7.4s, v10.4s\n"
+ "ldr q7, [x16, #0x80]\n"
+ "fmla v18.4s, v5.4s, v10.4s\n"
"ldr q5, [x16, #0x60]\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
+ "fmla v30.4s, v4.4s, v10.4s\n"
"ld1 { v10.4s }, [x7]\n"
"ldr q4, [x16, #0x50]\n"
- "fmax v18.4s, v18.4s, v15.4s\n"
- "fmax v19.4s, v19.4s, v15.4s\n"
- "fmax v20.4s, v20.4s, v15.4s\n"
- "fmax v21.4s, v21.4s, v15.4s\n"
- "fmax v22.4s, v22.4s, v15.4s\n"
- "fmax v23.4s, v23.4s, v15.4s\n"
- "fmax v24.4s, v24.4s, v15.4s\n"
- "fmax v25.4s, v25.4s, v15.4s\n"
- "fmax v26.4s, v26.4s, v15.4s\n"
- "fmax v27.4s, v27.4s, v15.4s\n"
- "fmax v28.4s, v28.4s, v15.4s\n"
- "fmax v29.4s, v29.4s, v15.4s\n"
- "fmax v30.4s, v30.4s, v15.4s\n"
- "fmax v31.4s, v31.4s, v15.4s\n"
- "fmin v16.4s, v16.4s, v14.4s\n"
- "fmin v17.4s, v17.4s, v14.4s\n"
- "st1 { v16.4s }, [x8]\n"
- "ldr q7, [x16, #0x80]\n"
- "fmin v18.4s, v18.4s, v14.4s\n"
- "fmin v19.4s, v19.4s, v14.4s\n"
- "str q17, [x8, x5]\n"
- "ldr q8, [x16, #0x90]\n"
- "fmin v20.4s, v20.4s, v14.4s\n"
- "fmin v21.4s, v21.4s, v14.4s\n"
- "str q18, [x8, x23]\n"
+ "fmax v17.4s, v17.4s, v13.4s\n"
+ "fmax v29.4s, v29.4s, v13.4s\n"
"add x16, x16, #0xa0\n"
- "fmin v22.4s, v22.4s, v14.4s\n"
- "fmin v23.4s, v23.4s, v14.4s\n"
- "str q19, [x8, x22]\n"
+ "fmax v31.4s, v31.4s, v13.4s\n"
+ "fmax v26.4s, v26.4s, v13.4s\n"
+ "fmax v16.4s, v16.4s, v13.4s\n"
+ "fmax v24.4s, v24.4s, v13.4s\n"
+ "fmax v20.4s, v20.4s, v13.4s\n"
+ "fmax v22.4s, v22.4s, v13.4s\n"
+ "fmax v23.4s, v23.4s, v13.4s\n"
+ "fmax v19.4s, v19.4s, v13.4s\n"
+ "fmax v21.4s, v21.4s, v13.4s\n"
+ "fmax v27.4s, v27.4s, v13.4s\n"
+ "fmax v18.4s, v18.4s, v13.4s\n"
+ "fmax v30.4s, v30.4s, v13.4s\n"
+ "fmin v28.4s, v28.4s, v15.4s\n"
+ "fmin v25.4s, v25.4s, v15.4s\n"
+ "st1 { v28.4s }, [x8]\n"
+ "fmin v17.4s, v17.4s, v15.4s\n"
+ "fmin v29.4s, v29.4s, v15.4s\n"
+ "str q25, [x8, x5]\n"
+ "fmin v31.4s, v31.4s, v15.4s\n"
+ "fmin v26.4s, v26.4s, v15.4s\n"
+ "str q17, [x8, x23]\n"
+ "fmin v16.4s, v16.4s, v15.4s\n"
+ "fmin v24.4s, v24.4s, v15.4s\n"
+ "str q29, [x8, x22]\n"
"add x8, x8, #0x10\n"
- "fmin v24.4s, v24.4s, v14.4s\n"
- "fmin v25.4s, v25.4s, v14.4s\n"
- "st1 { v20.4s }, [x10]\n"
- "fmin v26.4s, v26.4s, v14.4s\n"
- "fmin v27.4s, v27.4s, v14.4s\n"
- "str q21, [x10, x5]\n"
- "fmin v28.4s, v28.4s, v14.4s\n"
- "fmin v29.4s, v29.4s, v14.4s\n"
- "str q22, [x10, x23]\n"
- "fmin v30.4s, v30.4s, v14.4s\n"
- "fmin v31.4s, v31.4s, v14.4s\n"
- "str q23, [x10, x22]\n"
+ "fmin v20.4s, v20.4s, v15.4s\n"
+ "fmin v22.4s, v22.4s, v15.4s\n"
+ "st1 { v31.4s }, [x10]\n"
+ "fmin v23.4s, v23.4s, v15.4s\n"
+ "fmin v19.4s, v19.4s, v15.4s\n"
+ "str q26, [x10, x5]\n"
+ "fmin v21.4s, v21.4s, v15.4s\n"
+ "fmin v27.4s, v27.4s, v15.4s\n"
+ "str q16, [x10, x23]\n"
+ "fmin v18.4s, v18.4s, v15.4s\n"
+ "fmin v30.4s, v30.4s, v15.4s\n"
+ "str q24, [x10, x22]\n"
"add x10, x10, #0x10\n"
- "st1 { v24.4s }, [x27]\n"
- "str q25, [x27, x5]\n"
- "str q26, [x27, x23]\n"
- "str q27, [x27, x22]\n"
+ "st1 { v20.4s }, [x27]\n"
+ "str q22, [x27, x5]\n"
+ "str q23, [x27, x23]\n"
+ "str q19, [x27, x22]\n"
"add x27, x27, #0x10\n"
- "st1 { v28.4s }, [x24]\n"
- "str q29, [x24, x5]\n"
- "str q30, [x24, x23]\n"
- "str q31, [x24, x22]\n"
+ "st1 { v21.4s }, [x24]\n"
+ "str q27, [x24, x5]\n"
+ "str q18, [x24, x23]\n"
+ "str q30, [x24, x22]\n"
"add x24, x24, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "mov v21.16b, v13.16b\n fmla v21.4s, v4.4s, v9.4s\n"
- "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v9.4s\n"
- "mov v22.16b, v13.16b\n fmla v22.4s, v3.4s, v9.4s\n"
- "mov v25.16b, v13.16b\n fmla v25.4s, v1.4s, v9.4s\n"
- "mov v26.16b, v13.16b\n fmla v26.4s, v0.4s, v9.4s\n"
- "fmla v21.4s, v5.4s, v12.4s\n"
- "mov v17.16b, v13.16b\n fmla v17.4s, v7.4s, v9.4s\n"
- "mov v18.16b, v13.16b\n fmla v18.4s, v6.4s, v9.4s\n"
- "mov v20.16b, v13.16b\n fmla v20.4s, v5.4s, v9.4s\n"
- "mov v24.16b, v13.16b\n fmla v24.4s, v2.4s, v9.4s\n"
- "ldr q9, [x12, x17]\n"
- "fmla v16.4s, v0.4s, v10.4s\n"
- "ld1 { v10.4s }, [x26]\n"
- "mov v19.16b, v13.16b\n fmla v19.4s, v2.4s, v11.4s\n"
- "ldr q11, [x26, x25]\n"
- "fmla v22.4s, v4.4s, v12.4s\n"
- "fmla v25.4s, v2.4s, v12.4s\n"
- "fmla v26.4s, v1.4s, v12.4s\n"
- "mov v28.16b, v13.16b\n fmla v28.4s, v6.4s, v10.4s\n"
- "ldr q10, [x12, x11]\n"
- "fmla v21.4s, v7.4s, v9.4s\n"
+ "mov v16.16b, v14.16b\n fmla v16.4s, v4.4s, v9.4s\n"
+ "mov v23.16b, v14.16b\n fmla v23.4s, v8.4s, v9.4s\n"
+ "mov v31.16b, v14.16b\n fmla v31.4s, v3.4s, v9.4s\n"
+ "mov v30.16b, v14.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+ "mov v18.16b, v14.16b\n fmla v18.4s, v0.4s, v9.4s\n"
+ "fmla v16.4s, v5.4s, v12.4s\n"
+ "mov v17.16b, v14.16b\n fmla v17.4s, v7.4s, v9.4s\n"
+ "mov v19.16b, v14.16b\n fmla v19.4s, v6.4s, v9.4s\n"
+ "mov v28.16b, v14.16b\n fmla v28.4s, v5.4s, v9.4s\n"
+ "mov v27.16b, v14.16b\n fmla v27.4s, v2.4s, v9.4s\n"
+ "ldr q24, [x12, x17]\n"
+ "fmla v23.4s, v0.4s, v10.4s\n"
+ "ld1 { v21.4s }, [x26]\n"
+ "mov v29.16b, v14.16b\n fmla v29.4s, v2.4s, v11.4s\n"
+ "ldr q20, [x26, x25]\n"
+ "fmla v31.4s, v4.4s, v12.4s\n"
+ "fmla v30.4s, v2.4s, v12.4s\n"
+ "fmla v18.4s, v1.4s, v12.4s\n"
+ "mov v26.16b, v14.16b\n fmla v26.4s, v6.4s, v21.4s\n"
+ "ldr q9, [x12, x11]\n"
+ "fmla v16.4s, v7.4s, v24.4s\n"
"fmla v17.4s, v8.4s, v12.4s\n"
- "fmla v18.4s, v7.4s, v12.4s\n"
- "fmla v19.4s, v6.4s, v12.4s\n"
- "mov v23.16b, v13.16b\n fmla v23.4s, v3.4s, v12.4s\n"
- "mov v27.16b, v13.16b\n fmla v27.4s, v0.4s, v12.4s\n"
- "ldr q12, [x7, x4]\n"
- "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v11.4s\n"
- "ldr q11, [x7, x28]\n"
- "fmla v22.4s, v6.4s, v9.4s\n"
- "fmla v25.4s, v4.4s, v9.4s\n"
- "fmla v26.4s, v3.4s, v9.4s\n"
- "fmla v20.4s, v8.4s, v9.4s\n"
- "fmla v24.4s, v5.4s, v9.4s\n"
- "fmla v28.4s, v2.4s, v9.4s\n"
- "fmla v21.4s, v8.4s, v10.4s\n"
- "fmla v16.4s, v1.4s, v12.4s\n"
- "fmla v17.4s, v0.4s, v12.4s\n"
- "ldr q12, [x15, x25]\n"
- "fmla v18.4s, v2.4s, v11.4s\n"
- "fmla v19.4s, v1.4s, v11.4s\n"
- "ld1 { v11.4s }, [x9]\n"
- "fmla v22.4s, v7.4s, v10.4s\n"
- "fmla v23.4s, v6.4s, v10.4s\n"
- "fmla v25.4s, v5.4s, v10.4s\n"
- "fmla v26.4s, v4.4s, v10.4s\n"
- "fmla v27.4s, v3.4s, v10.4s\n"
- "fmla v31.4s, v0.4s, v10.4s\n"
- "fmla v24.4s, v6.4s, v11.4s\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ldr q11, [x9, x25]\n"
- "fmla v19.4s, v5.4s, v12.4s\n"
- "fmla v23.4s, v2.4s, v12.4s\n"
- "ldr q12, [x15, x11]\n"
- "fmla v27.4s, v8.4s, v11.4s\n"
- "fmla v31.4s, v5.4s, v11.4s\n"
- "mov v29.16b, v13.16b\n fmla v29.4s, v1.4s, v9.4s\n"
- "mov v30.16b, v13.16b\n fmla v30.4s, v0.4s, v9.4s\n"
- "ld1 { v9.4s }, [x15]\n"
- "fmla v29.4s, v2.4s, v10.4s\n"
- "fmla v30.4s, v1.4s, v10.4s\n"
- "ldr q10, [x15, x17]\n"
- "fmla v20.4s, v0.4s, v9.4s\n"
- "fmla v21.4s, v1.4s, v10.4s\n"
- "fmla v16.4s, v3.4s, v9.4s\n"
- "ldr q11, [x26, x4]\n"
- "fmla v17.4s, v4.4s, v10.4s\n"
- "fmla v18.4s, v3.4s, v10.4s\n"
- "fmla v22.4s, v0.4s, v10.4s\n"
- "fmla v20.4s, v2.4s, v10.4s\n"
- "fmla v21.4s, v2.4s, v12.4s\n"
- "fmla v16.4s, v5.4s, v10.4s\n"
- "ldr q10, [x14, x4]\n"
- "fmla v17.4s, v5.4s, v12.4s\n"
- "fmla v18.4s, v4.4s, v12.4s\n"
- "fmla v19.4s, v3.4s, v12.4s\n"
- "fmla v22.4s, v1.4s, v12.4s\n"
- "fmla v23.4s, v0.4s, v12.4s\n"
- "ldr q12, [x14, x28]\n"
- "fmla v28.4s, v7.4s, v11.4s\n"
- "fmla v29.4s, v6.4s, v11.4s\n"
- "ldr q11, [x26, x28]\n"
- "fmla v20.4s, v4.4s, v10.4s\n"
- "fmla v21.4s, v3.4s, v10.4s\n"
- "fmla v24.4s, v1.4s, v10.4s\n"
- "fmla v25.4s, v0.4s, v10.4s\n"
- "fmla v16.4s, v7.4s, v10.4s\n"
- "fmla v17.4s, v6.4s, v10.4s\n"
- "ldr q10, [x7, x17]\n"
- "fmla v30.4s, v8.4s, v11.4s\n"
- "fmla v31.4s, v7.4s, v11.4s\n"
- "ldr q11, [x12, x4]\n"
- "fmla v18.4s, v8.4s, v12.4s\n"
"fmla v19.4s, v7.4s, v12.4s\n"
- "fmla v22.4s, v5.4s, v12.4s\n"
- "fmla v23.4s, v4.4s, v12.4s\n"
- "fmla v26.4s, v2.4s, v12.4s\n"
- "fmla v27.4s, v1.4s, v12.4s\n"
- "ldr q12, [x7, x11]\n"
+ "fmla v29.4s, v6.4s, v12.4s\n"
+ "mov v11.16b, v14.16b\n fmla v11.4s, v3.4s, v12.4s\n"
+ "mov v10.16b, v14.16b\n fmla v10.4s, v0.4s, v12.4s\n"
+ "ldr q22, [x7, x4]\n"
+ "mov v25.16b, v14.16b\n fmla v25.4s, v8.4s, v20.4s\n"
+ "ldr q21, [x7, x28]\n"
+ "fmla v31.4s, v6.4s, v24.4s\n"
+ "fmla v30.4s, v4.4s, v24.4s\n"
+ "fmla v18.4s, v3.4s, v24.4s\n"
+ "mov v12.16b, v14.16b\n fmla v12.4s, v1.4s, v24.4s\n"
+ "fmla v14.4s, v0.4s, v24.4s\n"
+ "fmla v28.4s, v8.4s, v24.4s\n"
+ "fmla v27.4s, v5.4s, v24.4s\n"
+ "fmla v26.4s, v2.4s, v24.4s\n"
+ "ld1 { v24.4s }, [x15]\n"
+ "fmla v16.4s, v8.4s, v9.4s\n"
+ "fmla v23.4s, v1.4s, v22.4s\n"
+ "fmla v17.4s, v0.4s, v22.4s\n"
+ "ldr q22, [x15, x25]\n"
+ "fmla v19.4s, v2.4s, v21.4s\n"
+ "fmla v29.4s, v1.4s, v21.4s\n"
+ "ld1 { v20.4s }, [x9]\n"
+ "fmla v31.4s, v7.4s, v9.4s\n"
+ "fmla v11.4s, v6.4s, v9.4s\n"
+ "fmla v30.4s, v5.4s, v9.4s\n"
+ "fmla v18.4s, v4.4s, v9.4s\n"
+ "fmla v10.4s, v3.4s, v9.4s\n"
+ "fmla v12.4s, v2.4s, v9.4s\n"
+ "fmla v14.4s, v1.4s, v9.4s\n"
+ "fmla v25.4s, v0.4s, v9.4s\n"
+ "ldr q21, [x15, x17]\n"
+ "fmla v28.4s, v0.4s, v24.4s\n"
+ "fmla v27.4s, v6.4s, v20.4s\n"
+ "fmla v26.4s, v3.4s, v20.4s\n"
+ "ldr q20, [x9, x25]\n"
+ "fmla v16.4s, v1.4s, v21.4s\n"
+ "fmla v23.4s, v3.4s, v24.4s\n"
+ "fmla v29.4s, v5.4s, v22.4s\n"
+ "fmla v11.4s, v2.4s, v22.4s\n"
+ "ldr q22, [x15, x11]\n"
+ "fmla v17.4s, v4.4s, v21.4s\n"
+ "fmla v19.4s, v3.4s, v21.4s\n"
+ "fmla v31.4s, v0.4s, v21.4s\n"
+ "fmla v10.4s, v8.4s, v20.4s\n"
+ "fmla v25.4s, v5.4s, v20.4s\n"
+ "ldr q20, [x26, x4]\n"
+ "fmla v28.4s, v2.4s, v21.4s\n"
+ "fmla v16.4s, v2.4s, v22.4s\n"
+ "fmla v23.4s, v5.4s, v21.4s\n"
+ "ldr q21, [x14, x4]\n"
+ "fmla v17.4s, v5.4s, v22.4s\n"
+ "fmla v19.4s, v4.4s, v22.4s\n"
+ "fmla v29.4s, v3.4s, v22.4s\n"
+ "fmla v31.4s, v1.4s, v22.4s\n"
+ "fmla v11.4s, v0.4s, v22.4s\n"
+ "ldr q22, [x14, x28]\n"
+ "fmla v26.4s, v7.4s, v20.4s\n"
+ "fmla v12.4s, v6.4s, v20.4s\n"
+ "ldr q20, [x26, x28]\n"
+ "fmla v28.4s, v4.4s, v21.4s\n"
+ "fmla v16.4s, v3.4s, v21.4s\n"
+ "fmla v27.4s, v1.4s, v21.4s\n"
+ "fmla v30.4s, v0.4s, v21.4s\n"
+ "fmla v23.4s, v7.4s, v21.4s\n"
+ "fmla v17.4s, v6.4s, v21.4s\n"
+ "ldr q21, [x7, x17]\n"
+ "fmla v14.4s, v8.4s, v20.4s\n"
+ "fmla v25.4s, v7.4s, v20.4s\n"
+ "ldr q20, [x12, x4]\n"
+ "fmla v19.4s, v8.4s, v22.4s\n"
+ "fmla v29.4s, v7.4s, v22.4s\n"
+ "fmla v31.4s, v5.4s, v22.4s\n"
+ "fmla v11.4s, v4.4s, v22.4s\n"
+ "fmla v18.4s, v2.4s, v22.4s\n"
+ "fmla v10.4s, v1.4s, v22.4s\n"
+ "ldr q22, [x7, x11]\n"
"add x7, x7, #0x10\n"
- "fmla v20.4s, v7.4s, v11.4s\n"
- "fmla v21.4s, v6.4s, v11.4s\n"
- "fmla v24.4s, v4.4s, v11.4s\n"
- "fmla v25.4s, v3.4s, v11.4s\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "fmla v29.4s, v0.4s, v11.4s\n"
- "ldr q11, [x12, x28]\n"
- "fmla v16.4s, v2.4s, v10.4s\n"
- "fmla v17.4s, v1.4s, v10.4s\n"
- "fmla v18.4s, v0.4s, v10.4s\n"
- "ld1 { v10.4s }, [x14]\n"
- "fmla v30.4s, v2.4s, v11.4s\n"
- "fmla v19.4s, v0.4s, v12.4s\n"
- "fmla v20.4s, v3.4s, v10.4s\n"
- "fmla v24.4s, v0.4s, v10.4s\n"
- "fmla v22.4s, v8.4s, v11.4s\n"
- "fmla v23.4s, v7.4s, v11.4s\n"
- "fmla v26.4s, v5.4s, v11.4s\n"
- "fmla v27.4s, v4.4s, v11.4s\n"
- "fmla v31.4s, v1.4s, v11.4s\n"
- "ldr q11, [x9, x17]\n"
- "fmla v17.4s, v2.4s, v12.4s\n"
- "fmla v18.4s, v1.4s, v12.4s\n"
- "ldr q12, [x14, x25]\n"
+ "fmla v28.4s, v7.4s, v20.4s\n"
+ "fmla v16.4s, v6.4s, v20.4s\n"
+ "fmla v27.4s, v4.4s, v20.4s\n"
+ "fmla v30.4s, v3.4s, v20.4s\n"
+ "fmla v26.4s, v1.4s, v20.4s\n"
+ "fmla v12.4s, v0.4s, v20.4s\n"
+ "ldr q20, [x12, x28]\n"
+ "fmla v23.4s, v2.4s, v21.4s\n"
+ "fmla v17.4s, v1.4s, v21.4s\n"
+ "fmla v19.4s, v0.4s, v21.4s\n"
+ "ld1 { v21.4s }, [x14]\n"
+ "fmla v14.4s, v2.4s, v20.4s\n"
+ "fmla v29.4s, v0.4s, v22.4s\n"
+ "fmla v28.4s, v3.4s, v21.4s\n"
+ "fmla v27.4s, v0.4s, v21.4s\n"
+ "fmla v31.4s, v8.4s, v20.4s\n"
+ "fmla v11.4s, v7.4s, v20.4s\n"
+ "fmla v18.4s, v5.4s, v20.4s\n"
+ "fmla v10.4s, v4.4s, v20.4s\n"
+ "fmla v25.4s, v1.4s, v20.4s\n"
+ "ldr q24, [x9, x17]\n"
+ "fmla v17.4s, v2.4s, v22.4s\n"
+ "fmla v19.4s, v1.4s, v22.4s\n"
+ "ldr q20, [x14, x25]\n"
"add x14, x14, #0x10\n"
- "fmla v16.4s, v6.4s, v10.4s\n"
- "ld1 { v10.4s }, [x12]\n"
- "fmla v29.4s, v4.4s, v11.4s\n"
- "fmla v30.4s, v3.4s, v11.4s\n"
- "fmla v19.4s, v8.4s, v12.4s\n"
- "fmla v23.4s, v5.4s, v12.4s\n"
- "fmla v27.4s, v2.4s, v12.4s\n"
- "ldr q12, [x12, x25]\n"
+ "fmla v23.4s, v6.4s, v21.4s\n"
+ "ld1 { v21.4s }, [x12]\n"
+ "fmla v12.4s, v4.4s, v24.4s\n"
+ "fmla v14.4s, v3.4s, v24.4s\n"
+ "fmla v29.4s, v8.4s, v20.4s\n"
+ "fmla v11.4s, v5.4s, v20.4s\n"
+ "fmla v10.4s, v2.4s, v20.4s\n"
+ "ldr q20, [x12, x25]\n"
"add x12, x12, #0x10\n"
- "fmla v20.4s, v6.4s, v10.4s\n"
- "fmla v24.4s, v3.4s, v10.4s\n"
- "fmla v28.4s, v0.4s, v10.4s\n"
- "ldr q10, [x26, x17]\n"
- "fmla v31.4s, v2.4s, v12.4s\n"
- "fmla v29.4s, v7.4s, v10.4s\n"
- "fmla v30.4s, v6.4s, v10.4s\n"
- "fmla v24.4s, v8.4s, v11.4s\n"
- "fmla v25.4s, v7.4s, v11.4s\n"
- "fmla v26.4s, v6.4s, v11.4s\n"
- "fmla v28.4s, v5.4s, v11.4s\n"
- "ldr q11, [x9, x11]\n"
- "fmla v27.4s, v5.4s, v12.4s\n"
- "fmla v29.4s, v5.4s, v11.4s\n"
- "fmla v30.4s, v4.4s, v11.4s\n"
- "fmla v31.4s, v3.4s, v11.4s\n"
- "fmla v23.4s, v8.4s, v12.4s\n"
- "ldr q12, [x26, x11]\n"
- "fmla v28.4s, v8.4s, v10.4s\n"
- "ldr q10, [x15, x4]\n"
- "fmla v25.4s, v8.4s, v11.4s\n"
- "fmla v26.4s, v7.4s, v11.4s\n"
+ "fmla v28.4s, v6.4s, v21.4s\n"
+ "fmla v27.4s, v3.4s, v21.4s\n"
+ "fmla v26.4s, v0.4s, v21.4s\n"
+ "ldr q22, [x26, x17]\n"
+ "fmla v25.4s, v2.4s, v20.4s\n"
+ "fmla v12.4s, v7.4s, v22.4s\n"
+ "fmla v14.4s, v6.4s, v22.4s\n"
+ "fmla v27.4s, v8.4s, v24.4s\n"
+ "fmla v30.4s, v7.4s, v24.4s\n"
+ "fmla v18.4s, v6.4s, v24.4s\n"
+ "fmla v26.4s, v5.4s, v24.4s\n"
+ "ldr q21, [x9, x11]\n"
+ "fmla v10.4s, v5.4s, v20.4s\n"
+ "fmla v12.4s, v5.4s, v21.4s\n"
+ "fmla v14.4s, v4.4s, v21.4s\n"
+ "fmla v25.4s, v3.4s, v21.4s\n"
+ "fmla v11.4s, v8.4s, v20.4s\n"
+ "ldr q20, [x26, x11]\n"
+ "fmla v26.4s, v8.4s, v22.4s\n"
+ "ldr q9, [x15, x4]\n"
+ "fmla v30.4s, v8.4s, v21.4s\n"
+ "fmla v18.4s, v7.4s, v21.4s\n"
"add x26, x26, #0x10\n"
- "fmla v27.4s, v6.4s, v11.4s\n"
- "ldr q11, [x15, x28]\n"
- "fmla v29.4s, v8.4s, v12.4s\n"
+ "fmla v10.4s, v6.4s, v21.4s\n"
+ "ldr q21, [x15, x28]\n"
+ "fmla v12.4s, v8.4s, v20.4s\n"
"add x15, x15, #0x10\n"
- "fmla v30.4s, v7.4s, v12.4s\n"
- "fmla v31.4s, v6.4s, v12.4s\n"
- "ldr q12, [x9, x4]\n"
- "fmla v16.4s, v4.4s, v10.4s\n"
- "fmla v17.4s, v3.4s, v10.4s\n"
- "fmax v16.4s, v16.4s, v15.4s\n"
- "fmla v20.4s, v1.4s, v10.4s\n"
- "fmla v21.4s, v0.4s, v10.4s\n"
- "ldr q10, [x9, x28]\n"
- "fmax v17.4s, v17.4s, v15.4s\n"
- "fmla v18.4s, v5.4s, v11.4s\n"
- "fmla v19.4s, v4.4s, v11.4s\n"
- "fmax v18.4s, v18.4s, v15.4s\n"
+ "fmla v14.4s, v7.4s, v20.4s\n"
+ "fmla v25.4s, v6.4s, v20.4s\n"
+ "ldr q24, [x9, x4]\n"
+ "fmla v23.4s, v4.4s, v9.4s\n"
+ "fmla v17.4s, v3.4s, v9.4s\n"
+ "fmax v23.4s, v23.4s, v13.4s\n"
+ "fmla v28.4s, v1.4s, v9.4s\n"
+ "fmla v16.4s, v0.4s, v9.4s\n"
+ "ldr q0, [x9, x28]\n"
+ "fmax v17.4s, v17.4s, v13.4s\n"
+ "fmla v19.4s, v5.4s, v21.4s\n"
+ "fmla v29.4s, v4.4s, v21.4s\n"
+ "fmax v19.4s, v19.4s, v13.4s\n"
"add x9, x9, #0x10\n"
- "fmla v22.4s, v2.4s, v11.4s\n"
- "fmla v23.4s, v1.4s, v11.4s\n"
- "fmax v19.4s, v19.4s, v15.4s\n"
- "fmla v24.4s, v7.4s, v12.4s\n"
- "fmla v25.4s, v6.4s, v12.4s\n"
- "fmax v20.4s, v20.4s, v15.4s\n"
- "fmla v28.4s, v4.4s, v12.4s\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
- "fmax v21.4s, v21.4s, v15.4s\n"
- "fmla v26.4s, v8.4s, v10.4s\n"
- "fmla v27.4s, v7.4s, v10.4s\n"
- "fmax v22.4s, v22.4s, v15.4s\n"
- "fmla v30.4s, v5.4s, v10.4s\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
- "fmax v23.4s, v23.4s, v15.4s\n"
- "fmax v24.4s, v24.4s, v15.4s\n"
- "fmax v25.4s, v25.4s, v15.4s\n"
- "fmax v26.4s, v26.4s, v15.4s\n"
- "fmax v27.4s, v27.4s, v15.4s\n"
- "fmax v28.4s, v28.4s, v15.4s\n"
- "fmax v29.4s, v29.4s, v15.4s\n"
- "fmax v30.4s, v30.4s, v15.4s\n"
- "fmax v31.4s, v31.4s, v15.4s\n"
- "fmin v16.4s, v16.4s, v14.4s\n"
- "fmin v17.4s, v17.4s, v14.4s\n"
- "st1 { v16.4s }, [x8]\n"
- "fmin v18.4s, v18.4s, v14.4s\n"
- "fmin v19.4s, v19.4s, v14.4s\n"
+ "fmla v31.4s, v2.4s, v21.4s\n"
+ "fmla v11.4s, v1.4s, v21.4s\n"
+ "fmax v29.4s, v29.4s, v13.4s\n"
+ "fmla v27.4s, v7.4s, v24.4s\n"
+ "fmla v30.4s, v6.4s, v24.4s\n"
+ "fmax v28.4s, v28.4s, v13.4s\n"
+ "fmla v26.4s, v4.4s, v24.4s\n"
+ "fmla v12.4s, v3.4s, v24.4s\n"
+ "fmax v16.4s, v16.4s, v13.4s\n"
+ "fmla v18.4s, v8.4s, v0.4s\n"
+ "fmla v10.4s, v7.4s, v0.4s\n"
+ "fmax v31.4s, v31.4s, v13.4s\n"
+ "fmla v14.4s, v5.4s, v0.4s\n"
+ "fmla v25.4s, v4.4s, v0.4s\n"
+ "fmax v11.4s, v11.4s, v13.4s\n"
+ "fmax v27.4s, v27.4s, v13.4s\n"
+ "fmax v30.4s, v30.4s, v13.4s\n"
+ "fmax v18.4s, v18.4s, v13.4s\n"
+ "fmax v10.4s, v10.4s, v13.4s\n"
+ "fmax v26.4s, v26.4s, v13.4s\n"
+ "fmax v12.4s, v12.4s, v13.4s\n"
+ "fmax v14.4s, v14.4s, v13.4s\n"
+ "fmax v25.4s, v25.4s, v13.4s\n"
+ "fmin v23.4s, v23.4s, v15.4s\n"
+ "fmin v17.4s, v17.4s, v15.4s\n"
+ "st1 { v23.4s }, [x8]\n"
+ "fmin v19.4s, v19.4s, v15.4s\n"
+ "fmin v29.4s, v29.4s, v15.4s\n"
"str q17, [x8, x5]\n"
- "fmin v20.4s, v20.4s, v14.4s\n"
- "fmin v21.4s, v21.4s, v14.4s\n"
- "str q18, [x8, x23]\n"
- "fmin v22.4s, v22.4s, v14.4s\n"
- "fmin v23.4s, v23.4s, v14.4s\n"
- "str q19, [x8, x22]\n"
+ "fmin v28.4s, v28.4s, v15.4s\n"
+ "fmin v16.4s, v16.4s, v15.4s\n"
+ "str q19, [x8, x23]\n"
+ "fmin v31.4s, v31.4s, v15.4s\n"
+ "fmin v11.4s, v11.4s, v15.4s\n"
+ "str q29, [x8, x22]\n"
"add x8, x8, #0x10\n"
- "fmin v24.4s, v24.4s, v14.4s\n"
- "fmin v25.4s, v25.4s, v14.4s\n"
- "st1 { v20.4s }, [x10]\n"
- "fmin v26.4s, v26.4s, v14.4s\n"
- "fmin v27.4s, v27.4s, v14.4s\n"
- "str q21, [x10, x5]\n"
- "fmin v28.4s, v28.4s, v14.4s\n"
- "fmin v29.4s, v29.4s, v14.4s\n"
- "str q22, [x10, x23]\n"
- "fmin v30.4s, v30.4s, v14.4s\n"
- "fmin v31.4s, v31.4s, v14.4s\n"
- "str q23, [x10, x22]\n"
+ "fmin v27.4s, v27.4s, v15.4s\n"
+ "fmin v30.4s, v30.4s, v15.4s\n"
+ "st1 { v28.4s }, [x10]\n"
+ "fmin v18.4s, v18.4s, v15.4s\n"
+ "fmin v10.4s, v10.4s, v15.4s\n"
+ "str q16, [x10, x5]\n"
+ "fmin v26.4s, v26.4s, v15.4s\n"
+ "fmin v12.4s, v12.4s, v15.4s\n"
+ "str q31, [x10, x23]\n"
+ "fmin v14.4s, v14.4s, v15.4s\n"
+ "fmin v25.4s, v25.4s, v15.4s\n"
+ "str q11, [x10, x22]\n"
"add x10, x10, #0x10\n"
- "st1 { v24.4s }, [x27]\n"
- "str q25, [x27, x5]\n"
- "str q26, [x27, x23]\n"
- "str q27, [x27, x22]\n"
+ "st1 { v27.4s }, [x27]\n"
+ "str q30, [x27, x5]\n"
+ "str q18, [x27, x23]\n"
+ "str q10, [x27, x22]\n"
"add x27, x27, #0x10\n"
- "st1 { v28.4s }, [x24]\n"
- "str q29, [x24, x5]\n"
- "str q30, [x24, x23]\n"
- "str q31, [x24, x22]\n"
+ "st1 { v26.4s }, [x24]\n"
+ "str q12, [x24, x5]\n"
+ "str q14, [x24, x23]\n"
+ "str q25, [x24, x22]\n"
"add x24, x24, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x3\n"
"beq 73f\n"
- "ldr q13, [x16, #0x0]\n"
+ "ldr q14, [x16, #0x0]\n"
"ldr q0, [x16, #0x10]\n"
"add x23, x14, x17\n"
"add x22, x7, XZR\n"
@@ -675,27 +675,27 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"ldr s11, [x21, #0x0]\n"
"ldr s12, [x20, #0x0]\n"
"6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: End
- "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v9.4s\n"
- "mov v17.16b, v13.16b\n fmla v17.4s, v7.4s, v9.4s\n"
+ "mov v16.16b, v14.16b\n fmla v16.4s, v8.4s, v9.4s\n"
+ "mov v17.16b, v14.16b\n fmla v17.4s, v7.4s, v9.4s\n"
"add x20, x26, XZR\n"
- "mov v18.16b, v13.16b\n fmla v18.4s, v6.4s, v9.4s\n"
- "mov v21.16b, v13.16b\n fmla v21.4s, v4.4s, v9.4s\n"
- "mov v22.16b, v13.16b\n fmla v22.4s, v3.4s, v9.4s\n"
- "mov v25.16b, v13.16b\n fmla v25.4s, v1.4s, v9.4s\n"
- "mov v26.16b, v13.16b\n fmla v26.4s, v0.4s, v9.4s\n"
- "mov v19.16b, v13.16b\n fmla v19.4s, v2.4s, v11.4s\n"
- "mov v20.16b, v13.16b\n fmla v20.4s, v5.4s, v9.4s\n"
- "mov v24.16b, v13.16b\n fmla v24.4s, v2.4s, v9.4s\n"
+ "mov v18.16b, v14.16b\n fmla v18.4s, v6.4s, v9.4s\n"
+ "mov v21.16b, v14.16b\n fmla v21.4s, v4.4s, v9.4s\n"
+ "mov v22.16b, v14.16b\n fmla v22.4s, v3.4s, v9.4s\n"
+ "mov v25.16b, v14.16b\n fmla v25.4s, v1.4s, v9.4s\n"
+ "mov v26.16b, v14.16b\n fmla v26.4s, v0.4s, v9.4s\n"
+ "mov v19.16b, v14.16b\n fmla v19.4s, v2.4s, v11.4s\n"
+ "mov v20.16b, v14.16b\n fmla v20.4s, v5.4s, v9.4s\n"
+ "mov v24.16b, v14.16b\n fmla v24.4s, v2.4s, v9.4s\n"
"fmla v16.4s, v0.4s, v10.4s\n"
"fmla v17.4s, v8.4s, v12.4s\n"
"fmla v18.4s, v7.4s, v12.4s\n"
"fmla v19.4s, v6.4s, v12.4s\n"
"fmla v21.4s, v5.4s, v12.4s\n"
"fmla v22.4s, v4.4s, v12.4s\n"
- "mov v23.16b, v13.16b\n fmla v23.4s, v3.4s, v12.4s\n"
+ "mov v23.16b, v14.16b\n fmla v23.4s, v3.4s, v12.4s\n"
"fmla v25.4s, v2.4s, v12.4s\n"
"fmla v26.4s, v1.4s, v12.4s\n"
- "mov v27.16b, v13.16b\n fmla v27.4s, v0.4s, v12.4s\n"
+ "mov v27.16b, v14.16b\n fmla v27.4s, v0.4s, v12.4s\n"
"tbz %x[n_channels], #1, 7f\n"
"ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #0, 8f\n"
@@ -704,7 +704,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"7:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: Unset
"ldr s10, [x20, #0x0]\n"
"8:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: End
- "mov v28.16b, v13.16b\n fmla v28.4s, v6.4s, v10.4s\n"
+ "mov v28.16b, v14.16b\n fmla v28.4s, v6.4s, v10.4s\n"
"add x20, x26, x25\n"
"tbz %x[n_channels], #1, 9f\n"
"ldr d11, [x20], #0x8\n"
@@ -714,7 +714,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"9:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: Unset
"ldr s11, [x20, #0x0]\n"
"10:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: End
- "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v11.4s\n"
+ "mov v31.16b, v14.16b\n fmla v31.4s, v8.4s, v11.4s\n"
"add x20, x12, x17\n"
"tbz %x[n_channels], #1, 11f\n"
"ldr d9, [x20], #0x8\n"
@@ -732,8 +732,8 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"fmla v25.4s, v4.4s, v9.4s\n"
"fmla v26.4s, v3.4s, v9.4s\n"
"fmla v28.4s, v2.4s, v9.4s\n"
- "mov v29.16b, v13.16b\n fmla v29.4s, v1.4s, v9.4s\n"
- "mov v30.16b, v13.16b\n fmla v30.4s, v0.4s, v9.4s\n"
+ "mov v29.16b, v14.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+ "mov v30.16b, v14.16b\n fmla v30.4s, v0.4s, v9.4s\n"
"tbz %x[n_channels], #1, 13f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 14f\n"
@@ -1105,40 +1105,40 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"70:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
"fmla v26.4s, v8.4s, v10.4s\n"
"fmla v27.4s, v7.4s, v10.4s\n"
- "fmax v16.4s, v16.4s, v15.4s\n"
+ "fmax v16.4s, v16.4s, v13.4s\n"
"fmla v30.4s, v5.4s, v10.4s\n"
"fmla v31.4s, v4.4s, v10.4s\n"
- "fmax v17.4s, v17.4s, v15.4s\n"
- "fmax v18.4s, v18.4s, v15.4s\n"
- "fmax v19.4s, v19.4s, v15.4s\n"
- "fmax v20.4s, v20.4s, v15.4s\n"
- "fmax v21.4s, v21.4s, v15.4s\n"
- "fmax v22.4s, v22.4s, v15.4s\n"
- "fmax v23.4s, v23.4s, v15.4s\n"
- "fmax v24.4s, v24.4s, v15.4s\n"
- "fmax v25.4s, v25.4s, v15.4s\n"
- "fmax v26.4s, v26.4s, v15.4s\n"
- "fmax v27.4s, v27.4s, v15.4s\n"
- "fmax v28.4s, v28.4s, v15.4s\n"
- "fmax v29.4s, v29.4s, v15.4s\n"
- "fmax v30.4s, v30.4s, v15.4s\n"
- "fmax v31.4s, v31.4s, v15.4s\n"
- "fmin v16.4s, v16.4s, v14.4s\n"
- "fmin v17.4s, v17.4s, v14.4s\n"
- "fmin v18.4s, v18.4s, v14.4s\n"
- "fmin v19.4s, v19.4s, v14.4s\n"
- "fmin v20.4s, v20.4s, v14.4s\n"
- "fmin v21.4s, v21.4s, v14.4s\n"
- "fmin v22.4s, v22.4s, v14.4s\n"
- "fmin v23.4s, v23.4s, v14.4s\n"
- "fmin v24.4s, v24.4s, v14.4s\n"
- "fmin v25.4s, v25.4s, v14.4s\n"
- "fmin v26.4s, v26.4s, v14.4s\n"
- "fmin v27.4s, v27.4s, v14.4s\n"
- "fmin v28.4s, v28.4s, v14.4s\n"
- "fmin v29.4s, v29.4s, v14.4s\n"
- "fmin v30.4s, v30.4s, v14.4s\n"
- "fmin v31.4s, v31.4s, v14.4s\n"
+ "fmax v17.4s, v17.4s, v13.4s\n"
+ "fmax v18.4s, v18.4s, v13.4s\n"
+ "fmax v19.4s, v19.4s, v13.4s\n"
+ "fmax v20.4s, v20.4s, v13.4s\n"
+ "fmax v21.4s, v21.4s, v13.4s\n"
+ "fmax v22.4s, v22.4s, v13.4s\n"
+ "fmax v23.4s, v23.4s, v13.4s\n"
+ "fmax v24.4s, v24.4s, v13.4s\n"
+ "fmax v25.4s, v25.4s, v13.4s\n"
+ "fmax v26.4s, v26.4s, v13.4s\n"
+ "fmax v27.4s, v27.4s, v13.4s\n"
+ "fmax v28.4s, v28.4s, v13.4s\n"
+ "fmax v29.4s, v29.4s, v13.4s\n"
+ "fmax v30.4s, v30.4s, v13.4s\n"
+ "fmax v31.4s, v31.4s, v13.4s\n"
+ "fmin v16.4s, v16.4s, v15.4s\n"
+ "fmin v17.4s, v17.4s, v15.4s\n"
+ "fmin v18.4s, v18.4s, v15.4s\n"
+ "fmin v19.4s, v19.4s, v15.4s\n"
+ "fmin v20.4s, v20.4s, v15.4s\n"
+ "fmin v21.4s, v21.4s, v15.4s\n"
+ "fmin v22.4s, v22.4s, v15.4s\n"
+ "fmin v23.4s, v23.4s, v15.4s\n"
+ "fmin v24.4s, v24.4s, v15.4s\n"
+ "fmin v25.4s, v25.4s, v15.4s\n"
+ "fmin v26.4s, v26.4s, v15.4s\n"
+ "fmin v27.4s, v27.4s, v15.4s\n"
+ "fmin v28.4s, v28.4s, v15.4s\n"
+ "fmin v29.4s, v29.4s, v15.4s\n"
+ "fmin v30.4s, v30.4s, v15.4s\n"
+ "fmin v31.4s, v31.4s, v15.4s\n"
"tbz %x[n_channels], #1, 71f\n"
"mov x23, x8\n"
"mov x22, x10\n"
@@ -1229,4 +1229,4 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
index 2353045021..76045f30d6 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__
+#if defined(__aarch64__)
namespace arm_conv {
namespace depthwise {
@@ -98,629 +98,629 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "mov x8, #0x10\n" // cntb _, ALL, #1
- "lsr x17, %x[n_channels], #0x2\n"
- "ldr x16, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mov x6, #0x10\n" // cntb _, ALL, #1
+ "lsr x7, %x[n_channels], #0x2\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v15.4s }, [x20]\n"
+ "ld1r { v13.4s }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
"ld1r { v14.4s }, [x20]\n"
- "add x14, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "mov x13, #0x0\n"
- "sub x12, XZR, x8\n"
- "cbz x17, 3f\n"
- "ldr q13, [x15, #0x0]\n"
- "ldr q0, [x15, #0x10]\n"
- "cmp x8, x17, LSL #4\n"
- "ldr q1, [x15, #0x20]\n"
- "ldr q2, [x15, #0x30]\n"
- "ldr q3, [x15, #0x40]\n"
- "ldr q4, [x15, #0x50]\n"
- "ldr q5, [x15, #0x60]\n"
- "ldr q6, [x15, #0x70]\n"
- "ldr q7, [x15, #0x80]\n"
- "ldr q8, [x15, #0x90]\n"
- "add x15, x15, #0xa0\n"
- "ldp x11, x10, [x14, #0x0]\n"
- "ldr q9, [x11, x13]\n"
- "ldr q10, [x10, x13]\n"
- "ldp x9, x28, [x14, #0x10]\n"
- "ldr q11, [x9, x13]\n"
- "ldr q12, [x28, x13]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x15, #0x0\n"
+ "sub x14, XZR, x6\n"
+ "cbz x7, 3f\n"
+ "ldr q30, [x17, #0x0]\n"
+ "ldr q0, [x17, #0x10]\n"
+ "cmp x6, x7, LSL #4\n"
+ "ldr q1, [x17, #0x20]\n"
+ "ldr q2, [x17, #0x30]\n"
+ "ldr q3, [x17, #0x40]\n"
+ "ldr q4, [x17, #0x50]\n"
+ "ldr q5, [x17, #0x60]\n"
+ "ldr q6, [x17, #0x70]\n"
+ "ldr q7, [x17, #0x80]\n"
+ "ldr q8, [x17, #0x90]\n"
+ "add x17, x17, #0xa0\n"
+ "ldp x21, x20, [x16, #0x0]\n"
+ "ldr q9, [x21, x15]\n"
+ "ldr q10, [x20, x15]\n"
+ "ldp x21, x20, [x16, #0x10]\n"
+ "ldr q11, [x21, x15]\n"
+ "ldr q12, [x20, x15]\n"
"bge 2f\n"
"1:" // Channel loop
- "mov v21.16b, v13.16b\n fmla v21.4s, v4.4s, v9.4s\n"
- "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v9.4s\n"
- "ldr x27, [x14, #0x20]\n"
- "ldr x26, [x14, #0x30]\n"
- "mov v22.16b, v13.16b\n fmla v22.4s, v3.4s, v9.4s\n"
- "mov v25.16b, v13.16b\n fmla v25.4s, v1.4s, v9.4s\n"
- "ldr x25, [x14, #0x28]\n"
- "ldr x24, [x14, #0x38]\n"
- "mov v26.16b, v13.16b\n fmla v26.4s, v0.4s, v9.4s\n"
- "mov v17.16b, v13.16b\n fmla v17.4s, v7.4s, v9.4s\n"
- "ldr x11, [x14, #0x40]\n"
- "ldr x10, [x14, #0x48]\n"
- "mov v18.16b, v13.16b\n fmla v18.4s, v6.4s, v9.4s\n"
- "fmla v21.4s, v5.4s, v12.4s\n"
- "ldr x9, [x14, #0x50]\n"
- "ldr x28, [x14, #0x58]\n"
- "mov v20.16b, v13.16b\n fmla v20.4s, v5.4s, v9.4s\n"
- "mov v24.16b, v13.16b\n fmla v24.4s, v2.4s, v9.4s\n"
- "ldr q9, [x26, x13]\n"
- "ldr x26, [x14, #0x70]\n"
- "fmla v16.4s, v0.4s, v10.4s\n"
- "ldr q10, [x27, x13]\n"
- "mov v19.16b, v13.16b\n fmla v19.4s, v2.4s, v11.4s\n"
- "ldr q11, [x25, x13]\n"
- "fmla v22.4s, v4.4s, v12.4s\n"
- "fmla v25.4s, v2.4s, v12.4s\n"
- "ldr x27, [x14, #0x60]\n"
- "ldr x25, [x14, #0x68]\n"
- "fmla v26.4s, v1.4s, v12.4s\n"
- "fmla v17.4s, v8.4s, v12.4s\n"
- "ldr x23, [x16, #0x0]\n"
- "ldr x22, [x16, #0x8]\n"
- "fmla v18.4s, v7.4s, v12.4s\n"
- "mov v28.16b, v13.16b\n fmla v28.4s, v6.4s, v10.4s\n"
- "ldr q10, [x10, x13]\n"
- "ldr x10, [x14, #0x88]\n"
- "fmla v21.4s, v7.4s, v9.4s\n"
- "fmla v19.4s, v6.4s, v12.4s\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "mov v23.16b, v13.16b\n fmla v23.4s, v3.4s, v12.4s\n"
- "mov v27.16b, v13.16b\n fmla v27.4s, v0.4s, v12.4s\n"
- "ldr q12, [x24, x13]\n"
- "ldr x24, [x14, #0x78]\n"
- "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v11.4s\n"
- "ldr q11, [x11, x13]\n"
- "fmla v22.4s, v6.4s, v9.4s\n"
- "ldr x11, [x14, #0x80]\n"
- "fmla v25.4s, v4.4s, v9.4s\n"
- "fmla v26.4s, v3.4s, v9.4s\n"
- "add x12, x12, #0x10\n"
- "fmla v20.4s, v8.4s, v9.4s\n"
+ "mov v23.16b, v30.16b\n fmla v23.4s, v4.4s, v9.4s\n"
+ "mov v17.16b, v30.16b\n fmla v17.4s, v8.4s, v9.4s\n"
+ "ldr x27, [x16, #0x20]\n"
+ "ldr x24, [x16, #0x30]\n"
+ "mov v25.16b, v30.16b\n fmla v25.4s, v3.4s, v9.4s\n"
+ "mov v28.16b, v30.16b\n fmla v28.4s, v1.4s, v9.4s\n"
+ "ldr x23, [x16, #0x28]\n"
+ "ldr x22, [x16, #0x38]\n"
+ "mov v20.16b, v30.16b\n fmla v20.4s, v0.4s, v9.4s\n"
+ "mov v16.16b, v30.16b\n fmla v16.4s, v7.4s, v9.4s\n"
+ "ldr x26, [x16, #0x40]\n"
+ "ldr x20, [x16, #0x48]\n"
+ "mov v15.16b, v30.16b\n fmla v15.4s, v6.4s, v9.4s\n"
+ "fmla v23.4s, v5.4s, v12.4s\n"
+ "ldr x25, [x16, #0x50]\n"
+ "ldr x21, [x16, #0x58]\n"
+ "mov v27.16b, v30.16b\n fmla v27.4s, v5.4s, v9.4s\n"
+ "mov v31.16b, v30.16b\n fmla v31.4s, v2.4s, v9.4s\n"
+ "ldr q9, [x24, x15]\n"
+ "ldr x13, [x16, #0x70]\n"
+ "fmla v17.4s, v0.4s, v10.4s\n"
+ "ldr q22, [x27, x15]\n"
+ "mov v10.16b, v30.16b\n fmla v10.4s, v2.4s, v11.4s\n"
+ "ldr q18, [x23, x15]\n"
+ "fmla v25.4s, v4.4s, v12.4s\n"
+ "fmla v28.4s, v2.4s, v12.4s\n"
+ "ldr x24, [x16, #0x60]\n"
+ "ldr x23, [x16, #0x68]\n"
+ "fmla v20.4s, v1.4s, v12.4s\n"
+ "fmla v16.4s, v8.4s, v12.4s\n"
+ "ldr x12, [x8, #0x0]\n"
+ "ldr x11, [x8, #0x8]\n"
+ "fmla v15.4s, v7.4s, v12.4s\n"
+ "mov v29.16b, v30.16b\n fmla v29.4s, v6.4s, v22.4s\n"
+ "ldr q22, [x20, x15]\n"
+ "ldr x28, [x16, #0x88]\n"
+ "fmla v23.4s, v7.4s, v9.4s\n"
+ "fmla v10.4s, v6.4s, v12.4s\n"
+ "ldr x10, [x8, #0x10]\n"
+ "ldr x9, [x8, #0x18]\n"
+ "mov v21.16b, v30.16b\n fmla v21.4s, v3.4s, v12.4s\n"
+ "mov v19.16b, v30.16b\n fmla v19.4s, v0.4s, v12.4s\n"
+ "ldr q11, [x22, x15]\n"
+ "ldr x22, [x16, #0x78]\n"
+ "mov v24.16b, v30.16b\n fmla v24.4s, v8.4s, v18.4s\n"
+ "ldr q12, [x26, x15]\n"
+ "fmla v25.4s, v6.4s, v9.4s\n"
+ "ldr x20, [x16, #0x80]\n"
+ "fmla v28.4s, v4.4s, v9.4s\n"
+ "fmla v20.4s, v3.4s, v9.4s\n"
+ "add x14, x14, #0x10\n"
+ "mov v26.16b, v30.16b\n fmla v26.4s, v1.4s, v9.4s\n"
+ "mov v18.16b, v30.16b\n fmla v18.4s, v0.4s, v9.4s\n"
+ "ldr q30, [x17, #0x0]\n"
+ "fmla v27.4s, v8.4s, v9.4s\n"
+ "fmla v31.4s, v5.4s, v9.4s\n"
+ "fmla v29.4s, v2.4s, v9.4s\n"
+ "ldr q9, [x25, x15]\n"
+ "fmla v17.4s, v1.4s, v11.4s\n"
+ "ldr x27, [x16, #0x90]\n"
+ "fmla v16.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x21, x15]\n"
+ "fmla v15.4s, v2.4s, v12.4s\n"
+ "ldr x21, [x16, #0x98]\n"
+ "fmla v23.4s, v8.4s, v22.4s\n"
+ "fmla v10.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x24, x15]\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla v25.4s, v7.4s, v22.4s\n"
+ "fmla v21.4s, v6.4s, v22.4s\n"
+ "fmla v28.4s, v5.4s, v22.4s\n"
+ "fmla v20.4s, v4.4s, v22.4s\n"
+ "fmla v19.4s, v3.4s, v22.4s\n"
+ "fmla v26.4s, v2.4s, v22.4s\n"
+ "fmla v18.4s, v1.4s, v22.4s\n"
+ "fmla v24.4s, v0.4s, v22.4s\n"
+ "ldr q22, [x23, x15]\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla v17.4s, v3.4s, v9.4s\n"
+ "fmla v27.4s, v0.4s, v9.4s\n"
+ "fmla v31.4s, v6.4s, v12.4s\n"
+ "fmla v29.4s, v3.4s, v12.4s\n"
+ "ldr q9, [x13, x15]\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla v16.4s, v4.4s, v22.4s\n"
+ "fmla v15.4s, v3.4s, v22.4s\n"
+ "fmla v23.4s, v1.4s, v22.4s\n"
+ "fmla v10.4s, v5.4s, v11.4s\n"
+ "fmla v21.4s, v2.4s, v11.4s\n"
+ "ldr q12, [x22, x15]\n"
+ "fmla v25.4s, v0.4s, v22.4s\n"
+ "ldr x23, [x16, #0xb8]\n"
+ "fmla v19.4s, v8.4s, v9.4s\n"
"fmla v24.4s, v5.4s, v9.4s\n"
- "fmla v28.4s, v2.4s, v9.4s\n"
- "fmla v16.4s, v1.4s, v12.4s\n"
- "fmla v17.4s, v0.4s, v12.4s\n"
- "ldr q12, [x28, x13]\n"
- "fmla v18.4s, v2.4s, v11.4s\n"
- "ldr x28, [x14, #0x98]\n"
- "fmla v21.4s, v8.4s, v10.4s\n"
- "fmla v19.4s, v1.4s, v11.4s\n"
- "ldr q11, [x27, x13]\n"
- "ldr x27, [x14, #0xa0]\n"
- "fmla v22.4s, v7.4s, v10.4s\n"
- "fmla v23.4s, v6.4s, v10.4s\n"
- "fmla v25.4s, v5.4s, v10.4s\n"
- "fmla v26.4s, v4.4s, v10.4s\n"
- "fmla v27.4s, v3.4s, v10.4s\n"
- "fmla v31.4s, v0.4s, v10.4s\n"
- "fmla v24.4s, v6.4s, v11.4s\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ldr q11, [x26, x13]\n"
- "ldr x26, [x14, #0xb0]\n"
- "fmla v19.4s, v5.4s, v12.4s\n"
+ "ldr q11, [x20, x15]\n"
+ "ldr x22, [x16, #0xc0]\n"
+ "fmla v17.4s, v5.4s, v22.4s\n"
+ "fmla v27.4s, v2.4s, v22.4s\n"
+ "ldr q22, [x28, x15]\n"
+ "ldr x20, [x16, #0xc8]\n"
+ "fmla v16.4s, v5.4s, v12.4s\n"
+ "fmla v15.4s, v4.4s, v12.4s\n"
"fmla v23.4s, v2.4s, v12.4s\n"
- "ldr q12, [x24, x13]\n"
- "ldr x24, [x14, #0xb8]\n"
- "fmla v27.4s, v8.4s, v11.4s\n"
- "fmla v31.4s, v5.4s, v11.4s\n"
- "mov v29.16b, v13.16b\n fmla v29.4s, v1.4s, v9.4s\n"
- "mov v30.16b, v13.16b\n fmla v30.4s, v0.4s, v9.4s\n"
- "ldr q9, [x9, x13]\n"
- "ldr x9, [x14, #0x90]\n"
- "fmla v29.4s, v2.4s, v10.4s\n"
- "fmla v30.4s, v1.4s, v10.4s\n"
- "ldr q10, [x25, x13]\n"
- "ldr x25, [x14, #0xa8]\n"
- "fmla v16.4s, v3.4s, v9.4s\n"
- "fmla v20.4s, v0.4s, v9.4s\n"
- "ldr q11, [x11, x13]\n"
- "ldr x11, [x14, #0xc0]\n"
- "fmla v17.4s, v4.4s, v10.4s\n"
- "fmla v18.4s, v3.4s, v10.4s\n"
- "fmla v21.4s, v1.4s, v10.4s\n"
- "fmla v22.4s, v0.4s, v10.4s\n"
- "fmla v16.4s, v5.4s, v10.4s\n"
- "fmla v20.4s, v2.4s, v10.4s\n"
- "ldr q10, [x10, x13]\n"
- "ldr x10, [x14, #0xc8]\n"
- "fmla v17.4s, v5.4s, v12.4s\n"
- "fmla v18.4s, v4.4s, v12.4s\n"
- "fmla v21.4s, v2.4s, v12.4s\n"
- "fmla v19.4s, v3.4s, v12.4s\n"
- "fmla v22.4s, v1.4s, v12.4s\n"
- "fmla v23.4s, v0.4s, v12.4s\n"
- "ldr q12, [x28, x13]\n"
- "ldr x28, [x14, #0xd8]\n"
- "fmla v28.4s, v7.4s, v11.4s\n"
- "fmla v29.4s, v6.4s, v11.4s\n"
- "ldr q11, [x9, x13]\n"
- "ldr x9, [x14, #0xd0]\n"
- "fmla v16.4s, v7.4s, v10.4s\n"
- "fmla v17.4s, v6.4s, v10.4s\n"
- "fmla v20.4s, v4.4s, v10.4s\n"
- "fmla v21.4s, v3.4s, v10.4s\n"
- "fmla v24.4s, v1.4s, v10.4s\n"
- "fmla v25.4s, v0.4s, v10.4s\n"
- "ldr q10, [x27, x13]\n"
- "ldr x27, [x14, #0xe0]\n"
- "fmla v18.4s, v8.4s, v12.4s\n"
- "fmla v30.4s, v8.4s, v11.4s\n"
- "fmla v31.4s, v7.4s, v11.4s\n"
- "ldr q11, [x25, x13]\n"
- "fmla v27.4s, v1.4s, v12.4s\n"
- "ldr x25, [x14, #0xe8]\n"
- "fmla v19.4s, v7.4s, v12.4s\n"
- "fmla v22.4s, v5.4s, v12.4s\n"
- "fmla v23.4s, v4.4s, v12.4s\n"
- "fmla v26.4s, v2.4s, v12.4s\n"
- "ldr q12, [x26, x13]\n"
- "ldr x26, [x14, #0xf0]\n"
- "fmla v16.4s, v2.4s, v10.4s\n"
- "fmla v17.4s, v1.4s, v10.4s\n"
- "fmla v18.4s, v0.4s, v10.4s\n"
- "ldr q10, [x24, x13]\n"
- "fmla v20.4s, v7.4s, v11.4s\n"
- "ldr x24, [x14, #0xf8]\n"
- "fmla v21.4s, v6.4s, v11.4s\n"
- "fmla v24.4s, v4.4s, v11.4s\n"
- "fmla v25.4s, v3.4s, v11.4s\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "fmla v29.4s, v0.4s, v11.4s\n"
- "ldr q11, [x11, x13]\n"
- "fmla v27.4s, v4.4s, v11.4s\n"
- "ldr x11, [x14, #0x100]\n"
- "fmla v30.4s, v2.4s, v11.4s\n"
- "fmla v17.4s, v2.4s, v12.4s\n"
- "fmla v18.4s, v1.4s, v12.4s\n"
- "fmla v19.4s, v0.4s, v12.4s\n"
- "ldr q12, [x10, x13]\n"
- "ldr x10, [x14, #0x108]\n"
- "fmla v16.4s, v6.4s, v10.4s\n"
- "fmla v20.4s, v3.4s, v10.4s\n"
- "fmla v24.4s, v0.4s, v10.4s\n"
- "ldr q10, [x9, x13]\n"
- "fmla v22.4s, v8.4s, v11.4s\n"
- "ldr x9, [x14, #0x110]\n"
- "fmla v23.4s, v7.4s, v11.4s\n"
- "fmla v26.4s, v5.4s, v11.4s\n"
- "fmla v31.4s, v1.4s, v11.4s\n"
- "ldr q11, [x28, x13]\n"
- "fmla v27.4s, v2.4s, v12.4s\n"
- "ldr x28, [x14, #0x118]\n"
- "fmla v28.4s, v0.4s, v10.4s\n"
- "fmla v29.4s, v4.4s, v11.4s\n"
- "fmla v30.4s, v3.4s, v11.4s\n"
- "fmla v19.4s, v8.4s, v12.4s\n"
- "fmla v23.4s, v5.4s, v12.4s\n"
- "ldr q12, [x27, x13]\n"
- "fmla v20.4s, v6.4s, v10.4s\n"
- "fmla v24.4s, v3.4s, v10.4s\n"
- "ldr q10, [x25, x13]\n"
- "fmla v25.4s, v7.4s, v11.4s\n"
+ "fmla v10.4s, v3.4s, v12.4s\n"
+ "fmla v25.4s, v1.4s, v12.4s\n"
+ "fmla v21.4s, v0.4s, v12.4s\n"
+ "ldr q9, [x21, x15]\n"
+ "ldr x28, [x16, #0xd8]\n"
+ "fmla v29.4s, v7.4s, v11.4s\n"
"fmla v26.4s, v6.4s, v11.4s\n"
- "fmla v28.4s, v5.4s, v11.4s\n"
- "fmla v27.4s, v5.4s, v12.4s\n"
- "fmla v31.4s, v2.4s, v12.4s\n"
- "fmla v29.4s, v7.4s, v10.4s\n"
- "fmla v30.4s, v6.4s, v10.4s\n"
- "fmla v24.4s, v8.4s, v11.4s\n"
- "ldr q11, [x26, x13]\n"
- "fmla v28.4s, v8.4s, v10.4s\n"
- "ldr q10, [x11, x13]\n"
- "fmla v25.4s, v8.4s, v11.4s\n"
- "fmla v26.4s, v7.4s, v11.4s\n"
- "fmla v27.4s, v6.4s, v11.4s\n"
- "fmla v29.4s, v5.4s, v11.4s\n"
- "fmla v30.4s, v4.4s, v11.4s\n"
- "fmla v31.4s, v3.4s, v11.4s\n"
- "ldr q11, [x10, x13]\n"
- "ldp x11, x10, [x14, #0x0]\n"
- "fmla v23.4s, v8.4s, v12.4s\n"
- "ldr q12, [x24, x13]\n"
- "fmla v16.4s, v4.4s, v10.4s\n"
- "fmax v16.4s, v16.4s, v15.4s\n"
- "fmla v17.4s, v3.4s, v10.4s\n"
- "fmla v18.4s, v5.4s, v11.4s\n"
- "fmax v17.4s, v17.4s, v15.4s\n"
- "fmla v19.4s, v4.4s, v11.4s\n"
- "fmla v29.4s, v8.4s, v12.4s\n"
- "fmax v18.4s, v18.4s, v15.4s\n"
- "fmla v30.4s, v7.4s, v12.4s\n"
- "fmla v31.4s, v6.4s, v12.4s\n"
- "ldr q12, [x9, x13]\n"
- "fmax v19.4s, v19.4s, v15.4s\n"
- "fmla v20.4s, v1.4s, v10.4s\n"
- "fmla v21.4s, v0.4s, v10.4s\n"
- "ldr q10, [x28, x13]\n"
- "ldr q9, [x11, x8]\n"
- "fmla v22.4s, v2.4s, v11.4s\n"
- "ldr q13, [x15, #0x0]\n"
- "fmla v23.4s, v1.4s, v11.4s\n"
- "ldr q0, [x15, #0x10]\n"
- "ldr q1, [x15, #0x20]\n"
+ "ldr q12, [x27, x15]\n"
+ "ldr x21, [x16, #0xd0]\n"
+ "fmla v17.4s, v7.4s, v22.4s\n"
+ "fmla v16.4s, v6.4s, v22.4s\n"
+ "fmla v27.4s, v4.4s, v22.4s\n"
+ "fmla v23.4s, v3.4s, v22.4s\n"
+ "fmla v31.4s, v1.4s, v22.4s\n"
+ "fmla v28.4s, v0.4s, v22.4s\n"
+ "ldr q11, [x26, x15]\n"
+ "ldr x27, [x16, #0xe0]\n"
+ "fmla v15.4s, v8.4s, v9.4s\n"
+ "fmla v18.4s, v8.4s, v12.4s\n"
"fmla v24.4s, v7.4s, v12.4s\n"
- "fmla v25.4s, v6.4s, v12.4s\n"
- "ldr q2, [x15, #0x30]\n"
- "fmla v26.4s, v8.4s, v10.4s\n"
- "ldr q6, [x15, #0x70]\n"
- "fmla v27.4s, v7.4s, v10.4s\n"
- "ldr q7, [x15, #0x80]\n"
- "fmin v16.4s, v16.4s, v14.4s\n"
+ "ldr q12, [x25, x15]\n"
+ "fmla v19.4s, v1.4s, v9.4s\n"
+ "ldr x26, [x16, #0xe8]\n"
+ "fmla v10.4s, v7.4s, v9.4s\n"
+ "fmla v25.4s, v5.4s, v9.4s\n"
+ "fmla v21.4s, v4.4s, v9.4s\n"
+ "fmla v20.4s, v2.4s, v9.4s\n"
+ "ldr q9, [x24, x15]\n"
+ "ldr x24, [x16, #0xf0]\n"
+ "fmla v17.4s, v2.4s, v11.4s\n"
+ "fmla v16.4s, v1.4s, v11.4s\n"
+ "fmla v15.4s, v0.4s, v11.4s\n"
+ "ldr q22, [x23, x15]\n"
+ "fmla v27.4s, v7.4s, v12.4s\n"
+ "ldr x25, [x16, #0xf8]\n"
+ "fmla v23.4s, v6.4s, v12.4s\n"
+ "fmla v31.4s, v4.4s, v12.4s\n"
+ "fmla v28.4s, v3.4s, v12.4s\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "fmla v26.4s, v0.4s, v12.4s\n"
+ "ldr q11, [x22, x15]\n"
+ "fmla v19.4s, v4.4s, v11.4s\n"
+ "ldr x23, [x16, #0x100]\n"
+ "fmla v18.4s, v2.4s, v11.4s\n"
+ "fmla v16.4s, v2.4s, v9.4s\n"
+ "fmla v15.4s, v1.4s, v9.4s\n"
+ "fmla v10.4s, v0.4s, v9.4s\n"
+ "ldr q9, [x20, x15]\n"
+ "ldr x20, [x16, #0x108]\n"
+ "fmla v17.4s, v6.4s, v22.4s\n"
+ "fmla v27.4s, v3.4s, v22.4s\n"
+ "fmla v31.4s, v0.4s, v22.4s\n"
+ "ldr q22, [x21, x15]\n"
+ "fmla v25.4s, v8.4s, v11.4s\n"
+ "ldr x22, [x16, #0x110]\n"
+ "fmla v21.4s, v7.4s, v11.4s\n"
+ "fmla v20.4s, v5.4s, v11.4s\n"
+ "fmla v24.4s, v1.4s, v11.4s\n"
+ "ldr q12, [x28, x15]\n"
+ "fmla v19.4s, v2.4s, v9.4s\n"
+ "ldr x21, [x16, #0x118]\n"
+ "fmla v29.4s, v0.4s, v22.4s\n"
+ "fmla v26.4s, v4.4s, v12.4s\n"
+ "fmla v18.4s, v3.4s, v12.4s\n"
+ "fmla v10.4s, v8.4s, v9.4s\n"
+ "fmla v21.4s, v5.4s, v9.4s\n"
+ "ldr q11, [x27, x15]\n"
+ "fmla v27.4s, v6.4s, v22.4s\n"
+ "fmla v31.4s, v3.4s, v22.4s\n"
+ "ldr q22, [x26, x15]\n"
+ "fmla v28.4s, v7.4s, v12.4s\n"
+ "fmla v20.4s, v6.4s, v12.4s\n"
+ "fmla v29.4s, v5.4s, v12.4s\n"
+ "fmla v19.4s, v5.4s, v11.4s\n"
+ "fmla v24.4s, v2.4s, v11.4s\n"
+ "fmla v26.4s, v7.4s, v22.4s\n"
+ "fmla v18.4s, v6.4s, v22.4s\n"
+ "fmla v31.4s, v8.4s, v12.4s\n"
+ "ldr q12, [x24, x15]\n"
+ "fmla v29.4s, v8.4s, v22.4s\n"
+ "ldr q22, [x23, x15]\n"
+ "fmla v28.4s, v8.4s, v12.4s\n"
+ "fmla v20.4s, v7.4s, v12.4s\n"
+ "fmla v19.4s, v6.4s, v12.4s\n"
+ "fmla v26.4s, v5.4s, v12.4s\n"
+ "fmla v18.4s, v4.4s, v12.4s\n"
+ "fmla v24.4s, v3.4s, v12.4s\n"
+ "ldr q12, [x20, x15]\n"
+ "ldp x20, x24, [x16, #0x0]\n"
+ "ldr q9, [x20, x6]\n"
+ "fmla v21.4s, v8.4s, v11.4s\n"
+ "ldr q11, [x25, x15]\n"
+ "fmla v17.4s, v4.4s, v22.4s\n"
+ "fmla v16.4s, v3.4s, v22.4s\n"
+ "fmla v15.4s, v5.4s, v12.4s\n"
+ "fmax v17.4s, v17.4s, v13.4s\n"
+ "fmla v10.4s, v4.4s, v12.4s\n"
+ "fmla v26.4s, v8.4s, v11.4s\n"
+ "fmax v16.4s, v16.4s, v13.4s\n"
+ "fmla v18.4s, v7.4s, v11.4s\n"
+ "fmla v24.4s, v6.4s, v11.4s\n"
+ "ldr q11, [x22, x15]\n"
+ "fmax v15.4s, v15.4s, v13.4s\n"
+ "fmla v27.4s, v1.4s, v22.4s\n"
+ "fmla v23.4s, v0.4s, v22.4s\n"
+ "ldr q22, [x21, x15]\n"
+ "ldr q0, [x17, #0x10]\n"
+ "fmla v25.4s, v2.4s, v12.4s\n"
+ "ldr q2, [x17, #0x30]\n"
+ "fmla v21.4s, v1.4s, v12.4s\n"
+ "ldr q1, [x17, #0x20]\n"
+ "fmax v10.4s, v10.4s, v13.4s\n"
+ "fmla v31.4s, v7.4s, v11.4s\n"
+ "fmla v28.4s, v6.4s, v11.4s\n"
+ "ldr q6, [x17, #0x70]\n"
+ "fmla v20.4s, v8.4s, v22.4s\n"
+ "ldr q8, [x17, #0x90]\n"
+ "fmla v19.4s, v7.4s, v22.4s\n"
+ "ldr q7, [x17, #0x80]\n"
"fmin v17.4s, v17.4s, v14.4s\n"
- "str q16, [x23, x12]\n"
- "ldr q8, [x15, #0x90]\n"
- "fmin v18.4s, v18.4s, v14.4s\n"
- "fmin v19.4s, v19.4s, v14.4s\n"
- "str q17, [x22, x12]\n"
- "ldr x23, [x16, #0x20]\n"
- "fmax v20.4s, v20.4s, v15.4s\n"
- "fmax v21.4s, v21.4s, v15.4s\n"
- "str q18, [x21, x12]\n"
- "ldr x22, [x16, #0x28]\n"
- "fmax v22.4s, v22.4s, v15.4s\n"
- "fmax v23.4s, v23.4s, v15.4s\n"
- "str q19, [x20, x12]\n"
- "ldr x21, [x16, #0x30]\n"
- "ldr x20, [x16, #0x38]\n"
- "fmla v28.4s, v4.4s, v12.4s\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
- "ldr q3, [x15, #0x40]\n"
- "fmla v30.4s, v5.4s, v10.4s\n"
- "ldr q5, [x15, #0x60]\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
- "ldr q10, [x10, x8]\n"
- "ldr q4, [x15, #0x50]\n"
- "fmin v20.4s, v20.4s, v14.4s\n"
- "fmin v21.4s, v21.4s, v14.4s\n"
- "str q20, [x23, x12]\n"
- "fmin v22.4s, v22.4s, v14.4s\n"
+ "fmin v16.4s, v16.4s, v14.4s\n"
+ "str q17, [x12, x14]\n"
+ "ldr x23, [x8, #0x20]\n"
+ "fmin v15.4s, v15.4s, v14.4s\n"
+ "fmin v10.4s, v10.4s, v14.4s\n"
+ "str q16, [x11, x14]\n"
+ "ldr x22, [x8, #0x28]\n"
+ "fmax v27.4s, v27.4s, v13.4s\n"
+ "fmax v23.4s, v23.4s, v13.4s\n"
+ "str q15, [x10, x14]\n"
+ "ldr x21, [x8, #0x30]\n"
+ "fmax v25.4s, v25.4s, v13.4s\n"
+ "fmax v21.4s, v21.4s, v13.4s\n"
+ "str q10, [x9, x14]\n"
+ "ldr x20, [x8, #0x38]\n"
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "fmla v26.4s, v3.4s, v11.4s\n"
+ "ldr q3, [x17, #0x40]\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "fmla v18.4s, v5.4s, v22.4s\n"
+ "ldr q5, [x17, #0x60]\n"
+ "fmla v24.4s, v4.4s, v22.4s\n"
+ "ldr q10, [x24, x6]\n"
+ "ldr q4, [x17, #0x50]\n"
"fmin v23.4s, v23.4s, v14.4s\n"
- "str q21, [x22, x12]\n"
- "ldr x23, [x16, #0x40]\n"
- "fmax v24.4s, v24.4s, v15.4s\n"
- "fmax v25.4s, v25.4s, v15.4s\n"
- "str q22, [x21, x12]\n"
- "ldr x22, [x16, #0x48]\n"
- "fmax v26.4s, v26.4s, v15.4s\n"
- "fmax v27.4s, v27.4s, v15.4s\n"
- "str q23, [x20, x12]\n"
- "ldr x21, [x16, #0x50]\n"
- "ldr x20, [x16, #0x58]\n"
- "ldp x9, x28, [x14, #0x10]\n"
- "fmin v24.4s, v24.4s, v14.4s\n"
"fmin v25.4s, v25.4s, v14.4s\n"
- "ldr q11, [x9, x8]\n"
- "ldr q12, [x28, x8]\n"
- "fmin v26.4s, v26.4s, v14.4s\n"
- "fmin v27.4s, v27.4s, v14.4s\n"
- "fmax v28.4s, v28.4s, v15.4s\n"
- "fmax v29.4s, v29.4s, v15.4s\n"
- "str q24, [x23, x12]\n"
- "ldr x23, [x16, #0x60]\n"
- "fmax v30.4s, v30.4s, v15.4s\n"
- "fmax v31.4s, v31.4s, v15.4s\n"
- "str q25, [x22, x12]\n"
- "ldr x22, [x16, #0x68]\n"
- "str q26, [x21, x12]\n"
- "ldr x21, [x16, #0x70]\n"
- "add x8, x8, #0x10\n"
- "cmp x8, x17, LSL #4\n"
- "str q27, [x20, x12]\n"
- "ldr x20, [x16, #0x78]\n"
+ "str q27, [x23, x14]\n"
+ "fmin v21.4s, v21.4s, v14.4s\n"
+ "fmax v31.4s, v31.4s, v13.4s\n"
+ "str q23, [x22, x14]\n"
+ "ldr x25, [x8, #0x40]\n"
+ "fmax v28.4s, v28.4s, v13.4s\n"
+ "fmax v20.4s, v20.4s, v13.4s\n"
+ "str q25, [x21, x14]\n"
+ "ldr x23, [x8, #0x48]\n"
+ "fmax v19.4s, v19.4s, v13.4s\n"
+ "str q21, [x20, x14]\n"
+ "ldr x22, [x8, #0x50]\n"
+ "ldr x24, [x8, #0x58]\n"
+ "ldp x21, x20, [x16, #0x10]\n"
+ "ldr q11, [x21, x6]\n"
+ "fmin v31.4s, v31.4s, v14.4s\n"
"fmin v28.4s, v28.4s, v14.4s\n"
+ "ldr q12, [x20, x6]\n"
+ "fmin v20.4s, v20.4s, v14.4s\n"
+ "fmin v19.4s, v19.4s, v14.4s\n"
+ "str q31, [x25, x14]\n"
+ "fmax v29.4s, v29.4s, v13.4s\n"
+ "fmax v26.4s, v26.4s, v13.4s\n"
+ "str q28, [x23, x14]\n"
+ "ldr x23, [x8, #0x60]\n"
+ "fmax v18.4s, v18.4s, v13.4s\n"
+ "fmax v24.4s, v24.4s, v13.4s\n"
+ "str q20, [x22, x14]\n"
+ "ldr x22, [x8, #0x68]\n"
+ "str q19, [x24, x14]\n"
+ "ldr x21, [x8, #0x70]\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x6, x6, #0x10\n"
+ "cmp x6, x7, LSL #4\n"
"fmin v29.4s, v29.4s, v14.4s\n"
- "fmin v30.4s, v30.4s, v14.4s\n"
- "fmin v31.4s, v31.4s, v14.4s\n"
- "add x13, x13, #0x10\n"
- "str q28, [x23, x12]\n"
- "str q29, [x22, x12]\n"
- "add x15, x15, #0xa0\n"
- "str q30, [x21, x12]\n"
- "str q31, [x20, x12]\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "add x15, x15, #0x10\n"
+ "fmin v18.4s, v18.4s, v14.4s\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "str q29, [x23, x14]\n"
+ "add x17, x17, #0xa0\n"
+ "str q26, [x22, x14]\n"
+ "str q18, [x21, x14]\n"
+ "str q24, [x20, x14]\n"
"blt 1b\n"
"2:" // Channel tail
- "mov v21.16b, v13.16b\n fmla v21.4s, v4.4s, v9.4s\n"
- "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v9.4s\n"
- "ldr x27, [x14, #0x20]\n"
- "ldr x26, [x14, #0x30]\n"
- "mov v22.16b, v13.16b\n fmla v22.4s, v3.4s, v9.4s\n"
- "mov v25.16b, v13.16b\n fmla v25.4s, v1.4s, v9.4s\n"
- "ldr x25, [x14, #0x28]\n"
- "ldr x24, [x14, #0x38]\n"
- "mov v26.16b, v13.16b\n fmla v26.4s, v0.4s, v9.4s\n"
- "mov v17.16b, v13.16b\n fmla v17.4s, v7.4s, v9.4s\n"
- "ldr x11, [x14, #0x40]\n"
- "ldr x10, [x14, #0x48]\n"
- "mov v18.16b, v13.16b\n fmla v18.4s, v6.4s, v9.4s\n"
- "fmla v21.4s, v5.4s, v12.4s\n"
- "ldr x9, [x14, #0x50]\n"
- "ldr x28, [x14, #0x58]\n"
- "mov v20.16b, v13.16b\n fmla v20.4s, v5.4s, v9.4s\n"
- "mov v24.16b, v13.16b\n fmla v24.4s, v2.4s, v9.4s\n"
- "ldr q9, [x26, x13]\n"
- "ldr x26, [x14, #0x70]\n"
- "fmla v16.4s, v0.4s, v10.4s\n"
- "ldr q10, [x27, x13]\n"
- "mov v19.16b, v13.16b\n fmla v19.4s, v2.4s, v11.4s\n"
- "ldr q11, [x25, x13]\n"
- "fmla v22.4s, v4.4s, v12.4s\n"
- "fmla v25.4s, v2.4s, v12.4s\n"
- "ldr x27, [x14, #0x60]\n"
- "ldr x25, [x14, #0x68]\n"
- "fmla v26.4s, v1.4s, v12.4s\n"
- "fmla v17.4s, v8.4s, v12.4s\n"
- "ldr x23, [x16, #0x0]\n"
- "ldr x22, [x16, #0x8]\n"
- "fmla v18.4s, v7.4s, v12.4s\n"
- "mov v28.16b, v13.16b\n fmla v28.4s, v6.4s, v10.4s\n"
- "ldr q10, [x10, x13]\n"
- "ldr x10, [x14, #0x88]\n"
- "fmla v21.4s, v7.4s, v9.4s\n"
- "fmla v19.4s, v6.4s, v12.4s\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "mov v23.16b, v13.16b\n fmla v23.4s, v3.4s, v12.4s\n"
- "mov v27.16b, v13.16b\n fmla v27.4s, v0.4s, v12.4s\n"
- "ldr q12, [x24, x13]\n"
- "ldr x24, [x14, #0x78]\n"
- "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v11.4s\n"
- "ldr q11, [x11, x13]\n"
- "fmla v22.4s, v6.4s, v9.4s\n"
- "ldr x11, [x14, #0x80]\n"
- "fmla v25.4s, v4.4s, v9.4s\n"
- "fmla v26.4s, v3.4s, v9.4s\n"
- "add x12, x12, #0x10\n"
- "fmla v20.4s, v8.4s, v9.4s\n"
- "fmla v24.4s, v5.4s, v9.4s\n"
- "fmla v28.4s, v2.4s, v9.4s\n"
- "fmla v16.4s, v1.4s, v12.4s\n"
- "fmla v17.4s, v0.4s, v12.4s\n"
- "ldr q12, [x28, x13]\n"
- "fmla v18.4s, v2.4s, v11.4s\n"
- "ldr x28, [x14, #0x98]\n"
- "fmla v21.4s, v8.4s, v10.4s\n"
- "fmla v19.4s, v1.4s, v11.4s\n"
- "ldr q11, [x27, x13]\n"
- "ldr x27, [x14, #0xa0]\n"
- "fmla v22.4s, v7.4s, v10.4s\n"
- "fmla v23.4s, v6.4s, v10.4s\n"
- "fmla v25.4s, v5.4s, v10.4s\n"
- "fmla v26.4s, v4.4s, v10.4s\n"
- "fmla v27.4s, v3.4s, v10.4s\n"
- "fmla v31.4s, v0.4s, v10.4s\n"
- "fmla v24.4s, v6.4s, v11.4s\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ldr q11, [x26, x13]\n"
- "ldr x26, [x14, #0xb0]\n"
- "fmla v19.4s, v5.4s, v12.4s\n"
- "fmla v23.4s, v2.4s, v12.4s\n"
- "ldr q12, [x24, x13]\n"
- "ldr x24, [x14, #0xb8]\n"
- "fmla v27.4s, v8.4s, v11.4s\n"
- "fmla v31.4s, v5.4s, v11.4s\n"
- "mov v29.16b, v13.16b\n fmla v29.4s, v1.4s, v9.4s\n"
- "mov v30.16b, v13.16b\n fmla v30.4s, v0.4s, v9.4s\n"
- "ldr q9, [x9, x13]\n"
- "ldr x9, [x14, #0x90]\n"
- "fmla v29.4s, v2.4s, v10.4s\n"
- "fmla v30.4s, v1.4s, v10.4s\n"
- "ldr q10, [x25, x13]\n"
- "ldr x25, [x14, #0xa8]\n"
- "fmla v16.4s, v3.4s, v9.4s\n"
- "fmla v20.4s, v0.4s, v9.4s\n"
- "ldr q11, [x11, x13]\n"
- "ldr x11, [x14, #0xc0]\n"
- "fmla v17.4s, v4.4s, v10.4s\n"
- "fmla v18.4s, v3.4s, v10.4s\n"
- "fmla v21.4s, v1.4s, v10.4s\n"
- "fmla v22.4s, v0.4s, v10.4s\n"
- "fmla v16.4s, v5.4s, v10.4s\n"
- "fmla v20.4s, v2.4s, v10.4s\n"
- "ldr q10, [x10, x13]\n"
- "ldr x10, [x14, #0xc8]\n"
- "fmla v17.4s, v5.4s, v12.4s\n"
- "fmla v18.4s, v4.4s, v12.4s\n"
- "fmla v21.4s, v2.4s, v12.4s\n"
- "fmla v19.4s, v3.4s, v12.4s\n"
- "fmla v22.4s, v1.4s, v12.4s\n"
- "fmla v23.4s, v0.4s, v12.4s\n"
- "ldr q12, [x28, x13]\n"
- "ldr x28, [x14, #0xd8]\n"
- "fmla v28.4s, v7.4s, v11.4s\n"
- "fmla v29.4s, v6.4s, v11.4s\n"
- "ldr q11, [x9, x13]\n"
- "ldr x9, [x14, #0xd0]\n"
- "fmla v16.4s, v7.4s, v10.4s\n"
- "fmla v17.4s, v6.4s, v10.4s\n"
- "fmla v20.4s, v4.4s, v10.4s\n"
- "fmla v21.4s, v3.4s, v10.4s\n"
- "fmla v24.4s, v1.4s, v10.4s\n"
- "fmla v25.4s, v0.4s, v10.4s\n"
- "ldr q10, [x27, x13]\n"
- "ldr x27, [x14, #0xe0]\n"
- "fmla v18.4s, v8.4s, v12.4s\n"
- "fmla v30.4s, v8.4s, v11.4s\n"
- "fmla v31.4s, v7.4s, v11.4s\n"
- "ldr q11, [x25, x13]\n"
- "fmla v27.4s, v1.4s, v12.4s\n"
- "ldr x25, [x14, #0xe8]\n"
- "fmla v19.4s, v7.4s, v12.4s\n"
- "fmla v22.4s, v5.4s, v12.4s\n"
- "fmla v23.4s, v4.4s, v12.4s\n"
- "fmla v26.4s, v2.4s, v12.4s\n"
- "ldr q12, [x26, x13]\n"
- "ldr x26, [x14, #0xf0]\n"
- "fmla v16.4s, v2.4s, v10.4s\n"
- "fmla v17.4s, v1.4s, v10.4s\n"
- "fmla v18.4s, v0.4s, v10.4s\n"
- "ldr q10, [x24, x13]\n"
- "fmla v20.4s, v7.4s, v11.4s\n"
- "ldr x24, [x14, #0xf8]\n"
- "fmla v21.4s, v6.4s, v11.4s\n"
- "fmla v24.4s, v4.4s, v11.4s\n"
- "fmla v25.4s, v3.4s, v11.4s\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "fmla v29.4s, v0.4s, v11.4s\n"
- "ldr q11, [x11, x13]\n"
- "fmla v27.4s, v4.4s, v11.4s\n"
- "ldr x11, [x14, #0x100]\n"
- "fmla v30.4s, v2.4s, v11.4s\n"
- "fmla v17.4s, v2.4s, v12.4s\n"
- "fmla v18.4s, v1.4s, v12.4s\n"
- "fmla v19.4s, v0.4s, v12.4s\n"
- "ldr q12, [x10, x13]\n"
- "ldr x10, [x14, #0x108]\n"
- "fmla v16.4s, v6.4s, v10.4s\n"
- "fmla v20.4s, v3.4s, v10.4s\n"
- "fmla v24.4s, v0.4s, v10.4s\n"
- "ldr q10, [x9, x13]\n"
- "fmla v22.4s, v8.4s, v11.4s\n"
- "ldr x9, [x14, #0x110]\n"
- "fmla v23.4s, v7.4s, v11.4s\n"
- "fmla v26.4s, v5.4s, v11.4s\n"
- "fmla v31.4s, v1.4s, v11.4s\n"
- "ldr q11, [x28, x13]\n"
- "fmla v27.4s, v2.4s, v12.4s\n"
- "ldr x28, [x14, #0x118]\n"
- "fmla v28.4s, v0.4s, v10.4s\n"
- "fmla v29.4s, v4.4s, v11.4s\n"
- "fmla v30.4s, v3.4s, v11.4s\n"
- "fmla v19.4s, v8.4s, v12.4s\n"
- "fmla v23.4s, v5.4s, v12.4s\n"
- "ldr q12, [x27, x13]\n"
- "fmla v20.4s, v6.4s, v10.4s\n"
- "fmla v24.4s, v3.4s, v10.4s\n"
- "ldr q10, [x25, x13]\n"
- "fmla v25.4s, v7.4s, v11.4s\n"
- "fmla v26.4s, v6.4s, v11.4s\n"
- "fmla v28.4s, v5.4s, v11.4s\n"
- "fmla v27.4s, v5.4s, v12.4s\n"
- "fmla v31.4s, v2.4s, v12.4s\n"
- "fmla v29.4s, v7.4s, v10.4s\n"
- "fmla v30.4s, v6.4s, v10.4s\n"
- "fmla v24.4s, v8.4s, v11.4s\n"
- "ldr q11, [x26, x13]\n"
- "fmla v28.4s, v8.4s, v10.4s\n"
- "ldr q10, [x11, x13]\n"
- "fmla v25.4s, v8.4s, v11.4s\n"
- "fmla v26.4s, v7.4s, v11.4s\n"
- "fmla v27.4s, v6.4s, v11.4s\n"
- "fmla v29.4s, v5.4s, v11.4s\n"
- "fmla v30.4s, v4.4s, v11.4s\n"
- "fmla v31.4s, v3.4s, v11.4s\n"
- "ldr q11, [x10, x13]\n"
- "fmla v23.4s, v8.4s, v12.4s\n"
- "ldr q12, [x24, x13]\n"
- "fmla v16.4s, v4.4s, v10.4s\n"
- "fmax v16.4s, v16.4s, v15.4s\n"
- "fmla v17.4s, v3.4s, v10.4s\n"
- "fmla v18.4s, v5.4s, v11.4s\n"
- "fmax v17.4s, v17.4s, v15.4s\n"
- "fmla v19.4s, v4.4s, v11.4s\n"
- "fmla v29.4s, v8.4s, v12.4s\n"
- "fmax v18.4s, v18.4s, v15.4s\n"
- "fmla v30.4s, v7.4s, v12.4s\n"
- "fmla v31.4s, v6.4s, v12.4s\n"
- "ldr q12, [x9, x13]\n"
- "fmax v19.4s, v19.4s, v15.4s\n"
- "fmla v20.4s, v1.4s, v10.4s\n"
- "fmla v21.4s, v0.4s, v10.4s\n"
- "ldr q10, [x28, x13]\n"
- "fmin v16.4s, v16.4s, v14.4s\n"
- "fmla v22.4s, v2.4s, v11.4s\n"
- "fmla v23.4s, v1.4s, v11.4s\n"
+ "mov v31.16b, v30.16b\n fmla v31.4s, v4.4s, v9.4s\n"
+ "mov v17.16b, v30.16b\n fmla v17.4s, v8.4s, v9.4s\n"
+ "ldr x27, [x16, #0x20]\n"
+ "ldr x24, [x16, #0x30]\n"
+ "mov v15.16b, v30.16b\n fmla v15.4s, v3.4s, v9.4s\n"
+ "mov v29.16b, v30.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+ "ldr x23, [x16, #0x28]\n"
+ "ldr x22, [x16, #0x38]\n"
+ "mov v19.16b, v30.16b\n fmla v19.4s, v0.4s, v9.4s\n"
+ "mov v20.16b, v30.16b\n fmla v20.4s, v7.4s, v9.4s\n"
+ "ldr x26, [x16, #0x40]\n"
+ "ldr x21, [x16, #0x48]\n"
+ "mov v21.16b, v30.16b\n fmla v21.4s, v6.4s, v9.4s\n"
+ "fmla v31.4s, v5.4s, v12.4s\n"
+ "ldr x25, [x16, #0x50]\n"
+ "ldr x20, [x16, #0x58]\n"
+ "mov v18.16b, v30.16b\n fmla v18.4s, v5.4s, v9.4s\n"
+ "mov v27.16b, v30.16b\n fmla v27.4s, v2.4s, v9.4s\n"
+ "ldr q24, [x24, x15]\n"
+ "ldr x13, [x16, #0x70]\n"
+ "fmla v17.4s, v0.4s, v10.4s\n"
+ "ldr q22, [x27, x15]\n"
+ "mov v28.16b, v30.16b\n fmla v28.4s, v2.4s, v11.4s\n"
+ "ldr q16, [x23, x15]\n"
+ "fmla v15.4s, v4.4s, v12.4s\n"
+ "fmla v29.4s, v2.4s, v12.4s\n"
+ "ldr x24, [x16, #0x60]\n"
+ "ldr x23, [x16, #0x68]\n"
+ "fmla v19.4s, v1.4s, v12.4s\n"
+ "fmla v20.4s, v8.4s, v12.4s\n"
+ "ldr x12, [x8, #0x0]\n"
+ "ldr x11, [x8, #0x8]\n"
+ "fmla v21.4s, v7.4s, v12.4s\n"
+ "mov v10.16b, v30.16b\n fmla v10.4s, v6.4s, v22.4s\n"
+ "ldr q22, [x21, x15]\n"
+ "ldr x28, [x16, #0x88]\n"
+ "fmla v31.4s, v7.4s, v24.4s\n"
+ "fmla v28.4s, v6.4s, v12.4s\n"
+ "ldr x10, [x8, #0x10]\n"
+ "ldr x9, [x8, #0x18]\n"
+ "mov v9.16b, v30.16b\n fmla v9.4s, v3.4s, v12.4s\n"
+ "mov v11.16b, v30.16b\n fmla v11.4s, v0.4s, v12.4s\n"
+ "ldr q23, [x22, x15]\n"
+ "ldr x22, [x16, #0x78]\n"
+ "mov v12.16b, v30.16b\n fmla v12.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x26, x15]\n"
+ "fmla v15.4s, v6.4s, v24.4s\n"
+ "ldr x21, [x16, #0x80]\n"
+ "fmla v29.4s, v4.4s, v24.4s\n"
+ "fmla v19.4s, v3.4s, v24.4s\n"
+ "add x14, x14, #0x10\n"
+ "mov v26.16b, v30.16b\n fmla v26.4s, v1.4s, v24.4s\n"
+ "mov v25.16b, v30.16b\n fmla v25.4s, v0.4s, v24.4s\n"
+ "fmla v18.4s, v8.4s, v24.4s\n"
+ "fmla v27.4s, v5.4s, v24.4s\n"
+ "fmla v10.4s, v2.4s, v24.4s\n"
+ "ldr q24, [x25, x15]\n"
+ "fmla v17.4s, v1.4s, v23.4s\n"
+ "ldr x27, [x16, #0x90]\n"
+ "fmla v20.4s, v0.4s, v23.4s\n"
+ "ldr q23, [x20, x15]\n"
+ "fmla v21.4s, v2.4s, v16.4s\n"
+ "ldr x20, [x16, #0x98]\n"
+ "fmla v31.4s, v8.4s, v22.4s\n"
+ "fmla v28.4s, v1.4s, v16.4s\n"
+ "ldr q16, [x24, x15]\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla v15.4s, v7.4s, v22.4s\n"
+ "fmla v9.4s, v6.4s, v22.4s\n"
+ "fmla v29.4s, v5.4s, v22.4s\n"
+ "fmla v19.4s, v4.4s, v22.4s\n"
+ "fmla v11.4s, v3.4s, v22.4s\n"
+ "fmla v26.4s, v2.4s, v22.4s\n"
+ "fmla v25.4s, v1.4s, v22.4s\n"
+ "fmla v12.4s, v0.4s, v22.4s\n"
+ "ldr q22, [x23, x15]\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla v17.4s, v3.4s, v24.4s\n"
+ "fmla v18.4s, v0.4s, v24.4s\n"
+ "fmla v27.4s, v6.4s, v16.4s\n"
+ "fmla v10.4s, v3.4s, v16.4s\n"
+ "ldr q16, [x13, x15]\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla v20.4s, v4.4s, v22.4s\n"
+ "fmla v21.4s, v3.4s, v22.4s\n"
+ "fmla v31.4s, v1.4s, v22.4s\n"
+ "fmla v28.4s, v5.4s, v23.4s\n"
+ "fmla v9.4s, v2.4s, v23.4s\n"
+ "ldr q23, [x22, x15]\n"
+ "fmla v15.4s, v0.4s, v22.4s\n"
+ "ldr x23, [x16, #0xb8]\n"
+ "fmla v11.4s, v8.4s, v16.4s\n"
+ "fmla v12.4s, v5.4s, v16.4s\n"
+ "ldr q16, [x21, x15]\n"
+ "ldr x22, [x16, #0xc0]\n"
+ "fmla v17.4s, v5.4s, v22.4s\n"
+ "fmla v18.4s, v2.4s, v22.4s\n"
+ "ldr q22, [x28, x15]\n"
+ "ldr x21, [x16, #0xc8]\n"
+ "fmla v20.4s, v5.4s, v23.4s\n"
+ "fmla v21.4s, v4.4s, v23.4s\n"
+ "fmla v31.4s, v2.4s, v23.4s\n"
+ "fmla v28.4s, v3.4s, v23.4s\n"
+ "fmla v15.4s, v1.4s, v23.4s\n"
+ "fmla v9.4s, v0.4s, v23.4s\n"
+ "ldr q23, [x20, x15]\n"
+ "ldr x28, [x16, #0xd8]\n"
+ "fmla v10.4s, v7.4s, v16.4s\n"
+ "fmla v26.4s, v6.4s, v16.4s\n"
+ "ldr q16, [x27, x15]\n"
+ "ldr x20, [x16, #0xd0]\n"
+ "fmla v17.4s, v7.4s, v22.4s\n"
+ "fmla v20.4s, v6.4s, v22.4s\n"
+ "fmla v18.4s, v4.4s, v22.4s\n"
+ "fmla v31.4s, v3.4s, v22.4s\n"
+ "fmla v27.4s, v1.4s, v22.4s\n"
+ "fmla v29.4s, v0.4s, v22.4s\n"
+ "ldr q22, [x26, x15]\n"
+ "ldr x27, [x16, #0xe0]\n"
+ "fmla v21.4s, v8.4s, v23.4s\n"
+ "fmla v25.4s, v8.4s, v16.4s\n"
+ "fmla v12.4s, v7.4s, v16.4s\n"
+ "ldr q16, [x25, x15]\n"
+ "fmla v11.4s, v1.4s, v23.4s\n"
+ "ldr x26, [x16, #0xe8]\n"
+ "fmla v28.4s, v7.4s, v23.4s\n"
+ "fmla v15.4s, v5.4s, v23.4s\n"
+ "fmla v9.4s, v4.4s, v23.4s\n"
+ "fmla v19.4s, v2.4s, v23.4s\n"
+ "ldr q23, [x24, x15]\n"
+ "ldr x25, [x16, #0xf0]\n"
+ "fmla v17.4s, v2.4s, v22.4s\n"
+ "fmla v20.4s, v1.4s, v22.4s\n"
+ "fmla v21.4s, v0.4s, v22.4s\n"
+ "ldr q22, [x23, x15]\n"
+ "fmla v18.4s, v7.4s, v16.4s\n"
+ "ldr x24, [x16, #0xf8]\n"
+ "fmla v31.4s, v6.4s, v16.4s\n"
+ "fmla v27.4s, v4.4s, v16.4s\n"
+ "fmla v29.4s, v3.4s, v16.4s\n"
+ "fmla v10.4s, v1.4s, v16.4s\n"
+ "fmla v26.4s, v0.4s, v16.4s\n"
+ "ldr q16, [x22, x15]\n"
+ "fmla v11.4s, v4.4s, v16.4s\n"
+ "ldr x23, [x16, #0x100]\n"
+ "fmla v25.4s, v2.4s, v16.4s\n"
+ "fmla v20.4s, v2.4s, v23.4s\n"
+ "fmla v21.4s, v1.4s, v23.4s\n"
+ "fmla v28.4s, v0.4s, v23.4s\n"
+ "ldr q23, [x21, x15]\n"
+ "ldr x22, [x16, #0x108]\n"
+ "fmla v17.4s, v6.4s, v22.4s\n"
+ "fmla v18.4s, v3.4s, v22.4s\n"
+ "fmla v27.4s, v0.4s, v22.4s\n"
+ "ldr q22, [x20, x15]\n"
+ "fmla v15.4s, v8.4s, v16.4s\n"
+ "ldr x21, [x16, #0x110]\n"
+ "fmla v9.4s, v7.4s, v16.4s\n"
+ "fmla v19.4s, v5.4s, v16.4s\n"
+ "fmla v12.4s, v1.4s, v16.4s\n"
+ "ldr q16, [x28, x15]\n"
+ "fmla v11.4s, v2.4s, v23.4s\n"
+ "ldr x20, [x16, #0x118]\n"
+ "fmla v10.4s, v0.4s, v22.4s\n"
+ "fmla v26.4s, v4.4s, v16.4s\n"
+ "fmla v25.4s, v3.4s, v16.4s\n"
+ "fmla v28.4s, v8.4s, v23.4s\n"
+ "fmla v9.4s, v5.4s, v23.4s\n"
+ "ldr q23, [x27, x15]\n"
+ "fmla v18.4s, v6.4s, v22.4s\n"
+ "fmla v27.4s, v3.4s, v22.4s\n"
+ "ldr q22, [x26, x15]\n"
+ "fmla v29.4s, v7.4s, v16.4s\n"
+ "fmla v19.4s, v6.4s, v16.4s\n"
+ "fmla v10.4s, v5.4s, v16.4s\n"
+ "fmla v11.4s, v5.4s, v23.4s\n"
+ "fmla v12.4s, v2.4s, v23.4s\n"
+ "fmla v26.4s, v7.4s, v22.4s\n"
+ "fmla v25.4s, v6.4s, v22.4s\n"
+ "fmla v27.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x25, x15]\n"
+ "fmla v10.4s, v8.4s, v22.4s\n"
+ "ldr q30, [x23, x15]\n"
+ "fmla v29.4s, v8.4s, v16.4s\n"
+ "fmla v19.4s, v7.4s, v16.4s\n"
+ "fmla v11.4s, v6.4s, v16.4s\n"
+ "fmla v26.4s, v5.4s, v16.4s\n"
+ "fmla v25.4s, v4.4s, v16.4s\n"
+ "fmla v12.4s, v3.4s, v16.4s\n"
+ "ldr q24, [x22, x15]\n"
+ "fmla v9.4s, v8.4s, v23.4s\n"
+ "ldr q16, [x24, x15]\n"
+ "fmla v17.4s, v4.4s, v30.4s\n"
+ "fmax v17.4s, v17.4s, v13.4s\n"
+ "fmla v20.4s, v3.4s, v30.4s\n"
+ "fmla v21.4s, v5.4s, v24.4s\n"
+ "fmax v20.4s, v20.4s, v13.4s\n"
+ "fmla v28.4s, v4.4s, v24.4s\n"
+ "fmla v26.4s, v8.4s, v16.4s\n"
+ "fmax v21.4s, v21.4s, v13.4s\n"
+ "fmla v25.4s, v7.4s, v16.4s\n"
+ "fmla v12.4s, v6.4s, v16.4s\n"
+ "ldr q23, [x21, x15]\n"
+ "fmax v28.4s, v28.4s, v13.4s\n"
+ "fmla v18.4s, v1.4s, v30.4s\n"
+ "fmla v31.4s, v0.4s, v30.4s\n"
+ "ldr q16, [x20, x15]\n"
"fmin v17.4s, v17.4s, v14.4s\n"
- "str q16, [x23, x12]\n"
- "fmla v24.4s, v7.4s, v12.4s\n"
- "fmla v25.4s, v6.4s, v12.4s\n"
- "fmin v18.4s, v18.4s, v14.4s\n"
- "str q17, [x22, x12]\n"
- "fmla v26.4s, v8.4s, v10.4s\n"
- "fmla v27.4s, v7.4s, v10.4s\n"
- "fmin v19.4s, v19.4s, v14.4s\n"
- "str q18, [x21, x12]\n"
- "fmax v20.4s, v20.4s, v15.4s\n"
- "fmax v21.4s, v21.4s, v15.4s\n"
- "str q19, [x20, x12]\n"
- "ldr x23, [x16, #0x20]\n"
- "fmax v22.4s, v22.4s, v15.4s\n"
- "fmax v23.4s, v23.4s, v15.4s\n"
- "ldr x22, [x16, #0x28]\n"
- "ldr x21, [x16, #0x30]\n"
- "ldr x20, [x16, #0x38]\n"
- "fmla v28.4s, v4.4s, v12.4s\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
+ "fmla v15.4s, v2.4s, v24.4s\n"
+ "fmla v9.4s, v1.4s, v24.4s\n"
"fmin v20.4s, v20.4s, v14.4s\n"
- "fmla v30.4s, v5.4s, v10.4s\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
+ "str q17, [x12, x14]\n"
+ "fmla v27.4s, v7.4s, v23.4s\n"
+ "fmla v29.4s, v6.4s, v23.4s\n"
"fmin v21.4s, v21.4s, v14.4s\n"
- "str q20, [x23, x12]\n"
- "fmin v22.4s, v22.4s, v14.4s\n"
- "fmin v23.4s, v23.4s, v14.4s\n"
- "str q21, [x22, x12]\n"
- "ldr x23, [x16, #0x40]\n"
- "fmax v24.4s, v24.4s, v15.4s\n"
- "fmax v25.4s, v25.4s, v15.4s\n"
- "str q22, [x21, x12]\n"
- "ldr x22, [x16, #0x48]\n"
- "fmax v26.4s, v26.4s, v15.4s\n"
- "fmax v27.4s, v27.4s, v15.4s\n"
- "str q23, [x20, x12]\n"
- "ldr x21, [x16, #0x50]\n"
- "ldr x20, [x16, #0x58]\n"
- "fmin v24.4s, v24.4s, v14.4s\n"
- "fmin v25.4s, v25.4s, v14.4s\n"
- "str q24, [x23, x12]\n"
- "fmin v26.4s, v26.4s, v14.4s\n"
- "fmin v27.4s, v27.4s, v14.4s\n"
- "str q25, [x22, x12]\n"
- "ldr x23, [x16, #0x60]\n"
- "fmax v28.4s, v28.4s, v15.4s\n"
- "fmax v29.4s, v29.4s, v15.4s\n"
- "str q26, [x21, x12]\n"
- "ldr x22, [x16, #0x68]\n"
- "fmax v30.4s, v30.4s, v15.4s\n"
- "fmax v31.4s, v31.4s, v15.4s\n"
- "str q27, [x20, x12]\n"
- "ldr x21, [x16, #0x70]\n"
- "ldr x20, [x16, #0x78]\n"
+ "str q20, [x11, x14]\n"
+ "fmla v19.4s, v8.4s, v16.4s\n"
+ "fmla v11.4s, v7.4s, v16.4s\n"
"fmin v28.4s, v28.4s, v14.4s\n"
- "fmin v29.4s, v29.4s, v14.4s\n"
- "str q28, [x23, x12]\n"
- "fmin v30.4s, v30.4s, v14.4s\n"
+ "str q21, [x10, x14]\n"
+ "fmax v18.4s, v18.4s, v13.4s\n"
+ "fmax v31.4s, v31.4s, v13.4s\n"
+ "str q28, [x9, x14]\n"
+ "ldr x23, [x8, #0x20]\n"
+ "fmax v15.4s, v15.4s, v13.4s\n"
+ "fmax v9.4s, v9.4s, v13.4s\n"
+ "ldr x22, [x8, #0x28]\n"
+ "ldr x21, [x8, #0x30]\n"
+ "ldr x20, [x8, #0x38]\n"
+ "fmla v10.4s, v4.4s, v23.4s\n"
+ "fmla v26.4s, v3.4s, v23.4s\n"
+ "fmin v18.4s, v18.4s, v14.4s\n"
+ "fmla v25.4s, v5.4s, v16.4s\n"
+ "fmla v12.4s, v4.4s, v16.4s\n"
"fmin v31.4s, v31.4s, v14.4s\n"
- "str q29, [x22, x12]\n"
- "add x13, x13, #0x10\n"
- "str q30, [x21, x12]\n"
- "str q31, [x20, x12]\n"
+ "str q18, [x23, x14]\n"
+ "fmin v15.4s, v15.4s, v14.4s\n"
+ "fmin v9.4s, v9.4s, v14.4s\n"
+ "str q31, [x22, x14]\n"
+ "ldr x23, [x8, #0x40]\n"
+ "fmax v27.4s, v27.4s, v13.4s\n"
+ "fmax v29.4s, v29.4s, v13.4s\n"
+ "str q15, [x21, x14]\n"
+ "ldr x22, [x8, #0x48]\n"
+ "fmax v19.4s, v19.4s, v13.4s\n"
+ "fmax v11.4s, v11.4s, v13.4s\n"
+ "str q9, [x20, x14]\n"
+ "ldr x21, [x8, #0x50]\n"
+ "ldr x20, [x8, #0x58]\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "str q27, [x23, x14]\n"
+ "fmin v19.4s, v19.4s, v14.4s\n"
+ "fmin v11.4s, v11.4s, v14.4s\n"
+ "str q29, [x22, x14]\n"
+ "ldr x23, [x8, #0x60]\n"
+ "fmax v10.4s, v10.4s, v13.4s\n"
+ "fmax v26.4s, v26.4s, v13.4s\n"
+ "str q19, [x21, x14]\n"
+ "ldr x22, [x8, #0x68]\n"
+ "fmax v25.4s, v25.4s, v13.4s\n"
+ "fmax v12.4s, v12.4s, v13.4s\n"
+ "str q11, [x20, x14]\n"
+ "ldr x21, [x8, #0x70]\n"
+ "ldr x20, [x8, #0x78]\n"
+ "fmin v10.4s, v10.4s, v14.4s\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "str q10, [x23, x14]\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "fmin v12.4s, v12.4s, v14.4s\n"
+ "str q26, [x22, x14]\n"
+ "add x15, x15, #0x10\n"
+ "str q25, [x21, x14]\n"
+ "str q12, [x20, x14]\n"
"3:" // Oddments
"tst %x[n_channels], #0x3\n"
"beq 72f\n"
- "ldr q13, [x15, #0x0]\n"
- "ldr q0, [x15, #0x10]\n"
- "mov x12, x13\n"
- "ldr q1, [x15, #0x20]\n"
- "ldr q2, [x15, #0x30]\n"
- "ldr q3, [x15, #0x40]\n"
- "ldr q4, [x15, #0x50]\n"
- "ldr q5, [x15, #0x60]\n"
- "ldr q6, [x15, #0x70]\n"
- "ldr q7, [x15, #0x80]\n"
- "ldr q8, [x15, #0x90]\n"
- "ldr x23, [x14, #0x0]\n"
- "ldr x22, [x14, #0x8]\n"
- "add x23, x23, x13\n"
- "add x22, x22, x13\n"
- "ldr x21, [x14, #0x10]\n"
- "ldr x20, [x14, #0x18]\n"
- "add x21, x21, x13\n"
- "add x20, x20, x13\n"
+ "ldr q30, [x17, #0x0]\n"
+ "ldr q0, [x17, #0x10]\n"
+ "mov x14, x15\n"
+ "ldr q1, [x17, #0x20]\n"
+ "ldr q2, [x17, #0x30]\n"
+ "ldr q3, [x17, #0x40]\n"
+ "ldr q4, [x17, #0x50]\n"
+ "ldr q5, [x17, #0x60]\n"
+ "ldr q6, [x17, #0x70]\n"
+ "ldr q7, [x17, #0x80]\n"
+ "ldr q8, [x17, #0x90]\n"
+ "ldr x23, [x16, #0x0]\n"
+ "ldr x22, [x16, #0x8]\n"
+ "add x23, x23, x15\n"
+ "add x22, x22, x15\n"
+ "ldr x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x18]\n"
+ "add x21, x21, x15\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 4f\n"
"ld1 { v9.d }[0], [x23], #0x8\n"
"ld1 { v10.d }[0], [x22], #0x8\n"
@@ -738,28 +738,28 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ld1 { v11.s }[0], [x21], #0x4\n"
"ld1 { v12.s }[0], [x20], #0x4\n"
"5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: End
- "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v9.4s\n"
- "mov v17.16b, v13.16b\n fmla v17.4s, v7.4s, v9.4s\n"
- "ldr x20, [x14, #0x20]\n"
- "add x20, x20, x13\n"
- "mov v18.16b, v13.16b\n fmla v18.4s, v6.4s, v9.4s\n"
- "mov v21.16b, v13.16b\n fmla v21.4s, v4.4s, v9.4s\n"
- "mov v22.16b, v13.16b\n fmla v22.4s, v3.4s, v9.4s\n"
- "mov v25.16b, v13.16b\n fmla v25.4s, v1.4s, v9.4s\n"
- "mov v26.16b, v13.16b\n fmla v26.4s, v0.4s, v9.4s\n"
- "mov v19.16b, v13.16b\n fmla v19.4s, v2.4s, v11.4s\n"
- "mov v20.16b, v13.16b\n fmla v20.4s, v5.4s, v9.4s\n"
- "mov v24.16b, v13.16b\n fmla v24.4s, v2.4s, v9.4s\n"
+ "mov v16.16b, v30.16b\n fmla v16.4s, v8.4s, v9.4s\n"
+ "mov v17.16b, v30.16b\n fmla v17.4s, v7.4s, v9.4s\n"
+ "ldr x20, [x16, #0x20]\n"
+ "add x20, x20, x15\n"
+ "mov v18.16b, v30.16b\n fmla v18.4s, v6.4s, v9.4s\n"
+ "mov v21.16b, v30.16b\n fmla v21.4s, v4.4s, v9.4s\n"
+ "mov v22.16b, v30.16b\n fmla v22.4s, v3.4s, v9.4s\n"
+ "mov v25.16b, v30.16b\n fmla v25.4s, v1.4s, v9.4s\n"
+ "mov v26.16b, v30.16b\n fmla v26.4s, v0.4s, v9.4s\n"
+ "mov v19.16b, v30.16b\n fmla v19.4s, v2.4s, v11.4s\n"
+ "mov v20.16b, v30.16b\n fmla v20.4s, v5.4s, v9.4s\n"
+ "mov v24.16b, v30.16b\n fmla v24.4s, v2.4s, v9.4s\n"
"fmla v16.4s, v0.4s, v10.4s\n"
"fmla v17.4s, v8.4s, v12.4s\n"
"fmla v18.4s, v7.4s, v12.4s\n"
"fmla v19.4s, v6.4s, v12.4s\n"
"fmla v21.4s, v5.4s, v12.4s\n"
"fmla v22.4s, v4.4s, v12.4s\n"
- "mov v23.16b, v13.16b\n fmla v23.4s, v3.4s, v12.4s\n"
+ "mov v23.16b, v30.16b\n fmla v23.4s, v3.4s, v12.4s\n"
"fmla v25.4s, v2.4s, v12.4s\n"
"fmla v26.4s, v1.4s, v12.4s\n"
- "mov v27.16b, v13.16b\n fmla v27.4s, v0.4s, v12.4s\n"
+ "mov v27.16b, v30.16b\n fmla v27.4s, v0.4s, v12.4s\n"
"tbz %x[n_channels], #1, 6f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 7f\n"
@@ -768,9 +768,9 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"6:" // Oddments: Load input (5, 0): Bit 1: Unset
"ld1 { v10.s }[0], [x20], #0x4\n"
"7:" // Oddments: Load input (5, 0): Bit 1: End
- "ldr x20, [x14, #0x28]\n"
- "mov v28.16b, v13.16b\n fmla v28.4s, v6.4s, v10.4s\n"
- "add x20, x20, x13\n"
+ "ldr x20, [x16, #0x28]\n"
+ "mov v28.16b, v30.16b\n fmla v28.4s, v6.4s, v10.4s\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 8f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 9f\n"
@@ -779,9 +779,9 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"8:" // Oddments: Load input (5, 5): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"9:" // Oddments: Load input (5, 5): Bit 1: End
- "ldr x20, [x14, #0x30]\n"
- "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "ldr x20, [x16, #0x30]\n"
+ "mov v31.16b, v30.16b\n fmla v31.4s, v8.4s, v11.4s\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 10f\n"
"ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 11f\n"
@@ -790,17 +790,17 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"10:" // Oddments: Load input (3, 2): Bit 1: Unset
"ld1 { v9.s }[0], [x20], #0x4\n"
"11:" // Oddments: Load input (3, 2): Bit 1: End
- "ldr x20, [x14, #0x38]\n"
+ "ldr x20, [x16, #0x38]\n"
"fmla v20.4s, v8.4s, v9.4s\n"
"fmla v21.4s, v7.4s, v9.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v22.4s, v6.4s, v9.4s\n"
"fmla v24.4s, v5.4s, v9.4s\n"
"fmla v25.4s, v4.4s, v9.4s\n"
"fmla v26.4s, v3.4s, v9.4s\n"
"fmla v28.4s, v2.4s, v9.4s\n"
- "mov v29.16b, v13.16b\n fmla v29.4s, v1.4s, v9.4s\n"
- "mov v30.16b, v13.16b\n fmla v30.4s, v0.4s, v9.4s\n"
+ "mov v29.16b, v30.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+ "fmla v30.4s, v0.4s, v9.4s\n"
"tbz %x[n_channels], #1, 12f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 13f\n"
@@ -809,10 +809,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"12:" // Oddments: Load input (0, 1): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"13:" // Oddments: Load input (0, 1): Bit 1: End
- "ldr x20, [x14, #0x40]\n"
+ "ldr x20, [x16, #0x40]\n"
"fmla v16.4s, v1.4s, v12.4s\n"
"fmla v17.4s, v0.4s, v12.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 14f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 15f\n"
@@ -821,10 +821,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"14:" // Oddments: Load input (0, 4): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"15:" // Oddments: Load input (0, 4): Bit 1: End
- "ldr x20, [x14, #0x48]\n"
+ "ldr x20, [x16, #0x48]\n"
"fmla v18.4s, v2.4s, v11.4s\n"
"fmla v19.4s, v1.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 16f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 17f\n"
@@ -833,10 +833,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"16:" // Oddments: Load input (3, 3): Bit 1: Unset
"ld1 { v10.s }[0], [x20], #0x4\n"
"17:" // Oddments: Load input (3, 3): Bit 1: End
- "ldr x20, [x14, #0x50]\n"
+ "ldr x20, [x16, #0x50]\n"
"fmla v21.4s, v8.4s, v10.4s\n"
"fmla v22.4s, v7.4s, v10.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v23.4s, v6.4s, v10.4s\n"
"fmla v25.4s, v5.4s, v10.4s\n"
"fmla v26.4s, v4.4s, v10.4s\n"
@@ -852,10 +852,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"18:" // Oddments: Load input (1, 0): Bit 1: Unset
"ld1 { v9.s }[0], [x20], #0x4\n"
"19:" // Oddments: Load input (1, 0): Bit 1: End
- "ldr x20, [x14, #0x58]\n"
+ "ldr x20, [x16, #0x58]\n"
"fmla v16.4s, v3.4s, v9.4s\n"
"fmla v20.4s, v0.4s, v9.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 20f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 21f\n"
@@ -864,10 +864,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"20:" // Oddments: Load input (1, 5): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"21:" // Oddments: Load input (1, 5): Bit 1: End
- "ldr x20, [x14, #0x60]\n"
+ "ldr x20, [x16, #0x60]\n"
"fmla v19.4s, v5.4s, v12.4s\n"
"fmla v23.4s, v2.4s, v12.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 22f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 23f\n"
@@ -876,10 +876,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"22:" // Oddments: Load input (4, 0): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"23:" // Oddments: Load input (4, 0): Bit 1: End
- "ldr x20, [x14, #0x68]\n"
+ "ldr x20, [x16, #0x68]\n"
"fmla v24.4s, v6.4s, v11.4s\n"
"fmla v28.4s, v3.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 24f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 25f\n"
@@ -888,10 +888,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"24:" // Oddments: Load input (1, 2): Bit 1: Unset
"ld1 { v10.s }[0], [x20], #0x4\n"
"25:" // Oddments: Load input (1, 2): Bit 1: End
- "ldr x20, [x14, #0x70]\n"
+ "ldr x20, [x16, #0x70]\n"
"fmla v16.4s, v5.4s, v10.4s\n"
"fmla v17.4s, v4.4s, v10.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v18.4s, v3.4s, v10.4s\n"
"fmla v20.4s, v2.4s, v10.4s\n"
"fmla v21.4s, v1.4s, v10.4s\n"
@@ -904,10 +904,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"26:" // Oddments: Load input (4, 5): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"27:" // Oddments: Load input (4, 5): Bit 1: End
- "ldr x20, [x14, #0x78]\n"
+ "ldr x20, [x16, #0x78]\n"
"fmla v27.4s, v8.4s, v11.4s\n"
"fmla v31.4s, v5.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 28f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 29f\n"
@@ -916,10 +916,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"28:" // Oddments: Load input (1, 3): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"29:" // Oddments: Load input (1, 3): Bit 1: End
- "ldr x20, [x14, #0x80]\n"
+ "ldr x20, [x16, #0x80]\n"
"fmla v17.4s, v5.4s, v12.4s\n"
"fmla v18.4s, v4.4s, v12.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v19.4s, v3.4s, v12.4s\n"
"fmla v21.4s, v2.4s, v12.4s\n"
"fmla v22.4s, v1.4s, v12.4s\n"
@@ -932,10 +932,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"30:" // Oddments: Load input (5, 1): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"31:" // Oddments: Load input (5, 1): Bit 1: End
- "ldr x20, [x14, #0x88]\n"
+ "ldr x20, [x16, #0x88]\n"
"fmla v28.4s, v7.4s, v11.4s\n"
"fmla v29.4s, v6.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 32f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 33f\n"
@@ -944,10 +944,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"32:" // Oddments: Load input (2, 1): Bit 1: Unset
"ld1 { v10.s }[0], [x20], #0x4\n"
"33:" // Oddments: Load input (2, 1): Bit 1: End
- "ldr x20, [x14, #0x90]\n"
+ "ldr x20, [x16, #0x90]\n"
"fmla v16.4s, v7.4s, v10.4s\n"
"fmla v17.4s, v6.4s, v10.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v20.4s, v4.4s, v10.4s\n"
"fmla v21.4s, v3.4s, v10.4s\n"
"fmla v24.4s, v1.4s, v10.4s\n"
@@ -960,10 +960,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"34:" // Oddments: Load input (5, 4): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"35:" // Oddments: Load input (5, 4): Bit 1: End
- "ldr x20, [x14, #0x98]\n"
+ "ldr x20, [x16, #0x98]\n"
"fmla v30.4s, v8.4s, v11.4s\n"
"fmla v31.4s, v7.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 36f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 37f\n"
@@ -972,10 +972,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"36:" // Oddments: Load input (2, 4): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"37:" // Oddments: Load input (2, 4): Bit 1: End
- "ldr x20, [x14, #0xa0]\n"
+ "ldr x20, [x16, #0xa0]\n"
"fmla v18.4s, v8.4s, v12.4s\n"
"fmla v19.4s, v7.4s, v12.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v22.4s, v5.4s, v12.4s\n"
"fmla v23.4s, v4.4s, v12.4s\n"
"fmla v26.4s, v2.4s, v12.4s\n"
@@ -988,10 +988,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"38:" // Oddments: Load input (0, 2): Bit 1: Unset
"ld1 { v10.s }[0], [x20], #0x4\n"
"39:" // Oddments: Load input (0, 2): Bit 1: End
- "ldr x20, [x14, #0xa8]\n"
+ "ldr x20, [x16, #0xa8]\n"
"fmla v16.4s, v2.4s, v10.4s\n"
"fmla v17.4s, v1.4s, v10.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v18.4s, v0.4s, v10.4s\n"
"tbz %x[n_channels], #1, 40f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
@@ -1001,10 +1001,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"40:" // Oddments: Load input (3, 1): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"41:" // Oddments: Load input (3, 1): Bit 1: End
- "ldr x20, [x14, #0xb0]\n"
+ "ldr x20, [x16, #0xb0]\n"
"fmla v20.4s, v7.4s, v11.4s\n"
"fmla v21.4s, v6.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v24.4s, v4.4s, v11.4s\n"
"fmla v25.4s, v3.4s, v11.4s\n"
"fmla v28.4s, v1.4s, v11.4s\n"
@@ -1017,10 +1017,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"42:" // Oddments: Load input (0, 3): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"43:" // Oddments: Load input (0, 3): Bit 1: End
- "ldr x20, [x14, #0xb8]\n"
+ "ldr x20, [x16, #0xb8]\n"
"fmla v17.4s, v2.4s, v12.4s\n"
"fmla v18.4s, v1.4s, v12.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v19.4s, v0.4s, v12.4s\n"
"tbz %x[n_channels], #1, 44f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
@@ -1030,10 +1030,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"44:" // Oddments: Load input (2, 0): Bit 1: Unset
"ld1 { v10.s }[0], [x20], #0x4\n"
"45:" // Oddments: Load input (2, 0): Bit 1: End
- "ldr x20, [x14, #0xc0]\n"
+ "ldr x20, [x16, #0xc0]\n"
"fmla v16.4s, v6.4s, v10.4s\n"
"fmla v20.4s, v3.4s, v10.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v24.4s, v0.4s, v10.4s\n"
"tbz %x[n_channels], #1, 46f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
@@ -1043,10 +1043,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"46:" // Oddments: Load input (3, 4): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"47:" // Oddments: Load input (3, 4): Bit 1: End
- "ldr x20, [x14, #0xc8]\n"
+ "ldr x20, [x16, #0xc8]\n"
"fmla v22.4s, v8.4s, v11.4s\n"
"fmla v23.4s, v7.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v26.4s, v5.4s, v11.4s\n"
"fmla v27.4s, v4.4s, v11.4s\n"
"fmla v30.4s, v2.4s, v11.4s\n"
@@ -1059,10 +1059,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"48:" // Oddments: Load input (2, 5): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"49:" // Oddments: Load input (2, 5): Bit 1: End
- "ldr x20, [x14, #0xd0]\n"
+ "ldr x20, [x16, #0xd0]\n"
"fmla v19.4s, v8.4s, v12.4s\n"
"fmla v23.4s, v5.4s, v12.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v27.4s, v2.4s, v12.4s\n"
"tbz %x[n_channels], #1, 50f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
@@ -1072,10 +1072,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"50:" // Oddments: Load input (3, 0): Bit 1: Unset
"ld1 { v10.s }[0], [x20], #0x4\n"
"51:" // Oddments: Load input (3, 0): Bit 1: End
- "ldr x20, [x14, #0xd8]\n"
+ "ldr x20, [x16, #0xd8]\n"
"fmla v20.4s, v6.4s, v10.4s\n"
"fmla v24.4s, v3.4s, v10.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v28.4s, v0.4s, v10.4s\n"
"tbz %x[n_channels], #1, 52f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
@@ -1085,10 +1085,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"52:" // Oddments: Load input (4, 2): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"53:" // Oddments: Load input (4, 2): Bit 1: End
- "ldr x20, [x14, #0xe0]\n"
+ "ldr x20, [x16, #0xe0]\n"
"fmla v24.4s, v8.4s, v11.4s\n"
"fmla v25.4s, v7.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v26.4s, v6.4s, v11.4s\n"
"fmla v28.4s, v5.4s, v11.4s\n"
"fmla v29.4s, v4.4s, v11.4s\n"
@@ -1101,10 +1101,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"54:" // Oddments: Load input (3, 5): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"55:" // Oddments: Load input (3, 5): Bit 1: End
- "ldr x20, [x14, #0xe8]\n"
+ "ldr x20, [x16, #0xe8]\n"
"fmla v23.4s, v8.4s, v12.4s\n"
"fmla v27.4s, v5.4s, v12.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v31.4s, v2.4s, v12.4s\n"
"tbz %x[n_channels], #1, 56f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
@@ -1114,10 +1114,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"56:" // Oddments: Load input (5, 2): Bit 1: Unset
"ld1 { v10.s }[0], [x20], #0x4\n"
"57:" // Oddments: Load input (5, 2): Bit 1: End
- "ldr x20, [x14, #0xf0]\n"
+ "ldr x20, [x16, #0xf0]\n"
"fmla v28.4s, v8.4s, v10.4s\n"
"fmla v29.4s, v7.4s, v10.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v30.4s, v6.4s, v10.4s\n"
"tbz %x[n_channels], #1, 58f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
@@ -1127,10 +1127,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"58:" // Oddments: Load input (4, 3): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"59:" // Oddments: Load input (4, 3): Bit 1: End
- "ldr x20, [x14, #0xf8]\n"
+ "ldr x20, [x16, #0xf8]\n"
"fmla v25.4s, v8.4s, v11.4s\n"
"fmla v26.4s, v7.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v27.4s, v6.4s, v11.4s\n"
"fmla v29.4s, v5.4s, v11.4s\n"
"fmla v30.4s, v4.4s, v11.4s\n"
@@ -1143,10 +1143,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"60:" // Oddments: Load input (5, 3): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"61:" // Oddments: Load input (5, 3): Bit 1: End
- "ldr x20, [x14, #0x100]\n"
+ "ldr x20, [x16, #0x100]\n"
"fmla v29.4s, v8.4s, v12.4s\n"
"fmla v30.4s, v7.4s, v12.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v31.4s, v6.4s, v12.4s\n"
"tbz %x[n_channels], #1, 62f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
@@ -1156,10 +1156,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"62:" // Oddments: Load input (1, 1): Bit 1: Unset
"ld1 { v10.s }[0], [x20], #0x4\n"
"63:" // Oddments: Load input (1, 1): Bit 1: End
- "ldr x20, [x14, #0x108]\n"
+ "ldr x20, [x16, #0x108]\n"
"fmla v16.4s, v4.4s, v10.4s\n"
"fmla v17.4s, v3.4s, v10.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v20.4s, v1.4s, v10.4s\n"
"fmla v21.4s, v0.4s, v10.4s\n"
"tbz %x[n_channels], #1, 64f\n"
@@ -1170,10 +1170,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"64:" // Oddments: Load input (1, 4): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"65:" // Oddments: Load input (1, 4): Bit 1: End
- "ldr x20, [x14, #0x110]\n"
+ "ldr x20, [x16, #0x110]\n"
"fmla v18.4s, v5.4s, v11.4s\n"
"fmla v19.4s, v4.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v22.4s, v2.4s, v11.4s\n"
"fmla v23.4s, v1.4s, v11.4s\n"
"tbz %x[n_channels], #1, 66f\n"
@@ -1184,10 +1184,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"66:" // Oddments: Load input (4, 1): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"67:" // Oddments: Load input (4, 1): Bit 1: End
- "ldr x20, [x14, #0x118]\n"
+ "ldr x20, [x16, #0x118]\n"
"fmla v24.4s, v7.4s, v12.4s\n"
"fmla v25.4s, v6.4s, v12.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v28.4s, v4.4s, v12.4s\n"
"fmla v29.4s, v3.4s, v12.4s\n"
"tbz %x[n_channels], #1, 68f\n"
@@ -1200,24 +1200,24 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"69:" // Oddments: Load input (4, 4): Bit 1: End
"fmla v26.4s, v8.4s, v10.4s\n"
"fmla v27.4s, v7.4s, v10.4s\n"
- "fmax v16.4s, v16.4s, v15.4s\n"
+ "fmax v16.4s, v16.4s, v13.4s\n"
"fmla v30.4s, v5.4s, v10.4s\n"
"fmla v31.4s, v4.4s, v10.4s\n"
- "fmax v17.4s, v17.4s, v15.4s\n"
- "fmax v18.4s, v18.4s, v15.4s\n"
- "fmax v19.4s, v19.4s, v15.4s\n"
- "fmax v20.4s, v20.4s, v15.4s\n"
- "fmax v21.4s, v21.4s, v15.4s\n"
- "fmax v22.4s, v22.4s, v15.4s\n"
- "fmax v23.4s, v23.4s, v15.4s\n"
- "fmax v24.4s, v24.4s, v15.4s\n"
- "fmax v25.4s, v25.4s, v15.4s\n"
- "fmax v26.4s, v26.4s, v15.4s\n"
- "fmax v27.4s, v27.4s, v15.4s\n"
- "fmax v28.4s, v28.4s, v15.4s\n"
- "fmax v29.4s, v29.4s, v15.4s\n"
- "fmax v30.4s, v30.4s, v15.4s\n"
- "fmax v31.4s, v31.4s, v15.4s\n"
+ "fmax v17.4s, v17.4s, v13.4s\n"
+ "fmax v18.4s, v18.4s, v13.4s\n"
+ "fmax v19.4s, v19.4s, v13.4s\n"
+ "fmax v20.4s, v20.4s, v13.4s\n"
+ "fmax v21.4s, v21.4s, v13.4s\n"
+ "fmax v22.4s, v22.4s, v13.4s\n"
+ "fmax v23.4s, v23.4s, v13.4s\n"
+ "fmax v24.4s, v24.4s, v13.4s\n"
+ "fmax v25.4s, v25.4s, v13.4s\n"
+ "fmax v26.4s, v26.4s, v13.4s\n"
+ "fmax v27.4s, v27.4s, v13.4s\n"
+ "fmax v28.4s, v28.4s, v13.4s\n"
+ "fmax v29.4s, v29.4s, v13.4s\n"
+ "fmax v30.4s, v30.4s, v13.4s\n"
+ "fmax v31.4s, v31.4s, v13.4s\n"
"fmin v16.4s, v16.4s, v14.4s\n"
"fmin v17.4s, v17.4s, v14.4s\n"
"fmin v18.4s, v18.4s, v14.4s\n"
@@ -1235,150 +1235,150 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"fmin v30.4s, v30.4s, v14.4s\n"
"fmin v31.4s, v31.4s, v14.4s\n"
"tbz %x[n_channels], #1, 70f\n"
- "ldr x23, [x16, #0x0]\n"
- "ldr x22, [x16, #0x8]\n"
- "add x23, x23, x12\n"
- "add x22, x22, x12\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
+ "ldr x23, [x8, #0x0]\n"
+ "ldr x22, [x8, #0x8]\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "ldr x21, [x8, #0x10]\n"
+ "ldr x20, [x8, #0x18]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"st1 { v16.d }[0], [x23]\n"
- "ldr x23, [x16, #0x20]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x20]\n"
+ "add x23, x23, x14\n"
"st1 { v17.d }[0], [x22]\n"
- "ldr x22, [x16, #0x28]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x28]\n"
+ "add x22, x22, x14\n"
"st1 { v18.d }[0], [x21]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x30]\n"
+ "add x21, x21, x14\n"
"st1 { v19.d }[0], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x38]\n"
+ "add x20, x20, x14\n"
"st1 { v20.d }[0], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x23, x23, x14\n"
"st1 { v21.d }[0], [x22]\n"
- "ldr x22, [x16, #0x48]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x48]\n"
+ "add x22, x22, x14\n"
"st1 { v22.d }[0], [x21]\n"
- "ldr x21, [x16, #0x50]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x50]\n"
+ "add x21, x21, x14\n"
"st1 { v23.d }[0], [x20]\n"
- "ldr x20, [x16, #0x58]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x58]\n"
+ "add x20, x20, x14\n"
"st1 { v24.d }[0], [x23]\n"
- "ldr x23, [x16, #0x60]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x60]\n"
+ "add x23, x23, x14\n"
"st1 { v25.d }[0], [x22]\n"
- "ldr x22, [x16, #0x68]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x68]\n"
+ "add x22, x22, x14\n"
"st1 { v26.d }[0], [x21]\n"
- "ldr x21, [x16, #0x70]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x70]\n"
+ "add x21, x21, x14\n"
"st1 { v27.d }[0], [x20]\n"
- "ldr x20, [x16, #0x78]\n"
- "add x20, x20, x12\n"
- "add x12, x12, #0x8\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x20, x20, x14\n"
+ "add x14, x14, #0x8\n"
"st1 { v28.d }[0], [x23]\n"
"st1 { v29.d }[0], [x22]\n"
"st1 { v30.d }[0], [x21]\n"
"st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #0, 71f\n"
- "ldr x23, [x16, #0x0]\n"
- "ldr x22, [x16, #0x8]\n"
- "add x23, x23, x12\n"
- "add x22, x22, x12\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
+ "ldr x23, [x8, #0x0]\n"
+ "ldr x22, [x8, #0x8]\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "ldr x21, [x8, #0x10]\n"
+ "ldr x20, [x8, #0x18]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"st1 { v16.s }[2], [x23]\n"
- "ldr x23, [x16, #0x20]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x20]\n"
+ "add x23, x23, x14\n"
"st1 { v17.s }[2], [x22]\n"
- "ldr x22, [x16, #0x28]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x28]\n"
+ "add x22, x22, x14\n"
"st1 { v18.s }[2], [x21]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x30]\n"
+ "add x21, x21, x14\n"
"st1 { v19.s }[2], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x38]\n"
+ "add x20, x20, x14\n"
"st1 { v20.s }[2], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x23, x23, x14\n"
"st1 { v21.s }[2], [x22]\n"
- "ldr x22, [x16, #0x48]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x48]\n"
+ "add x22, x22, x14\n"
"st1 { v22.s }[2], [x21]\n"
- "ldr x21, [x16, #0x50]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x50]\n"
+ "add x21, x21, x14\n"
"st1 { v23.s }[2], [x20]\n"
- "ldr x20, [x16, #0x58]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x58]\n"
+ "add x20, x20, x14\n"
"st1 { v24.s }[2], [x23]\n"
- "ldr x23, [x16, #0x60]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x60]\n"
+ "add x23, x23, x14\n"
"st1 { v25.s }[2], [x22]\n"
- "ldr x22, [x16, #0x68]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x68]\n"
+ "add x22, x22, x14\n"
"st1 { v26.s }[2], [x21]\n"
- "ldr x21, [x16, #0x70]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x70]\n"
+ "add x21, x21, x14\n"
"st1 { v27.s }[2], [x20]\n"
- "ldr x20, [x16, #0x78]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x20, x20, x14\n"
"st1 { v28.s }[2], [x23]\n"
"st1 { v29.s }[2], [x22]\n"
"st1 { v30.s }[2], [x21]\n"
"st1 { v31.s }[2], [x20]\n"
"b 71f\n"
"70:" // Oddments: Store: Bit 1: Unset
- "ldr x23, [x16, #0x0]\n"
- "ldr x22, [x16, #0x8]\n"
- "add x23, x23, x12\n"
- "add x22, x22, x12\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
+ "ldr x23, [x8, #0x0]\n"
+ "ldr x22, [x8, #0x8]\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "ldr x21, [x8, #0x10]\n"
+ "ldr x20, [x8, #0x18]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"st1 { v16.s }[0], [x23]\n"
- "ldr x23, [x16, #0x20]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x20]\n"
+ "add x23, x23, x14\n"
"st1 { v17.s }[0], [x22]\n"
- "ldr x22, [x16, #0x28]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x28]\n"
+ "add x22, x22, x14\n"
"st1 { v18.s }[0], [x21]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x30]\n"
+ "add x21, x21, x14\n"
"st1 { v19.s }[0], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x38]\n"
+ "add x20, x20, x14\n"
"st1 { v20.s }[0], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x23, x23, x14\n"
"st1 { v21.s }[0], [x22]\n"
- "ldr x22, [x16, #0x48]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x48]\n"
+ "add x22, x22, x14\n"
"st1 { v22.s }[0], [x21]\n"
- "ldr x21, [x16, #0x50]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x50]\n"
+ "add x21, x21, x14\n"
"st1 { v23.s }[0], [x20]\n"
- "ldr x20, [x16, #0x58]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x58]\n"
+ "add x20, x20, x14\n"
"st1 { v24.s }[0], [x23]\n"
- "ldr x23, [x16, #0x60]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x60]\n"
+ "add x23, x23, x14\n"
"st1 { v25.s }[0], [x22]\n"
- "ldr x22, [x16, #0x68]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x68]\n"
+ "add x22, x22, x14\n"
"st1 { v26.s }[0], [x21]\n"
- "ldr x21, [x16, #0x70]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x70]\n"
+ "add x21, x21, x14\n"
"st1 { v27.s }[0], [x20]\n"
- "ldr x20, [x16, #0x78]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x20, x20, x14\n"
"st1 { v28.s }[0], [x23]\n"
"st1 { v29.s }[0], [x22]\n"
"st1 { v30.s }[0], [x21]\n"
@@ -1387,11 +1387,11 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"72:" // End
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index ff521fb2ca..f727efea80 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -33,8 +33,8 @@
namespace arm_conv {
namespace depthwise {
-void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
-void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
class a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
{
@@ -57,7 +57,7 @@ class a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 2;
a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<float, float, float, float>(2, 3, 2) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
index e42ceffb50..5ab61fad4c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__
+#if defined(__aarch64__)
namespace arm_conv {
namespace depthwise {
@@ -106,7 +106,7 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr x17, [%x[params_struct], %[offsetof_args_outptr]]\n"
"mov x23, #0x10\n" // cntb _, ALL, #1
"mul x22, x22, x26\n" // offset *= kernel_stride * output_size
- "add x8, x8, x22, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x8, x8, x22, LSL #2\n" // inptr[0] += offset * sizeof(float)
"add x16, x8, x24, LSL #2\n"
"ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
"madd x20, x27, x7, x20\n" // offset += tile_j * ld_output_col
@@ -118,9 +118,9 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"add x11, x13, x6\n"
"add x17, x17, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v19.4s }, [x20]\n"
+ "ld1r { v26.4s }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v18.4s }, [x20]\n"
+ "ld1r { v27.4s }, [x20]\n"
"add x10, x12, x24, LSL #2\n"
"add x9, x11, x6\n"
"add x28, x17, x21, LSL #2\n"
@@ -128,7 +128,7 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"mov x21, #0x0\n"
"sub x20, XZR, x23\n"
"cbz x22, 4f\n"
- "ldr q17, [x15, #0x0]\n"
+ "ldr q31, [x15, #0x0]\n"
"ldr q0, [x15, #0x10]\n"
"cmp x23, x22, LSL #4\n"
"ldr q1, [x15, #0x20]\n"
@@ -150,179 +150,179 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr q16, [x8, x13]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "mov v28.16b, v17.16b\n fmla v28.4s, v8.4s, v9.4s\n"
- "mov v29.16b, v17.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+ "mov v29.16b, v31.16b\n fmla v29.4s, v8.4s, v9.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v6.4s, v9.4s\n"
"add x23, x23, #0x10\n"
"add x8, x8, #0x10\n"
- "fmla v28.4s, v0.4s, v10.4s\n"
+ "fmla v29.4s, v0.4s, v10.4s\n"
"ld1 { v10.4s }, [x8]\n"
- "fmla v29.4s, v1.4s, v12.4s\n"
- "ldr q12, [x16, x9]\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "ldr q11, [x16, x11]\n"
- "fmla v29.4s, v2.4s, v13.4s\n"
- "ldr q13, [x16, x13]\n"
- "fmla v28.4s, v3.4s, v14.4s\n"
- "ld1 { v14.4s }, [x12]\n"
- "fmla v29.4s, v0.4s, v16.4s\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "ldr q21, [x16, x9]\n"
+ "fmla v29.4s, v1.4s, v11.4s\n"
+ "ldr q18, [x16, x11]\n"
+ "fmla v28.4s, v2.4s, v13.4s\n"
+ "ldr q17, [x16, x13]\n"
+ "fmla v29.4s, v3.4s, v14.4s\n"
+ "ld1 { v20.4s }, [x12]\n"
+ "fmla v28.4s, v0.4s, v16.4s\n"
"add x16, x16, #0x10\n"
- "fmla v28.4s, v4.4s, v15.4s\n"
- "ld1 { v15.4s }, [x14]\n"
- "fmla v29.4s, v4.4s, v11.4s\n"
- "ldr q11, [x12, x6]\n"
- "fmla v28.4s, v2.4s, v16.4s\n"
- "ldr q16, [x14, x6]\n"
- "fmla v29.4s, v5.4s, v12.4s\n"
- "ldr q12, [x14, x11]\n"
- "mov v30.16b, v17.16b\n fmla v30.4s, v2.4s, v9.4s\n"
- "mov v31.16b, v17.16b\n fmla v31.4s, v0.4s, v9.4s\n"
- "ldr q17, [x15, #0x0]\n"
+ "fmla v29.4s, v4.4s, v15.4s\n"
+ "ld1 { v25.4s }, [x14]\n"
+ "fmla v28.4s, v4.4s, v18.4s\n"
+ "ldr q19, [x12, x6]\n"
+ "fmla v29.4s, v2.4s, v16.4s\n"
+ "ldr q18, [x14, x6]\n"
+ "fmla v28.4s, v5.4s, v21.4s\n"
+ "ldr q24, [x14, x11]\n"
+ "mov v23.16b, v31.16b\n fmla v23.4s, v2.4s, v9.4s\n"
+ "mov v22.16b, v31.16b\n fmla v22.4s, v0.4s, v9.4s\n"
+ "ldr q31, [x15, #0x0]\n"
"cmp x23, x22, LSL #4\n"
- "fmla v28.4s, v5.4s, v13.4s\n"
- "fmla v29.4s, v3.4s, v13.4s\n"
- "ldr q13, [x12, x11]\n"
+ "fmla v29.4s, v5.4s, v17.4s\n"
+ "fmla v28.4s, v3.4s, v17.4s\n"
+ "ldr q17, [x12, x11]\n"
"add x20, x20, #0x10\n"
- "fmla v30.4s, v3.4s, v14.4s\n"
- "ldr q14, [x12, x9]\n"
- "fmla v31.4s, v4.4s, v13.4s\n"
- "ldr q13, [x10, x6]\n"
- "fmla v30.4s, v0.4s, v15.4s\n"
+ "fmla v23.4s, v3.4s, v20.4s\n"
+ "ldr q16, [x12, x9]\n"
+ "fmla v22.4s, v4.4s, v17.4s\n"
+ "ldr q21, [x10, x6]\n"
+ "fmla v23.4s, v0.4s, v25.4s\n"
"ldr q0, [x15, #0x10]\n"
- "fmla v31.4s, v1.4s, v12.4s\n"
+ "fmla v22.4s, v1.4s, v24.4s\n"
"add x21, x21, #0x10\n"
- "fmla v30.4s, v4.4s, v11.4s\n"
- "ldr q11, [x14, x9]\n"
+ "fmla v23.4s, v4.4s, v19.4s\n"
+ "ldr q20, [x14, x9]\n"
"ldr q4, [x15, #0x50]\n"
- "fmla v31.4s, v5.4s, v14.4s\n"
- "ldr q14, [x10, x11]\n"
- "fmla v28.4s, v6.4s, v15.4s\n"
- "ld1 { v15.4s }, [x10]\n"
- "fmla v30.4s, v1.4s, v16.4s\n"
+ "fmla v22.4s, v5.4s, v16.4s\n"
+ "ldr q19, [x10, x11]\n"
+ "fmla v29.4s, v6.4s, v25.4s\n"
+ "ld1 { v17.4s }, [x10]\n"
+ "fmla v23.4s, v1.4s, v18.4s\n"
"ldr q1, [x15, #0x20]\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
+ "fmla v22.4s, v2.4s, v20.4s\n"
"ldr q2, [x15, #0x30]\n"
- "fmla v28.4s, v7.4s, v16.4s\n"
+ "fmla v29.4s, v7.4s, v18.4s\n"
"ldr q16, [x12, x13]\n"
- "fmla v30.4s, v6.4s, v15.4s\n"
- "ldr q15, [x10, x13]\n"
- "fmla v31.4s, v3.4s, v16.4s\n"
+ "fmla v23.4s, v6.4s, v17.4s\n"
+ "ldr q18, [x10, x13]\n"
+ "fmla v22.4s, v3.4s, v16.4s\n"
"ldr q3, [x15, #0x40]\n"
- "fmla v30.4s, v7.4s, v13.4s\n"
+ "fmla v23.4s, v7.4s, v21.4s\n"
"ldr q13, [x8, x9]\n"
- "fmla v31.4s, v7.4s, v14.4s\n"
+ "fmla v22.4s, v7.4s, v19.4s\n"
"ld1 { v14.4s }, [x16]\n"
- "fmla v29.4s, v7.4s, v12.4s\n"
+ "fmla v28.4s, v7.4s, v24.4s\n"
"ldr q12, [x8, x11]\n"
- "fmla v30.4s, v5.4s, v16.4s\n"
+ "fmla v23.4s, v5.4s, v16.4s\n"
"ldr q16, [x8, x13]\n"
"ldr q5, [x15, #0x60]\n"
- "fmla v31.4s, v6.4s, v15.4s\n"
- "fmla v29.4s, v8.4s, v11.4s\n"
- "ldr q11, [x10, x9]\n"
+ "fmla v22.4s, v6.4s, v18.4s\n"
+ "fmla v28.4s, v8.4s, v20.4s\n"
+ "ldr q17, [x10, x9]\n"
"ldr q6, [x15, #0x70]\n"
- "fmla v30.4s, v8.4s, v15.4s\n"
- "fmla v31.4s, v8.4s, v11.4s\n"
+ "fmla v23.4s, v8.4s, v18.4s\n"
+ "fmla v22.4s, v8.4s, v17.4s\n"
"ldr q11, [x8, x6]\n"
"ldr q15, [x16, x6]\n"
- "fmax v28.4s, v28.4s, v19.4s\n"
- "fmax v29.4s, v29.4s, v19.4s\n"
+ "fmax v29.4s, v29.4s, v26.4s\n"
+ "fmax v28.4s, v28.4s, v26.4s\n"
"ldr q7, [x15, #0x80]\n"
"ldr q8, [x15, #0x90]\n"
- "fmax v30.4s, v30.4s, v19.4s\n"
- "fmax v31.4s, v31.4s, v19.4s\n"
+ "fmax v23.4s, v23.4s, v26.4s\n"
+ "fmax v22.4s, v22.4s, v26.4s\n"
"add x14, x14, #0x10\n"
"ldr q9, [x14, x13]\n"
- "fmin v28.4s, v28.4s, v18.4s\n"
- "fmin v29.4s, v29.4s, v18.4s\n"
- "fmin v30.4s, v30.4s, v18.4s\n"
- "fmin v31.4s, v31.4s, v18.4s\n"
+ "fmin v29.4s, v29.4s, v27.4s\n"
+ "fmin v28.4s, v28.4s, v27.4s\n"
+ "fmin v23.4s, v23.4s, v27.4s\n"
+ "fmin v22.4s, v22.4s, v27.4s\n"
"add x12, x12, #0x10\n"
"add x10, x10, #0x10\n"
- "st1 { v28.4s }, [x17]\n"
+ "st1 { v29.4s }, [x17]\n"
"add x15, x15, #0xa0\n"
- "str q29, [x17, x7]\n"
+ "str q28, [x17, x7]\n"
"add x17, x17, #0x10\n"
- "st1 { v30.4s }, [x28]\n"
- "str q31, [x28, x7]\n"
+ "st1 { v23.4s }, [x28]\n"
+ "str q22, [x28, x7]\n"
"add x28, x28, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "mov v28.16b, v17.16b\n fmla v28.4s, v8.4s, v9.4s\n"
- "mov v29.16b, v17.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+ "mov v29.16b, v31.16b\n fmla v29.4s, v8.4s, v9.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v6.4s, v9.4s\n"
"add x8, x8, #0x10\n"
- "fmla v28.4s, v0.4s, v10.4s\n"
- "fmla v29.4s, v1.4s, v12.4s\n"
- "ldr q12, [x16, x9]\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "ldr q11, [x16, x11]\n"
- "fmla v29.4s, v2.4s, v13.4s\n"
- "ldr q13, [x16, x13]\n"
- "fmla v28.4s, v3.4s, v14.4s\n"
- "ld1 { v14.4s }, [x12]\n"
- "fmla v29.4s, v0.4s, v16.4s\n"
+ "fmla v29.4s, v0.4s, v10.4s\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "ldr q20, [x16, x9]\n"
+ "fmla v29.4s, v1.4s, v11.4s\n"
+ "ldr q18, [x16, x11]\n"
+ "fmla v28.4s, v2.4s, v13.4s\n"
+ "ldr q17, [x16, x13]\n"
+ "fmla v29.4s, v3.4s, v14.4s\n"
+ "ld1 { v19.4s }, [x12]\n"
+ "fmla v28.4s, v0.4s, v16.4s\n"
"add x16, x16, #0x10\n"
- "fmla v28.4s, v4.4s, v15.4s\n"
- "ld1 { v15.4s }, [x14]\n"
- "fmla v29.4s, v4.4s, v11.4s\n"
- "ldr q11, [x12, x6]\n"
- "fmla v28.4s, v2.4s, v16.4s\n"
- "ldr q16, [x14, x6]\n"
- "fmla v29.4s, v5.4s, v12.4s\n"
- "ldr q12, [x14, x11]\n"
- "mov v30.16b, v17.16b\n fmla v30.4s, v2.4s, v9.4s\n"
- "mov v31.16b, v17.16b\n fmla v31.4s, v0.4s, v9.4s\n"
- "fmla v28.4s, v5.4s, v13.4s\n"
- "fmla v29.4s, v3.4s, v13.4s\n"
- "ldr q13, [x12, x11]\n"
- "fmla v30.4s, v3.4s, v14.4s\n"
- "ldr q14, [x12, x9]\n"
- "fmla v31.4s, v4.4s, v13.4s\n"
- "ldr q13, [x10, x6]\n"
- "fmla v30.4s, v0.4s, v15.4s\n"
- "fmla v31.4s, v1.4s, v12.4s\n"
- "fmla v30.4s, v4.4s, v11.4s\n"
- "ldr q11, [x14, x9]\n"
- "fmla v31.4s, v5.4s, v14.4s\n"
- "ldr q14, [x10, x11]\n"
- "fmla v28.4s, v6.4s, v15.4s\n"
- "ld1 { v15.4s }, [x10]\n"
- "fmla v30.4s, v1.4s, v16.4s\n"
+ "fmla v29.4s, v4.4s, v15.4s\n"
+ "ld1 { v25.4s }, [x14]\n"
+ "fmla v28.4s, v4.4s, v18.4s\n"
+ "ldr q18, [x12, x6]\n"
+ "fmla v29.4s, v2.4s, v16.4s\n"
+ "ldr q24, [x14, x6]\n"
+ "fmla v28.4s, v5.4s, v20.4s\n"
+ "ldr q23, [x14, x11]\n"
+ "mov v22.16b, v31.16b\n fmla v22.4s, v2.4s, v9.4s\n"
+ "mov v21.16b, v31.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+ "fmla v29.4s, v5.4s, v17.4s\n"
+ "fmla v28.4s, v3.4s, v17.4s\n"
+ "ldr q17, [x12, x11]\n"
+ "fmla v22.4s, v3.4s, v19.4s\n"
+ "ldr q16, [x12, x9]\n"
+ "fmla v21.4s, v4.4s, v17.4s\n"
+ "ldr q20, [x10, x6]\n"
+ "fmla v22.4s, v0.4s, v25.4s\n"
+ "fmla v21.4s, v1.4s, v23.4s\n"
+ "fmla v22.4s, v4.4s, v18.4s\n"
+ "ldr q19, [x14, x9]\n"
+ "fmla v21.4s, v5.4s, v16.4s\n"
+ "ldr q18, [x10, x11]\n"
+ "fmla v29.4s, v6.4s, v25.4s\n"
+ "ld1 { v17.4s }, [x10]\n"
+ "fmla v22.4s, v1.4s, v24.4s\n"
"add x14, x14, #0x10\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
- "fmla v28.4s, v7.4s, v16.4s\n"
+ "fmla v21.4s, v2.4s, v19.4s\n"
+ "fmla v29.4s, v7.4s, v24.4s\n"
"ldr q16, [x12, x13]\n"
- "fmax v28.4s, v28.4s, v19.4s\n"
- "fmla v30.4s, v6.4s, v15.4s\n"
- "ldr q15, [x10, x13]\n"
- "fmla v31.4s, v3.4s, v16.4s\n"
- "fmin v28.4s, v28.4s, v18.4s\n"
- "fmla v30.4s, v7.4s, v13.4s\n"
- "fmla v31.4s, v7.4s, v14.4s\n"
- "st1 { v28.4s }, [x17]\n"
+ "fmax v29.4s, v29.4s, v26.4s\n"
+ "fmla v22.4s, v6.4s, v17.4s\n"
+ "ldr q17, [x10, x13]\n"
+ "fmla v21.4s, v3.4s, v16.4s\n"
+ "fmin v29.4s, v29.4s, v27.4s\n"
+ "fmla v22.4s, v7.4s, v20.4s\n"
+ "fmla v21.4s, v7.4s, v18.4s\n"
+ "st1 { v29.4s }, [x17]\n"
"add x12, x12, #0x10\n"
- "fmla v29.4s, v7.4s, v12.4s\n"
- "fmla v30.4s, v5.4s, v16.4s\n"
- "fmla v31.4s, v6.4s, v15.4s\n"
- "fmla v29.4s, v8.4s, v11.4s\n"
- "ldr q11, [x10, x9]\n"
- "fmax v29.4s, v29.4s, v19.4s\n"
- "fmla v30.4s, v8.4s, v15.4s\n"
- "fmla v31.4s, v8.4s, v11.4s\n"
- "fmax v30.4s, v30.4s, v19.4s\n"
+ "fmla v28.4s, v7.4s, v23.4s\n"
+ "fmla v22.4s, v5.4s, v16.4s\n"
+ "fmla v21.4s, v6.4s, v17.4s\n"
+ "fmla v28.4s, v8.4s, v19.4s\n"
+ "ldr q16, [x10, x9]\n"
+ "fmax v28.4s, v28.4s, v26.4s\n"
+ "fmla v22.4s, v8.4s, v17.4s\n"
+ "fmla v21.4s, v8.4s, v16.4s\n"
+ "fmax v22.4s, v22.4s, v26.4s\n"
"add x10, x10, #0x10\n"
- "fmax v31.4s, v31.4s, v19.4s\n"
- "fmin v29.4s, v29.4s, v18.4s\n"
- "str q29, [x17, x7]\n"
+ "fmax v21.4s, v21.4s, v26.4s\n"
+ "fmin v28.4s, v28.4s, v27.4s\n"
+ "str q28, [x17, x7]\n"
"add x17, x17, #0x10\n"
- "fmin v30.4s, v30.4s, v18.4s\n"
- "fmin v31.4s, v31.4s, v18.4s\n"
- "st1 { v30.4s }, [x28]\n"
- "str q31, [x28, x7]\n"
+ "fmin v22.4s, v22.4s, v27.4s\n"
+ "fmin v21.4s, v21.4s, v27.4s\n"
+ "st1 { v22.4s }, [x28]\n"
+ "str q21, [x28, x7]\n"
"add x28, x28, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x3\n"
"beq 43f\n"
- "ldr q17, [x15, #0x0]\n"
+ "ldr q31, [x15, #0x0]\n"
"ldr q0, [x15, #0x10]\n"
"add x27, x14, x13\n"
"add x26, x8, XZR\n"
@@ -369,17 +369,17 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr s15, [x21, #0x0]\n"
"ldr s16, [x20, #0x0]\n"
"6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: End
- "mov v28.16b, v17.16b\n fmla v28.4s, v8.4s, v9.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v8.4s, v9.4s\n"
"fmla v28.4s, v0.4s, v10.4s\n"
"add x20, x16, x11\n"
- "mov v29.16b, v17.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+ "mov v29.16b, v31.16b\n fmla v29.4s, v6.4s, v9.4s\n"
"fmla v28.4s, v1.4s, v11.4s\n"
"fmla v29.4s, v1.4s, v12.4s\n"
"fmla v28.4s, v3.4s, v14.4s\n"
"fmla v29.4s, v2.4s, v13.4s\n"
"fmla v28.4s, v4.4s, v15.4s\n"
- "mov v30.16b, v17.16b\n fmla v30.4s, v2.4s, v9.4s\n"
- "mov v31.16b, v17.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "mov v30.16b, v31.16b\n fmla v30.4s, v2.4s, v9.4s\n"
+ "fmla v31.4s, v0.4s, v9.4s\n"
"fmla v28.4s, v2.4s, v16.4s\n"
"fmla v29.4s, v0.4s, v16.4s\n"
"tbz %x[n_channels], #1, 7f\n"
@@ -558,14 +558,14 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr s11, [x20, #0x0]\n"
"40:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
"fmla v31.4s, v8.4s, v11.4s\n"
- "fmax v28.4s, v28.4s, v19.4s\n"
- "fmax v29.4s, v29.4s, v19.4s\n"
- "fmax v30.4s, v30.4s, v19.4s\n"
- "fmax v31.4s, v31.4s, v19.4s\n"
- "fmin v28.4s, v28.4s, v18.4s\n"
- "fmin v29.4s, v29.4s, v18.4s\n"
- "fmin v30.4s, v30.4s, v18.4s\n"
- "fmin v31.4s, v31.4s, v18.4s\n"
+ "fmax v28.4s, v28.4s, v26.4s\n"
+ "fmax v29.4s, v29.4s, v26.4s\n"
+ "fmax v30.4s, v30.4s, v26.4s\n"
+ "fmax v31.4s, v31.4s, v26.4s\n"
+ "fmin v28.4s, v28.4s, v27.4s\n"
+ "fmin v29.4s, v29.4s, v27.4s\n"
+ "fmin v30.4s, v30.4s, v27.4s\n"
+ "fmin v31.4s, v31.4s, v27.4s\n"
"tbz %x[n_channels], #1, 41f\n"
"mov x21, x17\n"
"mov x20, x28\n"
@@ -591,7 +591,6 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"st1 { v29.s }[0], [x21]\n"
"st1 { v31.s }[0], [x20]\n"
"42:" // Tile loop: Oddments: Store: Bit 1: End
-
"43:" // Tile loop: End
"ldr x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
@@ -606,11 +605,11 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
index f65633002e..24fe255dfb 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__
+#if defined(__aarch64__)
namespace arm_conv {
namespace depthwise {
@@ -88,258 +88,258 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
__asm__ __volatile__(
"ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "mov x26, #0x10\n" // cntb _, ALL, #1
- "lsr x25, %x[n_channels], #0x2\n"
- "ldr x24, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mov x25, #0x10\n" // cntb _, ALL, #1
+ "lsr x24, %x[n_channels], #0x2\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_params]]\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v19.4s }, [x20]\n"
+ "ld1r { v26.4s }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v18.4s }, [x20]\n"
+ "ld1r { v27.4s }, [x20]\n"
"add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
"ldp x12, x11, [x21, #0x0]\n"
"ldp x10, x9, [x21, #0x10]\n"
"mov x28, #0x0\n"
- "sub x23, XZR, x26\n"
- "cbz x25, 3f\n"
- "ldr q17, [x24, #0x0]\n"
- "ldr q0, [x24, #0x10]\n"
- "cmp x26, x25, LSL #4\n"
- "ldr q1, [x24, #0x20]\n"
- "ldr q2, [x24, #0x30]\n"
- "ldr q3, [x24, #0x40]\n"
- "ldr q4, [x24, #0x50]\n"
- "ldr q5, [x24, #0x60]\n"
- "ldr q6, [x24, #0x70]\n"
- "ldr q7, [x24, #0x80]\n"
- "ldr q8, [x24, #0x90]\n"
- "add x24, x24, #0xa0\n"
- "ldp x22, x20, [x13, #0x0]\n"
- "ldr q9, [x22, x28]\n"
+ "sub x22, XZR, x25\n"
+ "cbz x24, 3f\n"
+ "ldr q31, [x23, #0x0]\n"
+ "ldr q0, [x23, #0x10]\n"
+ "cmp x25, x24, LSL #4\n"
+ "ldr q1, [x23, #0x20]\n"
+ "ldr q2, [x23, #0x30]\n"
+ "ldr q3, [x23, #0x40]\n"
+ "ldr q4, [x23, #0x50]\n"
+ "ldr q5, [x23, #0x60]\n"
+ "ldr q6, [x23, #0x70]\n"
+ "ldr q7, [x23, #0x80]\n"
+ "ldr q8, [x23, #0x90]\n"
+ "add x23, x23, #0xa0\n"
+ "ldp x21, x20, [x13, #0x0]\n"
+ "ldr q9, [x21, x28]\n"
"ldr q10, [x20, x28]\n"
"ldp x21, x20, [x13, #0x10]\n"
"ldr q11, [x21, x28]\n"
"ldr q12, [x20, x28]\n"
- "ldp x22, x21, [x13, #0x20]\n"
- "ldr q13, [x22, x28]\n"
- "ldr q14, [x21, x28]\n"
+ "ldp x21, x20, [x13, #0x20]\n"
+ "ldr q13, [x21, x28]\n"
+ "ldr q14, [x20, x28]\n"
"ldp x21, x20, [x13, #0x30]\n"
"ldr q15, [x21, x28]\n"
"ldr q16, [x20, x28]\n"
"bge 2f\n"
"1:" // Channel loop
- "mov v28.16b, v17.16b\n fmla v28.4s, v8.4s, v9.4s\n"
- "mov v29.16b, v17.16b\n fmla v29.4s, v6.4s, v9.4s\n"
- "ldr x22, [x13, #0x40]\n"
+ "mov v24.16b, v31.16b\n fmla v24.4s, v8.4s, v9.4s\n"
+ "mov v23.16b, v31.16b\n fmla v23.4s, v6.4s, v9.4s\n"
+ "ldr x21, [x13, #0x40]\n"
"ldr x20, [x13, #0x48]\n"
- "fmla v28.4s, v0.4s, v10.4s\n"
- "fmla v29.4s, v1.4s, v12.4s\n"
- "ldr q12, [x20, x28]\n"
- "ldr x21, [x13, #0x50]\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "ldr q11, [x22, x28]\n"
- "fmla v29.4s, v2.4s, v13.4s\n"
- "ldr q13, [x21, x28]\n"
- "fmla v28.4s, v3.4s, v14.4s\n"
- "fmla v29.4s, v0.4s, v16.4s\n"
+ "fmla v24.4s, v0.4s, v10.4s\n"
+ "fmla v23.4s, v1.4s, v12.4s\n"
+ "ldr q20, [x20, x28]\n"
+ "ldr x20, [x13, #0x50]\n"
+ "fmla v24.4s, v1.4s, v11.4s\n"
+ "ldr q19, [x21, x28]\n"
+ "fmla v23.4s, v2.4s, v13.4s\n"
+ "ldr q18, [x20, x28]\n"
+ "fmla v24.4s, v3.4s, v14.4s\n"
+ "fmla v23.4s, v0.4s, v16.4s\n"
"ldr x20, [x13, #0x58]\n"
- "ldr q14, [x20, x28]\n"
- "fmla v28.4s, v4.4s, v15.4s\n"
- "fmla v29.4s, v4.4s, v11.4s\n"
- "ldr x20, [x13, #0x78]\n"
- "ldr x22, [x13, #0x60]\n"
- "ldr q15, [x22, x28]\n"
- "fmla v28.4s, v2.4s, v16.4s\n"
- "fmla v29.4s, v5.4s, v12.4s\n"
- "ldr x22, [x13, #0x80]\n"
- "ldr q12, [x22, x28]\n"
- "mov v30.16b, v17.16b\n fmla v30.4s, v2.4s, v9.4s\n"
- "mov v31.16b, v17.16b\n fmla v31.4s, v0.4s, v9.4s\n"
- "ldr q17, [x24, #0x0]\n"
- "fmla v28.4s, v5.4s, v13.4s\n"
- "fmla v29.4s, v3.4s, v13.4s\n"
- "ldr q13, [x20, x28]\n"
- "ldr x21, [x13, #0x68]\n"
- "ldr q11, [x21, x28]\n"
- "fmla v30.4s, v3.4s, v14.4s\n"
- "fmla v31.4s, v4.4s, v13.4s\n"
- "ldr x20, [x13, #0x88]\n"
- "ldr q14, [x20, x28]\n"
- "fmla v30.4s, v0.4s, v15.4s\n"
- "ldr q0, [x24, #0x10]\n"
- "fmla v31.4s, v1.4s, v12.4s\n"
- "ldr x21, [x13, #0x70]\n"
+ "ldr q17, [x20, x28]\n"
+ "fmla v24.4s, v4.4s, v15.4s\n"
+ "fmla v23.4s, v4.4s, v19.4s\n"
+ "ldr x21, [x13, #0x78]\n"
+ "ldr x20, [x13, #0x60]\n"
+ "ldr q22, [x20, x28]\n"
+ "fmla v24.4s, v2.4s, v16.4s\n"
+ "fmla v23.4s, v5.4s, v20.4s\n"
+ "ldr x20, [x13, #0x80]\n"
+ "ldr q21, [x20, x28]\n"
+ "mov v20.16b, v31.16b\n fmla v20.4s, v2.4s, v9.4s\n"
+ "mov v19.16b, v31.16b\n fmla v19.4s, v0.4s, v9.4s\n"
+ "ldr q31, [x23, #0x0]\n"
+ "fmla v24.4s, v5.4s, v18.4s\n"
+ "fmla v23.4s, v3.4s, v18.4s\n"
"ldr q16, [x21, x28]\n"
- "fmla v30.4s, v4.4s, v11.4s\n"
- "fmla v31.4s, v5.4s, v14.4s\n"
- "ldr q4, [x24, #0x50]\n"
+ "ldr x20, [x13, #0x68]\n"
+ "ldr q18, [x20, x28]\n"
+ "fmla v20.4s, v3.4s, v17.4s\n"
+ "fmla v19.4s, v4.4s, v16.4s\n"
+ "ldr x20, [x13, #0x88]\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v20.4s, v0.4s, v22.4s\n"
+ "ldr q0, [x23, #0x10]\n"
+ "fmla v19.4s, v1.4s, v21.4s\n"
+ "ldr x20, [x13, #0x70]\n"
+ "ldr q17, [x20, x28]\n"
+ "fmla v20.4s, v4.4s, v18.4s\n"
+ "fmla v19.4s, v5.4s, v16.4s\n"
+ "ldr q4, [x23, #0x50]\n"
"ldr x20, [x13, #0x98]\n"
- "fmla v28.4s, v6.4s, v15.4s\n"
- "fmla v30.4s, v1.4s, v16.4s\n"
- "ldr q11, [x20, x28]\n"
- "ldr q1, [x24, #0x20]\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
- "fmla v28.4s, v7.4s, v16.4s\n"
- "ldr q2, [x24, #0x30]\n"
- "ldr x21, [x13, #0x90]\n"
- "fmla v29.4s, v7.4s, v12.4s\n"
- "fmla v29.4s, v8.4s, v11.4s\n"
- "ldr q15, [x21, x28]\n"
- "ldr x21, [x13, #0xa8]\n"
- "fmla v30.4s, v6.4s, v15.4s\n"
- "fmax v28.4s, v28.4s, v19.4s\n"
- "ldr q16, [x21, x28]\n"
- "ldr x22, [x13, #0xa0]\n"
- "fmla v31.4s, v3.4s, v16.4s\n"
- "fmax v29.4s, v29.4s, v19.4s\n"
- "ldr q13, [x22, x28]\n"
- "ldr q3, [x24, #0x40]\n"
- "fmla v30.4s, v7.4s, v13.4s\n"
- "fmla v30.4s, v5.4s, v16.4s\n"
- "ldr q5, [x24, #0x60]\n"
- "ldr x21, [x13, #0xb0]\n"
- "add x23, x23, #0x10\n"
- "fmin v28.4s, v28.4s, v18.4s\n"
- "ldr q14, [x21, x28]\n"
+ "fmla v24.4s, v6.4s, v22.4s\n"
+ "fmla v20.4s, v1.4s, v17.4s\n"
+ "ldr q16, [x20, x28]\n"
+ "ldr q1, [x23, #0x20]\n"
+ "fmla v19.4s, v2.4s, v16.4s\n"
+ "fmla v24.4s, v7.4s, v17.4s\n"
+ "ldr q2, [x23, #0x30]\n"
+ "ldr x20, [x13, #0x90]\n"
+ "fmla v23.4s, v7.4s, v21.4s\n"
+ "fmla v23.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x20, x28]\n"
+ "ldr x20, [x13, #0xa8]\n"
+ "fmla v20.4s, v6.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v26.4s\n"
+ "ldr q17, [x20, x28]\n"
+ "ldr x20, [x13, #0xa0]\n"
+ "fmla v19.4s, v3.4s, v17.4s\n"
+ "fmax v23.4s, v23.4s, v26.4s\n"
+ "ldr q16, [x20, x28]\n"
+ "ldr q3, [x23, #0x40]\n"
+ "fmla v20.4s, v7.4s, v16.4s\n"
+ "fmla v20.4s, v5.4s, v17.4s\n"
+ "ldr q5, [x23, #0x60]\n"
+ "ldr x20, [x13, #0xb0]\n"
+ "add x22, x22, #0x10\n"
+ "fmin v24.4s, v24.4s, v27.4s\n"
+ "ldr q16, [x20, x28]\n"
"ldr x20, [x13, #0xb8]\n"
- "fmla v31.4s, v7.4s, v14.4s\n"
- "fmin v29.4s, v29.4s, v18.4s\n"
- "ldr q15, [x20, x28]\n"
- "ldr q7, [x24, #0x80]\n"
- "fmla v31.4s, v6.4s, v15.4s\n"
- "fmla v30.4s, v8.4s, v15.4s\n"
- "ldr q6, [x24, #0x70]\n"
- "ldr x22, [x13, #0xc0]\n"
- "fmax v30.4s, v30.4s, v19.4s\n"
- "fmin v30.4s, v30.4s, v18.4s\n"
- "ldr q11, [x22, x28]\n"
- "fmla v31.4s, v8.4s, v11.4s\n"
- "ldr q8, [x24, #0x90]\n"
- "fmax v31.4s, v31.4s, v19.4s\n"
- "ldp x22, x20, [x13, #0x0]\n"
- "ldr q9, [x22, x26]\n"
- "fmin v31.4s, v31.4s, v18.4s\n"
+ "fmla v19.4s, v7.4s, v16.4s\n"
+ "fmin v23.4s, v23.4s, v27.4s\n"
+ "ldr q16, [x20, x28]\n"
+ "ldr q7, [x23, #0x80]\n"
+ "fmla v19.4s, v6.4s, v16.4s\n"
+ "fmla v20.4s, v8.4s, v16.4s\n"
+ "ldr q6, [x23, #0x70]\n"
+ "ldr x20, [x13, #0xc0]\n"
+ "fmax v20.4s, v20.4s, v26.4s\n"
+ "fmin v20.4s, v20.4s, v27.4s\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v19.4s, v8.4s, v16.4s\n"
+ "ldr q8, [x23, #0x90]\n"
+ "fmax v19.4s, v19.4s, v26.4s\n"
+ "ldp x21, x20, [x13, #0x0]\n"
+ "ldr q9, [x21, x25]\n"
+ "fmin v19.4s, v19.4s, v27.4s\n"
"add x28, x28, #0x10\n"
- "ldr q10, [x20, x26]\n"
+ "ldr q10, [x20, x25]\n"
"ldp x21, x20, [x13, #0x10]\n"
- "str q28, [x12, x23]\n"
- "add x24, x24, #0xa0\n"
- "ldr q11, [x21, x26]\n"
- "ldr q12, [x20, x26]\n"
- "str q29, [x11, x23]\n"
- "ldp x22, x21, [x13, #0x20]\n"
- "ldr q13, [x22, x26]\n"
- "str q30, [x10, x23]\n"
- "ldr q14, [x21, x26]\n"
+ "str q24, [x12, x22]\n"
+ "add x23, x23, #0xa0\n"
+ "ldr q11, [x21, x25]\n"
+ "ldr q12, [x20, x25]\n"
+ "str q23, [x11, x22]\n"
+ "ldp x21, x20, [x13, #0x20]\n"
+ "ldr q13, [x21, x25]\n"
+ "str q20, [x10, x22]\n"
+ "ldr q14, [x20, x25]\n"
"ldp x21, x20, [x13, #0x30]\n"
- "str q31, [x9, x23]\n"
- "ldr q15, [x21, x26]\n"
- "ldr q16, [x20, x26]\n"
- "add x26, x26, #0x10\n"
- "cmp x26, x25, LSL #4\n"
+ "str q19, [x9, x22]\n"
+ "ldr q15, [x21, x25]\n"
+ "ldr q16, [x20, x25]\n"
+ "add x25, x25, #0x10\n"
+ "cmp x25, x24, LSL #4\n"
"blt 1b\n"
"2:" // Channel tail
- "mov v28.16b, v17.16b\n fmla v28.4s, v8.4s, v9.4s\n"
- "mov v29.16b, v17.16b\n fmla v29.4s, v6.4s, v9.4s\n"
- "ldr x22, [x13, #0x40]\n"
+ "mov v25.16b, v31.16b\n fmla v25.4s, v8.4s, v9.4s\n"
+ "mov v24.16b, v31.16b\n fmla v24.4s, v6.4s, v9.4s\n"
+ "ldr x21, [x13, #0x40]\n"
"ldr x20, [x13, #0x48]\n"
- "fmla v28.4s, v0.4s, v10.4s\n"
- "fmla v29.4s, v1.4s, v12.4s\n"
- "ldr q12, [x20, x28]\n"
- "ldr x21, [x13, #0x50]\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "ldr q11, [x22, x28]\n"
- "fmla v29.4s, v2.4s, v13.4s\n"
- "ldr q13, [x21, x28]\n"
- "fmla v28.4s, v3.4s, v14.4s\n"
- "fmla v29.4s, v0.4s, v16.4s\n"
+ "fmla v25.4s, v0.4s, v10.4s\n"
+ "fmla v24.4s, v1.4s, v12.4s\n"
+ "ldr q20, [x20, x28]\n"
+ "ldr x20, [x13, #0x50]\n"
+ "fmla v25.4s, v1.4s, v11.4s\n"
+ "ldr q18, [x21, x28]\n"
+ "fmla v24.4s, v2.4s, v13.4s\n"
+ "ldr q19, [x20, x28]\n"
+ "fmla v25.4s, v3.4s, v14.4s\n"
+ "fmla v24.4s, v0.4s, v16.4s\n"
"ldr x20, [x13, #0x58]\n"
- "ldr q14, [x20, x28]\n"
- "fmla v28.4s, v4.4s, v15.4s\n"
- "fmla v29.4s, v4.4s, v11.4s\n"
- "ldr x20, [x13, #0x78]\n"
- "ldr x22, [x13, #0x60]\n"
- "ldr q15, [x22, x28]\n"
- "fmla v28.4s, v2.4s, v16.4s\n"
- "fmla v29.4s, v5.4s, v12.4s\n"
- "ldr x22, [x13, #0x80]\n"
- "ldr q12, [x22, x28]\n"
- "mov v30.16b, v17.16b\n fmla v30.4s, v2.4s, v9.4s\n"
- "mov v31.16b, v17.16b\n fmla v31.4s, v0.4s, v9.4s\n"
- "ldr x21, [x13, #0x68]\n"
- "ldr q11, [x21, x28]\n"
- "fmla v28.4s, v5.4s, v13.4s\n"
- "fmla v29.4s, v3.4s, v13.4s\n"
- "ldr q13, [x20, x28]\n"
- "fmla v30.4s, v3.4s, v14.4s\n"
- "fmla v31.4s, v4.4s, v13.4s\n"
- "ldr x20, [x13, #0x88]\n"
- "ldr q14, [x20, x28]\n"
- "fmla v30.4s, v0.4s, v15.4s\n"
- "fmla v31.4s, v1.4s, v12.4s\n"
- "ldr x21, [x13, #0x70]\n"
+ "ldr q17, [x20, x28]\n"
+ "fmla v25.4s, v4.4s, v15.4s\n"
+ "fmla v24.4s, v4.4s, v18.4s\n"
+ "ldr x21, [x13, #0x78]\n"
+ "ldr x20, [x13, #0x60]\n"
+ "ldr q23, [x20, x28]\n"
+ "fmla v25.4s, v2.4s, v16.4s\n"
+ "fmla v24.4s, v5.4s, v20.4s\n"
+ "ldr x20, [x13, #0x80]\n"
+ "ldr q22, [x20, x28]\n"
+ "mov v21.16b, v31.16b\n fmla v21.4s, v2.4s, v9.4s\n"
+ "mov v20.16b, v31.16b\n fmla v20.4s, v0.4s, v9.4s\n"
+ "ldr x20, [x13, #0x68]\n"
+ "ldr q18, [x20, x28]\n"
+ "fmla v25.4s, v5.4s, v19.4s\n"
+ "fmla v24.4s, v3.4s, v19.4s\n"
"ldr q16, [x21, x28]\n"
+ "fmla v21.4s, v3.4s, v17.4s\n"
+ "fmla v20.4s, v4.4s, v16.4s\n"
+ "ldr x20, [x13, #0x88]\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v21.4s, v0.4s, v23.4s\n"
+ "fmla v20.4s, v1.4s, v22.4s\n"
+ "ldr x20, [x13, #0x70]\n"
+ "ldr q17, [x20, x28]\n"
"ldr x20, [x13, #0x98]\n"
- "fmla v30.4s, v4.4s, v11.4s\n"
- "ldr q11, [x20, x28]\n"
- "fmla v31.4s, v5.4s, v14.4s\n"
- "fmla v28.4s, v6.4s, v15.4s\n"
- "ldr x21, [x13, #0x90]\n"
- "ldr q15, [x21, x28]\n"
- "fmla v30.4s, v1.4s, v16.4s\n"
- "ldr x21, [x13, #0xa8]\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
- "fmla v28.4s, v7.4s, v16.4s\n"
- "ldr q16, [x21, x28]\n"
- "ldr x22, [x13, #0xa0]\n"
- "ldr q13, [x22, x28]\n"
- "fmla v30.4s, v6.4s, v15.4s\n"
- "fmla v31.4s, v3.4s, v16.4s\n"
- "ldr x21, [x13, #0xb0]\n"
- "ldr q14, [x21, x28]\n"
- "fmla v30.4s, v7.4s, v13.4s\n"
- "fmla v31.4s, v7.4s, v14.4s\n"
+ "fmla v21.4s, v4.4s, v18.4s\n"
+ "ldr q19, [x20, x28]\n"
+ "fmla v20.4s, v5.4s, v16.4s\n"
+ "fmla v25.4s, v6.4s, v23.4s\n"
+ "ldr x20, [x13, #0x90]\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v21.4s, v1.4s, v17.4s\n"
+ "ldr x20, [x13, #0xa8]\n"
+ "fmla v20.4s, v2.4s, v19.4s\n"
+ "fmla v25.4s, v7.4s, v17.4s\n"
+ "ldr q18, [x20, x28]\n"
+ "ldr x20, [x13, #0xa0]\n"
+ "ldr q17, [x20, x28]\n"
+ "fmla v21.4s, v6.4s, v16.4s\n"
+ "fmla v20.4s, v3.4s, v18.4s\n"
+ "ldr x20, [x13, #0xb0]\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v21.4s, v7.4s, v17.4s\n"
+ "fmla v20.4s, v7.4s, v16.4s\n"
"ldr x20, [x13, #0xb8]\n"
- "ldr q15, [x20, x28]\n"
- "fmla v29.4s, v7.4s, v12.4s\n"
- "fmla v30.4s, v5.4s, v16.4s\n"
- "ldr x22, [x13, #0xc0]\n"
- "fmla v31.4s, v6.4s, v15.4s\n"
- "fmla v29.4s, v8.4s, v11.4s\n"
- "ldr q11, [x22, x28]\n"
- "fmla v30.4s, v8.4s, v15.4s\n"
- "fmla v31.4s, v8.4s, v11.4s\n"
- "fmax v28.4s, v28.4s, v19.4s\n"
- "add x23, x23, #0x10\n"
- "fmax v29.4s, v29.4s, v19.4s\n"
- "fmax v30.4s, v30.4s, v19.4s\n"
+ "ldr q17, [x20, x28]\n"
+ "fmla v24.4s, v7.4s, v22.4s\n"
+ "fmla v21.4s, v5.4s, v18.4s\n"
+ "ldr x20, [x13, #0xc0]\n"
+ "fmla v20.4s, v6.4s, v17.4s\n"
+ "fmla v24.4s, v8.4s, v19.4s\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v21.4s, v8.4s, v17.4s\n"
+ "fmla v20.4s, v8.4s, v16.4s\n"
+ "fmax v25.4s, v25.4s, v26.4s\n"
+ "add x22, x22, #0x10\n"
+ "fmax v24.4s, v24.4s, v26.4s\n"
+ "fmax v21.4s, v21.4s, v26.4s\n"
"add x28, x28, #0x10\n"
- "fmax v31.4s, v31.4s, v19.4s\n"
- "fmin v28.4s, v28.4s, v18.4s\n"
- "str q28, [x12, x23]\n"
- "fmin v29.4s, v29.4s, v18.4s\n"
- "fmin v30.4s, v30.4s, v18.4s\n"
- "str q29, [x11, x23]\n"
- "fmin v31.4s, v31.4s, v18.4s\n"
- "str q30, [x10, x23]\n"
- "str q31, [x9, x23]\n"
+ "fmax v20.4s, v20.4s, v26.4s\n"
+ "fmin v25.4s, v25.4s, v27.4s\n"
+ "str q25, [x12, x22]\n"
+ "fmin v24.4s, v24.4s, v27.4s\n"
+ "fmin v21.4s, v21.4s, v27.4s\n"
+ "str q24, [x11, x22]\n"
+ "fmin v20.4s, v20.4s, v27.4s\n"
+ "str q21, [x10, x22]\n"
+ "str q20, [x9, x22]\n"
"3:" // Oddments
"tst %x[n_channels], #0x3\n"
"beq 42f\n"
- "ldr q17, [x24, #0x0]\n"
- "ldr q0, [x24, #0x10]\n"
- "mov x23, x28\n"
- "add x12, x12, x23\n"
- "ldr q1, [x24, #0x20]\n"
- "ldr q2, [x24, #0x30]\n"
- "add x11, x11, x23\n"
- "add x10, x10, x23\n"
- "ldr q3, [x24, #0x40]\n"
- "ldr q4, [x24, #0x50]\n"
- "add x9, x9, x23\n"
- "ldr q5, [x24, #0x60]\n"
- "ldr q6, [x24, #0x70]\n"
- "ldr q7, [x24, #0x80]\n"
- "ldr q8, [x24, #0x90]\n"
+ "ldr q31, [x23, #0x0]\n"
+ "ldr q0, [x23, #0x10]\n"
+ "mov x20, x28\n"
+ "add x12, x12, x20\n"
+ "ldr q1, [x23, #0x20]\n"
+ "ldr q2, [x23, #0x30]\n"
+ "add x11, x11, x20\n"
+ "add x10, x10, x20\n"
+ "ldr q3, [x23, #0x40]\n"
+ "ldr q4, [x23, #0x50]\n"
+ "add x9, x9, x20\n"
+ "ldr q5, [x23, #0x60]\n"
+ "ldr q6, [x23, #0x70]\n"
+ "ldr q7, [x23, #0x80]\n"
+ "ldr q8, [x23, #0x90]\n"
"ldr x27, [x13, #0x0]\n"
"ldr x26, [x13, #0x8]\n"
"add x27, x27, x28\n"
@@ -385,18 +385,18 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"ld1 { v15.s }[0], [x21], #0x4\n"
"ld1 { v16.s }[0], [x20], #0x4\n"
"5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: End
- "mov v28.16b, v17.16b\n fmla v28.4s, v8.4s, v9.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v8.4s, v9.4s\n"
"fmla v28.4s, v0.4s, v10.4s\n"
"ldr x20, [x13, #0x40]\n"
"add x20, x20, x28\n"
- "mov v29.16b, v17.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+ "mov v29.16b, v31.16b\n fmla v29.4s, v6.4s, v9.4s\n"
"fmla v28.4s, v1.4s, v11.4s\n"
"fmla v29.4s, v1.4s, v12.4s\n"
"fmla v28.4s, v3.4s, v14.4s\n"
"fmla v29.4s, v2.4s, v13.4s\n"
"fmla v28.4s, v4.4s, v15.4s\n"
- "mov v30.16b, v17.16b\n fmla v30.4s, v2.4s, v9.4s\n"
- "mov v31.16b, v17.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "mov v30.16b, v31.16b\n fmla v30.4s, v2.4s, v9.4s\n"
+ "fmla v31.4s, v0.4s, v9.4s\n"
"fmla v28.4s, v2.4s, v16.4s\n"
"fmla v29.4s, v0.4s, v16.4s\n"
"tbz %x[n_channels], #1, 6f\n"
@@ -591,14 +591,14 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"ld1 { v11.s }[0], [x20], #0x4\n"
"39:" // Oddments: Load input (4, 4): Bit 1: End
"fmla v31.4s, v8.4s, v11.4s\n"
- "fmax v28.4s, v28.4s, v19.4s\n"
- "fmax v29.4s, v29.4s, v19.4s\n"
- "fmax v30.4s, v30.4s, v19.4s\n"
- "fmax v31.4s, v31.4s, v19.4s\n"
- "fmin v28.4s, v28.4s, v18.4s\n"
- "fmin v29.4s, v29.4s, v18.4s\n"
- "fmin v30.4s, v30.4s, v18.4s\n"
- "fmin v31.4s, v31.4s, v18.4s\n"
+ "fmax v28.4s, v28.4s, v26.4s\n"
+ "fmax v29.4s, v29.4s, v26.4s\n"
+ "fmax v30.4s, v30.4s, v26.4s\n"
+ "fmax v31.4s, v31.4s, v26.4s\n"
+ "fmin v28.4s, v28.4s, v27.4s\n"
+ "fmin v29.4s, v29.4s, v27.4s\n"
+ "fmin v30.4s, v30.4s, v27.4s\n"
+ "fmin v31.4s, v31.4s, v27.4s\n"
"tbz %x[n_channels], #1, 40f\n"
"st1 { v28.d }[0], [x12], #0x8\n"
"st1 { v29.d }[0], [x11], #0x8\n"
@@ -619,11 +619,11 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"42:" // End
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index c88a7d57ce..de8a1e4514 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -33,8 +33,8 @@
namespace arm_conv {
namespace depthwise {
-void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
-void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
class a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
{
@@ -57,7 +57,7 @@ class a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 2;
a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<float, float, float, float>(2, 5, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
index 6ca3976f02..3426fbc3f9 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -120,9 +120,9 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"add x13, x15, x2\n"
"add x5, x5, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v18.4s }, [x20]\n"
+ "ld1r { v27.4s }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v17.4s }, [x20]\n"
+ "ld1r { v15.4s }, [x20]\n"
"add x12, x14, x24, LSL #2\n"
"add x11, x13, x2\n"
"add x10, x5, x21, LSL #2\n"
@@ -130,7 +130,7 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"mov x21, #0x0\n"
"sub x20, XZR, x23\n"
"cbz x22, 4f\n"
- "ldr q16, [x8, #0x0]\n"
+ "ldr q25, [x8, #0x0]\n"
"ldr q0, [x8, #0x10]\n"
"cmp x23, x22, LSL #4\n"
"ldr q1, [x8, #0x20]\n"
@@ -150,366 +150,366 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"ld1 { v14.4s }, [x17]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v5.4s\n"
- "ldr q5, [x7, x15]\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v6.4s\n"
+ "mov v30.16b, v25.16b\n fmla v30.4s, v0.4s, v5.4s\n"
+ "ldr q23, [x7, x15]\n"
+ "mov v31.16b, v25.16b\n fmla v31.4s, v0.4s, v6.4s\n"
"add x23, x23, #0x10\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v7.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v8.4s\n"
- "ldr q0, [x8, #0x0]\n"
- "ldr q16, [x8, #0x140]\n"
- "fmla v28.4s, v1.4s, v6.4s\n"
- "ldr q6, [x7, x13]\n"
- "fmla v29.4s, v1.4s, v9.4s\n"
+ "mov v29.16b, v25.16b\n fmla v29.4s, v0.4s, v7.4s\n"
+ "mov v28.16b, v25.16b\n fmla v28.4s, v0.4s, v8.4s\n"
+ "ldr q19, [x8, #0x0]\n"
+ "ldr q25, [x8, #0x140]\n"
+ "fmla v30.4s, v1.4s, v6.4s\n"
+ "ldr q21, [x7, x13]\n"
+ "fmla v31.4s, v1.4s, v9.4s\n"
"add x7, x7, #0x10\n"
- "fmla v30.4s, v1.4s, v8.4s\n"
- "fmla v31.4s, v1.4s, v13.4s\n"
+ "fmla v29.4s, v1.4s, v8.4s\n"
+ "fmla v28.4s, v1.4s, v13.4s\n"
"ldr q1, [x8, #0x10]\n"
"cmp x23, x22, LSL #4\n"
- "fmla v28.4s, v2.4s, v9.4s\n"
- "ldr q9, [x4, x11]\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
+ "fmla v30.4s, v2.4s, v9.4s\n"
+ "ldr q18, [x4, x11]\n"
+ "fmla v31.4s, v2.4s, v11.4s\n"
"add x4, x4, #0x10\n"
- "fmla v30.4s, v2.4s, v13.4s\n"
- "fmla v31.4s, v2.4s, v5.4s\n"
- "ldr q2, [x8, #0x20]\n"
+ "fmla v29.4s, v2.4s, v13.4s\n"
+ "fmla v28.4s, v2.4s, v23.4s\n"
+ "ldr q17, [x8, #0x20]\n"
"add x20, x20, #0x10\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ldr q11, [x17, x2]\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
+ "fmla v30.4s, v3.4s, v11.4s\n"
+ "ldr q6, [x17, x2]\n"
+ "fmla v31.4s, v3.4s, v12.4s\n"
"add x21, x21, #0x10\n"
- "fmla v30.4s, v3.4s, v5.4s\n"
- "fmla v31.4s, v3.4s, v6.4s\n"
- "ldr q3, [x8, #0x30]\n"
- "fmla v28.4s, v4.4s, v12.4s\n"
- "ldr q12, [x17, x6]\n"
- "fmla v29.4s, v4.4s, v9.4s\n"
- "ldr q9, [x17, x15]\n"
- "fmla v30.4s, v4.4s, v6.4s\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
- "ldr q4, [x8, #0x40]\n"
- "fmla v28.4s, v0.4s, v7.4s\n"
+ "fmla v29.4s, v3.4s, v23.4s\n"
+ "fmla v28.4s, v3.4s, v21.4s\n"
+ "ldr q16, [x8, #0x30]\n"
+ "fmla v30.4s, v4.4s, v12.4s\n"
+ "ldr q2, [x17, x6]\n"
+ "fmla v31.4s, v4.4s, v18.4s\n"
+ "ldr q0, [x17, x15]\n"
+ "fmla v29.4s, v4.4s, v21.4s\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "ldr q20, [x8, #0x40]\n"
+ "fmla v30.4s, v19.4s, v7.4s\n"
"ld1 { v7.4s }, [x7]\n"
- "fmla v29.4s, v0.4s, v8.4s\n"
- "fmla v30.4s, v0.4s, v14.4s\n"
- "fmla v31.4s, v0.4s, v11.4s\n"
- "ldr q0, [x8, #0x50]\n"
- "fmla v28.4s, v1.4s, v8.4s\n"
- "ldr q8, [x17, x11]\n"
- "fmla v29.4s, v1.4s, v13.4s\n"
- "fmla v30.4s, v1.4s, v11.4s\n"
- "fmla v31.4s, v1.4s, v12.4s\n"
- "ldr q1, [x8, #0x60]\n"
- "fmla v28.4s, v2.4s, v13.4s\n"
- "ldr q13, [x17, x13]\n"
- "fmla v29.4s, v2.4s, v5.4s\n"
+ "fmla v31.4s, v19.4s, v8.4s\n"
+ "fmla v29.4s, v19.4s, v14.4s\n"
+ "fmla v28.4s, v19.4s, v6.4s\n"
+ "ldr q19, [x8, #0x50]\n"
+ "fmla v30.4s, v1.4s, v8.4s\n"
+ "ldr q26, [x17, x11]\n"
+ "fmla v31.4s, v1.4s, v13.4s\n"
+ "fmla v29.4s, v1.4s, v6.4s\n"
+ "fmla v28.4s, v1.4s, v2.4s\n"
+ "ldr q18, [x8, #0x60]\n"
+ "fmla v30.4s, v17.4s, v13.4s\n"
+ "ldr q1, [x17, x13]\n"
+ "fmla v31.4s, v17.4s, v23.4s\n"
"add x17, x17, #0x10\n"
- "fmla v30.4s, v2.4s, v12.4s\n"
- "fmla v31.4s, v2.4s, v9.4s\n"
- "ldr q2, [x8, #0x70]\n"
- "fmla v28.4s, v3.4s, v5.4s\n"
- "ld1 { v5.4s }, [x16]\n"
- "fmla v29.4s, v3.4s, v6.4s\n"
- "fmla v30.4s, v3.4s, v9.4s\n"
- "fmla v31.4s, v3.4s, v13.4s\n"
- "ldr q3, [x8, #0x80]\n"
- "fmla v28.4s, v4.4s, v6.4s\n"
- "ldr q6, [x16, x2]\n"
- "fmla v29.4s, v4.4s, v10.4s\n"
- "ldr q10, [x16, x6]\n"
- "fmla v30.4s, v4.4s, v13.4s\n"
- "fmla v31.4s, v4.4s, v8.4s\n"
- "ldr q4, [x8, #0x90]\n"
- "fmla v28.4s, v0.4s, v14.4s\n"
- "ldr q14, [x16, x11]\n"
- "fmla v29.4s, v0.4s, v11.4s\n"
- "fmla v30.4s, v0.4s, v5.4s\n"
- "fmla v31.4s, v0.4s, v6.4s\n"
- "ldr q0, [x8, #0xa0]\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "ldr q11, [x16, x15]\n"
- "fmla v29.4s, v1.4s, v12.4s\n"
- "fmla v30.4s, v1.4s, v6.4s\n"
- "fmla v31.4s, v1.4s, v10.4s\n"
- "ldr q1, [x8, #0xb0]\n"
- "fmla v28.4s, v2.4s, v12.4s\n"
- "ldr q12, [x16, x13]\n"
- "fmla v29.4s, v2.4s, v9.4s\n"
+ "fmla v29.4s, v17.4s, v2.4s\n"
+ "fmla v28.4s, v17.4s, v0.4s\n"
+ "ldr q17, [x8, #0x70]\n"
+ "fmla v30.4s, v16.4s, v23.4s\n"
+ "ld1 { v24.4s }, [x16]\n"
+ "fmla v31.4s, v16.4s, v21.4s\n"
+ "fmla v29.4s, v16.4s, v0.4s\n"
+ "fmla v28.4s, v16.4s, v1.4s\n"
+ "ldr q16, [x8, #0x80]\n"
+ "fmla v30.4s, v20.4s, v21.4s\n"
+ "ldr q23, [x16, x2]\n"
+ "fmla v31.4s, v20.4s, v10.4s\n"
+ "ldr q22, [x16, x6]\n"
+ "fmla v29.4s, v20.4s, v1.4s\n"
+ "fmla v28.4s, v20.4s, v26.4s\n"
+ "ldr q21, [x8, #0x90]\n"
+ "fmla v30.4s, v19.4s, v14.4s\n"
+ "ldr q5, [x16, x11]\n"
+ "fmla v31.4s, v19.4s, v6.4s\n"
+ "fmla v29.4s, v19.4s, v24.4s\n"
+ "fmla v28.4s, v19.4s, v23.4s\n"
+ "ldr q11, [x8, #0xa0]\n"
+ "fmla v30.4s, v18.4s, v6.4s\n"
+ "ldr q20, [x16, x15]\n"
+ "fmla v31.4s, v18.4s, v2.4s\n"
+ "fmla v29.4s, v18.4s, v23.4s\n"
+ "fmla v28.4s, v18.4s, v22.4s\n"
+ "ldr q18, [x8, #0xb0]\n"
+ "fmla v30.4s, v17.4s, v2.4s\n"
+ "ldr q19, [x16, x13]\n"
+ "fmla v31.4s, v17.4s, v0.4s\n"
"add x16, x16, #0x10\n"
- "fmla v30.4s, v2.4s, v10.4s\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
- "ldr q2, [x8, #0xc0]\n"
- "fmla v28.4s, v3.4s, v9.4s\n"
- "ld1 { v9.4s }, [x14]\n"
- "fmla v29.4s, v3.4s, v13.4s\n"
- "fmla v30.4s, v3.4s, v11.4s\n"
- "fmla v31.4s, v3.4s, v12.4s\n"
- "ldr q3, [x8, #0xd0]\n"
- "fmla v28.4s, v4.4s, v13.4s\n"
- "ldr q13, [x14, x2]\n"
- "fmla v29.4s, v4.4s, v8.4s\n"
- "ldr q8, [x14, x13]\n"
- "fmla v30.4s, v4.4s, v12.4s\n"
- "fmla v31.4s, v4.4s, v14.4s\n"
- "ldr q4, [x8, #0xe0]\n"
- "fmla v28.4s, v0.4s, v5.4s\n"
- "ldr q5, [x14, x6]\n"
- "fmla v29.4s, v0.4s, v6.4s\n"
- "fmla v30.4s, v0.4s, v9.4s\n"
- "fmla v31.4s, v0.4s, v13.4s\n"
- "ldr q0, [x8, #0xf0]\n"
- "fmla v28.4s, v1.4s, v6.4s\n"
- "ldr q6, [x14, x15]\n"
- "fmla v29.4s, v1.4s, v10.4s\n"
- "fmla v30.4s, v1.4s, v13.4s\n"
- "fmla v31.4s, v1.4s, v5.4s\n"
- "ldr q1, [x8, #0x100]\n"
- "fmla v28.4s, v2.4s, v10.4s\n"
- "ldr q10, [x14, x11]\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
+ "fmla v29.4s, v17.4s, v22.4s\n"
+ "fmla v28.4s, v17.4s, v20.4s\n"
+ "ldr q17, [x8, #0xc0]\n"
+ "fmla v30.4s, v16.4s, v0.4s\n"
+ "ld1 { v0.4s }, [x14]\n"
+ "fmla v31.4s, v16.4s, v1.4s\n"
+ "fmla v29.4s, v16.4s, v20.4s\n"
+ "fmla v28.4s, v16.4s, v19.4s\n"
+ "ldr q16, [x8, #0xd0]\n"
+ "fmla v30.4s, v21.4s, v1.4s\n"
+ "ldr q4, [x14, x2]\n"
+ "fmla v31.4s, v21.4s, v26.4s\n"
+ "ldr q12, [x14, x13]\n"
+ "fmla v29.4s, v21.4s, v19.4s\n"
+ "fmla v28.4s, v21.4s, v5.4s\n"
+ "ldr q13, [x8, #0xe0]\n"
+ "fmla v30.4s, v11.4s, v24.4s\n"
+ "ldr q6, [x14, x6]\n"
+ "fmla v31.4s, v11.4s, v23.4s\n"
+ "fmla v29.4s, v11.4s, v0.4s\n"
+ "fmla v28.4s, v11.4s, v4.4s\n"
+ "ldr q24, [x8, #0xf0]\n"
+ "fmla v30.4s, v18.4s, v23.4s\n"
+ "ldr q26, [x14, x15]\n"
+ "fmla v31.4s, v18.4s, v22.4s\n"
+ "fmla v29.4s, v18.4s, v4.4s\n"
+ "fmla v28.4s, v18.4s, v6.4s\n"
+ "ldr q23, [x8, #0x100]\n"
+ "fmla v30.4s, v17.4s, v22.4s\n"
+ "ldr q22, [x14, x11]\n"
+ "fmla v31.4s, v17.4s, v20.4s\n"
"add x14, x14, #0x10\n"
- "fmla v30.4s, v2.4s, v5.4s\n"
- "fmla v31.4s, v2.4s, v6.4s\n"
- "ldr q2, [x8, #0x110]\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ld1 { v11.4s }, [x12]\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
- "fmla v30.4s, v3.4s, v6.4s\n"
- "fmla v31.4s, v3.4s, v8.4s\n"
- "ldr q3, [x8, #0x120]\n"
- "fmla v28.4s, v4.4s, v12.4s\n"
- "ldr q12, [x12, x2]\n"
- "fmla v29.4s, v4.4s, v14.4s\n"
+ "fmla v29.4s, v17.4s, v6.4s\n"
+ "fmla v28.4s, v17.4s, v26.4s\n"
+ "ldr q21, [x8, #0x110]\n"
+ "fmla v30.4s, v16.4s, v20.4s\n"
+ "ld1 { v18.4s }, [x12]\n"
+ "fmla v31.4s, v16.4s, v19.4s\n"
+ "fmla v29.4s, v16.4s, v26.4s\n"
+ "fmla v28.4s, v16.4s, v12.4s\n"
+ "ldr q20, [x8, #0x120]\n"
+ "fmla v30.4s, v13.4s, v19.4s\n"
+ "ldr q17, [x12, x2]\n"
+ "fmla v31.4s, v13.4s, v5.4s\n"
"ld1 { v14.4s }, [x17]\n"
- "fmla v30.4s, v4.4s, v8.4s\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
- "ldr q4, [x8, #0x130]\n"
- "fmla v28.4s, v0.4s, v9.4s\n"
- "ldr q9, [x12, x6]\n"
- "fmla v29.4s, v0.4s, v13.4s\n"
- "fmla v30.4s, v0.4s, v11.4s\n"
- "ldr q11, [x12, x15]\n"
- "fmla v31.4s, v0.4s, v12.4s\n"
+ "fmla v29.4s, v13.4s, v12.4s\n"
+ "fmla v28.4s, v13.4s, v22.4s\n"
+ "ldr q19, [x8, #0x130]\n"
+ "fmla v30.4s, v24.4s, v0.4s\n"
+ "ldr q16, [x12, x6]\n"
+ "fmla v31.4s, v24.4s, v4.4s\n"
+ "fmla v29.4s, v24.4s, v18.4s\n"
+ "ldr q18, [x12, x15]\n"
+ "fmla v28.4s, v24.4s, v17.4s\n"
"ldr q0, [x8, #0x150]\n"
- "fmla v28.4s, v1.4s, v13.4s\n"
+ "fmla v30.4s, v23.4s, v4.4s\n"
"ldr q13, [x7, x6]\n"
- "fmla v29.4s, v1.4s, v5.4s\n"
- "fmla v30.4s, v1.4s, v12.4s\n"
- "ldr q12, [x12, x13]\n"
- "fmla v31.4s, v1.4s, v9.4s\n"
+ "fmla v31.4s, v23.4s, v6.4s\n"
+ "fmla v29.4s, v23.4s, v17.4s\n"
+ "ldr q17, [x12, x13]\n"
+ "fmla v28.4s, v23.4s, v16.4s\n"
"ldr q1, [x8, #0x160]\n"
- "fmla v28.4s, v2.4s, v5.4s\n"
+ "fmla v30.4s, v21.4s, v6.4s\n"
"ld1 { v5.4s }, [x4]\n"
- "fmla v29.4s, v2.4s, v6.4s\n"
- "fmla v30.4s, v2.4s, v9.4s\n"
- "ldr q9, [x12, x11]\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
+ "fmla v31.4s, v21.4s, v26.4s\n"
+ "fmla v29.4s, v21.4s, v16.4s\n"
+ "ldr q16, [x12, x11]\n"
+ "fmla v28.4s, v21.4s, v18.4s\n"
"ldr q2, [x8, #0x170]\n"
- "fmla v28.4s, v3.4s, v6.4s\n"
+ "fmla v30.4s, v20.4s, v26.4s\n"
"ldr q6, [x4, x2]\n"
- "fmla v29.4s, v3.4s, v8.4s\n"
+ "fmla v31.4s, v20.4s, v12.4s\n"
"add x12, x12, #0x10\n"
- "fmla v30.4s, v3.4s, v11.4s\n"
+ "fmla v29.4s, v20.4s, v18.4s\n"
"ldr q11, [x4, x15]\n"
- "fmla v31.4s, v3.4s, v12.4s\n"
+ "fmla v28.4s, v20.4s, v17.4s\n"
"ldr q3, [x8, #0x180]\n"
- "fmla v28.4s, v4.4s, v8.4s\n"
+ "fmla v30.4s, v19.4s, v12.4s\n"
"ldr q8, [x7, x2]\n"
- "fmla v29.4s, v4.4s, v10.4s\n"
+ "fmla v31.4s, v19.4s, v22.4s\n"
"ldr q10, [x7, x11]\n"
- "fmla v30.4s, v4.4s, v12.4s\n"
+ "fmla v29.4s, v19.4s, v17.4s\n"
"ldr q12, [x4, x13]\n"
- "fmla v31.4s, v4.4s, v9.4s\n"
+ "fmla v28.4s, v19.4s, v16.4s\n"
"ldr q9, [x4, x6]\n"
"ldr q4, [x8, #0x190]\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
+ "fmax v30.4s, v30.4s, v27.4s\n"
+ "fmax v31.4s, v31.4s, v27.4s\n"
"add x8, x8, #0x1a0\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "st1 { v28.4s }, [x5]\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "str q29, [x5, x3]\n"
+ "fmax v29.4s, v29.4s, v27.4s\n"
+ "fmax v28.4s, v28.4s, v27.4s\n"
+ "fmin v30.4s, v30.4s, v15.4s\n"
+ "fmin v31.4s, v31.4s, v15.4s\n"
+ "st1 { v30.4s }, [x5]\n"
+ "fmin v29.4s, v29.4s, v15.4s\n"
+ "fmin v28.4s, v28.4s, v15.4s\n"
+ "str q31, [x5, x3]\n"
"add x5, x5, #0x10\n"
- "st1 { v30.4s }, [x10]\n"
- "str q31, [x10, x3]\n"
+ "st1 { v29.4s }, [x10]\n"
+ "str q28, [x10, x3]\n"
"add x10, x10, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v5.4s\n"
- "ldr q5, [x7, x15]\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v6.4s\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v7.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v8.4s\n"
- "ldr q0, [x8, #0x0]\n"
- "fmla v28.4s, v1.4s, v6.4s\n"
- "ldr q6, [x7, x13]\n"
- "fmla v29.4s, v1.4s, v9.4s\n"
+ "mov v31.16b, v25.16b\n fmla v31.4s, v0.4s, v5.4s\n"
+ "ldr q22, [x7, x15]\n"
+ "mov v5.16b, v25.16b\n fmla v5.4s, v0.4s, v6.4s\n"
+ "mov v30.16b, v25.16b\n fmla v30.4s, v0.4s, v7.4s\n"
+ "mov v29.16b, v25.16b\n fmla v29.4s, v0.4s, v8.4s\n"
+ "ldr q19, [x8, #0x0]\n"
+ "fmla v31.4s, v1.4s, v6.4s\n"
+ "ldr q21, [x7, x13]\n"
+ "fmla v5.4s, v1.4s, v9.4s\n"
"add x7, x7, #0x10\n"
"fmla v30.4s, v1.4s, v8.4s\n"
- "fmla v31.4s, v1.4s, v13.4s\n"
- "ldr q1, [x8, #0x10]\n"
- "fmla v28.4s, v2.4s, v9.4s\n"
- "ldr q9, [x4, x11]\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
- "add x4, x4, #0x10\n"
- "fmla v30.4s, v2.4s, v13.4s\n"
- "fmla v31.4s, v2.4s, v5.4s\n"
- "ldr q2, [x8, #0x20]\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ldr q11, [x17, x2]\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
- "fmla v30.4s, v3.4s, v5.4s\n"
- "fmla v31.4s, v3.4s, v6.4s\n"
- "ldr q3, [x8, #0x30]\n"
- "fmla v28.4s, v4.4s, v12.4s\n"
- "ldr q12, [x17, x6]\n"
- "fmla v29.4s, v4.4s, v9.4s\n"
- "ldr q9, [x17, x15]\n"
- "fmla v30.4s, v4.4s, v6.4s\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
- "ldr q4, [x8, #0x40]\n"
- "fmla v28.4s, v0.4s, v7.4s\n"
- "fmla v29.4s, v0.4s, v8.4s\n"
- "fmla v30.4s, v0.4s, v14.4s\n"
- "fmla v31.4s, v0.4s, v11.4s\n"
- "ldr q0, [x8, #0x50]\n"
- "fmla v28.4s, v1.4s, v8.4s\n"
- "ldr q8, [x17, x11]\n"
"fmla v29.4s, v1.4s, v13.4s\n"
- "fmla v30.4s, v1.4s, v11.4s\n"
- "fmla v31.4s, v1.4s, v12.4s\n"
- "ldr q1, [x8, #0x60]\n"
- "fmla v28.4s, v2.4s, v13.4s\n"
- "ldr q13, [x17, x13]\n"
- "fmla v29.4s, v2.4s, v5.4s\n"
- "add x17, x17, #0x10\n"
- "fmla v30.4s, v2.4s, v12.4s\n"
+ "ldr q18, [x8, #0x10]\n"
"fmla v31.4s, v2.4s, v9.4s\n"
- "ldr q2, [x8, #0x70]\n"
- "fmla v28.4s, v3.4s, v5.4s\n"
- "ld1 { v5.4s }, [x16]\n"
- "fmla v29.4s, v3.4s, v6.4s\n"
- "fmla v30.4s, v3.4s, v9.4s\n"
- "fmla v31.4s, v3.4s, v13.4s\n"
- "ldr q3, [x8, #0x80]\n"
- "fmla v28.4s, v4.4s, v6.4s\n"
- "ldr q6, [x16, x2]\n"
+ "ldr q16, [x4, x11]\n"
+ "fmla v5.4s, v2.4s, v11.4s\n"
+ "add x4, x4, #0x10\n"
+ "fmla v30.4s, v2.4s, v13.4s\n"
+ "fmla v29.4s, v2.4s, v22.4s\n"
+ "ldr q17, [x8, #0x20]\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "ldr q6, [x17, x2]\n"
+ "fmla v5.4s, v3.4s, v12.4s\n"
+ "fmla v30.4s, v3.4s, v22.4s\n"
+ "fmla v29.4s, v3.4s, v21.4s\n"
+ "ldr q20, [x8, #0x30]\n"
+ "fmla v31.4s, v4.4s, v12.4s\n"
+ "ldr q2, [x17, x6]\n"
+ "fmla v5.4s, v4.4s, v16.4s\n"
+ "ldr q28, [x17, x15]\n"
+ "fmla v30.4s, v4.4s, v21.4s\n"
"fmla v29.4s, v4.4s, v10.4s\n"
- "ldr q10, [x16, x6]\n"
- "fmla v30.4s, v4.4s, v13.4s\n"
- "fmla v31.4s, v4.4s, v8.4s\n"
- "ldr q4, [x8, #0x90]\n"
- "fmla v28.4s, v0.4s, v14.4s\n"
- "ldr q14, [x16, x11]\n"
- "fmla v29.4s, v0.4s, v11.4s\n"
- "fmla v30.4s, v0.4s, v5.4s\n"
- "fmla v31.4s, v0.4s, v6.4s\n"
- "ldr q0, [x8, #0xa0]\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "ldr q11, [x16, x15]\n"
- "fmla v29.4s, v1.4s, v12.4s\n"
- "fmla v30.4s, v1.4s, v6.4s\n"
- "fmla v31.4s, v1.4s, v10.4s\n"
- "ldr q1, [x8, #0xb0]\n"
- "fmla v28.4s, v2.4s, v12.4s\n"
- "ldr q12, [x16, x13]\n"
- "fmla v29.4s, v2.4s, v9.4s\n"
+ "ldr q16, [x8, #0x40]\n"
+ "fmla v31.4s, v19.4s, v7.4s\n"
+ "fmla v5.4s, v19.4s, v8.4s\n"
+ "fmla v30.4s, v19.4s, v14.4s\n"
+ "fmla v29.4s, v19.4s, v6.4s\n"
+ "ldr q19, [x8, #0x50]\n"
+ "fmla v31.4s, v18.4s, v8.4s\n"
+ "ldr q1, [x17, x11]\n"
+ "fmla v5.4s, v18.4s, v13.4s\n"
+ "fmla v30.4s, v18.4s, v6.4s\n"
+ "fmla v29.4s, v18.4s, v2.4s\n"
+ "ldr q18, [x8, #0x60]\n"
+ "fmla v31.4s, v17.4s, v13.4s\n"
+ "ldr q26, [x17, x13]\n"
+ "fmla v5.4s, v17.4s, v22.4s\n"
+ "add x17, x17, #0x10\n"
+ "fmla v30.4s, v17.4s, v2.4s\n"
+ "fmla v29.4s, v17.4s, v28.4s\n"
+ "ldr q17, [x8, #0x70]\n"
+ "fmla v31.4s, v20.4s, v22.4s\n"
+ "ld1 { v25.4s }, [x16]\n"
+ "fmla v5.4s, v20.4s, v21.4s\n"
+ "fmla v30.4s, v20.4s, v28.4s\n"
+ "fmla v29.4s, v20.4s, v26.4s\n"
+ "ldr q24, [x8, #0x80]\n"
+ "fmla v31.4s, v16.4s, v21.4s\n"
+ "ldr q23, [x16, x2]\n"
+ "fmla v5.4s, v16.4s, v10.4s\n"
+ "ldr q0, [x16, x6]\n"
+ "fmla v30.4s, v16.4s, v26.4s\n"
+ "fmla v29.4s, v16.4s, v1.4s\n"
+ "ldr q22, [x8, #0x90]\n"
+ "fmla v31.4s, v19.4s, v14.4s\n"
+ "ldr q16, [x16, x11]\n"
+ "fmla v5.4s, v19.4s, v6.4s\n"
+ "fmla v30.4s, v19.4s, v25.4s\n"
+ "fmla v29.4s, v19.4s, v23.4s\n"
+ "ldr q21, [x8, #0xa0]\n"
+ "fmla v31.4s, v18.4s, v6.4s\n"
+ "ldr q20, [x16, x15]\n"
+ "fmla v5.4s, v18.4s, v2.4s\n"
+ "fmla v30.4s, v18.4s, v23.4s\n"
+ "fmla v29.4s, v18.4s, v0.4s\n"
+ "ldr q18, [x8, #0xb0]\n"
+ "fmla v31.4s, v17.4s, v2.4s\n"
+ "ldr q19, [x16, x13]\n"
+ "fmla v5.4s, v17.4s, v28.4s\n"
"add x16, x16, #0x10\n"
- "fmla v30.4s, v2.4s, v10.4s\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
- "ldr q2, [x8, #0xc0]\n"
- "fmla v28.4s, v3.4s, v9.4s\n"
- "ld1 { v9.4s }, [x14]\n"
- "fmla v29.4s, v3.4s, v13.4s\n"
- "fmla v30.4s, v3.4s, v11.4s\n"
- "fmla v31.4s, v3.4s, v12.4s\n"
- "ldr q3, [x8, #0xd0]\n"
- "fmla v28.4s, v4.4s, v13.4s\n"
- "ldr q13, [x14, x2]\n"
- "fmla v29.4s, v4.4s, v8.4s\n"
- "ldr q8, [x14, x13]\n"
- "fmla v30.4s, v4.4s, v12.4s\n"
- "fmla v31.4s, v4.4s, v14.4s\n"
- "ldr q4, [x8, #0xe0]\n"
- "fmla v28.4s, v0.4s, v5.4s\n"
- "ldr q5, [x14, x6]\n"
- "fmla v29.4s, v0.4s, v6.4s\n"
- "fmla v30.4s, v0.4s, v9.4s\n"
- "fmla v31.4s, v0.4s, v13.4s\n"
- "ldr q0, [x8, #0xf0]\n"
- "fmla v28.4s, v1.4s, v6.4s\n"
- "ldr q6, [x14, x15]\n"
- "fmla v29.4s, v1.4s, v10.4s\n"
- "fmla v30.4s, v1.4s, v13.4s\n"
- "fmla v31.4s, v1.4s, v5.4s\n"
- "ldr q1, [x8, #0x100]\n"
- "fmla v28.4s, v2.4s, v10.4s\n"
- "ldr q10, [x14, x11]\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
+ "fmla v30.4s, v17.4s, v0.4s\n"
+ "fmla v29.4s, v17.4s, v20.4s\n"
+ "ldr q17, [x8, #0xc0]\n"
+ "fmla v31.4s, v24.4s, v28.4s\n"
+ "ld1 { v7.4s }, [x14]\n"
+ "fmla v5.4s, v24.4s, v26.4s\n"
+ "fmla v30.4s, v24.4s, v20.4s\n"
+ "fmla v29.4s, v24.4s, v19.4s\n"
+ "ldr q2, [x8, #0xd0]\n"
+ "fmla v31.4s, v22.4s, v26.4s\n"
+ "ldr q28, [x14, x2]\n"
+ "fmla v5.4s, v22.4s, v1.4s\n"
+ "ldr q13, [x14, x13]\n"
+ "fmla v30.4s, v22.4s, v19.4s\n"
+ "fmla v29.4s, v22.4s, v16.4s\n"
+ "ldr q14, [x8, #0xe0]\n"
+ "fmla v31.4s, v21.4s, v25.4s\n"
+ "ldr q26, [x14, x6]\n"
+ "fmla v5.4s, v21.4s, v23.4s\n"
+ "fmla v30.4s, v21.4s, v7.4s\n"
+ "fmla v29.4s, v21.4s, v28.4s\n"
+ "ldr q25, [x8, #0xf0]\n"
+ "fmla v31.4s, v18.4s, v23.4s\n"
+ "ldr q24, [x14, x15]\n"
+ "fmla v5.4s, v18.4s, v0.4s\n"
+ "fmla v30.4s, v18.4s, v28.4s\n"
+ "fmla v29.4s, v18.4s, v26.4s\n"
+ "ldr q23, [x8, #0x100]\n"
+ "fmla v31.4s, v17.4s, v0.4s\n"
+ "ldr q22, [x14, x11]\n"
+ "fmla v5.4s, v17.4s, v20.4s\n"
"add x14, x14, #0x10\n"
- "fmla v30.4s, v2.4s, v5.4s\n"
- "fmla v31.4s, v2.4s, v6.4s\n"
- "ldr q2, [x8, #0x110]\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ld1 { v11.4s }, [x12]\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
- "fmla v30.4s, v3.4s, v6.4s\n"
- "fmla v31.4s, v3.4s, v8.4s\n"
- "ldr q3, [x8, #0x120]\n"
- "fmla v28.4s, v4.4s, v12.4s\n"
- "ldr q12, [x12, x2]\n"
- "fmla v29.4s, v4.4s, v14.4s\n"
- "fmla v30.4s, v4.4s, v8.4s\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
- "ldr q4, [x8, #0x130]\n"
+ "fmla v30.4s, v17.4s, v26.4s\n"
+ "fmla v29.4s, v17.4s, v24.4s\n"
+ "ldr q21, [x8, #0x110]\n"
+ "fmla v31.4s, v2.4s, v20.4s\n"
+ "ld1 { v18.4s }, [x12]\n"
+ "fmla v5.4s, v2.4s, v19.4s\n"
+ "fmla v30.4s, v2.4s, v24.4s\n"
+ "fmla v29.4s, v2.4s, v13.4s\n"
+ "ldr q20, [x8, #0x120]\n"
+ "fmla v31.4s, v14.4s, v19.4s\n"
+ "ldr q17, [x12, x2]\n"
+ "fmla v5.4s, v14.4s, v16.4s\n"
+ "fmla v30.4s, v14.4s, v13.4s\n"
+ "fmla v29.4s, v14.4s, v22.4s\n"
+ "ldr q19, [x8, #0x130]\n"
"add x8, x8, #0x140\n"
- "fmla v28.4s, v0.4s, v9.4s\n"
- "ldr q9, [x12, x6]\n"
- "fmla v29.4s, v0.4s, v13.4s\n"
- "fmla v30.4s, v0.4s, v11.4s\n"
- "ldr q11, [x12, x15]\n"
- "fmla v31.4s, v0.4s, v12.4s\n"
- "fmla v28.4s, v1.4s, v13.4s\n"
- "fmla v29.4s, v1.4s, v5.4s\n"
- "fmla v30.4s, v1.4s, v12.4s\n"
- "ldr q12, [x12, x13]\n"
- "fmla v31.4s, v1.4s, v9.4s\n"
- "fmla v28.4s, v2.4s, v5.4s\n"
- "fmla v29.4s, v2.4s, v6.4s\n"
- "fmla v30.4s, v2.4s, v9.4s\n"
- "ldr q9, [x12, x11]\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
+ "fmla v31.4s, v25.4s, v7.4s\n"
+ "ldr q16, [x12, x6]\n"
+ "fmla v5.4s, v25.4s, v28.4s\n"
+ "fmla v30.4s, v25.4s, v18.4s\n"
+ "ldr q18, [x12, x15]\n"
+ "fmla v29.4s, v25.4s, v17.4s\n"
+ "fmla v31.4s, v23.4s, v28.4s\n"
+ "fmla v5.4s, v23.4s, v26.4s\n"
+ "fmla v30.4s, v23.4s, v17.4s\n"
+ "ldr q17, [x12, x13]\n"
+ "fmla v29.4s, v23.4s, v16.4s\n"
+ "fmla v31.4s, v21.4s, v26.4s\n"
+ "fmla v5.4s, v21.4s, v24.4s\n"
+ "fmla v30.4s, v21.4s, v16.4s\n"
+ "ldr q16, [x12, x11]\n"
+ "fmla v29.4s, v21.4s, v18.4s\n"
"add x12, x12, #0x10\n"
- "fmla v28.4s, v3.4s, v6.4s\n"
- "fmla v29.4s, v3.4s, v8.4s\n"
- "fmla v30.4s, v3.4s, v11.4s\n"
- "fmla v31.4s, v3.4s, v12.4s\n"
- "fmla v28.4s, v4.4s, v8.4s\n"
- "fmla v29.4s, v4.4s, v10.4s\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmla v30.4s, v4.4s, v12.4s\n"
- "fmla v31.4s, v4.4s, v9.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "st1 { v28.4s }, [x5]\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "str q29, [x5, x3]\n"
+ "fmla v31.4s, v20.4s, v24.4s\n"
+ "fmla v5.4s, v20.4s, v13.4s\n"
+ "fmla v30.4s, v20.4s, v18.4s\n"
+ "fmla v29.4s, v20.4s, v17.4s\n"
+ "fmla v31.4s, v19.4s, v13.4s\n"
+ "fmla v5.4s, v19.4s, v22.4s\n"
+ "fmax v31.4s, v31.4s, v27.4s\n"
+ "fmla v30.4s, v19.4s, v17.4s\n"
+ "fmla v29.4s, v19.4s, v16.4s\n"
+ "fmax v5.4s, v5.4s, v27.4s\n"
+ "fmax v30.4s, v30.4s, v27.4s\n"
+ "fmax v29.4s, v29.4s, v27.4s\n"
+ "fmin v31.4s, v31.4s, v15.4s\n"
+ "fmin v5.4s, v5.4s, v15.4s\n"
+ "st1 { v31.4s }, [x5]\n"
+ "fmin v30.4s, v30.4s, v15.4s\n"
+ "fmin v29.4s, v29.4s, v15.4s\n"
+ "str q5, [x5, x3]\n"
"add x5, x5, #0x10\n"
"st1 { v30.4s }, [x10]\n"
- "str q31, [x10, x3]\n"
+ "str q29, [x10, x3]\n"
"add x10, x10, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x3\n"
"beq 61f\n"
- "ldr q16, [x8, #0x0]\n"
+ "ldr q25, [x8, #0x0]\n"
"ldr q0, [x8, #0x10]\n"
"add x9, x4, XZR\n"
"add x28, x4, x2\n"
@@ -561,11 +561,11 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"ldr s10, [x21, #0x0]\n"
"ldr s14, [x20, #0x0]\n"
"6:" // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: End
- "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v5.4s\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v6.4s\n"
+ "mov v28.16b, v25.16b\n fmla v28.4s, v0.4s, v5.4s\n"
+ "mov v29.16b, v25.16b\n fmla v29.4s, v0.4s, v6.4s\n"
"add x20, x7, x15\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v7.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v8.4s\n"
+ "mov v30.16b, v25.16b\n fmla v30.4s, v0.4s, v7.4s\n"
+ "mov v31.16b, v25.16b\n fmla v31.4s, v0.4s, v8.4s\n"
"fmla v28.4s, v1.4s, v6.4s\n"
"fmla v29.4s, v1.4s, v9.4s\n"
"fmla v30.4s, v1.4s, v8.4s\n"
@@ -934,14 +934,14 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"ldr s9, [x20, #0x0]\n"
"58:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: End
"fmla v31.4s, v4.4s, v9.4s\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
+ "fmax v28.4s, v28.4s, v27.4s\n"
+ "fmax v29.4s, v29.4s, v27.4s\n"
+ "fmax v30.4s, v30.4s, v27.4s\n"
+ "fmax v31.4s, v31.4s, v27.4s\n"
+ "fmin v28.4s, v28.4s, v15.4s\n"
+ "fmin v29.4s, v29.4s, v15.4s\n"
+ "fmin v30.4s, v30.4s, v15.4s\n"
+ "fmin v31.4s, v31.4s, v15.4s\n"
"tbz %x[n_channels], #1, 59f\n"
"mov x21, x5\n"
"mov x20, x10\n"
@@ -967,7 +967,6 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"st1 { v29.s }[0], [x21]\n"
"st1 { v31.s }[0], [x20]\n"
"60:" // Tile loop: Oddments: Store: Bit 1: End
-
"61:" // Tile loop: End
"ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
@@ -982,7 +981,7 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index 860adac326..32939eb6dc 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -103,16 +103,16 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"lsr x9, %x[n_channels], #0x2\n"
"ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v18.4s }, [x20]\n"
+ "ld1r { v27.4s }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v17.4s }, [x20]\n"
+ "ld1r { v15.4s }, [x20]\n"
"add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
"ldp x14, x13, [x21, #0x0]\n"
"ldp x12, x11, [x21, #0x10]\n"
"mov x10, #0x0\n"
"sub x28, XZR, x17\n"
"cbz x9, 3f\n"
- "ldr q16, [x16, #0x0]\n"
+ "ldr q26, [x16, #0x0]\n"
"ldr q0, [x16, #0x10]\n"
"cmp x17, x9, LSL #4\n"
"ldr q1, [x16, #0x20]\n"
@@ -120,436 +120,436 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q3, [x16, #0x40]\n"
"ldr q4, [x16, #0x50]\n"
"add x16, x16, #0x60\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "ldr q5, [x27, x10]\n"
- "ldr q6, [x26, x10]\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "ldr q7, [x25, x10]\n"
- "ldr q8, [x24, x10]\n"
- "ldp x23, x22, [x15, #0x20]\n"
- "ldr q9, [x23, x10]\n"
- "ldr q13, [x22, x10]\n"
+ "ldp x21, x20, [x15, #0x0]\n"
+ "ldr q5, [x21, x10]\n"
+ "ldr q6, [x20, x10]\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr q7, [x21, x10]\n"
+ "ldr q8, [x20, x10]\n"
+ "ldp x21, x20, [x15, #0x20]\n"
+ "ldr q9, [x21, x10]\n"
+ "ldr q13, [x20, x10]\n"
"ldp x21, x20, [x15, #0x30]\n"
"ldr q11, [x21, x10]\n"
"ldr q12, [x20, x10]\n"
- "ldp x27, x26, [x15, #0x40]\n"
- "ldr q10, [x27, x10]\n"
- "ldr q14, [x26, x10]\n"
+ "ldp x21, x20, [x15, #0x40]\n"
+ "ldr q10, [x21, x10]\n"
+ "ldr q14, [x20, x10]\n"
"bge 2f\n"
"1:" // Channel loop
- "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v5.4s\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v6.4s\n"
- "ldr x25, [x15, #0x50]\n"
- "ldr q5, [x25, x10]\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v7.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v8.4s\n"
- "ldr q0, [x16, #0x0]\n"
- "ldr q16, [x16, #0x140]\n"
- "fmla v28.4s, v1.4s, v6.4s\n"
- "fmla v29.4s, v1.4s, v9.4s\n"
- "ldr x24, [x15, #0x58]\n"
- "ldr q6, [x24, x10]\n"
- "fmla v30.4s, v1.4s, v8.4s\n"
- "fmla v31.4s, v1.4s, v13.4s\n"
- "ldr q1, [x16, #0x10]\n"
- "ldr x23, [x15, #0x60]\n"
- "fmla v28.4s, v2.4s, v9.4s\n"
- "ldr q9, [x23, x10]\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
- "ldr x22, [x15, #0x68]\n"
- "fmla v30.4s, v2.4s, v13.4s\n"
- "fmla v31.4s, v2.4s, v5.4s\n"
- "ldr q2, [x16, #0x20]\n"
- "ldr x21, [x15, #0x70]\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ldr q11, [x22, x10]\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
- "ldr x20, [x15, #0x78]\n"
- "fmla v30.4s, v3.4s, v5.4s\n"
- "fmla v31.4s, v3.4s, v6.4s\n"
- "ldr q3, [x16, #0x30]\n"
- "ldr x27, [x15, #0x80]\n"
- "fmla v28.4s, v4.4s, v12.4s\n"
- "ldr q12, [x21, x10]\n"
- "fmla v29.4s, v4.4s, v9.4s\n"
- "ldr q9, [x20, x10]\n"
- "fmla v30.4s, v4.4s, v6.4s\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
- "ldr q4, [x16, #0x40]\n"
- "ldr x26, [x15, #0x88]\n"
- "fmla v28.4s, v0.4s, v7.4s\n"
- "fmla v29.4s, v0.4s, v8.4s\n"
- "ldr x25, [x15, #0x90]\n"
- "ldr x24, [x15, #0x98]\n"
- "fmla v30.4s, v0.4s, v14.4s\n"
- "fmla v31.4s, v0.4s, v11.4s\n"
- "ldr q0, [x16, #0x50]\n"
- "ldr x23, [x15, #0xa0]\n"
+ "mov v30.16b, v26.16b\n fmla v30.4s, v0.4s, v5.4s\n"
+ "mov v31.16b, v26.16b\n fmla v31.4s, v0.4s, v6.4s\n"
+ "ldr x20, [x15, #0x50]\n"
+ "ldr q24, [x20, x10]\n"
+ "mov v28.16b, v26.16b\n fmla v28.4s, v0.4s, v7.4s\n"
+ "mov v29.16b, v26.16b\n fmla v29.4s, v0.4s, v8.4s\n"
+ "ldr q23, [x16, #0x0]\n"
+ "ldr q26, [x16, #0x140]\n"
+ "fmla v30.4s, v1.4s, v6.4s\n"
+ "fmla v31.4s, v1.4s, v9.4s\n"
+ "ldr x20, [x15, #0x58]\n"
+ "ldr q22, [x20, x10]\n"
"fmla v28.4s, v1.4s, v8.4s\n"
- "ldr q8, [x26, x10]\n"
"fmla v29.4s, v1.4s, v13.4s\n"
- "ldr x22, [x15, #0xa8]\n"
- "fmla v30.4s, v1.4s, v11.4s\n"
- "fmla v31.4s, v1.4s, v12.4s\n"
- "ldr q1, [x16, #0x60]\n"
- "ldr x21, [x15, #0xb0]\n"
- "fmla v28.4s, v2.4s, v13.4s\n"
- "ldr q13, [x27, x10]\n"
- "fmla v29.4s, v2.4s, v5.4s\n"
- "ldr x20, [x15, #0xb8]\n"
- "fmla v30.4s, v2.4s, v12.4s\n"
- "fmla v31.4s, v2.4s, v9.4s\n"
- "ldr q2, [x16, #0x70]\n"
- "ldr x27, [x15, #0xc0]\n"
- "fmla v28.4s, v3.4s, v5.4s\n"
- "ldr q5, [x25, x10]\n"
- "fmla v29.4s, v3.4s, v6.4s\n"
- "ldr x26, [x15, #0xc8]\n"
- "fmla v30.4s, v3.4s, v9.4s\n"
- "fmla v31.4s, v3.4s, v13.4s\n"
- "ldr q3, [x16, #0x80]\n"
- "ldr x25, [x15, #0xd0]\n"
- "fmla v28.4s, v4.4s, v6.4s\n"
- "ldr q6, [x24, x10]\n"
- "fmla v29.4s, v4.4s, v10.4s\n"
- "ldr q10, [x23, x10]\n"
- "fmla v30.4s, v4.4s, v13.4s\n"
- "fmla v31.4s, v4.4s, v8.4s\n"
- "ldr q4, [x16, #0x90]\n"
- "ldr x24, [x15, #0xd8]\n"
- "fmla v28.4s, v0.4s, v14.4s\n"
- "ldr q14, [x20, x10]\n"
- "fmla v29.4s, v0.4s, v11.4s\n"
- "ldr x23, [x15, #0xe0]\n"
- "fmla v30.4s, v0.4s, v5.4s\n"
- "fmla v31.4s, v0.4s, v6.4s\n"
- "ldr q0, [x16, #0xa0]\n"
- "ldr x20, [x15, #0xf8]\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "ldr q11, [x22, x10]\n"
- "fmla v29.4s, v1.4s, v12.4s\n"
- "ldr x22, [x15, #0xe8]\n"
- "fmla v30.4s, v1.4s, v6.4s\n"
- "fmla v31.4s, v1.4s, v10.4s\n"
- "ldr q1, [x16, #0xb0]\n"
- "add x28, x28, #0x10\n"
- "fmla v28.4s, v2.4s, v12.4s\n"
- "ldr q12, [x21, x10]\n"
- "fmla v29.4s, v2.4s, v9.4s\n"
- "ldr x21, [x15, #0xf0]\n"
- "fmla v30.4s, v2.4s, v10.4s\n"
+ "ldr q21, [x16, #0x10]\n"
+ "ldr x20, [x15, #0x60]\n"
+ "fmla v30.4s, v2.4s, v9.4s\n"
+ "ldr q17, [x20, x10]\n"
"fmla v31.4s, v2.4s, v11.4s\n"
- "ldr q2, [x16, #0xc0]\n"
- "fmla v28.4s, v3.4s, v9.4s\n"
- "ldr q9, [x27, x10]\n"
- "fmla v29.4s, v3.4s, v13.4s\n"
- "ldr x27, [x15, #0x100]\n"
+ "ldr x20, [x15, #0x68]\n"
+ "fmla v28.4s, v2.4s, v13.4s\n"
+ "fmla v29.4s, v2.4s, v24.4s\n"
+ "ldr q16, [x16, #0x20]\n"
+ "ldr x22, [x15, #0x70]\n"
"fmla v30.4s, v3.4s, v11.4s\n"
+ "ldr q5, [x20, x10]\n"
"fmla v31.4s, v3.4s, v12.4s\n"
- "ldr q3, [x16, #0xd0]\n"
- "fmla v28.4s, v4.4s, v13.4s\n"
- "ldr q13, [x26, x10]\n"
- "fmla v29.4s, v4.4s, v8.4s\n"
- "ldr q8, [x23, x10]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "fmla v28.4s, v3.4s, v24.4s\n"
+ "fmla v29.4s, v3.4s, v22.4s\n"
+ "ldr q20, [x16, #0x30]\n"
+ "ldr x21, [x15, #0x80]\n"
"fmla v30.4s, v4.4s, v12.4s\n"
- "fmla v31.4s, v4.4s, v14.4s\n"
- "ldr q4, [x16, #0xe0]\n"
- "ldr x26, [x15, #0x108]\n"
- "fmla v28.4s, v0.4s, v5.4s\n"
- "ldr q5, [x25, x10]\n"
- "fmla v29.4s, v0.4s, v6.4s\n"
- "ldr x25, [x15, #0x110]\n"
- "fmla v30.4s, v0.4s, v9.4s\n"
- "fmla v31.4s, v0.4s, v13.4s\n"
- "ldr q0, [x16, #0xf0]\n"
- "fmla v28.4s, v1.4s, v6.4s\n"
- "ldr q6, [x24, x10]\n"
- "fmla v29.4s, v1.4s, v10.4s\n"
- "ldr x24, [x15, #0x118]\n"
- "fmla v30.4s, v1.4s, v13.4s\n"
+ "ldr q19, [x22, x10]\n"
+ "fmla v31.4s, v4.4s, v17.4s\n"
+ "ldr q2, [x20, x10]\n"
+ "fmla v28.4s, v4.4s, v22.4s\n"
+ "fmla v29.4s, v4.4s, v10.4s\n"
+ "ldr q18, [x16, #0x40]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "fmla v30.4s, v23.4s, v7.4s\n"
+ "fmla v31.4s, v23.4s, v8.4s\n"
+ "ldr x23, [x15, #0x90]\n"
+ "ldr x26, [x15, #0x98]\n"
+ "fmla v28.4s, v23.4s, v14.4s\n"
+ "fmla v29.4s, v23.4s, v5.4s\n"
+ "ldr q1, [x16, #0x50]\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "fmla v30.4s, v21.4s, v8.4s\n"
+ "ldr q25, [x20, x10]\n"
+ "fmla v31.4s, v21.4s, v13.4s\n"
+ "ldr x25, [x15, #0xa8]\n"
+ "fmla v28.4s, v21.4s, v5.4s\n"
+ "fmla v29.4s, v21.4s, v19.4s\n"
+ "ldr q17, [x16, #0x60]\n"
+ "ldr x24, [x15, #0xb0]\n"
+ "fmla v30.4s, v16.4s, v13.4s\n"
+ "ldr q8, [x21, x10]\n"
+ "fmla v31.4s, v16.4s, v24.4s\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla v28.4s, v16.4s, v19.4s\n"
+ "fmla v29.4s, v16.4s, v2.4s\n"
+ "ldr q16, [x16, #0x70]\n"
+ "ldr x21, [x15, #0xc0]\n"
+ "fmla v30.4s, v20.4s, v24.4s\n"
+ "ldr q24, [x23, x10]\n"
+ "fmla v31.4s, v20.4s, v22.4s\n"
+ "ldr x27, [x15, #0xc8]\n"
+ "fmla v28.4s, v20.4s, v2.4s\n"
+ "fmla v29.4s, v20.4s, v8.4s\n"
+ "ldr q23, [x16, #0x80]\n"
+ "ldr x23, [x15, #0xd0]\n"
+ "fmla v30.4s, v18.4s, v22.4s\n"
+ "ldr q22, [x26, x10]\n"
+ "fmla v31.4s, v18.4s, v10.4s\n"
+ "ldr q21, [x22, x10]\n"
+ "fmla v28.4s, v18.4s, v8.4s\n"
+ "fmla v29.4s, v18.4s, v25.4s\n"
+ "ldr q20, [x16, #0x90]\n"
+ "ldr x22, [x15, #0xd8]\n"
+ "fmla v30.4s, v1.4s, v14.4s\n"
+ "ldr q0, [x20, x10]\n"
"fmla v31.4s, v1.4s, v5.4s\n"
- "ldr q1, [x16, #0x100]\n"
- "fmla v28.4s, v2.4s, v10.4s\n"
- "ldr q10, [x22, x10]\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
- "fmla v30.4s, v2.4s, v5.4s\n"
- "fmla v31.4s, v2.4s, v6.4s\n"
- "ldr q2, [x16, #0x110]\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ldr q11, [x21, x10]\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
- "fmla v30.4s, v3.4s, v6.4s\n"
- "fmla v31.4s, v3.4s, v8.4s\n"
- "ldr q3, [x16, #0x120]\n"
- "fmla v28.4s, v4.4s, v12.4s\n"
- "ldr q12, [x20, x10]\n"
- "fmla v29.4s, v4.4s, v14.4s\n"
- "fmla v30.4s, v4.4s, v8.4s\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
- "ldr q4, [x16, #0x130]\n"
- "fmla v28.4s, v0.4s, v9.4s\n"
- "ldr q9, [x27, x10]\n"
- "fmla v29.4s, v0.4s, v13.4s\n"
- "fmla v30.4s, v0.4s, v11.4s\n"
- "ldr q11, [x26, x10]\n"
- "fmla v31.4s, v0.4s, v12.4s\n"
+ "ldr x20, [x15, #0xe0]\n"
+ "fmla v28.4s, v1.4s, v24.4s\n"
+ "fmla v29.4s, v1.4s, v22.4s\n"
+ "ldr q6, [x16, #0xa0]\n"
+ "ldr x26, [x15, #0xf8]\n"
+ "fmla v30.4s, v17.4s, v5.4s\n"
+ "ldr q1, [x25, x10]\n"
+ "fmla v31.4s, v17.4s, v19.4s\n"
+ "ldr x25, [x15, #0xe8]\n"
+ "fmla v28.4s, v17.4s, v22.4s\n"
+ "fmla v29.4s, v17.4s, v21.4s\n"
+ "ldr q18, [x16, #0xb0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v30.4s, v16.4s, v19.4s\n"
+ "ldr q19, [x24, x10]\n"
+ "fmla v31.4s, v16.4s, v2.4s\n"
+ "ldr x24, [x15, #0xf0]\n"
+ "fmla v28.4s, v16.4s, v21.4s\n"
+ "fmla v29.4s, v16.4s, v1.4s\n"
+ "ldr q17, [x16, #0xc0]\n"
+ "fmla v30.4s, v23.4s, v2.4s\n"
+ "ldr q16, [x21, x10]\n"
+ "fmla v31.4s, v23.4s, v8.4s\n"
+ "ldr x21, [x15, #0x100]\n"
+ "fmla v28.4s, v23.4s, v1.4s\n"
+ "fmla v29.4s, v23.4s, v19.4s\n"
+ "ldr q13, [x16, #0xd0]\n"
+ "fmla v30.4s, v20.4s, v8.4s\n"
+ "ldr q2, [x27, x10]\n"
+ "fmla v31.4s, v20.4s, v25.4s\n"
+ "ldr q10, [x20, x10]\n"
+ "fmla v28.4s, v20.4s, v19.4s\n"
+ "fmla v29.4s, v20.4s, v0.4s\n"
+ "ldr q9, [x16, #0xe0]\n"
+ "ldr x20, [x15, #0x108]\n"
+ "fmla v30.4s, v6.4s, v24.4s\n"
+ "ldr q5, [x23, x10]\n"
+ "fmla v31.4s, v6.4s, v22.4s\n"
+ "ldr x23, [x15, #0x110]\n"
+ "fmla v28.4s, v6.4s, v16.4s\n"
+ "fmla v29.4s, v6.4s, v2.4s\n"
+ "ldr q24, [x16, #0xf0]\n"
+ "fmla v30.4s, v18.4s, v22.4s\n"
+ "ldr q25, [x22, x10]\n"
+ "fmla v31.4s, v18.4s, v21.4s\n"
+ "ldr x22, [x15, #0x118]\n"
+ "fmla v28.4s, v18.4s, v2.4s\n"
+ "fmla v29.4s, v18.4s, v5.4s\n"
+ "ldr q23, [x16, #0x100]\n"
+ "fmla v30.4s, v17.4s, v21.4s\n"
+ "ldr q22, [x25, x10]\n"
+ "fmla v31.4s, v17.4s, v1.4s\n"
+ "fmla v28.4s, v17.4s, v5.4s\n"
+ "fmla v29.4s, v17.4s, v25.4s\n"
+ "ldr q21, [x16, #0x110]\n"
+ "fmla v30.4s, v13.4s, v1.4s\n"
+ "ldr q18, [x24, x10]\n"
+ "fmla v31.4s, v13.4s, v19.4s\n"
+ "fmla v28.4s, v13.4s, v25.4s\n"
+ "fmla v29.4s, v13.4s, v10.4s\n"
+ "ldr q20, [x16, #0x120]\n"
+ "fmla v30.4s, v9.4s, v19.4s\n"
+ "ldr q17, [x26, x10]\n"
+ "fmla v31.4s, v9.4s, v0.4s\n"
+ "fmla v28.4s, v9.4s, v10.4s\n"
+ "fmla v29.4s, v9.4s, v22.4s\n"
+ "ldr q19, [x16, #0x130]\n"
+ "fmla v30.4s, v24.4s, v16.4s\n"
+ "ldr q16, [x21, x10]\n"
+ "fmla v31.4s, v24.4s, v2.4s\n"
+ "fmla v28.4s, v24.4s, v18.4s\n"
+ "ldr q18, [x20, x10]\n"
+ "fmla v29.4s, v24.4s, v17.4s\n"
"ldr q0, [x16, #0x150]\n"
- "fmla v28.4s, v1.4s, v13.4s\n"
- "fmla v29.4s, v1.4s, v5.4s\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "fmla v30.4s, v1.4s, v12.4s\n"
- "ldr q12, [x25, x10]\n"
- "fmla v31.4s, v1.4s, v9.4s\n"
+ "fmla v30.4s, v23.4s, v2.4s\n"
+ "fmla v31.4s, v23.4s, v5.4s\n"
+ "ldp x21, x20, [x15, #0x0]\n"
+ "fmla v28.4s, v23.4s, v17.4s\n"
+ "ldr q17, [x23, x10]\n"
+ "fmla v29.4s, v23.4s, v16.4s\n"
"ldr q1, [x16, #0x160]\n"
- "fmla v28.4s, v2.4s, v5.4s\n"
- "ldr q5, [x27, x17]\n"
- "fmla v29.4s, v2.4s, v6.4s\n"
- "fmla v30.4s, v2.4s, v9.4s\n"
- "ldr q9, [x24, x10]\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
+ "fmla v30.4s, v21.4s, v5.4s\n"
+ "ldr q5, [x21, x17]\n"
+ "fmla v31.4s, v21.4s, v25.4s\n"
+ "fmla v28.4s, v21.4s, v16.4s\n"
+ "ldr q16, [x22, x10]\n"
+ "fmla v29.4s, v21.4s, v18.4s\n"
"ldr q2, [x16, #0x170]\n"
- "fmla v28.4s, v3.4s, v6.4s\n"
- "ldr q6, [x26, x17]\n"
- "fmla v29.4s, v3.4s, v8.4s\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "ldr q7, [x25, x17]\n"
- "fmla v30.4s, v3.4s, v11.4s\n"
- "fmla v31.4s, v3.4s, v12.4s\n"
+ "fmla v30.4s, v20.4s, v25.4s\n"
+ "ldr q6, [x20, x17]\n"
+ "fmla v31.4s, v20.4s, v10.4s\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr q7, [x21, x17]\n"
+ "fmla v28.4s, v20.4s, v18.4s\n"
+ "fmla v29.4s, v20.4s, v17.4s\n"
"ldr q3, [x16, #0x180]\n"
- "fmla v28.4s, v4.4s, v8.4s\n"
- "ldr q8, [x24, x17]\n"
- "fmla v29.4s, v4.4s, v10.4s\n"
- "ldp x23, x22, [x15, #0x20]\n"
- "ldr q13, [x22, x17]\n"
- "fmla v30.4s, v4.4s, v12.4s\n"
- "fmla v31.4s, v4.4s, v9.4s\n"
- "ldr q9, [x23, x17]\n"
+ "fmla v30.4s, v19.4s, v10.4s\n"
+ "ldr q8, [x20, x17]\n"
+ "fmla v31.4s, v19.4s, v22.4s\n"
+ "ldp x21, x20, [x15, #0x20]\n"
+ "ldr q13, [x20, x17]\n"
+ "fmla v28.4s, v19.4s, v17.4s\n"
+ "fmla v29.4s, v19.4s, v16.4s\n"
+ "ldr q9, [x21, x17]\n"
"ldr q4, [x16, #0x190]\n"
"ldp x21, x20, [x15, #0x30]\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
+ "fmax v30.4s, v30.4s, v27.4s\n"
+ "fmax v31.4s, v31.4s, v27.4s\n"
"ldr q11, [x21, x17]\n"
"ldr q12, [x20, x17]\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "ldp x27, x26, [x15, #0x40]\n"
- "ldr q10, [x27, x17]\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "ldr q14, [x26, x17]\n"
+ "fmax v28.4s, v28.4s, v27.4s\n"
+ "fmax v29.4s, v29.4s, v27.4s\n"
+ "ldp x21, x20, [x15, #0x40]\n"
+ "ldr q10, [x21, x17]\n"
+ "fmin v30.4s, v30.4s, v15.4s\n"
+ "fmin v31.4s, v31.4s, v15.4s\n"
+ "ldr q14, [x20, x17]\n"
"add x17, x17, #0x10\n"
"cmp x17, x9, LSL #4\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
+ "fmin v28.4s, v28.4s, v15.4s\n"
+ "fmin v29.4s, v29.4s, v15.4s\n"
"add x10, x10, #0x10\n"
- "str q28, [x14, x28]\n"
+ "str q30, [x14, x28]\n"
"add x16, x16, #0x1a0\n"
- "str q29, [x13, x28]\n"
- "str q30, [x12, x28]\n"
- "str q31, [x11, x28]\n"
+ "str q31, [x13, x28]\n"
+ "str q28, [x12, x28]\n"
+ "str q29, [x11, x28]\n"
"blt 1b\n"
"2:" // Channel tail
- "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v5.4s\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v6.4s\n"
- "ldr x25, [x15, #0x50]\n"
- "ldr q5, [x25, x10]\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v7.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v8.4s\n"
- "ldr q0, [x16, #0x0]\n"
- "ldr x24, [x15, #0x58]\n"
- "fmla v28.4s, v1.4s, v6.4s\n"
- "ldr q6, [x24, x10]\n"
- "fmla v29.4s, v1.4s, v9.4s\n"
- "ldr x23, [x15, #0x60]\n"
+ "mov v31.16b, v26.16b\n fmla v31.4s, v0.4s, v5.4s\n"
+ "mov v5.16b, v26.16b\n fmla v5.4s, v0.4s, v6.4s\n"
+ "ldr x20, [x15, #0x50]\n"
+ "ldr q22, [x20, x10]\n"
+ "mov v30.16b, v26.16b\n fmla v30.4s, v0.4s, v7.4s\n"
+ "mov v29.16b, v26.16b\n fmla v29.4s, v0.4s, v8.4s\n"
+ "ldr q19, [x16, #0x0]\n"
+ "ldr x20, [x15, #0x58]\n"
+ "fmla v31.4s, v1.4s, v6.4s\n"
+ "ldr q21, [x20, x10]\n"
+ "fmla v5.4s, v1.4s, v9.4s\n"
+ "ldr x21, [x15, #0x60]\n"
"fmla v30.4s, v1.4s, v8.4s\n"
- "fmla v31.4s, v1.4s, v13.4s\n"
- "ldr q1, [x16, #0x10]\n"
- "ldr x22, [x15, #0x68]\n"
- "fmla v28.4s, v2.4s, v9.4s\n"
- "ldr q9, [x23, x10]\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
- "ldr x21, [x15, #0x70]\n"
- "fmla v30.4s, v2.4s, v13.4s\n"
- "fmla v31.4s, v2.4s, v5.4s\n"
- "ldr q2, [x16, #0x20]\n"
- "ldr x20, [x15, #0x78]\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ldr q11, [x22, x10]\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
- "ldr x27, [x15, #0x80]\n"
- "fmla v30.4s, v3.4s, v5.4s\n"
- "fmla v31.4s, v3.4s, v6.4s\n"
- "ldr q3, [x16, #0x30]\n"
- "ldr x26, [x15, #0x88]\n"
- "fmla v28.4s, v4.4s, v12.4s\n"
- "ldr q12, [x21, x10]\n"
- "fmla v29.4s, v4.4s, v9.4s\n"
- "ldr q9, [x20, x10]\n"
- "fmla v30.4s, v4.4s, v6.4s\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
- "ldr q4, [x16, #0x40]\n"
- "ldr x25, [x15, #0x90]\n"
- "fmla v28.4s, v0.4s, v7.4s\n"
- "fmla v29.4s, v0.4s, v8.4s\n"
- "ldr x24, [x15, #0x98]\n"
- "ldr x23, [x15, #0xa0]\n"
- "fmla v30.4s, v0.4s, v14.4s\n"
- "fmla v31.4s, v0.4s, v11.4s\n"
- "ldr q0, [x16, #0x50]\n"
- "ldr x22, [x15, #0xa8]\n"
- "fmla v28.4s, v1.4s, v8.4s\n"
- "ldr q8, [x26, x10]\n"
"fmla v29.4s, v1.4s, v13.4s\n"
- "ldr x21, [x15, #0xb0]\n"
- "fmla v30.4s, v1.4s, v11.4s\n"
- "fmla v31.4s, v1.4s, v12.4s\n"
- "ldr q1, [x16, #0x60]\n"
- "ldr x20, [x15, #0xb8]\n"
- "fmla v28.4s, v2.4s, v13.4s\n"
- "ldr q13, [x27, x10]\n"
- "fmla v29.4s, v2.4s, v5.4s\n"
- "ldr x27, [x15, #0xc0]\n"
- "fmla v30.4s, v2.4s, v12.4s\n"
+ "ldr q18, [x16, #0x10]\n"
+ "ldr x20, [x15, #0x68]\n"
"fmla v31.4s, v2.4s, v9.4s\n"
- "ldr q2, [x16, #0x70]\n"
- "ldr x26, [x15, #0xc8]\n"
- "fmla v28.4s, v3.4s, v5.4s\n"
- "ldr q5, [x25, x10]\n"
- "fmla v29.4s, v3.4s, v6.4s\n"
- "ldr x25, [x15, #0xd0]\n"
- "fmla v30.4s, v3.4s, v9.4s\n"
- "fmla v31.4s, v3.4s, v13.4s\n"
- "ldr q3, [x16, #0x80]\n"
- "add x28, x28, #0x10\n"
- "fmla v28.4s, v4.4s, v6.4s\n"
- "ldr q6, [x24, x10]\n"
+ "ldr q16, [x21, x10]\n"
+ "fmla v5.4s, v2.4s, v11.4s\n"
+ "ldr x23, [x15, #0x70]\n"
+ "fmla v30.4s, v2.4s, v13.4s\n"
+ "fmla v29.4s, v2.4s, v22.4s\n"
+ "ldr q17, [x16, #0x20]\n"
+ "ldr x21, [x15, #0x78]\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "ldr q6, [x20, x10]\n"
+ "fmla v5.4s, v3.4s, v12.4s\n"
+ "ldr x22, [x15, #0x80]\n"
+ "fmla v30.4s, v3.4s, v22.4s\n"
+ "fmla v29.4s, v3.4s, v21.4s\n"
+ "ldr q20, [x16, #0x30]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "fmla v31.4s, v4.4s, v12.4s\n"
+ "ldr q2, [x23, x10]\n"
+ "fmla v5.4s, v4.4s, v16.4s\n"
+ "ldr q28, [x21, x10]\n"
+ "fmla v30.4s, v4.4s, v21.4s\n"
"fmla v29.4s, v4.4s, v10.4s\n"
- "ldr q10, [x23, x10]\n"
- "fmla v30.4s, v4.4s, v13.4s\n"
- "fmla v31.4s, v4.4s, v8.4s\n"
- "ldr q4, [x16, #0x90]\n"
- "ldr x24, [x15, #0xd8]\n"
- "fmla v28.4s, v0.4s, v14.4s\n"
- "ldr q14, [x20, x10]\n"
- "fmla v29.4s, v0.4s, v11.4s\n"
- "ldr x23, [x15, #0xe0]\n"
- "fmla v30.4s, v0.4s, v5.4s\n"
- "fmla v31.4s, v0.4s, v6.4s\n"
- "ldr q0, [x16, #0xa0]\n"
- "ldr x20, [x15, #0xf8]\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "ldr q11, [x22, x10]\n"
- "fmla v29.4s, v1.4s, v12.4s\n"
- "ldr x22, [x15, #0xe8]\n"
- "fmla v30.4s, v1.4s, v6.4s\n"
- "fmla v31.4s, v1.4s, v10.4s\n"
- "ldr q1, [x16, #0xb0]\n"
- "fmla v28.4s, v2.4s, v12.4s\n"
- "ldr q12, [x21, x10]\n"
- "fmla v29.4s, v2.4s, v9.4s\n"
- "ldr x21, [x15, #0xf0]\n"
- "fmla v30.4s, v2.4s, v10.4s\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
- "ldr q2, [x16, #0xc0]\n"
- "fmla v28.4s, v3.4s, v9.4s\n"
- "ldr q9, [x27, x10]\n"
- "fmla v29.4s, v3.4s, v13.4s\n"
- "ldr x27, [x15, #0x100]\n"
- "fmla v30.4s, v3.4s, v11.4s\n"
- "fmla v31.4s, v3.4s, v12.4s\n"
+ "ldr q16, [x16, #0x40]\n"
+ "ldr x21, [x15, #0x90]\n"
+ "fmla v31.4s, v19.4s, v7.4s\n"
+ "fmla v5.4s, v19.4s, v8.4s\n"
+ "ldr x27, [x15, #0x98]\n"
+ "ldr x26, [x15, #0xa0]\n"
+ "fmla v30.4s, v19.4s, v14.4s\n"
+ "fmla v29.4s, v19.4s, v6.4s\n"
+ "ldr q19, [x16, #0x50]\n"
+ "ldr x25, [x15, #0xa8]\n"
+ "fmla v31.4s, v18.4s, v8.4s\n"
+ "ldr q1, [x20, x10]\n"
+ "fmla v5.4s, v18.4s, v13.4s\n"
+ "ldr x24, [x15, #0xb0]\n"
+ "fmla v30.4s, v18.4s, v6.4s\n"
+ "fmla v29.4s, v18.4s, v2.4s\n"
+ "ldr q18, [x16, #0x60]\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla v31.4s, v17.4s, v13.4s\n"
+ "ldr q26, [x22, x10]\n"
+ "fmla v5.4s, v17.4s, v22.4s\n"
+ "ldr x23, [x15, #0xc0]\n"
+ "fmla v30.4s, v17.4s, v2.4s\n"
+ "fmla v29.4s, v17.4s, v28.4s\n"
+ "ldr q17, [x16, #0x70]\n"
+ "ldr x22, [x15, #0xc8]\n"
+ "fmla v31.4s, v20.4s, v22.4s\n"
+ "ldr q25, [x21, x10]\n"
+ "fmla v5.4s, v20.4s, v21.4s\n"
+ "ldr x21, [x15, #0xd0]\n"
+ "fmla v30.4s, v20.4s, v28.4s\n"
+ "fmla v29.4s, v20.4s, v26.4s\n"
+ "ldr q24, [x16, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v31.4s, v16.4s, v21.4s\n"
+ "ldr q23, [x27, x10]\n"
+ "fmla v5.4s, v16.4s, v10.4s\n"
+ "ldr q0, [x26, x10]\n"
+ "fmla v30.4s, v16.4s, v26.4s\n"
+ "fmla v29.4s, v16.4s, v1.4s\n"
+ "ldr q22, [x16, #0x90]\n"
+ "ldr x27, [x15, #0xd8]\n"
+ "fmla v31.4s, v19.4s, v14.4s\n"
+ "ldr q16, [x20, x10]\n"
+ "fmla v5.4s, v19.4s, v6.4s\n"
+ "ldr x20, [x15, #0xe0]\n"
+ "fmla v30.4s, v19.4s, v25.4s\n"
+ "fmla v29.4s, v19.4s, v23.4s\n"
+ "ldr q21, [x16, #0xa0]\n"
+ "ldr x26, [x15, #0xf8]\n"
+ "fmla v31.4s, v18.4s, v6.4s\n"
+ "ldr q20, [x25, x10]\n"
+ "fmla v5.4s, v18.4s, v2.4s\n"
+ "ldr x25, [x15, #0xe8]\n"
+ "fmla v30.4s, v18.4s, v23.4s\n"
+ "fmla v29.4s, v18.4s, v0.4s\n"
+ "ldr q18, [x16, #0xb0]\n"
+ "fmla v31.4s, v17.4s, v2.4s\n"
+ "ldr q19, [x24, x10]\n"
+ "fmla v5.4s, v17.4s, v28.4s\n"
+ "ldr x24, [x15, #0xf0]\n"
+ "fmla v30.4s, v17.4s, v0.4s\n"
+ "fmla v29.4s, v17.4s, v20.4s\n"
+ "ldr q17, [x16, #0xc0]\n"
+ "fmla v31.4s, v24.4s, v28.4s\n"
+ "ldr q7, [x23, x10]\n"
+ "fmla v5.4s, v24.4s, v26.4s\n"
+ "ldr x23, [x15, #0x100]\n"
+ "fmla v30.4s, v24.4s, v20.4s\n"
+ "fmla v29.4s, v24.4s, v19.4s\n"
"ldr q3, [x16, #0xd0]\n"
- "fmla v28.4s, v4.4s, v13.4s\n"
- "ldr q13, [x26, x10]\n"
- "fmla v29.4s, v4.4s, v8.4s\n"
- "ldr q8, [x23, x10]\n"
- "fmla v30.4s, v4.4s, v12.4s\n"
- "fmla v31.4s, v4.4s, v14.4s\n"
- "ldr q4, [x16, #0xe0]\n"
- "ldr x26, [x15, #0x108]\n"
- "fmla v28.4s, v0.4s, v5.4s\n"
- "ldr q5, [x25, x10]\n"
- "fmla v29.4s, v0.4s, v6.4s\n"
- "ldr x25, [x15, #0x110]\n"
- "fmla v30.4s, v0.4s, v9.4s\n"
- "fmla v31.4s, v0.4s, v13.4s\n"
- "ldr q0, [x16, #0xf0]\n"
- "fmla v28.4s, v1.4s, v6.4s\n"
- "ldr q6, [x24, x10]\n"
- "fmla v29.4s, v1.4s, v10.4s\n"
- "ldr x24, [x15, #0x118]\n"
- "fmla v30.4s, v1.4s, v13.4s\n"
- "fmla v31.4s, v1.4s, v5.4s\n"
- "ldr q1, [x16, #0x100]\n"
- "fmla v28.4s, v2.4s, v10.4s\n"
- "ldr q10, [x22, x10]\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
- "fmla v30.4s, v2.4s, v5.4s\n"
- "fmla v31.4s, v2.4s, v6.4s\n"
- "ldr q2, [x16, #0x110]\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ldr q11, [x21, x10]\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
- "fmla v30.4s, v3.4s, v6.4s\n"
- "fmla v31.4s, v3.4s, v8.4s\n"
- "ldr q3, [x16, #0x120]\n"
- "fmla v28.4s, v4.4s, v12.4s\n"
- "ldr q12, [x20, x10]\n"
- "fmla v29.4s, v4.4s, v14.4s\n"
- "fmla v30.4s, v4.4s, v8.4s\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
- "ldr q4, [x16, #0x130]\n"
+ "fmla v31.4s, v22.4s, v26.4s\n"
+ "ldr q28, [x22, x10]\n"
+ "fmla v5.4s, v22.4s, v1.4s\n"
+ "ldr q13, [x20, x10]\n"
+ "fmla v30.4s, v22.4s, v19.4s\n"
+ "fmla v29.4s, v22.4s, v16.4s\n"
+ "ldr q11, [x16, #0xe0]\n"
+ "ldr x22, [x15, #0x108]\n"
+ "fmla v31.4s, v21.4s, v25.4s\n"
+ "ldr q26, [x21, x10]\n"
+ "fmla v5.4s, v21.4s, v23.4s\n"
+ "ldr x21, [x15, #0x110]\n"
+ "fmla v30.4s, v21.4s, v7.4s\n"
+ "fmla v29.4s, v21.4s, v28.4s\n"
+ "ldr q25, [x16, #0xf0]\n"
+ "fmla v31.4s, v18.4s, v23.4s\n"
+ "ldr q24, [x27, x10]\n"
+ "fmla v5.4s, v18.4s, v0.4s\n"
+ "ldr x20, [x15, #0x118]\n"
+ "fmla v30.4s, v18.4s, v28.4s\n"
+ "fmla v29.4s, v18.4s, v26.4s\n"
+ "ldr q23, [x16, #0x100]\n"
+ "fmla v31.4s, v17.4s, v0.4s\n"
+ "ldr q22, [x25, x10]\n"
+ "fmla v5.4s, v17.4s, v20.4s\n"
+ "fmla v30.4s, v17.4s, v26.4s\n"
+ "fmla v29.4s, v17.4s, v24.4s\n"
+ "ldr q21, [x16, #0x110]\n"
+ "fmla v31.4s, v3.4s, v20.4s\n"
+ "ldr q18, [x24, x10]\n"
+ "fmla v5.4s, v3.4s, v19.4s\n"
+ "fmla v30.4s, v3.4s, v24.4s\n"
+ "fmla v29.4s, v3.4s, v13.4s\n"
+ "ldr q20, [x16, #0x120]\n"
+ "fmla v31.4s, v11.4s, v19.4s\n"
+ "ldr q17, [x26, x10]\n"
+ "fmla v5.4s, v11.4s, v16.4s\n"
+ "fmla v30.4s, v11.4s, v13.4s\n"
+ "fmla v29.4s, v11.4s, v22.4s\n"
+ "ldr q19, [x16, #0x130]\n"
"add x16, x16, #0x140\n"
- "fmla v28.4s, v0.4s, v9.4s\n"
- "ldr q9, [x27, x10]\n"
- "fmla v29.4s, v0.4s, v13.4s\n"
- "fmla v30.4s, v0.4s, v11.4s\n"
- "ldr q11, [x26, x10]\n"
- "fmla v31.4s, v0.4s, v12.4s\n"
- "fmla v28.4s, v1.4s, v13.4s\n"
- "fmla v29.4s, v1.4s, v5.4s\n"
- "fmla v30.4s, v1.4s, v12.4s\n"
- "ldr q12, [x25, x10]\n"
- "fmla v31.4s, v1.4s, v9.4s\n"
- "fmla v28.4s, v2.4s, v5.4s\n"
- "fmla v29.4s, v2.4s, v6.4s\n"
- "fmla v30.4s, v2.4s, v9.4s\n"
- "ldr q9, [x24, x10]\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
+ "fmla v31.4s, v25.4s, v7.4s\n"
+ "ldr q16, [x23, x10]\n"
+ "fmla v5.4s, v25.4s, v28.4s\n"
+ "fmla v30.4s, v25.4s, v18.4s\n"
+ "ldr q18, [x22, x10]\n"
+ "fmla v29.4s, v25.4s, v17.4s\n"
+ "fmla v31.4s, v23.4s, v28.4s\n"
+ "fmla v5.4s, v23.4s, v26.4s\n"
+ "fmla v30.4s, v23.4s, v17.4s\n"
+ "ldr q17, [x21, x10]\n"
+ "fmla v29.4s, v23.4s, v16.4s\n"
+ "fmla v31.4s, v21.4s, v26.4s\n"
+ "fmla v5.4s, v21.4s, v24.4s\n"
+ "fmla v30.4s, v21.4s, v16.4s\n"
+ "ldr q16, [x20, x10]\n"
+ "fmla v29.4s, v21.4s, v18.4s\n"
"add x10, x10, #0x10\n"
- "fmla v28.4s, v3.4s, v6.4s\n"
- "fmla v29.4s, v3.4s, v8.4s\n"
- "fmla v30.4s, v3.4s, v11.4s\n"
- "fmla v31.4s, v3.4s, v12.4s\n"
- "fmla v28.4s, v4.4s, v8.4s\n"
- "fmla v29.4s, v4.4s, v10.4s\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmla v30.4s, v4.4s, v12.4s\n"
- "fmla v31.4s, v4.4s, v9.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "str q28, [x14, x28]\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "str q29, [x13, x28]\n"
+ "fmla v31.4s, v20.4s, v24.4s\n"
+ "fmla v5.4s, v20.4s, v13.4s\n"
+ "fmla v30.4s, v20.4s, v18.4s\n"
+ "fmla v29.4s, v20.4s, v17.4s\n"
+ "fmla v31.4s, v19.4s, v13.4s\n"
+ "fmla v5.4s, v19.4s, v22.4s\n"
+ "fmax v31.4s, v31.4s, v27.4s\n"
+ "fmla v30.4s, v19.4s, v17.4s\n"
+ "fmla v29.4s, v19.4s, v16.4s\n"
+ "fmax v5.4s, v5.4s, v27.4s\n"
+ "fmax v30.4s, v30.4s, v27.4s\n"
+ "fmax v29.4s, v29.4s, v27.4s\n"
+ "fmin v31.4s, v31.4s, v15.4s\n"
+ "fmin v5.4s, v5.4s, v15.4s\n"
+ "str q31, [x14, x28]\n"
+ "fmin v30.4s, v30.4s, v15.4s\n"
+ "fmin v29.4s, v29.4s, v15.4s\n"
+ "str q5, [x13, x28]\n"
"str q30, [x12, x28]\n"
- "str q31, [x11, x28]\n"
+ "str q29, [x11, x28]\n"
"3:" // Oddments
"tst %x[n_channels], #0x3\n"
"beq 60f\n"
- "ldr q16, [x16, #0x0]\n"
+ "ldr q26, [x16, #0x0]\n"
"ldr q0, [x16, #0x10]\n"
- "mov x28, x10\n"
- "add x14, x14, x28\n"
+ "mov x20, x10\n"
+ "add x14, x14, x20\n"
"ldr q1, [x16, #0x20]\n"
"ldr q2, [x16, #0x30]\n"
- "add x13, x13, x28\n"
- "add x12, x12, x28\n"
+ "add x13, x13, x20\n"
+ "add x12, x12, x20\n"
"ldr q3, [x16, #0x40]\n"
"ldr q4, [x16, #0x50]\n"
- "add x11, x11, x28\n"
+ "add x11, x11, x20\n"
"ldr x9, [x15, #0x0]\n"
"ldr x28, [x15, #0x8]\n"
"add x9, x9, x10\n"
@@ -606,12 +606,12 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ld1 { v10.s }[0], [x21], #0x4\n"
"ld1 { v14.s }[0], [x20], #0x4\n"
"5:" // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: End
- "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v5.4s\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v6.4s\n"
+ "mov v28.16b, v26.16b\n fmla v28.4s, v0.4s, v5.4s\n"
+ "mov v29.16b, v26.16b\n fmla v29.4s, v0.4s, v6.4s\n"
"ldr x20, [x15, #0x50]\n"
"add x20, x20, x10\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v7.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v8.4s\n"
+ "mov v30.16b, v26.16b\n fmla v30.4s, v0.4s, v7.4s\n"
+ "mov v31.16b, v26.16b\n fmla v31.4s, v0.4s, v8.4s\n"
"fmla v28.4s, v1.4s, v6.4s\n"
"fmla v29.4s, v1.4s, v9.4s\n"
"fmla v30.4s, v1.4s, v8.4s\n"
@@ -1005,14 +1005,14 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ld1 { v9.s }[0], [x20], #0x4\n"
"57:" // Oddments: Load input (5, 5): Bit 1: End
"fmla v31.4s, v4.4s, v9.4s\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
+ "fmax v28.4s, v28.4s, v27.4s\n"
+ "fmax v29.4s, v29.4s, v27.4s\n"
+ "fmax v30.4s, v30.4s, v27.4s\n"
+ "fmax v31.4s, v31.4s, v27.4s\n"
+ "fmin v28.4s, v28.4s, v15.4s\n"
+ "fmin v29.4s, v29.4s, v15.4s\n"
+ "fmin v30.4s, v30.4s, v15.4s\n"
+ "fmin v31.4s, v31.4s, v15.4s\n"
"tbz %x[n_channels], #1, 58f\n"
"st1 { v28.d }[0], [x14], #0x8\n"
"st1 { v29.d }[0], [x13], #0x8\n"
@@ -1030,12 +1030,10 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"st1 { v30.s }[0], [x12], #0x4\n"
"st1 { v31.s }[0], [x11], #0x4\n"
"59:" // Oddments: Store: Bit 1: End
-
"60:" // End
-
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp
index 6fa02b781e..8a8060770c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -48,4 +48,4 @@ class a64_fp32_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKer
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
index 0ea3a8fbed..a2f577784f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -22,10 +22,11 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
@@ -44,70 +45,70 @@ void a64_fp32_nhwc_generic_output9_mla_depthfirst_impl(
__asm__ __volatile__(
"ld1r { v2.4s }, [%x[minmax_vals]]\n"
- "lsr x12, %x[n_channels], #0x2\n"
+ "lsr x9, %x[n_channels], #0x2\n"
"add x20, %x[minmax_vals], #0x4\n"
"ld1r { v1.4s }, [x20]\n"
"mov x11, #0x0\n"
- "cbz x12, 5f\n"
+ "cbz x9, 5f\n"
"1:" // Channel loop
"movi v23.16b, #0x0\n"
"cbz %x[bias], 2f\n"
"ldr q23, [%x[bias], x11]\n"
"2:" // Channel loop: Load bias: Done
"ldr q0, [%x[params], #0x0]\n"
- "mov x21, %x[inptrs]\n"
- "ldp x10, x9, [x21], #0x10\n"
- "subs x20, %x[n_points], #0x1\n"
- "ldr q14, [x10, x11]\n"
- "ldr q15, [x9, x11]\n"
+ "mov x26, %x[inptrs]\n"
+ "ldp x21, x20, [x26], #0x10\n"
+ "subs x25, %x[n_points], #0x1\n"
+ "ldr q14, [x21, x11]\n"
+ "ldr q15, [x20, x11]\n"
"mov v24.16b, v23.16b\n"
"mov v25.16b, v23.16b\n"
- "ldp x28, x27, [x21], #0x10\n"
- "ldr q16, [x28, x11]\n"
+ "ldp x21, x20, [x26], #0x10\n"
+ "ldr q16, [x21, x11]\n"
"mov v26.16b, v23.16b\n"
"mov v27.16b, v23.16b\n"
- "ldr q17, [x27, x11]\n"
- "ldp x26, x25, [x21], #0x10\n"
+ "ldr q17, [x20, x11]\n"
+ "ldp x21, x20, [x26], #0x10\n"
"mov v28.16b, v23.16b\n"
"mov v29.16b, v23.16b\n"
- "ldr q18, [x26, x11]\n"
- "ldr q19, [x25, x11]\n"
+ "ldr q18, [x21, x11]\n"
+ "ldr q19, [x20, x11]\n"
"mov v30.16b, v23.16b\n"
"mov v31.16b, v23.16b\n"
- "ldp x24, x23, [x21], #0x10\n"
- "ldr q20, [x24, x11]\n"
+ "ldp x21, x20, [x26], #0x10\n"
+ "ldr q20, [x21, x11]\n"
"add %x[params], %x[params], #0x10\n"
- "ldr q21, [x23, x11]\n"
- "ldr x22, [x21], #0x8\n"
- "ldr q22, [x22, x11]\n"
+ "ldr q21, [x20, x11]\n"
+ "ldr x20, [x26], #0x8\n"
+ "ldr q22, [x20, x11]\n"
"ble 4f\n"
"3:" // Channel loop: Planar loop
- "ldp x10, x9, [x21], #0x10\n"
- "ldp x28, x27, [x21], #0x10\n"
- "subs x20, x20, #0x1\n"
+ "ldp x20, x24, [x26], #0x10\n"
+ "ldp x23, x22, [x26], #0x10\n"
+ "subs x25, x25, #0x1\n"
"fmla v23.4s, v14.4s, v0.4s\n"
- "ldr q14, [x10, x11]\n"
- "ldp x26, x25, [x21], #0x10\n"
+ "ldr q14, [x20, x11]\n"
+ "ldp x21, x20, [x26], #0x10\n"
"fmla v24.4s, v15.4s, v0.4s\n"
"fmla v25.4s, v16.4s, v0.4s\n"
- "ldr q15, [x9, x11]\n"
- "ldr q16, [x28, x11]\n"
+ "ldr q15, [x24, x11]\n"
+ "ldr q16, [x23, x11]\n"
"fmla v26.4s, v17.4s, v0.4s\n"
"fmla v27.4s, v18.4s, v0.4s\n"
- "ldr q17, [x27, x11]\n"
- "ldr q18, [x26, x11]\n"
+ "ldr q17, [x22, x11]\n"
+ "ldr q18, [x21, x11]\n"
"fmla v28.4s, v19.4s, v0.4s\n"
"fmla v29.4s, v20.4s, v0.4s\n"
- "ldr q19, [x25, x11]\n"
- "ldp x24, x23, [x21], #0x10\n"
+ "ldr q19, [x20, x11]\n"
+ "ldp x21, x20, [x26], #0x10\n"
"fmla v30.4s, v21.4s, v0.4s\n"
"fmla v31.4s, v22.4s, v0.4s\n"
"ldr q0, [%x[params], #0x0]\n"
- "ldr q20, [x24, x11]\n"
+ "ldr q20, [x21, x11]\n"
"add %x[params], %x[params], #0x10\n"
- "ldr q21, [x23, x11]\n"
- "ldr x22, [x21], #0x8\n"
- "ldr q22, [x22, x11]\n"
+ "ldr q21, [x20, x11]\n"
+ "ldr x20, [x26], #0x8\n"
+ "ldr q22, [x20, x11]\n"
"bgt 3b\n"
"4:" // Channel loop: Planar tail
"fmla v23.4s, v14.4s, v0.4s\n"
@@ -152,7 +153,7 @@ void a64_fp32_nhwc_generic_output9_mla_depthfirst_impl(
"str q30, [x21, x11]\n"
"str q31, [x20, x11]\n"
"add x11, x11, #0x10\n"
- "cmp x11, x12, LSL #4\n"
+ "cmp x11, x9, LSL #4\n"
"blt 1b\n"
"5:" // Oddments
"tst %x[n_channels], #0x3\n"
@@ -170,121 +171,121 @@ void a64_fp32_nhwc_generic_output9_mla_depthfirst_impl(
"7:" // Oddments: Load bias: Bit 1: End
"8:" // Oddments: Load bias: Done
"ldr q0, [%x[params], #0x0]\n"
- "mov x21, %x[inptrs]\n"
- "ldp x10, x9, [x21], #0x10\n"
+ "mov x10, %x[inptrs]\n"
+ "ldp x9, x28, [x10], #0x10\n"
"mov v24.16b, v23.16b\n"
- "ldp x28, x27, [x21], #0x10\n"
- "ldp x26, x25, [x21], #0x10\n"
+ "ldp x27, x26, [x10], #0x10\n"
+ "ldp x25, x24, [x10], #0x10\n"
"mov v25.16b, v23.16b\n"
"mov v26.16b, v23.16b\n"
- "ldp x24, x23, [x21], #0x10\n"
- "ldr x22, [x21], #0x8\n"
+ "ldp x23, x22, [x10], #0x10\n"
+ "ldr x21, [x10], #0x8\n"
"mov v27.16b, v23.16b\n"
"mov v28.16b, v23.16b\n"
"mov v29.16b, v23.16b\n"
"mov v30.16b, v23.16b\n"
- "add x10, x10, x11\n"
"add x9, x9, x11\n"
- "mov v31.16b, v23.16b\n"
"add x28, x28, x11\n"
+ "mov v31.16b, v23.16b\n"
"add x27, x27, x11\n"
"add x26, x26, x11\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
+ "add x21, x21, x11\n"
"add %x[params], %x[params], #0x10\n"
"tbz %x[n_channels], #1, 9f\n"
- "ldr d14, [x10], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "ldr d16, [x28], #0x8\n"
- "ldr d17, [x27], #0x8\n"
- "ldr d18, [x26], #0x8\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
- "ldr d21, [x23], #0x8\n"
- "ldr d22, [x22], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d15, [x28], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d17, [x26], #0x8\n"
+ "ldr d18, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "ldr d22, [x21], #0x8\n"
"tbz %x[n_channels], #0, 10f\n"
- "ld1 { v14.s }[2], [x10], #0x4\n"
- "ld1 { v15.s }[2], [x9], #0x4\n"
- "ld1 { v16.s }[2], [x28], #0x4\n"
- "ld1 { v17.s }[2], [x27], #0x4\n"
- "ld1 { v18.s }[2], [x26], #0x4\n"
- "ld1 { v19.s }[2], [x25], #0x4\n"
- "ld1 { v20.s }[2], [x24], #0x4\n"
- "ld1 { v21.s }[2], [x23], #0x4\n"
- "ld1 { v22.s }[2], [x22], #0x4\n"
+ "ld1 { v14.s }[2], [x9], #0x4\n"
+ "ld1 { v15.s }[2], [x28], #0x4\n"
+ "ld1 { v16.s }[2], [x27], #0x4\n"
+ "ld1 { v17.s }[2], [x26], #0x4\n"
+ "ld1 { v18.s }[2], [x25], #0x4\n"
+ "ld1 { v19.s }[2], [x24], #0x4\n"
+ "ld1 { v20.s }[2], [x23], #0x4\n"
+ "ld1 { v21.s }[2], [x22], #0x4\n"
+ "ld1 { v22.s }[2], [x21], #0x4\n"
"b 10f\n"
"9:" // Oddments: Load: Bit 1: Unset
- "ldr s14, [x10], #0x4\n"
- "ldr s15, [x9], #0x4\n"
- "ldr s16, [x28], #0x4\n"
- "ldr s17, [x27], #0x4\n"
- "ldr s18, [x26], #0x4\n"
- "ldr s19, [x25], #0x4\n"
- "ldr s20, [x24], #0x4\n"
- "ldr s21, [x23], #0x4\n"
- "ldr s22, [x22], #0x4\n"
+ "ldr s14, [x9], #0x4\n"
+ "ldr s15, [x28], #0x4\n"
+ "ldr s16, [x27], #0x4\n"
+ "ldr s17, [x26], #0x4\n"
+ "ldr s18, [x25], #0x4\n"
+ "ldr s19, [x24], #0x4\n"
+ "ldr s20, [x23], #0x4\n"
+ "ldr s21, [x22], #0x4\n"
+ "ldr s22, [x21], #0x4\n"
"10:" // Oddments: Load: Bit 1: End
"subs x20, %x[n_points], #0x1\n"
"ble 14f\n"
"11:" // Oddments: Planar loop
- "ldp x10, x9, [x21], #0x10\n"
- "ldp x28, x27, [x21], #0x10\n"
+ "ldp x9, x28, [x10], #0x10\n"
+ "ldp x27, x26, [x10], #0x10\n"
"fmla v23.4s, v14.4s, v0.4s\n"
"fmla v24.4s, v15.4s, v0.4s\n"
- "ldp x26, x25, [x21], #0x10\n"
- "ldp x24, x23, [x21], #0x10\n"
+ "ldp x25, x24, [x10], #0x10\n"
+ "ldp x23, x22, [x10], #0x10\n"
"fmla v25.4s, v16.4s, v0.4s\n"
"fmla v26.4s, v17.4s, v0.4s\n"
- "ldr x22, [x21], #0x8\n"
+ "ldr x21, [x10], #0x8\n"
"fmla v27.4s, v18.4s, v0.4s\n"
"fmla v28.4s, v19.4s, v0.4s\n"
- "add x10, x10, x11\n"
+ "add x9, x9, x11\n"
"fmla v29.4s, v20.4s, v0.4s\n"
"fmla v30.4s, v21.4s, v0.4s\n"
- "add x9, x9, x11\n"
"add x28, x28, x11\n"
+ "add x27, x27, x11\n"
"fmla v31.4s, v22.4s, v0.4s\n"
"ldr q0, [%x[params], #0x0]\n"
- "add x27, x27, x11\n"
"add x26, x26, x11\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
+ "add x21, x21, x11\n"
"add %x[params], %x[params], #0x10\n"
"tbz %x[n_channels], #1, 12f\n"
- "ldr d14, [x10], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "ldr d16, [x28], #0x8\n"
- "ldr d17, [x27], #0x8\n"
- "ldr d18, [x26], #0x8\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
- "ldr d21, [x23], #0x8\n"
- "ldr d22, [x22], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d15, [x28], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d17, [x26], #0x8\n"
+ "ldr d18, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "ldr d22, [x21], #0x8\n"
"tbz %x[n_channels], #0, 13f\n"
- "ld1 { v14.s }[2], [x10], #0x4\n"
- "ld1 { v15.s }[2], [x9], #0x4\n"
- "ld1 { v16.s }[2], [x28], #0x4\n"
- "ld1 { v17.s }[2], [x27], #0x4\n"
- "ld1 { v18.s }[2], [x26], #0x4\n"
- "ld1 { v19.s }[2], [x25], #0x4\n"
- "ld1 { v20.s }[2], [x24], #0x4\n"
- "ld1 { v21.s }[2], [x23], #0x4\n"
- "ld1 { v22.s }[2], [x22], #0x4\n"
+ "ld1 { v14.s }[2], [x9], #0x4\n"
+ "ld1 { v15.s }[2], [x28], #0x4\n"
+ "ld1 { v16.s }[2], [x27], #0x4\n"
+ "ld1 { v17.s }[2], [x26], #0x4\n"
+ "ld1 { v18.s }[2], [x25], #0x4\n"
+ "ld1 { v19.s }[2], [x24], #0x4\n"
+ "ld1 { v20.s }[2], [x23], #0x4\n"
+ "ld1 { v21.s }[2], [x22], #0x4\n"
+ "ld1 { v22.s }[2], [x21], #0x4\n"
"b 13f\n"
"12:" // Oddments: Planar loop: Load: Bit 1: Unset
- "ldr s14, [x10], #0x4\n"
- "ldr s15, [x9], #0x4\n"
- "ldr s16, [x28], #0x4\n"
- "ldr s17, [x27], #0x4\n"
- "ldr s18, [x26], #0x4\n"
- "ldr s19, [x25], #0x4\n"
- "ldr s20, [x24], #0x4\n"
- "ldr s21, [x23], #0x4\n"
- "ldr s22, [x22], #0x4\n"
+ "ldr s14, [x9], #0x4\n"
+ "ldr s15, [x28], #0x4\n"
+ "ldr s16, [x27], #0x4\n"
+ "ldr s17, [x26], #0x4\n"
+ "ldr s18, [x25], #0x4\n"
+ "ldr s19, [x24], #0x4\n"
+ "ldr s20, [x23], #0x4\n"
+ "ldr s21, [x22], #0x4\n"
+ "ldr s22, [x21], #0x4\n"
"13:" // Oddments: Planar loop: Load: Bit 1: End
"subs x20, x20, #0x1\n"
"bgt 11b\n"
@@ -365,10 +366,11 @@ void a64_fp32_nhwc_generic_output9_mla_depthfirst_impl(
"17:" // End
: [params] "+&r" (params)
: [bias] "r" (bias), [inptrs] "r" (inptrs), [minmax_vals] "r" (minmax_vals), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [outptrs] "r" (outptrs)
- : "cc", "memory", "v0", "v1", "v2", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
index 2ec0525226..6c07fa645c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -58,4 +58,4 @@ struct a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst :
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) \ No newline at end of file
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
index 69b3865a65..9cafd23fb8 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
@@ -22,10 +22,11 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
@@ -41,7 +42,7 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
const float minmax_vals[2] = { activation_min, activation_max };
__asm__ __volatile__(
- "ld1r { v24.4s }, [%x[clamps]]\n"
+ "ld1r { v27.4s }, [%x[clamps]]\n"
"ldr x21, [%x[inptrs], #0x0]\n"
"lsr x22, %x[channel_multiplier], #0x2\n"
"add x20, %x[clamps], #0x4\n"
@@ -49,7 +50,7 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"ldr q1, [x21, #0x10]\n"
"mov x21, #0x0\n"
"mov x14, #0x0\n"
- "ld1r { v23.4s }, [x20]\n"
+ "ld1r { v26.4s }, [x20]\n"
"ldr x20, [%x[inptrs], #0x8]\n"
"ldr q2, [x20, #0x0]\n"
"ldr q3, [x20, #0x10]\n"
@@ -101,7 +102,7 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"fmla v20.4s, v31.4s, v8.s[0]\n"
"fmla v21.4s, v31.4s, v8.s[2]\n"
"fmla v22.4s, v31.4s, v9.s[0]\n"
- "ldr q31, [%x[params], #0x0]\n"
+ "ldr q25, [%x[params], #0x0]\n"
"fmla v14.4s, v30.4s, v0.s[1]\n"
"fmla v15.4s, v30.4s, v0.s[3]\n"
"fmla v16.4s, v30.4s, v1.s[1]\n"
@@ -111,7 +112,7 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"fmla v20.4s, v30.4s, v8.s[1]\n"
"fmla v21.4s, v30.4s, v8.s[3]\n"
"fmla v22.4s, v30.4s, v9.s[1]\n"
- "ldr q30, [%x[params], #0x10]\n"
+ "ldr q24, [%x[params], #0x10]\n"
"fmla v14.4s, v29.4s, v0.s[2]\n"
"fmla v15.4s, v29.4s, v1.s[0]\n"
"fmla v16.4s, v29.4s, v1.s[2]\n"
@@ -121,92 +122,92 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"fmla v20.4s, v29.4s, v8.s[2]\n"
"fmla v21.4s, v29.4s, v9.s[0]\n"
"fmla v22.4s, v29.4s, v9.s[2]\n"
- "ldr q29, [%x[params], #0x20]\n"
- "fmla v14.4s, v31.4s, v2.s[0]\n"
- "fmla v15.4s, v31.4s, v2.s[2]\n"
- "fmla v16.4s, v31.4s, v3.s[0]\n"
- "fmla v17.4s, v31.4s, v6.s[0]\n"
- "fmla v18.4s, v31.4s, v6.s[2]\n"
- "fmla v19.4s, v31.4s, v7.s[0]\n"
- "fmla v20.4s, v31.4s, v10.s[0]\n"
- "fmla v21.4s, v31.4s, v10.s[2]\n"
- "fmla v22.4s, v31.4s, v11.s[0]\n"
- "ldr q31, [%x[params], #0x30]\n"
- "fmla v14.4s, v30.4s, v2.s[1]\n"
- "fmla v15.4s, v30.4s, v2.s[3]\n"
- "fmla v16.4s, v30.4s, v3.s[1]\n"
- "fmla v17.4s, v30.4s, v6.s[1]\n"
- "fmla v18.4s, v30.4s, v6.s[3]\n"
- "fmla v19.4s, v30.4s, v7.s[1]\n"
- "fmla v20.4s, v30.4s, v10.s[1]\n"
- "fmla v21.4s, v30.4s, v10.s[3]\n"
- "fmla v22.4s, v30.4s, v11.s[1]\n"
- "ldr q30, [%x[params], #0x40]\n"
- "fmla v14.4s, v29.4s, v2.s[2]\n"
- "fmla v15.4s, v29.4s, v3.s[0]\n"
- "fmla v16.4s, v29.4s, v3.s[2]\n"
- "fmla v17.4s, v29.4s, v6.s[2]\n"
- "fmla v18.4s, v29.4s, v7.s[0]\n"
- "fmla v19.4s, v29.4s, v7.s[2]\n"
- "fmla v20.4s, v29.4s, v10.s[2]\n"
- "fmla v21.4s, v29.4s, v11.s[0]\n"
- "fmla v22.4s, v29.4s, v11.s[2]\n"
- "ldr q29, [%x[params], #0x50]\n"
- "fmla v14.4s, v31.4s, v4.s[0]\n"
- "fmla v15.4s, v31.4s, v4.s[2]\n"
- "fmla v16.4s, v31.4s, v5.s[0]\n"
- "fmla v17.4s, v31.4s, v8.s[0]\n"
- "fmla v18.4s, v31.4s, v8.s[2]\n"
- "fmla v19.4s, v31.4s, v9.s[0]\n"
- "fmla v20.4s, v31.4s, v12.s[0]\n"
- "fmla v21.4s, v31.4s, v12.s[2]\n"
- "fmla v22.4s, v31.4s, v13.s[0]\n"
+ "ldr q23, [%x[params], #0x20]\n"
+ "fmla v14.4s, v25.4s, v2.s[0]\n"
+ "fmla v15.4s, v25.4s, v2.s[2]\n"
+ "fmla v16.4s, v25.4s, v3.s[0]\n"
+ "fmla v17.4s, v25.4s, v6.s[0]\n"
+ "fmla v18.4s, v25.4s, v6.s[2]\n"
+ "fmla v19.4s, v25.4s, v7.s[0]\n"
+ "fmla v20.4s, v25.4s, v10.s[0]\n"
+ "fmla v21.4s, v25.4s, v10.s[2]\n"
+ "fmla v22.4s, v25.4s, v11.s[0]\n"
+ "ldr q25, [%x[params], #0x30]\n"
+ "fmla v14.4s, v24.4s, v2.s[1]\n"
+ "fmla v15.4s, v24.4s, v2.s[3]\n"
+ "fmla v16.4s, v24.4s, v3.s[1]\n"
+ "fmla v17.4s, v24.4s, v6.s[1]\n"
+ "fmla v18.4s, v24.4s, v6.s[3]\n"
+ "fmla v19.4s, v24.4s, v7.s[1]\n"
+ "fmla v20.4s, v24.4s, v10.s[1]\n"
+ "fmla v21.4s, v24.4s, v10.s[3]\n"
+ "fmla v22.4s, v24.4s, v11.s[1]\n"
+ "ldr q24, [%x[params], #0x40]\n"
+ "fmla v14.4s, v23.4s, v2.s[2]\n"
+ "fmla v15.4s, v23.4s, v3.s[0]\n"
+ "fmla v16.4s, v23.4s, v3.s[2]\n"
+ "fmla v17.4s, v23.4s, v6.s[2]\n"
+ "fmla v18.4s, v23.4s, v7.s[0]\n"
+ "fmla v19.4s, v23.4s, v7.s[2]\n"
+ "fmla v20.4s, v23.4s, v10.s[2]\n"
+ "fmla v21.4s, v23.4s, v11.s[0]\n"
+ "fmla v22.4s, v23.4s, v11.s[2]\n"
+ "ldr q23, [%x[params], #0x50]\n"
+ "fmla v14.4s, v25.4s, v4.s[0]\n"
+ "fmla v15.4s, v25.4s, v4.s[2]\n"
+ "fmla v16.4s, v25.4s, v5.s[0]\n"
+ "fmla v17.4s, v25.4s, v8.s[0]\n"
+ "fmla v18.4s, v25.4s, v8.s[2]\n"
+ "fmla v19.4s, v25.4s, v9.s[0]\n"
+ "fmla v20.4s, v25.4s, v12.s[0]\n"
+ "fmla v21.4s, v25.4s, v12.s[2]\n"
+ "fmla v22.4s, v25.4s, v13.s[0]\n"
"ldr q31, [%x[params], #0x70]\n"
- "fmla v14.4s, v30.4s, v4.s[1]\n"
- "fmla v15.4s, v30.4s, v4.s[3]\n"
- "fmla v16.4s, v30.4s, v5.s[1]\n"
- "fmla v17.4s, v30.4s, v8.s[1]\n"
- "fmla v18.4s, v30.4s, v8.s[3]\n"
- "fmla v19.4s, v30.4s, v9.s[1]\n"
- "fmla v20.4s, v30.4s, v12.s[1]\n"
- "fmla v21.4s, v30.4s, v12.s[3]\n"
- "fmla v22.4s, v30.4s, v13.s[1]\n"
+ "fmla v14.4s, v24.4s, v4.s[1]\n"
+ "fmla v15.4s, v24.4s, v4.s[3]\n"
+ "fmla v16.4s, v24.4s, v5.s[1]\n"
+ "fmla v17.4s, v24.4s, v8.s[1]\n"
+ "fmla v18.4s, v24.4s, v8.s[3]\n"
+ "fmla v19.4s, v24.4s, v9.s[1]\n"
+ "fmla v20.4s, v24.4s, v12.s[1]\n"
+ "fmla v21.4s, v24.4s, v12.s[3]\n"
+ "fmla v22.4s, v24.4s, v13.s[1]\n"
"ldr q30, [%x[params], #0x80]\n"
- "fmla v14.4s, v29.4s, v4.s[2]\n"
- "fmla v15.4s, v29.4s, v5.s[0]\n"
- "fmin v14.4s, v14.4s, v23.4s\n"
- "fmla v16.4s, v29.4s, v5.s[2]\n"
- "fmla v17.4s, v29.4s, v8.s[2]\n"
- "fmax v14.4s, v14.4s, v24.4s\n"
+ "fmla v14.4s, v23.4s, v4.s[2]\n"
+ "fmla v15.4s, v23.4s, v5.s[0]\n"
+ "fmin v14.4s, v14.4s, v26.4s\n"
+ "fmla v16.4s, v23.4s, v5.s[2]\n"
+ "fmla v17.4s, v23.4s, v8.s[2]\n"
+ "fmax v14.4s, v14.4s, v27.4s\n"
"str q14, [x13, x14]\n"
"ldr q14, [%x[params], #0x60]\n"
- "fmla v18.4s, v29.4s, v9.s[0]\n"
- "fmla v19.4s, v29.4s, v9.s[2]\n"
- "fmin v15.4s, v15.4s, v23.4s\n"
- "fmla v20.4s, v29.4s, v12.s[2]\n"
- "fmla v21.4s, v29.4s, v13.s[0]\n"
- "fmin v16.4s, v16.4s, v23.4s\n"
- "fmla v22.4s, v29.4s, v13.s[2]\n"
+ "fmla v18.4s, v23.4s, v9.s[0]\n"
+ "fmla v19.4s, v23.4s, v9.s[2]\n"
+ "fmin v15.4s, v15.4s, v26.4s\n"
+ "fmla v20.4s, v23.4s, v12.s[2]\n"
+ "fmla v21.4s, v23.4s, v13.s[0]\n"
+ "fmin v16.4s, v16.4s, v26.4s\n"
+ "fmla v22.4s, v23.4s, v13.s[2]\n"
"ldr q29, [%x[params], #0x90]\n"
- "fmin v17.4s, v17.4s, v23.4s\n"
+ "fmin v17.4s, v17.4s, v26.4s\n"
"add %x[params], %x[params], #0xa0\n"
- "fmin v18.4s, v18.4s, v23.4s\n"
- "fmin v19.4s, v19.4s, v23.4s\n"
- "fmin v20.4s, v20.4s, v23.4s\n"
- "fmin v21.4s, v21.4s, v23.4s\n"
- "fmin v22.4s, v22.4s, v23.4s\n"
- "fmax v15.4s, v15.4s, v24.4s\n"
+ "fmin v18.4s, v18.4s, v26.4s\n"
+ "fmin v19.4s, v19.4s, v26.4s\n"
+ "fmin v20.4s, v20.4s, v26.4s\n"
+ "fmin v21.4s, v21.4s, v26.4s\n"
+ "fmin v22.4s, v22.4s, v26.4s\n"
+ "fmax v15.4s, v15.4s, v27.4s\n"
"str q15, [x12, x14]\n"
- "fmax v16.4s, v16.4s, v24.4s\n"
- "fmax v17.4s, v17.4s, v24.4s\n"
+ "fmax v16.4s, v16.4s, v27.4s\n"
+ "fmax v17.4s, v17.4s, v27.4s\n"
"str q16, [x11, x14]\n"
- "fmax v18.4s, v18.4s, v24.4s\n"
- "fmax v19.4s, v19.4s, v24.4s\n"
+ "fmax v18.4s, v18.4s, v27.4s\n"
+ "fmax v19.4s, v19.4s, v27.4s\n"
"str q17, [x10, x14]\n"
- "fmax v20.4s, v20.4s, v24.4s\n"
- "fmax v21.4s, v21.4s, v24.4s\n"
+ "fmax v20.4s, v20.4s, v27.4s\n"
+ "fmax v21.4s, v21.4s, v27.4s\n"
"str q18, [x9, x14]\n"
- "fmax v22.4s, v22.4s, v24.4s\n"
+ "fmax v22.4s, v22.4s, v27.4s\n"
"str q19, [x28, x14]\n"
"mov v15.16b, v14.16b\n"
"str q20, [x27, x14]\n"
@@ -231,7 +232,7 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"fmla v20.4s, v31.4s, v8.s[0]\n"
"fmla v21.4s, v31.4s, v8.s[2]\n"
"fmla v22.4s, v31.4s, v9.s[0]\n"
- "ldr q31, [%x[params], #0x0]\n"
+ "ldr q25, [%x[params], #0x0]\n"
"fmla v14.4s, v30.4s, v0.s[1]\n"
"fmla v15.4s, v30.4s, v0.s[3]\n"
"fmla v16.4s, v30.4s, v1.s[1]\n"
@@ -241,7 +242,7 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"fmla v20.4s, v30.4s, v8.s[1]\n"
"fmla v21.4s, v30.4s, v8.s[3]\n"
"fmla v22.4s, v30.4s, v9.s[1]\n"
- "ldr q30, [%x[params], #0x10]\n"
+ "ldr q24, [%x[params], #0x10]\n"
"fmla v14.4s, v29.4s, v0.s[2]\n"
"fmla v15.4s, v29.4s, v1.s[0]\n"
"fmla v16.4s, v29.4s, v1.s[2]\n"
@@ -251,87 +252,87 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"fmla v20.4s, v29.4s, v8.s[2]\n"
"fmla v21.4s, v29.4s, v9.s[0]\n"
"fmla v22.4s, v29.4s, v9.s[2]\n"
- "ldr q29, [%x[params], #0x20]\n"
- "fmla v14.4s, v31.4s, v2.s[0]\n"
- "fmla v15.4s, v31.4s, v2.s[2]\n"
- "fmla v16.4s, v31.4s, v3.s[0]\n"
- "fmla v17.4s, v31.4s, v6.s[0]\n"
- "fmla v18.4s, v31.4s, v6.s[2]\n"
- "fmla v19.4s, v31.4s, v7.s[0]\n"
- "fmla v20.4s, v31.4s, v10.s[0]\n"
- "fmla v21.4s, v31.4s, v10.s[2]\n"
- "fmla v22.4s, v31.4s, v11.s[0]\n"
- "ldr q31, [%x[params], #0x30]\n"
- "fmla v14.4s, v30.4s, v2.s[1]\n"
- "fmla v15.4s, v30.4s, v2.s[3]\n"
- "fmla v16.4s, v30.4s, v3.s[1]\n"
- "fmla v17.4s, v30.4s, v6.s[1]\n"
- "fmla v18.4s, v30.4s, v6.s[3]\n"
- "fmla v19.4s, v30.4s, v7.s[1]\n"
- "fmla v20.4s, v30.4s, v10.s[1]\n"
- "fmla v21.4s, v30.4s, v10.s[3]\n"
- "fmla v22.4s, v30.4s, v11.s[1]\n"
- "ldr q30, [%x[params], #0x40]\n"
- "fmla v14.4s, v29.4s, v2.s[2]\n"
- "fmla v15.4s, v29.4s, v3.s[0]\n"
- "fmla v16.4s, v29.4s, v3.s[2]\n"
- "fmla v17.4s, v29.4s, v6.s[2]\n"
- "fmla v18.4s, v29.4s, v7.s[0]\n"
- "fmla v19.4s, v29.4s, v7.s[2]\n"
- "fmla v20.4s, v29.4s, v10.s[2]\n"
- "fmla v21.4s, v29.4s, v11.s[0]\n"
- "fmla v22.4s, v29.4s, v11.s[2]\n"
- "ldr q29, [%x[params], #0x50]\n"
+ "ldr q23, [%x[params], #0x20]\n"
+ "fmla v14.4s, v25.4s, v2.s[0]\n"
+ "fmla v15.4s, v25.4s, v2.s[2]\n"
+ "fmla v16.4s, v25.4s, v3.s[0]\n"
+ "fmla v17.4s, v25.4s, v6.s[0]\n"
+ "fmla v18.4s, v25.4s, v6.s[2]\n"
+ "fmla v19.4s, v25.4s, v7.s[0]\n"
+ "fmla v20.4s, v25.4s, v10.s[0]\n"
+ "fmla v21.4s, v25.4s, v10.s[2]\n"
+ "fmla v22.4s, v25.4s, v11.s[0]\n"
+ "ldr q25, [%x[params], #0x30]\n"
+ "fmla v14.4s, v24.4s, v2.s[1]\n"
+ "fmla v15.4s, v24.4s, v2.s[3]\n"
+ "fmla v16.4s, v24.4s, v3.s[1]\n"
+ "fmla v17.4s, v24.4s, v6.s[1]\n"
+ "fmla v18.4s, v24.4s, v6.s[3]\n"
+ "fmla v19.4s, v24.4s, v7.s[1]\n"
+ "fmla v20.4s, v24.4s, v10.s[1]\n"
+ "fmla v21.4s, v24.4s, v10.s[3]\n"
+ "fmla v22.4s, v24.4s, v11.s[1]\n"
+ "ldr q24, [%x[params], #0x40]\n"
+ "fmla v14.4s, v23.4s, v2.s[2]\n"
+ "fmla v15.4s, v23.4s, v3.s[0]\n"
+ "fmla v16.4s, v23.4s, v3.s[2]\n"
+ "fmla v17.4s, v23.4s, v6.s[2]\n"
+ "fmla v18.4s, v23.4s, v7.s[0]\n"
+ "fmla v19.4s, v23.4s, v7.s[2]\n"
+ "fmla v20.4s, v23.4s, v10.s[2]\n"
+ "fmla v21.4s, v23.4s, v11.s[0]\n"
+ "fmla v22.4s, v23.4s, v11.s[2]\n"
+ "ldr q23, [%x[params], #0x50]\n"
"add %x[params], %x[params], #0x60\n"
- "fmla v14.4s, v31.4s, v4.s[0]\n"
- "fmla v15.4s, v31.4s, v4.s[2]\n"
- "fmla v16.4s, v31.4s, v5.s[0]\n"
- "fmla v17.4s, v31.4s, v8.s[0]\n"
- "fmla v18.4s, v31.4s, v8.s[2]\n"
- "fmla v19.4s, v31.4s, v9.s[0]\n"
- "fmla v20.4s, v31.4s, v12.s[0]\n"
- "fmla v21.4s, v31.4s, v12.s[2]\n"
- "fmla v22.4s, v31.4s, v13.s[0]\n"
- "fmla v14.4s, v30.4s, v4.s[1]\n"
- "fmla v15.4s, v30.4s, v4.s[3]\n"
- "fmla v16.4s, v30.4s, v5.s[1]\n"
- "fmla v17.4s, v30.4s, v8.s[1]\n"
- "fmla v18.4s, v30.4s, v8.s[3]\n"
- "fmla v19.4s, v30.4s, v9.s[1]\n"
- "fmla v20.4s, v30.4s, v12.s[1]\n"
- "fmla v21.4s, v30.4s, v12.s[3]\n"
- "fmla v22.4s, v30.4s, v13.s[1]\n"
- "fmla v14.4s, v29.4s, v4.s[2]\n"
- "fmla v15.4s, v29.4s, v5.s[0]\n"
- "fmin v14.4s, v14.4s, v23.4s\n"
- "fmla v16.4s, v29.4s, v5.s[2]\n"
- "fmla v17.4s, v29.4s, v8.s[2]\n"
- "fmin v15.4s, v15.4s, v23.4s\n"
- "fmla v18.4s, v29.4s, v9.s[0]\n"
- "fmla v19.4s, v29.4s, v9.s[2]\n"
- "fmin v16.4s, v16.4s, v23.4s\n"
- "fmla v20.4s, v29.4s, v12.s[2]\n"
- "fmla v21.4s, v29.4s, v13.s[0]\n"
- "fmin v17.4s, v17.4s, v23.4s\n"
- "fmla v22.4s, v29.4s, v13.s[2]\n"
- "fmin v18.4s, v18.4s, v23.4s\n"
- "fmin v19.4s, v19.4s, v23.4s\n"
- "fmin v20.4s, v20.4s, v23.4s\n"
- "fmin v21.4s, v21.4s, v23.4s\n"
- "fmin v22.4s, v22.4s, v23.4s\n"
- "fmax v14.4s, v14.4s, v24.4s\n"
- "fmax v15.4s, v15.4s, v24.4s\n"
+ "fmla v14.4s, v25.4s, v4.s[0]\n"
+ "fmla v15.4s, v25.4s, v4.s[2]\n"
+ "fmla v16.4s, v25.4s, v5.s[0]\n"
+ "fmla v17.4s, v25.4s, v8.s[0]\n"
+ "fmla v18.4s, v25.4s, v8.s[2]\n"
+ "fmla v19.4s, v25.4s, v9.s[0]\n"
+ "fmla v20.4s, v25.4s, v12.s[0]\n"
+ "fmla v21.4s, v25.4s, v12.s[2]\n"
+ "fmla v22.4s, v25.4s, v13.s[0]\n"
+ "fmla v14.4s, v24.4s, v4.s[1]\n"
+ "fmla v15.4s, v24.4s, v4.s[3]\n"
+ "fmla v16.4s, v24.4s, v5.s[1]\n"
+ "fmla v17.4s, v24.4s, v8.s[1]\n"
+ "fmla v18.4s, v24.4s, v8.s[3]\n"
+ "fmla v19.4s, v24.4s, v9.s[1]\n"
+ "fmla v20.4s, v24.4s, v12.s[1]\n"
+ "fmla v21.4s, v24.4s, v12.s[3]\n"
+ "fmla v22.4s, v24.4s, v13.s[1]\n"
+ "fmla v14.4s, v23.4s, v4.s[2]\n"
+ "fmla v15.4s, v23.4s, v5.s[0]\n"
+ "fmin v14.4s, v14.4s, v26.4s\n"
+ "fmla v16.4s, v23.4s, v5.s[2]\n"
+ "fmla v17.4s, v23.4s, v8.s[2]\n"
+ "fmin v15.4s, v15.4s, v26.4s\n"
+ "fmla v18.4s, v23.4s, v9.s[0]\n"
+ "fmla v19.4s, v23.4s, v9.s[2]\n"
+ "fmin v16.4s, v16.4s, v26.4s\n"
+ "fmla v20.4s, v23.4s, v12.s[2]\n"
+ "fmla v21.4s, v23.4s, v13.s[0]\n"
+ "fmin v17.4s, v17.4s, v26.4s\n"
+ "fmla v22.4s, v23.4s, v13.s[2]\n"
+ "fmin v18.4s, v18.4s, v26.4s\n"
+ "fmin v19.4s, v19.4s, v26.4s\n"
+ "fmin v20.4s, v20.4s, v26.4s\n"
+ "fmin v21.4s, v21.4s, v26.4s\n"
+ "fmin v22.4s, v22.4s, v26.4s\n"
+ "fmax v14.4s, v14.4s, v27.4s\n"
+ "fmax v15.4s, v15.4s, v27.4s\n"
"str q14, [x13, x14]\n"
- "fmax v16.4s, v16.4s, v24.4s\n"
- "fmax v17.4s, v17.4s, v24.4s\n"
+ "fmax v16.4s, v16.4s, v27.4s\n"
+ "fmax v17.4s, v17.4s, v27.4s\n"
"str q15, [x12, x14]\n"
- "fmax v18.4s, v18.4s, v24.4s\n"
- "fmax v19.4s, v19.4s, v24.4s\n"
+ "fmax v18.4s, v18.4s, v27.4s\n"
+ "fmax v19.4s, v19.4s, v27.4s\n"
"str q16, [x11, x14]\n"
- "fmax v20.4s, v20.4s, v24.4s\n"
- "fmax v21.4s, v21.4s, v24.4s\n"
+ "fmax v20.4s, v20.4s, v27.4s\n"
+ "fmax v21.4s, v21.4s, v27.4s\n"
"str q17, [x10, x14]\n"
- "fmax v22.4s, v22.4s, v24.4s\n"
+ "fmax v22.4s, v22.4s, v27.4s\n"
"str q18, [x9, x14]\n"
"str q19, [x28, x14]\n"
"str q20, [x27, x14]\n"
@@ -342,123 +343,123 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"tst %x[channel_multiplier], #0x3\n"
"beq 6f\n"
"ldr q14, [%x[params], #0x0]\n"
- "ldr q31, [%x[params], #0x10]\n"
+ "ldr q25, [%x[params], #0x10]\n"
"mov v15.16b, v14.16b\n"
"mov v16.16b, v14.16b\n"
- "ldr q30, [%x[params], #0x20]\n"
- "ldr q29, [%x[params], #0x30]\n"
+ "ldr q24, [%x[params], #0x20]\n"
+ "ldr q23, [%x[params], #0x30]\n"
"mov v17.16b, v14.16b\n"
"mov v18.16b, v14.16b\n"
"mov v19.16b, v14.16b\n"
"mov v20.16b, v14.16b\n"
- "fmla v15.4s, v31.4s, v0.s[2]\n"
+ "fmla v15.4s, v25.4s, v0.s[2]\n"
"mov v21.16b, v14.16b\n"
"mov v22.16b, v14.16b\n"
- "fmla v14.4s, v31.4s, v0.s[0]\n"
- "fmla v16.4s, v31.4s, v1.s[0]\n"
- "fmla v17.4s, v31.4s, v4.s[0]\n"
- "fmla v18.4s, v31.4s, v4.s[2]\n"
- "fmla v19.4s, v31.4s, v5.s[0]\n"
- "fmla v20.4s, v31.4s, v8.s[0]\n"
- "fmla v21.4s, v31.4s, v8.s[2]\n"
- "fmla v22.4s, v31.4s, v9.s[0]\n"
- "ldr q31, [%x[params], #0x40]\n"
- "fmla v14.4s, v30.4s, v0.s[1]\n"
- "fmla v15.4s, v30.4s, v0.s[3]\n"
- "fmla v16.4s, v30.4s, v1.s[1]\n"
- "fmla v17.4s, v30.4s, v4.s[1]\n"
- "fmla v18.4s, v30.4s, v4.s[3]\n"
- "fmla v19.4s, v30.4s, v5.s[1]\n"
- "fmla v20.4s, v30.4s, v8.s[1]\n"
- "fmla v21.4s, v30.4s, v8.s[3]\n"
- "fmla v22.4s, v30.4s, v9.s[1]\n"
- "ldr q30, [%x[params], #0x50]\n"
- "fmla v14.4s, v29.4s, v0.s[2]\n"
- "fmla v15.4s, v29.4s, v1.s[0]\n"
- "fmla v16.4s, v29.4s, v1.s[2]\n"
- "fmla v17.4s, v29.4s, v4.s[2]\n"
- "fmla v18.4s, v29.4s, v5.s[0]\n"
- "fmla v19.4s, v29.4s, v5.s[2]\n"
- "fmla v20.4s, v29.4s, v8.s[2]\n"
- "fmla v21.4s, v29.4s, v9.s[0]\n"
- "fmla v22.4s, v29.4s, v9.s[2]\n"
- "ldr q29, [%x[params], #0x60]\n"
- "fmla v14.4s, v31.4s, v2.s[0]\n"
- "fmla v15.4s, v31.4s, v2.s[2]\n"
- "fmla v16.4s, v31.4s, v3.s[0]\n"
- "fmla v17.4s, v31.4s, v6.s[0]\n"
- "fmla v18.4s, v31.4s, v6.s[2]\n"
- "fmla v19.4s, v31.4s, v7.s[0]\n"
- "fmla v20.4s, v31.4s, v10.s[0]\n"
- "fmla v21.4s, v31.4s, v10.s[2]\n"
- "fmla v22.4s, v31.4s, v11.s[0]\n"
- "ldr q31, [%x[params], #0x70]\n"
- "fmla v14.4s, v30.4s, v2.s[1]\n"
- "fmla v15.4s, v30.4s, v2.s[3]\n"
- "fmla v16.4s, v30.4s, v3.s[1]\n"
- "fmla v17.4s, v30.4s, v6.s[1]\n"
- "fmla v18.4s, v30.4s, v6.s[3]\n"
- "fmla v19.4s, v30.4s, v7.s[1]\n"
- "fmla v20.4s, v30.4s, v10.s[1]\n"
- "fmla v21.4s, v30.4s, v10.s[3]\n"
- "fmla v22.4s, v30.4s, v11.s[1]\n"
- "ldr q30, [%x[params], #0x80]\n"
- "fmla v14.4s, v29.4s, v2.s[2]\n"
- "fmla v15.4s, v29.4s, v3.s[0]\n"
- "fmla v16.4s, v29.4s, v3.s[2]\n"
- "fmla v17.4s, v29.4s, v6.s[2]\n"
- "fmla v18.4s, v29.4s, v7.s[0]\n"
- "fmla v19.4s, v29.4s, v7.s[2]\n"
- "fmla v20.4s, v29.4s, v10.s[2]\n"
- "fmla v21.4s, v29.4s, v11.s[0]\n"
- "fmla v22.4s, v29.4s, v11.s[2]\n"
- "ldr q29, [%x[params], #0x90]\n"
+ "fmla v14.4s, v25.4s, v0.s[0]\n"
+ "fmla v16.4s, v25.4s, v1.s[0]\n"
+ "fmla v17.4s, v25.4s, v4.s[0]\n"
+ "fmla v18.4s, v25.4s, v4.s[2]\n"
+ "fmla v19.4s, v25.4s, v5.s[0]\n"
+ "fmla v20.4s, v25.4s, v8.s[0]\n"
+ "fmla v21.4s, v25.4s, v8.s[2]\n"
+ "fmla v22.4s, v25.4s, v9.s[0]\n"
+ "ldr q25, [%x[params], #0x40]\n"
+ "fmla v14.4s, v24.4s, v0.s[1]\n"
+ "fmla v15.4s, v24.4s, v0.s[3]\n"
+ "fmla v16.4s, v24.4s, v1.s[1]\n"
+ "fmla v17.4s, v24.4s, v4.s[1]\n"
+ "fmla v18.4s, v24.4s, v4.s[3]\n"
+ "fmla v19.4s, v24.4s, v5.s[1]\n"
+ "fmla v20.4s, v24.4s, v8.s[1]\n"
+ "fmla v21.4s, v24.4s, v8.s[3]\n"
+ "fmla v22.4s, v24.4s, v9.s[1]\n"
+ "ldr q24, [%x[params], #0x50]\n"
+ "fmla v14.4s, v23.4s, v0.s[2]\n"
+ "fmla v15.4s, v23.4s, v1.s[0]\n"
+ "fmla v16.4s, v23.4s, v1.s[2]\n"
+ "fmla v17.4s, v23.4s, v4.s[2]\n"
+ "fmla v18.4s, v23.4s, v5.s[0]\n"
+ "fmla v19.4s, v23.4s, v5.s[2]\n"
+ "fmla v20.4s, v23.4s, v8.s[2]\n"
+ "fmla v21.4s, v23.4s, v9.s[0]\n"
+ "fmla v22.4s, v23.4s, v9.s[2]\n"
+ "ldr q23, [%x[params], #0x60]\n"
+ "fmla v14.4s, v25.4s, v2.s[0]\n"
+ "fmla v15.4s, v25.4s, v2.s[2]\n"
+ "fmla v16.4s, v25.4s, v3.s[0]\n"
+ "fmla v17.4s, v25.4s, v6.s[0]\n"
+ "fmla v18.4s, v25.4s, v6.s[2]\n"
+ "fmla v19.4s, v25.4s, v7.s[0]\n"
+ "fmla v20.4s, v25.4s, v10.s[0]\n"
+ "fmla v21.4s, v25.4s, v10.s[2]\n"
+ "fmla v22.4s, v25.4s, v11.s[0]\n"
+ "ldr q25, [%x[params], #0x70]\n"
+ "fmla v14.4s, v24.4s, v2.s[1]\n"
+ "fmla v15.4s, v24.4s, v2.s[3]\n"
+ "fmla v16.4s, v24.4s, v3.s[1]\n"
+ "fmla v17.4s, v24.4s, v6.s[1]\n"
+ "fmla v18.4s, v24.4s, v6.s[3]\n"
+ "fmla v19.4s, v24.4s, v7.s[1]\n"
+ "fmla v20.4s, v24.4s, v10.s[1]\n"
+ "fmla v21.4s, v24.4s, v10.s[3]\n"
+ "fmla v22.4s, v24.4s, v11.s[1]\n"
+ "ldr q24, [%x[params], #0x80]\n"
+ "fmla v14.4s, v23.4s, v2.s[2]\n"
+ "fmla v15.4s, v23.4s, v3.s[0]\n"
+ "fmla v16.4s, v23.4s, v3.s[2]\n"
+ "fmla v17.4s, v23.4s, v6.s[2]\n"
+ "fmla v18.4s, v23.4s, v7.s[0]\n"
+ "fmla v19.4s, v23.4s, v7.s[2]\n"
+ "fmla v20.4s, v23.4s, v10.s[2]\n"
+ "fmla v21.4s, v23.4s, v11.s[0]\n"
+ "fmla v22.4s, v23.4s, v11.s[2]\n"
+ "ldr q23, [%x[params], #0x90]\n"
"add %x[params], %x[params], #0xa0\n"
- "fmla v14.4s, v31.4s, v4.s[0]\n"
- "fmla v15.4s, v31.4s, v4.s[2]\n"
- "fmla v16.4s, v31.4s, v5.s[0]\n"
- "fmla v17.4s, v31.4s, v8.s[0]\n"
- "fmla v18.4s, v31.4s, v8.s[2]\n"
- "fmla v19.4s, v31.4s, v9.s[0]\n"
- "fmla v20.4s, v31.4s, v12.s[0]\n"
- "fmla v21.4s, v31.4s, v12.s[2]\n"
- "fmla v22.4s, v31.4s, v13.s[0]\n"
- "fmla v14.4s, v30.4s, v4.s[1]\n"
- "fmla v15.4s, v30.4s, v4.s[3]\n"
- "fmla v16.4s, v30.4s, v5.s[1]\n"
- "fmla v17.4s, v30.4s, v8.s[1]\n"
- "fmla v18.4s, v30.4s, v8.s[3]\n"
- "fmla v19.4s, v30.4s, v9.s[1]\n"
- "fmla v20.4s, v30.4s, v12.s[1]\n"
- "fmla v21.4s, v30.4s, v12.s[3]\n"
- "fmla v22.4s, v30.4s, v13.s[1]\n"
- "fmla v14.4s, v29.4s, v4.s[2]\n"
- "fmla v15.4s, v29.4s, v5.s[0]\n"
- "fmin v14.4s, v14.4s, v23.4s\n"
- "fmla v16.4s, v29.4s, v5.s[2]\n"
- "fmla v17.4s, v29.4s, v8.s[2]\n"
- "fmin v15.4s, v15.4s, v23.4s\n"
- "fmla v18.4s, v29.4s, v9.s[0]\n"
- "fmla v19.4s, v29.4s, v9.s[2]\n"
- "fmin v16.4s, v16.4s, v23.4s\n"
- "fmla v20.4s, v29.4s, v12.s[2]\n"
- "fmla v21.4s, v29.4s, v13.s[0]\n"
- "fmin v17.4s, v17.4s, v23.4s\n"
- "fmla v22.4s, v29.4s, v13.s[2]\n"
- "fmin v18.4s, v18.4s, v23.4s\n"
- "fmin v19.4s, v19.4s, v23.4s\n"
- "fmin v20.4s, v20.4s, v23.4s\n"
- "fmin v21.4s, v21.4s, v23.4s\n"
- "fmin v22.4s, v22.4s, v23.4s\n"
- "fmax v14.4s, v14.4s, v24.4s\n"
- "fmax v15.4s, v15.4s, v24.4s\n"
- "fmax v16.4s, v16.4s, v24.4s\n"
- "fmax v17.4s, v17.4s, v24.4s\n"
- "fmax v18.4s, v18.4s, v24.4s\n"
- "fmax v19.4s, v19.4s, v24.4s\n"
- "fmax v20.4s, v20.4s, v24.4s\n"
- "fmax v21.4s, v21.4s, v24.4s\n"
- "fmax v22.4s, v22.4s, v24.4s\n"
+ "fmla v14.4s, v25.4s, v4.s[0]\n"
+ "fmla v15.4s, v25.4s, v4.s[2]\n"
+ "fmla v16.4s, v25.4s, v5.s[0]\n"
+ "fmla v17.4s, v25.4s, v8.s[0]\n"
+ "fmla v18.4s, v25.4s, v8.s[2]\n"
+ "fmla v19.4s, v25.4s, v9.s[0]\n"
+ "fmla v20.4s, v25.4s, v12.s[0]\n"
+ "fmla v21.4s, v25.4s, v12.s[2]\n"
+ "fmla v22.4s, v25.4s, v13.s[0]\n"
+ "fmla v14.4s, v24.4s, v4.s[1]\n"
+ "fmla v15.4s, v24.4s, v4.s[3]\n"
+ "fmla v16.4s, v24.4s, v5.s[1]\n"
+ "fmla v17.4s, v24.4s, v8.s[1]\n"
+ "fmla v18.4s, v24.4s, v8.s[3]\n"
+ "fmla v19.4s, v24.4s, v9.s[1]\n"
+ "fmla v20.4s, v24.4s, v12.s[1]\n"
+ "fmla v21.4s, v24.4s, v12.s[3]\n"
+ "fmla v22.4s, v24.4s, v13.s[1]\n"
+ "fmla v14.4s, v23.4s, v4.s[2]\n"
+ "fmla v15.4s, v23.4s, v5.s[0]\n"
+ "fmin v14.4s, v14.4s, v26.4s\n"
+ "fmla v16.4s, v23.4s, v5.s[2]\n"
+ "fmla v17.4s, v23.4s, v8.s[2]\n"
+ "fmin v15.4s, v15.4s, v26.4s\n"
+ "fmla v18.4s, v23.4s, v9.s[0]\n"
+ "fmla v19.4s, v23.4s, v9.s[2]\n"
+ "fmin v16.4s, v16.4s, v26.4s\n"
+ "fmla v20.4s, v23.4s, v12.s[2]\n"
+ "fmla v21.4s, v23.4s, v13.s[0]\n"
+ "fmin v17.4s, v17.4s, v26.4s\n"
+ "fmla v22.4s, v23.4s, v13.s[2]\n"
+ "fmin v18.4s, v18.4s, v26.4s\n"
+ "fmin v19.4s, v19.4s, v26.4s\n"
+ "fmin v20.4s, v20.4s, v26.4s\n"
+ "fmin v21.4s, v21.4s, v26.4s\n"
+ "fmin v22.4s, v22.4s, v26.4s\n"
+ "fmax v14.4s, v14.4s, v27.4s\n"
+ "fmax v15.4s, v15.4s, v27.4s\n"
+ "fmax v16.4s, v16.4s, v27.4s\n"
+ "fmax v17.4s, v17.4s, v27.4s\n"
+ "fmax v18.4s, v18.4s, v27.4s\n"
+ "fmax v19.4s, v19.4s, v27.4s\n"
+ "fmax v20.4s, v20.4s, v27.4s\n"
+ "fmax v21.4s, v21.4s, v27.4s\n"
+ "fmax v22.4s, v22.4s, v27.4s\n"
"tbz %x[channel_multiplier], #1, 4f\n"
"add x20, x13, x14\n"
"add x22, x12, x14\n"
@@ -519,15 +520,14 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"st1 { v21.s }[0], [x21]\n"
"st1 { v22.s }[0], [x20]\n"
"5:" // Output channel oddments: Store: Bit 1: End
-
"6:" // End
-
: [params] "+&r" (params)
: [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
index 5ae8dd3653..9f514c78e7 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -58,4 +58,4 @@ struct a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst :
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
index 50848cc2e8..c9bb1f41da 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
@@ -22,11 +22,11 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
-
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
@@ -42,7 +42,7 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
const float minmax_vals[2] = { activation_min, activation_max };
__asm__ __volatile__(
- "ld1r { v21.4s }, [%x[clamps]]\n"
+ "ld1r { v26.4s }, [%x[clamps]]\n"
"ldr x21, [%x[inptrs], #0x0]\n"
"lsr x22, %x[channel_multiplier], #0x2\n"
"add x20, %x[clamps], #0x4\n"
@@ -50,7 +50,7 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"ldr q1, [x21, #0x10]\n"
"mov x21, #0x0\n"
"mov x13, #0x0\n"
- "ld1r { v20.4s }, [x20]\n"
+ "ld1r { v25.4s }, [x20]\n"
"ldr x20, [%x[inptrs], #0x8]\n"
"ldr q2, [x20, #0x0]\n"
"ldr q3, [x20, #0x10]\n"
@@ -98,7 +98,7 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v17.4s, v31.4s, v2.s[1]\n"
"fmla v18.4s, v31.4s, v2.s[2]\n"
"fmla v19.4s, v31.4s, v2.s[3]\n"
- "ldr q31, [%x[params], #0x0]\n"
+ "ldr q24, [%x[params], #0x0]\n"
"fmla v12.4s, v30.4s, v0.s[1]\n"
"fmla v13.4s, v30.4s, v0.s[2]\n"
"fmla v14.4s, v30.4s, v0.s[3]\n"
@@ -107,7 +107,7 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v17.4s, v30.4s, v2.s[2]\n"
"fmla v18.4s, v30.4s, v2.s[3]\n"
"fmla v19.4s, v30.4s, v3.s[0]\n"
- "ldr q30, [%x[params], #0x10]\n"
+ "ldr q23, [%x[params], #0x10]\n"
"fmla v12.4s, v29.4s, v0.s[2]\n"
"fmla v13.4s, v29.4s, v0.s[3]\n"
"fmla v14.4s, v29.4s, v1.s[0]\n"
@@ -116,7 +116,7 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v17.4s, v29.4s, v2.s[3]\n"
"fmla v18.4s, v29.4s, v3.s[0]\n"
"fmla v19.4s, v29.4s, v3.s[1]\n"
- "ldr q29, [%x[params], #0x20]\n"
+ "ldr q22, [%x[params], #0x20]\n"
"fmla v12.4s, v28.4s, v0.s[3]\n"
"fmla v13.4s, v28.4s, v1.s[0]\n"
"fmla v14.4s, v28.4s, v1.s[1]\n"
@@ -125,7 +125,7 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v17.4s, v28.4s, v3.s[0]\n"
"fmla v18.4s, v28.4s, v3.s[1]\n"
"fmla v19.4s, v28.4s, v3.s[2]\n"
- "ldr q28, [%x[params], #0x30]\n"
+ "ldr q21, [%x[params], #0x30]\n"
"fmla v12.4s, v27.4s, v1.s[0]\n"
"fmla v13.4s, v27.4s, v1.s[1]\n"
"fmla v14.4s, v27.4s, v1.s[2]\n"
@@ -134,209 +134,209 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v17.4s, v27.4s, v3.s[1]\n"
"fmla v18.4s, v27.4s, v3.s[2]\n"
"fmla v19.4s, v27.4s, v3.s[3]\n"
- "ldr q27, [%x[params], #0x40]\n"
- "fmla v12.4s, v31.4s, v2.s[0]\n"
- "fmla v13.4s, v31.4s, v2.s[1]\n"
- "fmla v14.4s, v31.4s, v2.s[2]\n"
- "fmla v15.4s, v31.4s, v2.s[3]\n"
- "fmla v16.4s, v31.4s, v4.s[0]\n"
- "fmla v17.4s, v31.4s, v4.s[1]\n"
- "fmla v18.4s, v31.4s, v4.s[2]\n"
- "fmla v19.4s, v31.4s, v4.s[3]\n"
- "ldr q31, [%x[params], #0x50]\n"
- "fmla v12.4s, v30.4s, v2.s[1]\n"
- "fmla v13.4s, v30.4s, v2.s[2]\n"
- "fmla v14.4s, v30.4s, v2.s[3]\n"
- "fmla v15.4s, v30.4s, v3.s[0]\n"
- "fmla v16.4s, v30.4s, v4.s[1]\n"
- "fmla v17.4s, v30.4s, v4.s[2]\n"
- "fmla v18.4s, v30.4s, v4.s[3]\n"
- "fmla v19.4s, v30.4s, v5.s[0]\n"
- "ldr q30, [%x[params], #0x60]\n"
- "fmla v12.4s, v29.4s, v2.s[2]\n"
- "fmla v13.4s, v29.4s, v2.s[3]\n"
- "fmla v14.4s, v29.4s, v3.s[0]\n"
- "fmla v15.4s, v29.4s, v3.s[1]\n"
- "fmla v16.4s, v29.4s, v4.s[2]\n"
- "fmla v17.4s, v29.4s, v4.s[3]\n"
- "fmla v18.4s, v29.4s, v5.s[0]\n"
- "fmla v19.4s, v29.4s, v5.s[1]\n"
- "ldr q29, [%x[params], #0x70]\n"
- "fmla v12.4s, v28.4s, v2.s[3]\n"
- "fmla v13.4s, v28.4s, v3.s[0]\n"
- "fmla v14.4s, v28.4s, v3.s[1]\n"
- "fmla v15.4s, v28.4s, v3.s[2]\n"
- "fmla v16.4s, v28.4s, v4.s[3]\n"
- "fmla v17.4s, v28.4s, v5.s[0]\n"
- "fmla v18.4s, v28.4s, v5.s[1]\n"
- "fmla v19.4s, v28.4s, v5.s[2]\n"
- "ldr q28, [%x[params], #0x80]\n"
- "fmla v12.4s, v27.4s, v3.s[0]\n"
- "fmla v13.4s, v27.4s, v3.s[1]\n"
- "fmla v14.4s, v27.4s, v3.s[2]\n"
- "fmla v15.4s, v27.4s, v3.s[3]\n"
- "fmla v16.4s, v27.4s, v5.s[0]\n"
- "fmla v17.4s, v27.4s, v5.s[1]\n"
- "fmla v18.4s, v27.4s, v5.s[2]\n"
- "fmla v19.4s, v27.4s, v5.s[3]\n"
- "ldr q27, [%x[params], #0x90]\n"
- "fmla v12.4s, v31.4s, v4.s[0]\n"
- "fmla v13.4s, v31.4s, v4.s[1]\n"
- "fmla v14.4s, v31.4s, v4.s[2]\n"
- "fmla v15.4s, v31.4s, v4.s[3]\n"
- "fmla v16.4s, v31.4s, v6.s[0]\n"
- "fmla v17.4s, v31.4s, v6.s[1]\n"
- "fmla v18.4s, v31.4s, v6.s[2]\n"
- "fmla v19.4s, v31.4s, v6.s[3]\n"
- "ldr q31, [%x[params], #0xa0]\n"
- "fmla v12.4s, v30.4s, v4.s[1]\n"
- "fmla v13.4s, v30.4s, v4.s[2]\n"
- "fmla v14.4s, v30.4s, v4.s[3]\n"
- "fmla v15.4s, v30.4s, v5.s[0]\n"
- "fmla v16.4s, v30.4s, v6.s[1]\n"
- "fmla v17.4s, v30.4s, v6.s[2]\n"
- "fmla v18.4s, v30.4s, v6.s[3]\n"
- "fmla v19.4s, v30.4s, v7.s[0]\n"
- "ldr q30, [%x[params], #0xb0]\n"
- "fmla v12.4s, v29.4s, v4.s[2]\n"
- "fmla v13.4s, v29.4s, v4.s[3]\n"
- "fmla v14.4s, v29.4s, v5.s[0]\n"
- "fmla v15.4s, v29.4s, v5.s[1]\n"
- "fmla v16.4s, v29.4s, v6.s[2]\n"
- "fmla v17.4s, v29.4s, v6.s[3]\n"
- "fmla v18.4s, v29.4s, v7.s[0]\n"
- "fmla v19.4s, v29.4s, v7.s[1]\n"
- "ldr q29, [%x[params], #0xc0]\n"
- "fmla v12.4s, v28.4s, v4.s[3]\n"
- "fmla v13.4s, v28.4s, v5.s[0]\n"
- "fmla v14.4s, v28.4s, v5.s[1]\n"
- "fmla v15.4s, v28.4s, v5.s[2]\n"
- "fmla v16.4s, v28.4s, v6.s[3]\n"
- "fmla v17.4s, v28.4s, v7.s[0]\n"
- "fmla v18.4s, v28.4s, v7.s[1]\n"
- "fmla v19.4s, v28.4s, v7.s[2]\n"
- "ldr q28, [%x[params], #0xd0]\n"
- "fmla v12.4s, v27.4s, v5.s[0]\n"
- "fmla v13.4s, v27.4s, v5.s[1]\n"
- "fmla v14.4s, v27.4s, v5.s[2]\n"
- "fmla v15.4s, v27.4s, v5.s[3]\n"
- "fmla v16.4s, v27.4s, v7.s[0]\n"
- "fmla v17.4s, v27.4s, v7.s[1]\n"
- "fmla v18.4s, v27.4s, v7.s[2]\n"
- "fmla v19.4s, v27.4s, v7.s[3]\n"
- "ldr q27, [%x[params], #0xe0]\n"
- "fmla v12.4s, v31.4s, v6.s[0]\n"
- "fmla v13.4s, v31.4s, v6.s[1]\n"
- "fmla v14.4s, v31.4s, v6.s[2]\n"
- "fmla v15.4s, v31.4s, v6.s[3]\n"
- "fmla v16.4s, v31.4s, v8.s[0]\n"
- "fmla v17.4s, v31.4s, v8.s[1]\n"
- "fmla v18.4s, v31.4s, v8.s[2]\n"
- "fmla v19.4s, v31.4s, v8.s[3]\n"
- "ldr q31, [%x[params], #0xf0]\n"
- "fmla v12.4s, v30.4s, v6.s[1]\n"
- "fmla v13.4s, v30.4s, v6.s[2]\n"
- "fmla v14.4s, v30.4s, v6.s[3]\n"
- "fmla v15.4s, v30.4s, v7.s[0]\n"
- "fmla v16.4s, v30.4s, v8.s[1]\n"
- "fmla v17.4s, v30.4s, v8.s[2]\n"
- "fmla v18.4s, v30.4s, v8.s[3]\n"
- "fmla v19.4s, v30.4s, v9.s[0]\n"
- "ldr q30, [%x[params], #0x100]\n"
- "fmla v12.4s, v29.4s, v6.s[2]\n"
- "fmla v13.4s, v29.4s, v6.s[3]\n"
- "fmla v14.4s, v29.4s, v7.s[0]\n"
- "fmla v15.4s, v29.4s, v7.s[1]\n"
- "fmla v16.4s, v29.4s, v8.s[2]\n"
- "fmla v17.4s, v29.4s, v8.s[3]\n"
- "fmla v18.4s, v29.4s, v9.s[0]\n"
- "fmla v19.4s, v29.4s, v9.s[1]\n"
- "ldr q29, [%x[params], #0x110]\n"
- "fmla v12.4s, v28.4s, v6.s[3]\n"
- "fmla v13.4s, v28.4s, v7.s[0]\n"
- "fmla v14.4s, v28.4s, v7.s[1]\n"
- "fmla v15.4s, v28.4s, v7.s[2]\n"
- "fmla v16.4s, v28.4s, v8.s[3]\n"
- "fmla v17.4s, v28.4s, v9.s[0]\n"
- "fmla v18.4s, v28.4s, v9.s[1]\n"
- "fmla v19.4s, v28.4s, v9.s[2]\n"
- "ldr q28, [%x[params], #0x120]\n"
- "fmla v12.4s, v27.4s, v7.s[0]\n"
- "fmla v13.4s, v27.4s, v7.s[1]\n"
- "fmla v14.4s, v27.4s, v7.s[2]\n"
- "fmla v15.4s, v27.4s, v7.s[3]\n"
- "fmla v16.4s, v27.4s, v9.s[0]\n"
- "fmla v17.4s, v27.4s, v9.s[1]\n"
- "fmla v18.4s, v27.4s, v9.s[2]\n"
- "fmla v19.4s, v27.4s, v9.s[3]\n"
- "ldr q27, [%x[params], #0x130]\n"
- "fmla v12.4s, v31.4s, v8.s[0]\n"
- "fmla v13.4s, v31.4s, v8.s[1]\n"
- "fmla v14.4s, v31.4s, v8.s[2]\n"
- "fmla v15.4s, v31.4s, v8.s[3]\n"
- "fmla v16.4s, v31.4s, v10.s[0]\n"
- "fmla v17.4s, v31.4s, v10.s[1]\n"
- "fmla v18.4s, v31.4s, v10.s[2]\n"
- "fmla v19.4s, v31.4s, v10.s[3]\n"
+ "ldr q20, [%x[params], #0x40]\n"
+ "fmla v12.4s, v24.4s, v2.s[0]\n"
+ "fmla v13.4s, v24.4s, v2.s[1]\n"
+ "fmla v14.4s, v24.4s, v2.s[2]\n"
+ "fmla v15.4s, v24.4s, v2.s[3]\n"
+ "fmla v16.4s, v24.4s, v4.s[0]\n"
+ "fmla v17.4s, v24.4s, v4.s[1]\n"
+ "fmla v18.4s, v24.4s, v4.s[2]\n"
+ "fmla v19.4s, v24.4s, v4.s[3]\n"
+ "ldr q24, [%x[params], #0x50]\n"
+ "fmla v12.4s, v23.4s, v2.s[1]\n"
+ "fmla v13.4s, v23.4s, v2.s[2]\n"
+ "fmla v14.4s, v23.4s, v2.s[3]\n"
+ "fmla v15.4s, v23.4s, v3.s[0]\n"
+ "fmla v16.4s, v23.4s, v4.s[1]\n"
+ "fmla v17.4s, v23.4s, v4.s[2]\n"
+ "fmla v18.4s, v23.4s, v4.s[3]\n"
+ "fmla v19.4s, v23.4s, v5.s[0]\n"
+ "ldr q23, [%x[params], #0x60]\n"
+ "fmla v12.4s, v22.4s, v2.s[2]\n"
+ "fmla v13.4s, v22.4s, v2.s[3]\n"
+ "fmla v14.4s, v22.4s, v3.s[0]\n"
+ "fmla v15.4s, v22.4s, v3.s[1]\n"
+ "fmla v16.4s, v22.4s, v4.s[2]\n"
+ "fmla v17.4s, v22.4s, v4.s[3]\n"
+ "fmla v18.4s, v22.4s, v5.s[0]\n"
+ "fmla v19.4s, v22.4s, v5.s[1]\n"
+ "ldr q22, [%x[params], #0x70]\n"
+ "fmla v12.4s, v21.4s, v2.s[3]\n"
+ "fmla v13.4s, v21.4s, v3.s[0]\n"
+ "fmla v14.4s, v21.4s, v3.s[1]\n"
+ "fmla v15.4s, v21.4s, v3.s[2]\n"
+ "fmla v16.4s, v21.4s, v4.s[3]\n"
+ "fmla v17.4s, v21.4s, v5.s[0]\n"
+ "fmla v18.4s, v21.4s, v5.s[1]\n"
+ "fmla v19.4s, v21.4s, v5.s[2]\n"
+ "ldr q21, [%x[params], #0x80]\n"
+ "fmla v12.4s, v20.4s, v3.s[0]\n"
+ "fmla v13.4s, v20.4s, v3.s[1]\n"
+ "fmla v14.4s, v20.4s, v3.s[2]\n"
+ "fmla v15.4s, v20.4s, v3.s[3]\n"
+ "fmla v16.4s, v20.4s, v5.s[0]\n"
+ "fmla v17.4s, v20.4s, v5.s[1]\n"
+ "fmla v18.4s, v20.4s, v5.s[2]\n"
+ "fmla v19.4s, v20.4s, v5.s[3]\n"
+ "ldr q20, [%x[params], #0x90]\n"
+ "fmla v12.4s, v24.4s, v4.s[0]\n"
+ "fmla v13.4s, v24.4s, v4.s[1]\n"
+ "fmla v14.4s, v24.4s, v4.s[2]\n"
+ "fmla v15.4s, v24.4s, v4.s[3]\n"
+ "fmla v16.4s, v24.4s, v6.s[0]\n"
+ "fmla v17.4s, v24.4s, v6.s[1]\n"
+ "fmla v18.4s, v24.4s, v6.s[2]\n"
+ "fmla v19.4s, v24.4s, v6.s[3]\n"
+ "ldr q24, [%x[params], #0xa0]\n"
+ "fmla v12.4s, v23.4s, v4.s[1]\n"
+ "fmla v13.4s, v23.4s, v4.s[2]\n"
+ "fmla v14.4s, v23.4s, v4.s[3]\n"
+ "fmla v15.4s, v23.4s, v5.s[0]\n"
+ "fmla v16.4s, v23.4s, v6.s[1]\n"
+ "fmla v17.4s, v23.4s, v6.s[2]\n"
+ "fmla v18.4s, v23.4s, v6.s[3]\n"
+ "fmla v19.4s, v23.4s, v7.s[0]\n"
+ "ldr q23, [%x[params], #0xb0]\n"
+ "fmla v12.4s, v22.4s, v4.s[2]\n"
+ "fmla v13.4s, v22.4s, v4.s[3]\n"
+ "fmla v14.4s, v22.4s, v5.s[0]\n"
+ "fmla v15.4s, v22.4s, v5.s[1]\n"
+ "fmla v16.4s, v22.4s, v6.s[2]\n"
+ "fmla v17.4s, v22.4s, v6.s[3]\n"
+ "fmla v18.4s, v22.4s, v7.s[0]\n"
+ "fmla v19.4s, v22.4s, v7.s[1]\n"
+ "ldr q22, [%x[params], #0xc0]\n"
+ "fmla v12.4s, v21.4s, v4.s[3]\n"
+ "fmla v13.4s, v21.4s, v5.s[0]\n"
+ "fmla v14.4s, v21.4s, v5.s[1]\n"
+ "fmla v15.4s, v21.4s, v5.s[2]\n"
+ "fmla v16.4s, v21.4s, v6.s[3]\n"
+ "fmla v17.4s, v21.4s, v7.s[0]\n"
+ "fmla v18.4s, v21.4s, v7.s[1]\n"
+ "fmla v19.4s, v21.4s, v7.s[2]\n"
+ "ldr q21, [%x[params], #0xd0]\n"
+ "fmla v12.4s, v20.4s, v5.s[0]\n"
+ "fmla v13.4s, v20.4s, v5.s[1]\n"
+ "fmla v14.4s, v20.4s, v5.s[2]\n"
+ "fmla v15.4s, v20.4s, v5.s[3]\n"
+ "fmla v16.4s, v20.4s, v7.s[0]\n"
+ "fmla v17.4s, v20.4s, v7.s[1]\n"
+ "fmla v18.4s, v20.4s, v7.s[2]\n"
+ "fmla v19.4s, v20.4s, v7.s[3]\n"
+ "ldr q20, [%x[params], #0xe0]\n"
+ "fmla v12.4s, v24.4s, v6.s[0]\n"
+ "fmla v13.4s, v24.4s, v6.s[1]\n"
+ "fmla v14.4s, v24.4s, v6.s[2]\n"
+ "fmla v15.4s, v24.4s, v6.s[3]\n"
+ "fmla v16.4s, v24.4s, v8.s[0]\n"
+ "fmla v17.4s, v24.4s, v8.s[1]\n"
+ "fmla v18.4s, v24.4s, v8.s[2]\n"
+ "fmla v19.4s, v24.4s, v8.s[3]\n"
+ "ldr q24, [%x[params], #0xf0]\n"
+ "fmla v12.4s, v23.4s, v6.s[1]\n"
+ "fmla v13.4s, v23.4s, v6.s[2]\n"
+ "fmla v14.4s, v23.4s, v6.s[3]\n"
+ "fmla v15.4s, v23.4s, v7.s[0]\n"
+ "fmla v16.4s, v23.4s, v8.s[1]\n"
+ "fmla v17.4s, v23.4s, v8.s[2]\n"
+ "fmla v18.4s, v23.4s, v8.s[3]\n"
+ "fmla v19.4s, v23.4s, v9.s[0]\n"
+ "ldr q23, [%x[params], #0x100]\n"
+ "fmla v12.4s, v22.4s, v6.s[2]\n"
+ "fmla v13.4s, v22.4s, v6.s[3]\n"
+ "fmla v14.4s, v22.4s, v7.s[0]\n"
+ "fmla v15.4s, v22.4s, v7.s[1]\n"
+ "fmla v16.4s, v22.4s, v8.s[2]\n"
+ "fmla v17.4s, v22.4s, v8.s[3]\n"
+ "fmla v18.4s, v22.4s, v9.s[0]\n"
+ "fmla v19.4s, v22.4s, v9.s[1]\n"
+ "ldr q22, [%x[params], #0x110]\n"
+ "fmla v12.4s, v21.4s, v6.s[3]\n"
+ "fmla v13.4s, v21.4s, v7.s[0]\n"
+ "fmla v14.4s, v21.4s, v7.s[1]\n"
+ "fmla v15.4s, v21.4s, v7.s[2]\n"
+ "fmla v16.4s, v21.4s, v8.s[3]\n"
+ "fmla v17.4s, v21.4s, v9.s[0]\n"
+ "fmla v18.4s, v21.4s, v9.s[1]\n"
+ "fmla v19.4s, v21.4s, v9.s[2]\n"
+ "ldr q21, [%x[params], #0x120]\n"
+ "fmla v12.4s, v20.4s, v7.s[0]\n"
+ "fmla v13.4s, v20.4s, v7.s[1]\n"
+ "fmla v14.4s, v20.4s, v7.s[2]\n"
+ "fmla v15.4s, v20.4s, v7.s[3]\n"
+ "fmla v16.4s, v20.4s, v9.s[0]\n"
+ "fmla v17.4s, v20.4s, v9.s[1]\n"
+ "fmla v18.4s, v20.4s, v9.s[2]\n"
+ "fmla v19.4s, v20.4s, v9.s[3]\n"
+ "ldr q20, [%x[params], #0x130]\n"
+ "fmla v12.4s, v24.4s, v8.s[0]\n"
+ "fmla v13.4s, v24.4s, v8.s[1]\n"
+ "fmla v14.4s, v24.4s, v8.s[2]\n"
+ "fmla v15.4s, v24.4s, v8.s[3]\n"
+ "fmla v16.4s, v24.4s, v10.s[0]\n"
+ "fmla v17.4s, v24.4s, v10.s[1]\n"
+ "fmla v18.4s, v24.4s, v10.s[2]\n"
+ "fmla v19.4s, v24.4s, v10.s[3]\n"
"ldr q31, [%x[params], #0x150]\n"
- "fmla v12.4s, v30.4s, v8.s[1]\n"
- "fmla v13.4s, v30.4s, v8.s[2]\n"
- "fmla v14.4s, v30.4s, v8.s[3]\n"
- "fmla v15.4s, v30.4s, v9.s[0]\n"
- "fmla v16.4s, v30.4s, v10.s[1]\n"
- "fmla v17.4s, v30.4s, v10.s[2]\n"
- "fmla v18.4s, v30.4s, v10.s[3]\n"
- "fmla v19.4s, v30.4s, v11.s[0]\n"
+ "fmla v12.4s, v23.4s, v8.s[1]\n"
+ "fmla v13.4s, v23.4s, v8.s[2]\n"
+ "fmla v14.4s, v23.4s, v8.s[3]\n"
+ "fmla v15.4s, v23.4s, v9.s[0]\n"
+ "fmla v16.4s, v23.4s, v10.s[1]\n"
+ "fmla v17.4s, v23.4s, v10.s[2]\n"
+ "fmla v18.4s, v23.4s, v10.s[3]\n"
+ "fmla v19.4s, v23.4s, v11.s[0]\n"
"ldr q30, [%x[params], #0x160]\n"
- "fmla v12.4s, v29.4s, v8.s[2]\n"
- "fmla v13.4s, v29.4s, v8.s[3]\n"
- "fmla v14.4s, v29.4s, v9.s[0]\n"
- "fmla v15.4s, v29.4s, v9.s[1]\n"
- "fmla v16.4s, v29.4s, v10.s[2]\n"
- "fmla v17.4s, v29.4s, v10.s[3]\n"
- "fmla v18.4s, v29.4s, v11.s[0]\n"
- "fmla v19.4s, v29.4s, v11.s[1]\n"
+ "fmla v12.4s, v22.4s, v8.s[2]\n"
+ "fmla v13.4s, v22.4s, v8.s[3]\n"
+ "fmla v14.4s, v22.4s, v9.s[0]\n"
+ "fmla v15.4s, v22.4s, v9.s[1]\n"
+ "fmla v16.4s, v22.4s, v10.s[2]\n"
+ "fmla v17.4s, v22.4s, v10.s[3]\n"
+ "fmla v18.4s, v22.4s, v11.s[0]\n"
+ "fmla v19.4s, v22.4s, v11.s[1]\n"
"ldr q29, [%x[params], #0x170]\n"
- "fmla v12.4s, v28.4s, v8.s[3]\n"
- "fmla v13.4s, v28.4s, v9.s[0]\n"
- "fmla v14.4s, v28.4s, v9.s[1]\n"
- "fmla v15.4s, v28.4s, v9.s[2]\n"
- "fmla v16.4s, v28.4s, v10.s[3]\n"
- "fmla v17.4s, v28.4s, v11.s[0]\n"
- "fmla v18.4s, v28.4s, v11.s[1]\n"
- "fmla v19.4s, v28.4s, v11.s[2]\n"
+ "fmla v12.4s, v21.4s, v8.s[3]\n"
+ "fmla v13.4s, v21.4s, v9.s[0]\n"
+ "fmla v14.4s, v21.4s, v9.s[1]\n"
+ "fmla v15.4s, v21.4s, v9.s[2]\n"
+ "fmla v16.4s, v21.4s, v10.s[3]\n"
+ "fmla v17.4s, v21.4s, v11.s[0]\n"
+ "fmla v18.4s, v21.4s, v11.s[1]\n"
+ "fmla v19.4s, v21.4s, v11.s[2]\n"
"ldr q28, [%x[params], #0x180]\n"
- "fmla v12.4s, v27.4s, v9.s[0]\n"
- "fmla v13.4s, v27.4s, v9.s[1]\n"
- "fmin v12.4s, v12.4s, v20.4s\n"
- "fmla v14.4s, v27.4s, v9.s[2]\n"
- "fmla v15.4s, v27.4s, v9.s[3]\n"
- "fmax v12.4s, v12.4s, v21.4s\n"
+ "fmla v12.4s, v20.4s, v9.s[0]\n"
+ "fmla v13.4s, v20.4s, v9.s[1]\n"
+ "fmin v12.4s, v12.4s, v25.4s\n"
+ "fmla v14.4s, v20.4s, v9.s[2]\n"
+ "fmla v15.4s, v20.4s, v9.s[3]\n"
+ "fmax v12.4s, v12.4s, v26.4s\n"
"str q12, [x12, x13]\n"
"ldr q12, [%x[params], #0x140]\n"
- "fmla v16.4s, v27.4s, v11.s[0]\n"
- "fmla v17.4s, v27.4s, v11.s[1]\n"
- "fmin v13.4s, v13.4s, v20.4s\n"
- "fmla v18.4s, v27.4s, v11.s[2]\n"
- "fmla v19.4s, v27.4s, v11.s[3]\n"
+ "fmla v16.4s, v20.4s, v11.s[0]\n"
+ "fmla v17.4s, v20.4s, v11.s[1]\n"
+ "fmin v13.4s, v13.4s, v25.4s\n"
+ "fmla v18.4s, v20.4s, v11.s[2]\n"
+ "fmla v19.4s, v20.4s, v11.s[3]\n"
"ldr q27, [%x[params], #0x190]\n"
- "fmin v14.4s, v14.4s, v20.4s\n"
- "fmin v15.4s, v15.4s, v20.4s\n"
- "fmin v16.4s, v16.4s, v20.4s\n"
+ "fmin v14.4s, v14.4s, v25.4s\n"
+ "fmin v15.4s, v15.4s, v25.4s\n"
+ "fmin v16.4s, v16.4s, v25.4s\n"
"add %x[params], %x[params], #0x1a0\n"
- "fmin v17.4s, v17.4s, v20.4s\n"
- "fmin v18.4s, v18.4s, v20.4s\n"
- "fmin v19.4s, v19.4s, v20.4s\n"
- "fmax v13.4s, v13.4s, v21.4s\n"
+ "fmin v17.4s, v17.4s, v25.4s\n"
+ "fmin v18.4s, v18.4s, v25.4s\n"
+ "fmin v19.4s, v19.4s, v25.4s\n"
+ "fmax v13.4s, v13.4s, v26.4s\n"
"str q13, [x11, x13]\n"
- "fmax v14.4s, v14.4s, v21.4s\n"
- "fmax v15.4s, v15.4s, v21.4s\n"
+ "fmax v14.4s, v14.4s, v26.4s\n"
+ "fmax v15.4s, v15.4s, v26.4s\n"
"str q14, [x10, x13]\n"
- "fmax v16.4s, v16.4s, v21.4s\n"
- "fmax v17.4s, v17.4s, v21.4s\n"
+ "fmax v16.4s, v16.4s, v26.4s\n"
+ "fmax v17.4s, v17.4s, v26.4s\n"
"str q15, [x9, x13]\n"
- "fmax v18.4s, v18.4s, v21.4s\n"
- "fmax v19.4s, v19.4s, v21.4s\n"
+ "fmax v18.4s, v18.4s, v26.4s\n"
+ "fmax v19.4s, v19.4s, v26.4s\n"
"str q16, [x28, x13]\n"
"str q17, [x27, x13]\n"
"mov v13.16b, v12.16b\n"
@@ -359,7 +359,7 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v17.4s, v31.4s, v2.s[1]\n"
"fmla v18.4s, v31.4s, v2.s[2]\n"
"fmla v19.4s, v31.4s, v2.s[3]\n"
- "ldr q31, [%x[params], #0x0]\n"
+ "ldr q24, [%x[params], #0x0]\n"
"fmla v12.4s, v30.4s, v0.s[1]\n"
"fmla v13.4s, v30.4s, v0.s[2]\n"
"fmla v14.4s, v30.4s, v0.s[3]\n"
@@ -368,7 +368,7 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v17.4s, v30.4s, v2.s[2]\n"
"fmla v18.4s, v30.4s, v2.s[3]\n"
"fmla v19.4s, v30.4s, v3.s[0]\n"
- "ldr q30, [%x[params], #0x10]\n"
+ "ldr q23, [%x[params], #0x10]\n"
"fmla v12.4s, v29.4s, v0.s[2]\n"
"fmla v13.4s, v29.4s, v0.s[3]\n"
"fmla v14.4s, v29.4s, v1.s[0]\n"
@@ -377,7 +377,7 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v17.4s, v29.4s, v2.s[3]\n"
"fmla v18.4s, v29.4s, v3.s[0]\n"
"fmla v19.4s, v29.4s, v3.s[1]\n"
- "ldr q29, [%x[params], #0x20]\n"
+ "ldr q22, [%x[params], #0x20]\n"
"fmla v12.4s, v28.4s, v0.s[3]\n"
"fmla v13.4s, v28.4s, v1.s[0]\n"
"fmla v14.4s, v28.4s, v1.s[1]\n"
@@ -386,7 +386,7 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v17.4s, v28.4s, v3.s[0]\n"
"fmla v18.4s, v28.4s, v3.s[1]\n"
"fmla v19.4s, v28.4s, v3.s[2]\n"
- "ldr q28, [%x[params], #0x30]\n"
+ "ldr q21, [%x[params], #0x30]\n"
"fmla v12.4s, v27.4s, v1.s[0]\n"
"fmla v13.4s, v27.4s, v1.s[1]\n"
"fmla v14.4s, v27.4s, v1.s[2]\n"
@@ -395,202 +395,202 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v17.4s, v27.4s, v3.s[1]\n"
"fmla v18.4s, v27.4s, v3.s[2]\n"
"fmla v19.4s, v27.4s, v3.s[3]\n"
- "ldr q27, [%x[params], #0x40]\n"
- "fmla v12.4s, v31.4s, v2.s[0]\n"
- "fmla v13.4s, v31.4s, v2.s[1]\n"
- "fmla v14.4s, v31.4s, v2.s[2]\n"
- "fmla v15.4s, v31.4s, v2.s[3]\n"
- "fmla v16.4s, v31.4s, v4.s[0]\n"
- "fmla v17.4s, v31.4s, v4.s[1]\n"
- "fmla v18.4s, v31.4s, v4.s[2]\n"
- "fmla v19.4s, v31.4s, v4.s[3]\n"
- "ldr q31, [%x[params], #0x50]\n"
- "fmla v12.4s, v30.4s, v2.s[1]\n"
- "fmla v13.4s, v30.4s, v2.s[2]\n"
- "fmla v14.4s, v30.4s, v2.s[3]\n"
- "fmla v15.4s, v30.4s, v3.s[0]\n"
- "fmla v16.4s, v30.4s, v4.s[1]\n"
- "fmla v17.4s, v30.4s, v4.s[2]\n"
- "fmla v18.4s, v30.4s, v4.s[3]\n"
- "fmla v19.4s, v30.4s, v5.s[0]\n"
- "ldr q30, [%x[params], #0x60]\n"
- "fmla v12.4s, v29.4s, v2.s[2]\n"
- "fmla v13.4s, v29.4s, v2.s[3]\n"
- "fmla v14.4s, v29.4s, v3.s[0]\n"
- "fmla v15.4s, v29.4s, v3.s[1]\n"
- "fmla v16.4s, v29.4s, v4.s[2]\n"
- "fmla v17.4s, v29.4s, v4.s[3]\n"
- "fmla v18.4s, v29.4s, v5.s[0]\n"
- "fmla v19.4s, v29.4s, v5.s[1]\n"
- "ldr q29, [%x[params], #0x70]\n"
- "fmla v12.4s, v28.4s, v2.s[3]\n"
- "fmla v13.4s, v28.4s, v3.s[0]\n"
- "fmla v14.4s, v28.4s, v3.s[1]\n"
- "fmla v15.4s, v28.4s, v3.s[2]\n"
- "fmla v16.4s, v28.4s, v4.s[3]\n"
- "fmla v17.4s, v28.4s, v5.s[0]\n"
- "fmla v18.4s, v28.4s, v5.s[1]\n"
- "fmla v19.4s, v28.4s, v5.s[2]\n"
- "ldr q28, [%x[params], #0x80]\n"
- "fmla v12.4s, v27.4s, v3.s[0]\n"
- "fmla v13.4s, v27.4s, v3.s[1]\n"
- "fmla v14.4s, v27.4s, v3.s[2]\n"
- "fmla v15.4s, v27.4s, v3.s[3]\n"
- "fmla v16.4s, v27.4s, v5.s[0]\n"
- "fmla v17.4s, v27.4s, v5.s[1]\n"
- "fmla v18.4s, v27.4s, v5.s[2]\n"
- "fmla v19.4s, v27.4s, v5.s[3]\n"
- "ldr q27, [%x[params], #0x90]\n"
- "fmla v12.4s, v31.4s, v4.s[0]\n"
- "fmla v13.4s, v31.4s, v4.s[1]\n"
- "fmla v14.4s, v31.4s, v4.s[2]\n"
- "fmla v15.4s, v31.4s, v4.s[3]\n"
- "fmla v16.4s, v31.4s, v6.s[0]\n"
- "fmla v17.4s, v31.4s, v6.s[1]\n"
- "fmla v18.4s, v31.4s, v6.s[2]\n"
- "fmla v19.4s, v31.4s, v6.s[3]\n"
- "ldr q31, [%x[params], #0xa0]\n"
- "fmla v12.4s, v30.4s, v4.s[1]\n"
- "fmla v13.4s, v30.4s, v4.s[2]\n"
- "fmla v14.4s, v30.4s, v4.s[3]\n"
- "fmla v15.4s, v30.4s, v5.s[0]\n"
- "fmla v16.4s, v30.4s, v6.s[1]\n"
- "fmla v17.4s, v30.4s, v6.s[2]\n"
- "fmla v18.4s, v30.4s, v6.s[3]\n"
- "fmla v19.4s, v30.4s, v7.s[0]\n"
- "ldr q30, [%x[params], #0xb0]\n"
- "fmla v12.4s, v29.4s, v4.s[2]\n"
- "fmla v13.4s, v29.4s, v4.s[3]\n"
- "fmla v14.4s, v29.4s, v5.s[0]\n"
- "fmla v15.4s, v29.4s, v5.s[1]\n"
- "fmla v16.4s, v29.4s, v6.s[2]\n"
- "fmla v17.4s, v29.4s, v6.s[3]\n"
- "fmla v18.4s, v29.4s, v7.s[0]\n"
- "fmla v19.4s, v29.4s, v7.s[1]\n"
- "ldr q29, [%x[params], #0xc0]\n"
- "fmla v12.4s, v28.4s, v4.s[3]\n"
- "fmla v13.4s, v28.4s, v5.s[0]\n"
- "fmla v14.4s, v28.4s, v5.s[1]\n"
- "fmla v15.4s, v28.4s, v5.s[2]\n"
- "fmla v16.4s, v28.4s, v6.s[3]\n"
- "fmla v17.4s, v28.4s, v7.s[0]\n"
- "fmla v18.4s, v28.4s, v7.s[1]\n"
- "fmla v19.4s, v28.4s, v7.s[2]\n"
- "ldr q28, [%x[params], #0xd0]\n"
- "fmla v12.4s, v27.4s, v5.s[0]\n"
- "fmla v13.4s, v27.4s, v5.s[1]\n"
- "fmla v14.4s, v27.4s, v5.s[2]\n"
- "fmla v15.4s, v27.4s, v5.s[3]\n"
- "fmla v16.4s, v27.4s, v7.s[0]\n"
- "fmla v17.4s, v27.4s, v7.s[1]\n"
- "fmla v18.4s, v27.4s, v7.s[2]\n"
- "fmla v19.4s, v27.4s, v7.s[3]\n"
- "ldr q27, [%x[params], #0xe0]\n"
- "fmla v12.4s, v31.4s, v6.s[0]\n"
- "fmla v13.4s, v31.4s, v6.s[1]\n"
- "fmla v14.4s, v31.4s, v6.s[2]\n"
- "fmla v15.4s, v31.4s, v6.s[3]\n"
- "fmla v16.4s, v31.4s, v8.s[0]\n"
- "fmla v17.4s, v31.4s, v8.s[1]\n"
- "fmla v18.4s, v31.4s, v8.s[2]\n"
- "fmla v19.4s, v31.4s, v8.s[3]\n"
- "ldr q31, [%x[params], #0xf0]\n"
- "fmla v12.4s, v30.4s, v6.s[1]\n"
- "fmla v13.4s, v30.4s, v6.s[2]\n"
- "fmla v14.4s, v30.4s, v6.s[3]\n"
- "fmla v15.4s, v30.4s, v7.s[0]\n"
- "fmla v16.4s, v30.4s, v8.s[1]\n"
- "fmla v17.4s, v30.4s, v8.s[2]\n"
- "fmla v18.4s, v30.4s, v8.s[3]\n"
- "fmla v19.4s, v30.4s, v9.s[0]\n"
- "ldr q30, [%x[params], #0x100]\n"
- "fmla v12.4s, v29.4s, v6.s[2]\n"
- "fmla v13.4s, v29.4s, v6.s[3]\n"
- "fmla v14.4s, v29.4s, v7.s[0]\n"
- "fmla v15.4s, v29.4s, v7.s[1]\n"
- "fmla v16.4s, v29.4s, v8.s[2]\n"
- "fmla v17.4s, v29.4s, v8.s[3]\n"
- "fmla v18.4s, v29.4s, v9.s[0]\n"
- "fmla v19.4s, v29.4s, v9.s[1]\n"
- "ldr q29, [%x[params], #0x110]\n"
- "fmla v12.4s, v28.4s, v6.s[3]\n"
- "fmla v13.4s, v28.4s, v7.s[0]\n"
- "fmla v14.4s, v28.4s, v7.s[1]\n"
- "fmla v15.4s, v28.4s, v7.s[2]\n"
- "fmla v16.4s, v28.4s, v8.s[3]\n"
- "fmla v17.4s, v28.4s, v9.s[0]\n"
- "fmla v18.4s, v28.4s, v9.s[1]\n"
- "fmla v19.4s, v28.4s, v9.s[2]\n"
- "ldr q28, [%x[params], #0x120]\n"
- "fmla v12.4s, v27.4s, v7.s[0]\n"
- "fmla v13.4s, v27.4s, v7.s[1]\n"
- "fmla v14.4s, v27.4s, v7.s[2]\n"
- "fmla v15.4s, v27.4s, v7.s[3]\n"
- "fmla v16.4s, v27.4s, v9.s[0]\n"
- "fmla v17.4s, v27.4s, v9.s[1]\n"
- "fmla v18.4s, v27.4s, v9.s[2]\n"
- "fmla v19.4s, v27.4s, v9.s[3]\n"
- "ldr q27, [%x[params], #0x130]\n"
+ "ldr q20, [%x[params], #0x40]\n"
+ "fmla v12.4s, v24.4s, v2.s[0]\n"
+ "fmla v13.4s, v24.4s, v2.s[1]\n"
+ "fmla v14.4s, v24.4s, v2.s[2]\n"
+ "fmla v15.4s, v24.4s, v2.s[3]\n"
+ "fmla v16.4s, v24.4s, v4.s[0]\n"
+ "fmla v17.4s, v24.4s, v4.s[1]\n"
+ "fmla v18.4s, v24.4s, v4.s[2]\n"
+ "fmla v19.4s, v24.4s, v4.s[3]\n"
+ "ldr q24, [%x[params], #0x50]\n"
+ "fmla v12.4s, v23.4s, v2.s[1]\n"
+ "fmla v13.4s, v23.4s, v2.s[2]\n"
+ "fmla v14.4s, v23.4s, v2.s[3]\n"
+ "fmla v15.4s, v23.4s, v3.s[0]\n"
+ "fmla v16.4s, v23.4s, v4.s[1]\n"
+ "fmla v17.4s, v23.4s, v4.s[2]\n"
+ "fmla v18.4s, v23.4s, v4.s[3]\n"
+ "fmla v19.4s, v23.4s, v5.s[0]\n"
+ "ldr q23, [%x[params], #0x60]\n"
+ "fmla v12.4s, v22.4s, v2.s[2]\n"
+ "fmla v13.4s, v22.4s, v2.s[3]\n"
+ "fmla v14.4s, v22.4s, v3.s[0]\n"
+ "fmla v15.4s, v22.4s, v3.s[1]\n"
+ "fmla v16.4s, v22.4s, v4.s[2]\n"
+ "fmla v17.4s, v22.4s, v4.s[3]\n"
+ "fmla v18.4s, v22.4s, v5.s[0]\n"
+ "fmla v19.4s, v22.4s, v5.s[1]\n"
+ "ldr q22, [%x[params], #0x70]\n"
+ "fmla v12.4s, v21.4s, v2.s[3]\n"
+ "fmla v13.4s, v21.4s, v3.s[0]\n"
+ "fmla v14.4s, v21.4s, v3.s[1]\n"
+ "fmla v15.4s, v21.4s, v3.s[2]\n"
+ "fmla v16.4s, v21.4s, v4.s[3]\n"
+ "fmla v17.4s, v21.4s, v5.s[0]\n"
+ "fmla v18.4s, v21.4s, v5.s[1]\n"
+ "fmla v19.4s, v21.4s, v5.s[2]\n"
+ "ldr q21, [%x[params], #0x80]\n"
+ "fmla v12.4s, v20.4s, v3.s[0]\n"
+ "fmla v13.4s, v20.4s, v3.s[1]\n"
+ "fmla v14.4s, v20.4s, v3.s[2]\n"
+ "fmla v15.4s, v20.4s, v3.s[3]\n"
+ "fmla v16.4s, v20.4s, v5.s[0]\n"
+ "fmla v17.4s, v20.4s, v5.s[1]\n"
+ "fmla v18.4s, v20.4s, v5.s[2]\n"
+ "fmla v19.4s, v20.4s, v5.s[3]\n"
+ "ldr q20, [%x[params], #0x90]\n"
+ "fmla v12.4s, v24.4s, v4.s[0]\n"
+ "fmla v13.4s, v24.4s, v4.s[1]\n"
+ "fmla v14.4s, v24.4s, v4.s[2]\n"
+ "fmla v15.4s, v24.4s, v4.s[3]\n"
+ "fmla v16.4s, v24.4s, v6.s[0]\n"
+ "fmla v17.4s, v24.4s, v6.s[1]\n"
+ "fmla v18.4s, v24.4s, v6.s[2]\n"
+ "fmla v19.4s, v24.4s, v6.s[3]\n"
+ "ldr q24, [%x[params], #0xa0]\n"
+ "fmla v12.4s, v23.4s, v4.s[1]\n"
+ "fmla v13.4s, v23.4s, v4.s[2]\n"
+ "fmla v14.4s, v23.4s, v4.s[3]\n"
+ "fmla v15.4s, v23.4s, v5.s[0]\n"
+ "fmla v16.4s, v23.4s, v6.s[1]\n"
+ "fmla v17.4s, v23.4s, v6.s[2]\n"
+ "fmla v18.4s, v23.4s, v6.s[3]\n"
+ "fmla v19.4s, v23.4s, v7.s[0]\n"
+ "ldr q23, [%x[params], #0xb0]\n"
+ "fmla v12.4s, v22.4s, v4.s[2]\n"
+ "fmla v13.4s, v22.4s, v4.s[3]\n"
+ "fmla v14.4s, v22.4s, v5.s[0]\n"
+ "fmla v15.4s, v22.4s, v5.s[1]\n"
+ "fmla v16.4s, v22.4s, v6.s[2]\n"
+ "fmla v17.4s, v22.4s, v6.s[3]\n"
+ "fmla v18.4s, v22.4s, v7.s[0]\n"
+ "fmla v19.4s, v22.4s, v7.s[1]\n"
+ "ldr q22, [%x[params], #0xc0]\n"
+ "fmla v12.4s, v21.4s, v4.s[3]\n"
+ "fmla v13.4s, v21.4s, v5.s[0]\n"
+ "fmla v14.4s, v21.4s, v5.s[1]\n"
+ "fmla v15.4s, v21.4s, v5.s[2]\n"
+ "fmla v16.4s, v21.4s, v6.s[3]\n"
+ "fmla v17.4s, v21.4s, v7.s[0]\n"
+ "fmla v18.4s, v21.4s, v7.s[1]\n"
+ "fmla v19.4s, v21.4s, v7.s[2]\n"
+ "ldr q21, [%x[params], #0xd0]\n"
+ "fmla v12.4s, v20.4s, v5.s[0]\n"
+ "fmla v13.4s, v20.4s, v5.s[1]\n"
+ "fmla v14.4s, v20.4s, v5.s[2]\n"
+ "fmla v15.4s, v20.4s, v5.s[3]\n"
+ "fmla v16.4s, v20.4s, v7.s[0]\n"
+ "fmla v17.4s, v20.4s, v7.s[1]\n"
+ "fmla v18.4s, v20.4s, v7.s[2]\n"
+ "fmla v19.4s, v20.4s, v7.s[3]\n"
+ "ldr q20, [%x[params], #0xe0]\n"
+ "fmla v12.4s, v24.4s, v6.s[0]\n"
+ "fmla v13.4s, v24.4s, v6.s[1]\n"
+ "fmla v14.4s, v24.4s, v6.s[2]\n"
+ "fmla v15.4s, v24.4s, v6.s[3]\n"
+ "fmla v16.4s, v24.4s, v8.s[0]\n"
+ "fmla v17.4s, v24.4s, v8.s[1]\n"
+ "fmla v18.4s, v24.4s, v8.s[2]\n"
+ "fmla v19.4s, v24.4s, v8.s[3]\n"
+ "ldr q24, [%x[params], #0xf0]\n"
+ "fmla v12.4s, v23.4s, v6.s[1]\n"
+ "fmla v13.4s, v23.4s, v6.s[2]\n"
+ "fmla v14.4s, v23.4s, v6.s[3]\n"
+ "fmla v15.4s, v23.4s, v7.s[0]\n"
+ "fmla v16.4s, v23.4s, v8.s[1]\n"
+ "fmla v17.4s, v23.4s, v8.s[2]\n"
+ "fmla v18.4s, v23.4s, v8.s[3]\n"
+ "fmla v19.4s, v23.4s, v9.s[0]\n"
+ "ldr q23, [%x[params], #0x100]\n"
+ "fmla v12.4s, v22.4s, v6.s[2]\n"
+ "fmla v13.4s, v22.4s, v6.s[3]\n"
+ "fmla v14.4s, v22.4s, v7.s[0]\n"
+ "fmla v15.4s, v22.4s, v7.s[1]\n"
+ "fmla v16.4s, v22.4s, v8.s[2]\n"
+ "fmla v17.4s, v22.4s, v8.s[3]\n"
+ "fmla v18.4s, v22.4s, v9.s[0]\n"
+ "fmla v19.4s, v22.4s, v9.s[1]\n"
+ "ldr q22, [%x[params], #0x110]\n"
+ "fmla v12.4s, v21.4s, v6.s[3]\n"
+ "fmla v13.4s, v21.4s, v7.s[0]\n"
+ "fmla v14.4s, v21.4s, v7.s[1]\n"
+ "fmla v15.4s, v21.4s, v7.s[2]\n"
+ "fmla v16.4s, v21.4s, v8.s[3]\n"
+ "fmla v17.4s, v21.4s, v9.s[0]\n"
+ "fmla v18.4s, v21.4s, v9.s[1]\n"
+ "fmla v19.4s, v21.4s, v9.s[2]\n"
+ "ldr q21, [%x[params], #0x120]\n"
+ "fmla v12.4s, v20.4s, v7.s[0]\n"
+ "fmla v13.4s, v20.4s, v7.s[1]\n"
+ "fmla v14.4s, v20.4s, v7.s[2]\n"
+ "fmla v15.4s, v20.4s, v7.s[3]\n"
+ "fmla v16.4s, v20.4s, v9.s[0]\n"
+ "fmla v17.4s, v20.4s, v9.s[1]\n"
+ "fmla v18.4s, v20.4s, v9.s[2]\n"
+ "fmla v19.4s, v20.4s, v9.s[3]\n"
+ "ldr q20, [%x[params], #0x130]\n"
"add %x[params], %x[params], #0x140\n"
- "fmla v12.4s, v31.4s, v8.s[0]\n"
- "fmla v13.4s, v31.4s, v8.s[1]\n"
- "fmla v14.4s, v31.4s, v8.s[2]\n"
- "fmla v15.4s, v31.4s, v8.s[3]\n"
- "fmla v16.4s, v31.4s, v10.s[0]\n"
- "fmla v17.4s, v31.4s, v10.s[1]\n"
- "fmla v18.4s, v31.4s, v10.s[2]\n"
- "fmla v19.4s, v31.4s, v10.s[3]\n"
- "fmla v12.4s, v30.4s, v8.s[1]\n"
- "fmla v13.4s, v30.4s, v8.s[2]\n"
- "fmla v14.4s, v30.4s, v8.s[3]\n"
- "fmla v15.4s, v30.4s, v9.s[0]\n"
- "fmla v16.4s, v30.4s, v10.s[1]\n"
- "fmla v17.4s, v30.4s, v10.s[2]\n"
- "fmla v18.4s, v30.4s, v10.s[3]\n"
- "fmla v19.4s, v30.4s, v11.s[0]\n"
- "fmla v12.4s, v29.4s, v8.s[2]\n"
- "fmla v13.4s, v29.4s, v8.s[3]\n"
- "fmla v14.4s, v29.4s, v9.s[0]\n"
- "fmla v15.4s, v29.4s, v9.s[1]\n"
- "fmla v16.4s, v29.4s, v10.s[2]\n"
- "fmla v17.4s, v29.4s, v10.s[3]\n"
- "fmla v18.4s, v29.4s, v11.s[0]\n"
- "fmla v19.4s, v29.4s, v11.s[1]\n"
- "fmla v12.4s, v28.4s, v8.s[3]\n"
- "fmla v13.4s, v28.4s, v9.s[0]\n"
- "fmla v14.4s, v28.4s, v9.s[1]\n"
- "fmla v15.4s, v28.4s, v9.s[2]\n"
- "fmla v16.4s, v28.4s, v10.s[3]\n"
- "fmla v17.4s, v28.4s, v11.s[0]\n"
- "fmla v18.4s, v28.4s, v11.s[1]\n"
- "fmla v19.4s, v28.4s, v11.s[2]\n"
- "fmla v12.4s, v27.4s, v9.s[0]\n"
- "fmla v13.4s, v27.4s, v9.s[1]\n"
- "fmin v12.4s, v12.4s, v20.4s\n"
- "fmla v14.4s, v27.4s, v9.s[2]\n"
- "fmla v15.4s, v27.4s, v9.s[3]\n"
- "fmin v13.4s, v13.4s, v20.4s\n"
- "fmla v16.4s, v27.4s, v11.s[0]\n"
- "fmla v17.4s, v27.4s, v11.s[1]\n"
- "fmin v14.4s, v14.4s, v20.4s\n"
- "fmla v18.4s, v27.4s, v11.s[2]\n"
- "fmla v19.4s, v27.4s, v11.s[3]\n"
- "fmin v15.4s, v15.4s, v20.4s\n"
- "fmin v16.4s, v16.4s, v20.4s\n"
- "fmin v17.4s, v17.4s, v20.4s\n"
- "fmin v18.4s, v18.4s, v20.4s\n"
- "fmin v19.4s, v19.4s, v20.4s\n"
- "fmax v12.4s, v12.4s, v21.4s\n"
- "fmax v13.4s, v13.4s, v21.4s\n"
+ "fmla v12.4s, v24.4s, v8.s[0]\n"
+ "fmla v13.4s, v24.4s, v8.s[1]\n"
+ "fmla v14.4s, v24.4s, v8.s[2]\n"
+ "fmla v15.4s, v24.4s, v8.s[3]\n"
+ "fmla v16.4s, v24.4s, v10.s[0]\n"
+ "fmla v17.4s, v24.4s, v10.s[1]\n"
+ "fmla v18.4s, v24.4s, v10.s[2]\n"
+ "fmla v19.4s, v24.4s, v10.s[3]\n"
+ "fmla v12.4s, v23.4s, v8.s[1]\n"
+ "fmla v13.4s, v23.4s, v8.s[2]\n"
+ "fmla v14.4s, v23.4s, v8.s[3]\n"
+ "fmla v15.4s, v23.4s, v9.s[0]\n"
+ "fmla v16.4s, v23.4s, v10.s[1]\n"
+ "fmla v17.4s, v23.4s, v10.s[2]\n"
+ "fmla v18.4s, v23.4s, v10.s[3]\n"
+ "fmla v19.4s, v23.4s, v11.s[0]\n"
+ "fmla v12.4s, v22.4s, v8.s[2]\n"
+ "fmla v13.4s, v22.4s, v8.s[3]\n"
+ "fmla v14.4s, v22.4s, v9.s[0]\n"
+ "fmla v15.4s, v22.4s, v9.s[1]\n"
+ "fmla v16.4s, v22.4s, v10.s[2]\n"
+ "fmla v17.4s, v22.4s, v10.s[3]\n"
+ "fmla v18.4s, v22.4s, v11.s[0]\n"
+ "fmla v19.4s, v22.4s, v11.s[1]\n"
+ "fmla v12.4s, v21.4s, v8.s[3]\n"
+ "fmla v13.4s, v21.4s, v9.s[0]\n"
+ "fmla v14.4s, v21.4s, v9.s[1]\n"
+ "fmla v15.4s, v21.4s, v9.s[2]\n"
+ "fmla v16.4s, v21.4s, v10.s[3]\n"
+ "fmla v17.4s, v21.4s, v11.s[0]\n"
+ "fmla v18.4s, v21.4s, v11.s[1]\n"
+ "fmla v19.4s, v21.4s, v11.s[2]\n"
+ "fmla v12.4s, v20.4s, v9.s[0]\n"
+ "fmla v13.4s, v20.4s, v9.s[1]\n"
+ "fmin v12.4s, v12.4s, v25.4s\n"
+ "fmla v14.4s, v20.4s, v9.s[2]\n"
+ "fmla v15.4s, v20.4s, v9.s[3]\n"
+ "fmin v13.4s, v13.4s, v25.4s\n"
+ "fmla v16.4s, v20.4s, v11.s[0]\n"
+ "fmla v17.4s, v20.4s, v11.s[1]\n"
+ "fmin v14.4s, v14.4s, v25.4s\n"
+ "fmla v18.4s, v20.4s, v11.s[2]\n"
+ "fmla v19.4s, v20.4s, v11.s[3]\n"
+ "fmin v15.4s, v15.4s, v25.4s\n"
+ "fmin v16.4s, v16.4s, v25.4s\n"
+ "fmin v17.4s, v17.4s, v25.4s\n"
+ "fmin v18.4s, v18.4s, v25.4s\n"
+ "fmin v19.4s, v19.4s, v25.4s\n"
+ "fmax v12.4s, v12.4s, v26.4s\n"
+ "fmax v13.4s, v13.4s, v26.4s\n"
"str q12, [x12, x13]\n"
- "fmax v14.4s, v14.4s, v21.4s\n"
- "fmax v15.4s, v15.4s, v21.4s\n"
+ "fmax v14.4s, v14.4s, v26.4s\n"
+ "fmax v15.4s, v15.4s, v26.4s\n"
"str q13, [x11, x13]\n"
- "fmax v16.4s, v16.4s, v21.4s\n"
- "fmax v17.4s, v17.4s, v21.4s\n"
+ "fmax v16.4s, v16.4s, v26.4s\n"
+ "fmax v17.4s, v17.4s, v26.4s\n"
"str q14, [x10, x13]\n"
- "fmax v18.4s, v18.4s, v21.4s\n"
- "fmax v19.4s, v19.4s, v21.4s\n"
+ "fmax v18.4s, v18.4s, v26.4s\n"
+ "fmax v19.4s, v19.4s, v26.4s\n"
"str q15, [x9, x13]\n"
"str q16, [x28, x13]\n"
"str q17, [x27, x13]\n"
@@ -601,255 +601,255 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"tst %x[channel_multiplier], #0x3\n"
"beq 6f\n"
"ldr q12, [%x[params], #0x0]\n"
- "ldr q31, [%x[params], #0x10]\n"
+ "ldr q24, [%x[params], #0x10]\n"
"mov v13.16b, v12.16b\n"
"mov v14.16b, v12.16b\n"
- "ldr q30, [%x[params], #0x20]\n"
- "ldr q29, [%x[params], #0x30]\n"
+ "ldr q23, [%x[params], #0x20]\n"
+ "ldr q22, [%x[params], #0x30]\n"
"mov v15.16b, v12.16b\n"
"mov v16.16b, v12.16b\n"
- "ldr q28, [%x[params], #0x40]\n"
- "ldr q27, [%x[params], #0x50]\n"
+ "ldr q21, [%x[params], #0x40]\n"
+ "ldr q20, [%x[params], #0x50]\n"
"mov v17.16b, v12.16b\n"
"mov v18.16b, v12.16b\n"
"mov v19.16b, v12.16b\n"
- "fmla v12.4s, v31.4s, v0.s[0]\n"
- "fmla v13.4s, v31.4s, v0.s[1]\n"
- "fmla v14.4s, v31.4s, v0.s[2]\n"
- "fmla v15.4s, v31.4s, v0.s[3]\n"
- "fmla v16.4s, v31.4s, v2.s[0]\n"
- "fmla v17.4s, v31.4s, v2.s[1]\n"
- "fmla v18.4s, v31.4s, v2.s[2]\n"
- "fmla v19.4s, v31.4s, v2.s[3]\n"
- "ldr q31, [%x[params], #0x60]\n"
- "fmla v12.4s, v30.4s, v0.s[1]\n"
- "fmla v13.4s, v30.4s, v0.s[2]\n"
- "fmla v14.4s, v30.4s, v0.s[3]\n"
- "fmla v15.4s, v30.4s, v1.s[0]\n"
- "fmla v16.4s, v30.4s, v2.s[1]\n"
- "fmla v17.4s, v30.4s, v2.s[2]\n"
- "fmla v18.4s, v30.4s, v2.s[3]\n"
- "fmla v19.4s, v30.4s, v3.s[0]\n"
- "ldr q30, [%x[params], #0x70]\n"
- "fmla v12.4s, v29.4s, v0.s[2]\n"
- "fmla v13.4s, v29.4s, v0.s[3]\n"
- "fmla v14.4s, v29.4s, v1.s[0]\n"
- "fmla v15.4s, v29.4s, v1.s[1]\n"
- "fmla v16.4s, v29.4s, v2.s[2]\n"
- "fmla v17.4s, v29.4s, v2.s[3]\n"
- "fmla v18.4s, v29.4s, v3.s[0]\n"
- "fmla v19.4s, v29.4s, v3.s[1]\n"
- "ldr q29, [%x[params], #0x80]\n"
- "fmla v12.4s, v28.4s, v0.s[3]\n"
- "fmla v13.4s, v28.4s, v1.s[0]\n"
- "fmla v14.4s, v28.4s, v1.s[1]\n"
- "fmla v15.4s, v28.4s, v1.s[2]\n"
- "fmla v16.4s, v28.4s, v2.s[3]\n"
- "fmla v17.4s, v28.4s, v3.s[0]\n"
- "fmla v18.4s, v28.4s, v3.s[1]\n"
- "fmla v19.4s, v28.4s, v3.s[2]\n"
- "ldr q28, [%x[params], #0x90]\n"
- "fmla v12.4s, v27.4s, v1.s[0]\n"
- "fmla v13.4s, v27.4s, v1.s[1]\n"
- "fmla v14.4s, v27.4s, v1.s[2]\n"
- "fmla v15.4s, v27.4s, v1.s[3]\n"
- "fmla v16.4s, v27.4s, v3.s[0]\n"
- "fmla v17.4s, v27.4s, v3.s[1]\n"
- "fmla v18.4s, v27.4s, v3.s[2]\n"
- "fmla v19.4s, v27.4s, v3.s[3]\n"
- "ldr q27, [%x[params], #0xa0]\n"
- "fmla v12.4s, v31.4s, v2.s[0]\n"
- "fmla v13.4s, v31.4s, v2.s[1]\n"
- "fmla v14.4s, v31.4s, v2.s[2]\n"
- "fmla v15.4s, v31.4s, v2.s[3]\n"
- "fmla v16.4s, v31.4s, v4.s[0]\n"
- "fmla v17.4s, v31.4s, v4.s[1]\n"
- "fmla v18.4s, v31.4s, v4.s[2]\n"
- "fmla v19.4s, v31.4s, v4.s[3]\n"
- "ldr q31, [%x[params], #0xb0]\n"
- "fmla v12.4s, v30.4s, v2.s[1]\n"
- "fmla v13.4s, v30.4s, v2.s[2]\n"
- "fmla v14.4s, v30.4s, v2.s[3]\n"
- "fmla v15.4s, v30.4s, v3.s[0]\n"
- "fmla v16.4s, v30.4s, v4.s[1]\n"
- "fmla v17.4s, v30.4s, v4.s[2]\n"
- "fmla v18.4s, v30.4s, v4.s[3]\n"
- "fmla v19.4s, v30.4s, v5.s[0]\n"
- "ldr q30, [%x[params], #0xc0]\n"
- "fmla v12.4s, v29.4s, v2.s[2]\n"
- "fmla v13.4s, v29.4s, v2.s[3]\n"
- "fmla v14.4s, v29.4s, v3.s[0]\n"
- "fmla v15.4s, v29.4s, v3.s[1]\n"
- "fmla v16.4s, v29.4s, v4.s[2]\n"
- "fmla v17.4s, v29.4s, v4.s[3]\n"
- "fmla v18.4s, v29.4s, v5.s[0]\n"
- "fmla v19.4s, v29.4s, v5.s[1]\n"
- "ldr q29, [%x[params], #0xd0]\n"
- "fmla v12.4s, v28.4s, v2.s[3]\n"
- "fmla v13.4s, v28.4s, v3.s[0]\n"
- "fmla v14.4s, v28.4s, v3.s[1]\n"
- "fmla v15.4s, v28.4s, v3.s[2]\n"
- "fmla v16.4s, v28.4s, v4.s[3]\n"
- "fmla v17.4s, v28.4s, v5.s[0]\n"
- "fmla v18.4s, v28.4s, v5.s[1]\n"
- "fmla v19.4s, v28.4s, v5.s[2]\n"
- "ldr q28, [%x[params], #0xe0]\n"
- "fmla v12.4s, v27.4s, v3.s[0]\n"
- "fmla v13.4s, v27.4s, v3.s[1]\n"
- "fmla v14.4s, v27.4s, v3.s[2]\n"
- "fmla v15.4s, v27.4s, v3.s[3]\n"
- "fmla v16.4s, v27.4s, v5.s[0]\n"
- "fmla v17.4s, v27.4s, v5.s[1]\n"
- "fmla v18.4s, v27.4s, v5.s[2]\n"
- "fmla v19.4s, v27.4s, v5.s[3]\n"
- "ldr q27, [%x[params], #0xf0]\n"
- "fmla v12.4s, v31.4s, v4.s[0]\n"
- "fmla v13.4s, v31.4s, v4.s[1]\n"
- "fmla v14.4s, v31.4s, v4.s[2]\n"
- "fmla v15.4s, v31.4s, v4.s[3]\n"
- "fmla v16.4s, v31.4s, v6.s[0]\n"
- "fmla v17.4s, v31.4s, v6.s[1]\n"
- "fmla v18.4s, v31.4s, v6.s[2]\n"
- "fmla v19.4s, v31.4s, v6.s[3]\n"
- "ldr q31, [%x[params], #0x100]\n"
- "fmla v12.4s, v30.4s, v4.s[1]\n"
- "fmla v13.4s, v30.4s, v4.s[2]\n"
- "fmla v14.4s, v30.4s, v4.s[3]\n"
- "fmla v15.4s, v30.4s, v5.s[0]\n"
- "fmla v16.4s, v30.4s, v6.s[1]\n"
- "fmla v17.4s, v30.4s, v6.s[2]\n"
- "fmla v18.4s, v30.4s, v6.s[3]\n"
- "fmla v19.4s, v30.4s, v7.s[0]\n"
- "ldr q30, [%x[params], #0x110]\n"
- "fmla v12.4s, v29.4s, v4.s[2]\n"
- "fmla v13.4s, v29.4s, v4.s[3]\n"
- "fmla v14.4s, v29.4s, v5.s[0]\n"
- "fmla v15.4s, v29.4s, v5.s[1]\n"
- "fmla v16.4s, v29.4s, v6.s[2]\n"
- "fmla v17.4s, v29.4s, v6.s[3]\n"
- "fmla v18.4s, v29.4s, v7.s[0]\n"
- "fmla v19.4s, v29.4s, v7.s[1]\n"
- "ldr q29, [%x[params], #0x120]\n"
- "fmla v12.4s, v28.4s, v4.s[3]\n"
- "fmla v13.4s, v28.4s, v5.s[0]\n"
- "fmla v14.4s, v28.4s, v5.s[1]\n"
- "fmla v15.4s, v28.4s, v5.s[2]\n"
- "fmla v16.4s, v28.4s, v6.s[3]\n"
- "fmla v17.4s, v28.4s, v7.s[0]\n"
- "fmla v18.4s, v28.4s, v7.s[1]\n"
- "fmla v19.4s, v28.4s, v7.s[2]\n"
- "ldr q28, [%x[params], #0x130]\n"
- "fmla v12.4s, v27.4s, v5.s[0]\n"
- "fmla v13.4s, v27.4s, v5.s[1]\n"
- "fmla v14.4s, v27.4s, v5.s[2]\n"
- "fmla v15.4s, v27.4s, v5.s[3]\n"
- "fmla v16.4s, v27.4s, v7.s[0]\n"
- "fmla v17.4s, v27.4s, v7.s[1]\n"
- "fmla v18.4s, v27.4s, v7.s[2]\n"
- "fmla v19.4s, v27.4s, v7.s[3]\n"
- "ldr q27, [%x[params], #0x140]\n"
- "fmla v12.4s, v31.4s, v6.s[0]\n"
- "fmla v13.4s, v31.4s, v6.s[1]\n"
- "fmla v14.4s, v31.4s, v6.s[2]\n"
- "fmla v15.4s, v31.4s, v6.s[3]\n"
- "fmla v16.4s, v31.4s, v8.s[0]\n"
- "fmla v17.4s, v31.4s, v8.s[1]\n"
- "fmla v18.4s, v31.4s, v8.s[2]\n"
- "fmla v19.4s, v31.4s, v8.s[3]\n"
- "ldr q31, [%x[params], #0x150]\n"
- "fmla v12.4s, v30.4s, v6.s[1]\n"
- "fmla v13.4s, v30.4s, v6.s[2]\n"
- "fmla v14.4s, v30.4s, v6.s[3]\n"
- "fmla v15.4s, v30.4s, v7.s[0]\n"
- "fmla v16.4s, v30.4s, v8.s[1]\n"
- "fmla v17.4s, v30.4s, v8.s[2]\n"
- "fmla v18.4s, v30.4s, v8.s[3]\n"
- "fmla v19.4s, v30.4s, v9.s[0]\n"
- "ldr q30, [%x[params], #0x160]\n"
- "fmla v12.4s, v29.4s, v6.s[2]\n"
- "fmla v13.4s, v29.4s, v6.s[3]\n"
- "fmla v14.4s, v29.4s, v7.s[0]\n"
- "fmla v15.4s, v29.4s, v7.s[1]\n"
- "fmla v16.4s, v29.4s, v8.s[2]\n"
- "fmla v17.4s, v29.4s, v8.s[3]\n"
- "fmla v18.4s, v29.4s, v9.s[0]\n"
- "fmla v19.4s, v29.4s, v9.s[1]\n"
- "ldr q29, [%x[params], #0x170]\n"
- "fmla v12.4s, v28.4s, v6.s[3]\n"
- "fmla v13.4s, v28.4s, v7.s[0]\n"
- "fmla v14.4s, v28.4s, v7.s[1]\n"
- "fmla v15.4s, v28.4s, v7.s[2]\n"
- "fmla v16.4s, v28.4s, v8.s[3]\n"
- "fmla v17.4s, v28.4s, v9.s[0]\n"
- "fmla v18.4s, v28.4s, v9.s[1]\n"
- "fmla v19.4s, v28.4s, v9.s[2]\n"
- "ldr q28, [%x[params], #0x180]\n"
- "fmla v12.4s, v27.4s, v7.s[0]\n"
- "fmla v13.4s, v27.4s, v7.s[1]\n"
- "fmla v14.4s, v27.4s, v7.s[2]\n"
- "fmla v15.4s, v27.4s, v7.s[3]\n"
- "fmla v16.4s, v27.4s, v9.s[0]\n"
- "fmla v17.4s, v27.4s, v9.s[1]\n"
- "fmla v18.4s, v27.4s, v9.s[2]\n"
- "fmla v19.4s, v27.4s, v9.s[3]\n"
- "ldr q27, [%x[params], #0x190]\n"
+ "fmla v12.4s, v24.4s, v0.s[0]\n"
+ "fmla v13.4s, v24.4s, v0.s[1]\n"
+ "fmla v14.4s, v24.4s, v0.s[2]\n"
+ "fmla v15.4s, v24.4s, v0.s[3]\n"
+ "fmla v16.4s, v24.4s, v2.s[0]\n"
+ "fmla v17.4s, v24.4s, v2.s[1]\n"
+ "fmla v18.4s, v24.4s, v2.s[2]\n"
+ "fmla v19.4s, v24.4s, v2.s[3]\n"
+ "ldr q24, [%x[params], #0x60]\n"
+ "fmla v12.4s, v23.4s, v0.s[1]\n"
+ "fmla v13.4s, v23.4s, v0.s[2]\n"
+ "fmla v14.4s, v23.4s, v0.s[3]\n"
+ "fmla v15.4s, v23.4s, v1.s[0]\n"
+ "fmla v16.4s, v23.4s, v2.s[1]\n"
+ "fmla v17.4s, v23.4s, v2.s[2]\n"
+ "fmla v18.4s, v23.4s, v2.s[3]\n"
+ "fmla v19.4s, v23.4s, v3.s[0]\n"
+ "ldr q23, [%x[params], #0x70]\n"
+ "fmla v12.4s, v22.4s, v0.s[2]\n"
+ "fmla v13.4s, v22.4s, v0.s[3]\n"
+ "fmla v14.4s, v22.4s, v1.s[0]\n"
+ "fmla v15.4s, v22.4s, v1.s[1]\n"
+ "fmla v16.4s, v22.4s, v2.s[2]\n"
+ "fmla v17.4s, v22.4s, v2.s[3]\n"
+ "fmla v18.4s, v22.4s, v3.s[0]\n"
+ "fmla v19.4s, v22.4s, v3.s[1]\n"
+ "ldr q22, [%x[params], #0x80]\n"
+ "fmla v12.4s, v21.4s, v0.s[3]\n"
+ "fmla v13.4s, v21.4s, v1.s[0]\n"
+ "fmla v14.4s, v21.4s, v1.s[1]\n"
+ "fmla v15.4s, v21.4s, v1.s[2]\n"
+ "fmla v16.4s, v21.4s, v2.s[3]\n"
+ "fmla v17.4s, v21.4s, v3.s[0]\n"
+ "fmla v18.4s, v21.4s, v3.s[1]\n"
+ "fmla v19.4s, v21.4s, v3.s[2]\n"
+ "ldr q21, [%x[params], #0x90]\n"
+ "fmla v12.4s, v20.4s, v1.s[0]\n"
+ "fmla v13.4s, v20.4s, v1.s[1]\n"
+ "fmla v14.4s, v20.4s, v1.s[2]\n"
+ "fmla v15.4s, v20.4s, v1.s[3]\n"
+ "fmla v16.4s, v20.4s, v3.s[0]\n"
+ "fmla v17.4s, v20.4s, v3.s[1]\n"
+ "fmla v18.4s, v20.4s, v3.s[2]\n"
+ "fmla v19.4s, v20.4s, v3.s[3]\n"
+ "ldr q20, [%x[params], #0xa0]\n"
+ "fmla v12.4s, v24.4s, v2.s[0]\n"
+ "fmla v13.4s, v24.4s, v2.s[1]\n"
+ "fmla v14.4s, v24.4s, v2.s[2]\n"
+ "fmla v15.4s, v24.4s, v2.s[3]\n"
+ "fmla v16.4s, v24.4s, v4.s[0]\n"
+ "fmla v17.4s, v24.4s, v4.s[1]\n"
+ "fmla v18.4s, v24.4s, v4.s[2]\n"
+ "fmla v19.4s, v24.4s, v4.s[3]\n"
+ "ldr q24, [%x[params], #0xb0]\n"
+ "fmla v12.4s, v23.4s, v2.s[1]\n"
+ "fmla v13.4s, v23.4s, v2.s[2]\n"
+ "fmla v14.4s, v23.4s, v2.s[3]\n"
+ "fmla v15.4s, v23.4s, v3.s[0]\n"
+ "fmla v16.4s, v23.4s, v4.s[1]\n"
+ "fmla v17.4s, v23.4s, v4.s[2]\n"
+ "fmla v18.4s, v23.4s, v4.s[3]\n"
+ "fmla v19.4s, v23.4s, v5.s[0]\n"
+ "ldr q23, [%x[params], #0xc0]\n"
+ "fmla v12.4s, v22.4s, v2.s[2]\n"
+ "fmla v13.4s, v22.4s, v2.s[3]\n"
+ "fmla v14.4s, v22.4s, v3.s[0]\n"
+ "fmla v15.4s, v22.4s, v3.s[1]\n"
+ "fmla v16.4s, v22.4s, v4.s[2]\n"
+ "fmla v17.4s, v22.4s, v4.s[3]\n"
+ "fmla v18.4s, v22.4s, v5.s[0]\n"
+ "fmla v19.4s, v22.4s, v5.s[1]\n"
+ "ldr q22, [%x[params], #0xd0]\n"
+ "fmla v12.4s, v21.4s, v2.s[3]\n"
+ "fmla v13.4s, v21.4s, v3.s[0]\n"
+ "fmla v14.4s, v21.4s, v3.s[1]\n"
+ "fmla v15.4s, v21.4s, v3.s[2]\n"
+ "fmla v16.4s, v21.4s, v4.s[3]\n"
+ "fmla v17.4s, v21.4s, v5.s[0]\n"
+ "fmla v18.4s, v21.4s, v5.s[1]\n"
+ "fmla v19.4s, v21.4s, v5.s[2]\n"
+ "ldr q21, [%x[params], #0xe0]\n"
+ "fmla v12.4s, v20.4s, v3.s[0]\n"
+ "fmla v13.4s, v20.4s, v3.s[1]\n"
+ "fmla v14.4s, v20.4s, v3.s[2]\n"
+ "fmla v15.4s, v20.4s, v3.s[3]\n"
+ "fmla v16.4s, v20.4s, v5.s[0]\n"
+ "fmla v17.4s, v20.4s, v5.s[1]\n"
+ "fmla v18.4s, v20.4s, v5.s[2]\n"
+ "fmla v19.4s, v20.4s, v5.s[3]\n"
+ "ldr q20, [%x[params], #0xf0]\n"
+ "fmla v12.4s, v24.4s, v4.s[0]\n"
+ "fmla v13.4s, v24.4s, v4.s[1]\n"
+ "fmla v14.4s, v24.4s, v4.s[2]\n"
+ "fmla v15.4s, v24.4s, v4.s[3]\n"
+ "fmla v16.4s, v24.4s, v6.s[0]\n"
+ "fmla v17.4s, v24.4s, v6.s[1]\n"
+ "fmla v18.4s, v24.4s, v6.s[2]\n"
+ "fmla v19.4s, v24.4s, v6.s[3]\n"
+ "ldr q24, [%x[params], #0x100]\n"
+ "fmla v12.4s, v23.4s, v4.s[1]\n"
+ "fmla v13.4s, v23.4s, v4.s[2]\n"
+ "fmla v14.4s, v23.4s, v4.s[3]\n"
+ "fmla v15.4s, v23.4s, v5.s[0]\n"
+ "fmla v16.4s, v23.4s, v6.s[1]\n"
+ "fmla v17.4s, v23.4s, v6.s[2]\n"
+ "fmla v18.4s, v23.4s, v6.s[3]\n"
+ "fmla v19.4s, v23.4s, v7.s[0]\n"
+ "ldr q23, [%x[params], #0x110]\n"
+ "fmla v12.4s, v22.4s, v4.s[2]\n"
+ "fmla v13.4s, v22.4s, v4.s[3]\n"
+ "fmla v14.4s, v22.4s, v5.s[0]\n"
+ "fmla v15.4s, v22.4s, v5.s[1]\n"
+ "fmla v16.4s, v22.4s, v6.s[2]\n"
+ "fmla v17.4s, v22.4s, v6.s[3]\n"
+ "fmla v18.4s, v22.4s, v7.s[0]\n"
+ "fmla v19.4s, v22.4s, v7.s[1]\n"
+ "ldr q22, [%x[params], #0x120]\n"
+ "fmla v12.4s, v21.4s, v4.s[3]\n"
+ "fmla v13.4s, v21.4s, v5.s[0]\n"
+ "fmla v14.4s, v21.4s, v5.s[1]\n"
+ "fmla v15.4s, v21.4s, v5.s[2]\n"
+ "fmla v16.4s, v21.4s, v6.s[3]\n"
+ "fmla v17.4s, v21.4s, v7.s[0]\n"
+ "fmla v18.4s, v21.4s, v7.s[1]\n"
+ "fmla v19.4s, v21.4s, v7.s[2]\n"
+ "ldr q21, [%x[params], #0x130]\n"
+ "fmla v12.4s, v20.4s, v5.s[0]\n"
+ "fmla v13.4s, v20.4s, v5.s[1]\n"
+ "fmla v14.4s, v20.4s, v5.s[2]\n"
+ "fmla v15.4s, v20.4s, v5.s[3]\n"
+ "fmla v16.4s, v20.4s, v7.s[0]\n"
+ "fmla v17.4s, v20.4s, v7.s[1]\n"
+ "fmla v18.4s, v20.4s, v7.s[2]\n"
+ "fmla v19.4s, v20.4s, v7.s[3]\n"
+ "ldr q20, [%x[params], #0x140]\n"
+ "fmla v12.4s, v24.4s, v6.s[0]\n"
+ "fmla v13.4s, v24.4s, v6.s[1]\n"
+ "fmla v14.4s, v24.4s, v6.s[2]\n"
+ "fmla v15.4s, v24.4s, v6.s[3]\n"
+ "fmla v16.4s, v24.4s, v8.s[0]\n"
+ "fmla v17.4s, v24.4s, v8.s[1]\n"
+ "fmla v18.4s, v24.4s, v8.s[2]\n"
+ "fmla v19.4s, v24.4s, v8.s[3]\n"
+ "ldr q24, [%x[params], #0x150]\n"
+ "fmla v12.4s, v23.4s, v6.s[1]\n"
+ "fmla v13.4s, v23.4s, v6.s[2]\n"
+ "fmla v14.4s, v23.4s, v6.s[3]\n"
+ "fmla v15.4s, v23.4s, v7.s[0]\n"
+ "fmla v16.4s, v23.4s, v8.s[1]\n"
+ "fmla v17.4s, v23.4s, v8.s[2]\n"
+ "fmla v18.4s, v23.4s, v8.s[3]\n"
+ "fmla v19.4s, v23.4s, v9.s[0]\n"
+ "ldr q23, [%x[params], #0x160]\n"
+ "fmla v12.4s, v22.4s, v6.s[2]\n"
+ "fmla v13.4s, v22.4s, v6.s[3]\n"
+ "fmla v14.4s, v22.4s, v7.s[0]\n"
+ "fmla v15.4s, v22.4s, v7.s[1]\n"
+ "fmla v16.4s, v22.4s, v8.s[2]\n"
+ "fmla v17.4s, v22.4s, v8.s[3]\n"
+ "fmla v18.4s, v22.4s, v9.s[0]\n"
+ "fmla v19.4s, v22.4s, v9.s[1]\n"
+ "ldr q22, [%x[params], #0x170]\n"
+ "fmla v12.4s, v21.4s, v6.s[3]\n"
+ "fmla v13.4s, v21.4s, v7.s[0]\n"
+ "fmla v14.4s, v21.4s, v7.s[1]\n"
+ "fmla v15.4s, v21.4s, v7.s[2]\n"
+ "fmla v16.4s, v21.4s, v8.s[3]\n"
+ "fmla v17.4s, v21.4s, v9.s[0]\n"
+ "fmla v18.4s, v21.4s, v9.s[1]\n"
+ "fmla v19.4s, v21.4s, v9.s[2]\n"
+ "ldr q21, [%x[params], #0x180]\n"
+ "fmla v12.4s, v20.4s, v7.s[0]\n"
+ "fmla v13.4s, v20.4s, v7.s[1]\n"
+ "fmla v14.4s, v20.4s, v7.s[2]\n"
+ "fmla v15.4s, v20.4s, v7.s[3]\n"
+ "fmla v16.4s, v20.4s, v9.s[0]\n"
+ "fmla v17.4s, v20.4s, v9.s[1]\n"
+ "fmla v18.4s, v20.4s, v9.s[2]\n"
+ "fmla v19.4s, v20.4s, v9.s[3]\n"
+ "ldr q20, [%x[params], #0x190]\n"
"add %x[params], %x[params], #0x1a0\n"
- "fmla v12.4s, v31.4s, v8.s[0]\n"
- "fmla v13.4s, v31.4s, v8.s[1]\n"
- "fmla v14.4s, v31.4s, v8.s[2]\n"
- "fmla v15.4s, v31.4s, v8.s[3]\n"
- "fmla v16.4s, v31.4s, v10.s[0]\n"
- "fmla v17.4s, v31.4s, v10.s[1]\n"
- "fmla v18.4s, v31.4s, v10.s[2]\n"
- "fmla v19.4s, v31.4s, v10.s[3]\n"
- "fmla v12.4s, v30.4s, v8.s[1]\n"
- "fmla v13.4s, v30.4s, v8.s[2]\n"
- "fmla v14.4s, v30.4s, v8.s[3]\n"
- "fmla v15.4s, v30.4s, v9.s[0]\n"
- "fmla v16.4s, v30.4s, v10.s[1]\n"
- "fmla v17.4s, v30.4s, v10.s[2]\n"
- "fmla v18.4s, v30.4s, v10.s[3]\n"
- "fmla v19.4s, v30.4s, v11.s[0]\n"
- "fmla v12.4s, v29.4s, v8.s[2]\n"
- "fmla v13.4s, v29.4s, v8.s[3]\n"
- "fmla v14.4s, v29.4s, v9.s[0]\n"
- "fmla v15.4s, v29.4s, v9.s[1]\n"
- "fmla v16.4s, v29.4s, v10.s[2]\n"
- "fmla v17.4s, v29.4s, v10.s[3]\n"
- "fmla v18.4s, v29.4s, v11.s[0]\n"
- "fmla v19.4s, v29.4s, v11.s[1]\n"
- "fmla v12.4s, v28.4s, v8.s[3]\n"
- "fmla v13.4s, v28.4s, v9.s[0]\n"
- "fmla v14.4s, v28.4s, v9.s[1]\n"
- "fmla v15.4s, v28.4s, v9.s[2]\n"
- "fmla v16.4s, v28.4s, v10.s[3]\n"
- "fmla v17.4s, v28.4s, v11.s[0]\n"
- "fmla v18.4s, v28.4s, v11.s[1]\n"
- "fmla v19.4s, v28.4s, v11.s[2]\n"
- "fmla v12.4s, v27.4s, v9.s[0]\n"
- "fmla v13.4s, v27.4s, v9.s[1]\n"
- "fmin v12.4s, v12.4s, v20.4s\n"
- "fmla v14.4s, v27.4s, v9.s[2]\n"
- "fmla v15.4s, v27.4s, v9.s[3]\n"
- "fmin v13.4s, v13.4s, v20.4s\n"
- "fmla v16.4s, v27.4s, v11.s[0]\n"
- "fmla v17.4s, v27.4s, v11.s[1]\n"
- "fmin v14.4s, v14.4s, v20.4s\n"
- "fmla v18.4s, v27.4s, v11.s[2]\n"
- "fmla v19.4s, v27.4s, v11.s[3]\n"
- "fmin v15.4s, v15.4s, v20.4s\n"
- "fmin v16.4s, v16.4s, v20.4s\n"
- "fmin v17.4s, v17.4s, v20.4s\n"
- "fmin v18.4s, v18.4s, v20.4s\n"
- "fmin v19.4s, v19.4s, v20.4s\n"
- "fmax v12.4s, v12.4s, v21.4s\n"
- "fmax v13.4s, v13.4s, v21.4s\n"
- "fmax v14.4s, v14.4s, v21.4s\n"
- "fmax v15.4s, v15.4s, v21.4s\n"
- "fmax v16.4s, v16.4s, v21.4s\n"
- "fmax v17.4s, v17.4s, v21.4s\n"
- "fmax v18.4s, v18.4s, v21.4s\n"
- "fmax v19.4s, v19.4s, v21.4s\n"
+ "fmla v12.4s, v24.4s, v8.s[0]\n"
+ "fmla v13.4s, v24.4s, v8.s[1]\n"
+ "fmla v14.4s, v24.4s, v8.s[2]\n"
+ "fmla v15.4s, v24.4s, v8.s[3]\n"
+ "fmla v16.4s, v24.4s, v10.s[0]\n"
+ "fmla v17.4s, v24.4s, v10.s[1]\n"
+ "fmla v18.4s, v24.4s, v10.s[2]\n"
+ "fmla v19.4s, v24.4s, v10.s[3]\n"
+ "fmla v12.4s, v23.4s, v8.s[1]\n"
+ "fmla v13.4s, v23.4s, v8.s[2]\n"
+ "fmla v14.4s, v23.4s, v8.s[3]\n"
+ "fmla v15.4s, v23.4s, v9.s[0]\n"
+ "fmla v16.4s, v23.4s, v10.s[1]\n"
+ "fmla v17.4s, v23.4s, v10.s[2]\n"
+ "fmla v18.4s, v23.4s, v10.s[3]\n"
+ "fmla v19.4s, v23.4s, v11.s[0]\n"
+ "fmla v12.4s, v22.4s, v8.s[2]\n"
+ "fmla v13.4s, v22.4s, v8.s[3]\n"
+ "fmla v14.4s, v22.4s, v9.s[0]\n"
+ "fmla v15.4s, v22.4s, v9.s[1]\n"
+ "fmla v16.4s, v22.4s, v10.s[2]\n"
+ "fmla v17.4s, v22.4s, v10.s[3]\n"
+ "fmla v18.4s, v22.4s, v11.s[0]\n"
+ "fmla v19.4s, v22.4s, v11.s[1]\n"
+ "fmla v12.4s, v21.4s, v8.s[3]\n"
+ "fmla v13.4s, v21.4s, v9.s[0]\n"
+ "fmla v14.4s, v21.4s, v9.s[1]\n"
+ "fmla v15.4s, v21.4s, v9.s[2]\n"
+ "fmla v16.4s, v21.4s, v10.s[3]\n"
+ "fmla v17.4s, v21.4s, v11.s[0]\n"
+ "fmla v18.4s, v21.4s, v11.s[1]\n"
+ "fmla v19.4s, v21.4s, v11.s[2]\n"
+ "fmla v12.4s, v20.4s, v9.s[0]\n"
+ "fmla v13.4s, v20.4s, v9.s[1]\n"
+ "fmin v12.4s, v12.4s, v25.4s\n"
+ "fmla v14.4s, v20.4s, v9.s[2]\n"
+ "fmla v15.4s, v20.4s, v9.s[3]\n"
+ "fmin v13.4s, v13.4s, v25.4s\n"
+ "fmla v16.4s, v20.4s, v11.s[0]\n"
+ "fmla v17.4s, v20.4s, v11.s[1]\n"
+ "fmin v14.4s, v14.4s, v25.4s\n"
+ "fmla v18.4s, v20.4s, v11.s[2]\n"
+ "fmla v19.4s, v20.4s, v11.s[3]\n"
+ "fmin v15.4s, v15.4s, v25.4s\n"
+ "fmin v16.4s, v16.4s, v25.4s\n"
+ "fmin v17.4s, v17.4s, v25.4s\n"
+ "fmin v18.4s, v18.4s, v25.4s\n"
+ "fmin v19.4s, v19.4s, v25.4s\n"
+ "fmax v12.4s, v12.4s, v26.4s\n"
+ "fmax v13.4s, v13.4s, v26.4s\n"
+ "fmax v14.4s, v14.4s, v26.4s\n"
+ "fmax v15.4s, v15.4s, v26.4s\n"
+ "fmax v16.4s, v16.4s, v26.4s\n"
+ "fmax v17.4s, v17.4s, v26.4s\n"
+ "fmax v18.4s, v18.4s, v26.4s\n"
+ "fmax v19.4s, v19.4s, v26.4s\n"
"tbz %x[channel_multiplier], #1, 4f\n"
"add x20, x12, x13\n"
"add x21, x11, x13\n"
@@ -904,15 +904,14 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"st1 { v18.s }[0], [x21]\n"
"st1 { v19.s }[0], [x20]\n"
"5:" // Output channel oddments: Store: Bit 1: End
-
"6:" // End
-
: [params] "+&r" (params)
: [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
index d60e15ec84..3bece73973 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
index c28f29c4f9..cc18dd4bb4 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -22,10 +22,11 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
@@ -43,10 +44,10 @@ void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
const float minmax_vals[2] = { activation_min, activation_max };
__asm__ __volatile__(
- "ld1r { v11.4s }, [%x[minmax_vals]]\n"
+ "ld1r { v12.4s }, [%x[minmax_vals]]\n"
"lsr x11, %x[n_output_channels], #0x2\n"
"add x20, %x[minmax_vals], #0x4\n"
- "ld1r { v10.4s }, [x20]\n"
+ "ld1r { v11.4s }, [x20]\n"
"mov x10, #0x0\n"
"cbz x11, 8f\n"
"1:" // Output channel loop
@@ -55,16 +56,16 @@ void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"lsl x20, x10, #0x2\n"
"ldr q31, [%x[bias], x20]\n"
"2:" // Output channel loop: Load bias: Done
- "ldr q9, [%x[weights], #0x0]\n"
- "mov x20, %x[inptrs]\n"
- "ldp x23, x9, [x20], #0x10\n"
- "lsr x21, %x[kernel_points], #0x1\n"
- "ldr q8, [x23, #0x0]\n"
- "ldr q7, [x23, #0x10]\n"
+ "ldr q10, [%x[weights], #0x0]\n"
+ "mov x22, %x[inptrs]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "lsr x23, %x[kernel_points], #0x1\n"
+ "ldr q3, [x21, #0x0]\n"
+ "ldr q2, [x21, #0x10]\n"
"mov v16.16b, v31.16b\n"
"mov v17.16b, v31.16b\n"
- "ldr q6, [x9, #0x0]\n"
- "ldr q5, [x9, #0x10]\n"
+ "ldr q1, [x20, #0x0]\n"
+ "ldr q0, [x20, #0x10]\n"
"mov v18.16b, v31.16b\n"
"mov v19.16b, v31.16b\n"
"mov v20.16b, v31.16b\n"
@@ -80,368 +81,368 @@ void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"mov v29.16b, v31.16b\n"
"mov v30.16b, v31.16b\n"
"mov v31.16b, v31.16b\n"
- "cbz x21, 6f\n"
- "ldr q4, [%x[weights], #0x0]\n"
- "ldp x23, x9, [x20], #0x10\n"
- "subs x21, x21, #0x1\n"
+ "cbz x23, 6f\n"
+ "ldr q9, [%x[weights], #0x0]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "subs x23, x23, #0x1\n"
"add %x[weights], %x[weights], #0x10\n"
- "ldr q3, [x23, #0x0]\n"
- "ldr q2, [x23, #0x10]\n"
- "ldr q1, [x9, #0x0]\n"
- "ldr q0, [x9, #0x10]\n"
+ "ldr q8, [x21, #0x0]\n"
+ "ldr q7, [x21, #0x10]\n"
+ "ldr q6, [x20, #0x0]\n"
+ "ldr q5, [x20, #0x10]\n"
"beq 4f\n"
"3:" // Output channel loop: Kernel loop
- "ldp x23, x9, [x20], #0x10\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "fmla v16.4s, v10.4s, v3.s[0]\n"
+ "fmla v17.4s, v10.4s, v3.s[1]\n"
+ "subs x23, x23, #0x1\n"
+ "fmla v18.4s, v10.4s, v3.s[2]\n"
+ "fmla v19.4s, v10.4s, v3.s[3]\n"
+ "ldr q3, [x21, #0x0]\n"
+ "fmla v20.4s, v10.4s, v2.s[0]\n"
+ "fmla v21.4s, v10.4s, v2.s[1]\n"
+ "fmla v22.4s, v10.4s, v2.s[2]\n"
+ "fmla v23.4s, v10.4s, v2.s[3]\n"
+ "ldr q2, [x21, #0x10]\n"
+ "fmla v24.4s, v10.4s, v1.s[0]\n"
+ "fmla v25.4s, v10.4s, v1.s[1]\n"
+ "fmla v26.4s, v10.4s, v1.s[2]\n"
+ "fmla v27.4s, v10.4s, v1.s[3]\n"
+ "ldr q1, [x20, #0x0]\n"
+ "fmla v28.4s, v10.4s, v0.s[0]\n"
+ "fmla v29.4s, v10.4s, v0.s[1]\n"
+ "fmla v30.4s, v10.4s, v0.s[2]\n"
+ "fmla v31.4s, v10.4s, v0.s[3]\n"
+ "ldr q0, [x20, #0x10]\n"
+ "ldr q10, [%x[weights], #0x0]\n"
+ "ldp x21, x20, [x22], #0x10\n"
"fmla v16.4s, v9.4s, v8.s[0]\n"
"fmla v17.4s, v9.4s, v8.s[1]\n"
- "subs x21, x21, #0x1\n"
"fmla v18.4s, v9.4s, v8.s[2]\n"
"fmla v19.4s, v9.4s, v8.s[3]\n"
- "ldr q8, [x23, #0x0]\n"
+ "ldr q8, [x21, #0x0]\n"
"fmla v20.4s, v9.4s, v7.s[0]\n"
"fmla v21.4s, v9.4s, v7.s[1]\n"
"fmla v22.4s, v9.4s, v7.s[2]\n"
"fmla v23.4s, v9.4s, v7.s[3]\n"
- "ldr q7, [x23, #0x10]\n"
+ "ldr q7, [x21, #0x10]\n"
"fmla v24.4s, v9.4s, v6.s[0]\n"
"fmla v25.4s, v9.4s, v6.s[1]\n"
"fmla v26.4s, v9.4s, v6.s[2]\n"
"fmla v27.4s, v9.4s, v6.s[3]\n"
- "ldr q6, [x9, #0x0]\n"
+ "ldr q6, [x20, #0x0]\n"
"fmla v28.4s, v9.4s, v5.s[0]\n"
"fmla v29.4s, v9.4s, v5.s[1]\n"
"fmla v30.4s, v9.4s, v5.s[2]\n"
"fmla v31.4s, v9.4s, v5.s[3]\n"
- "ldr q5, [x9, #0x10]\n"
- "ldr q9, [%x[weights], #0x0]\n"
- "ldp x23, x9, [x20], #0x10\n"
- "fmla v16.4s, v4.4s, v3.s[0]\n"
- "fmla v17.4s, v4.4s, v3.s[1]\n"
- "fmla v18.4s, v4.4s, v3.s[2]\n"
- "fmla v19.4s, v4.4s, v3.s[3]\n"
- "ldr q3, [x23, #0x0]\n"
- "fmla v20.4s, v4.4s, v2.s[0]\n"
- "fmla v21.4s, v4.4s, v2.s[1]\n"
- "fmla v22.4s, v4.4s, v2.s[2]\n"
- "fmla v23.4s, v4.4s, v2.s[3]\n"
- "ldr q2, [x23, #0x10]\n"
- "fmla v24.4s, v4.4s, v1.s[0]\n"
- "fmla v25.4s, v4.4s, v1.s[1]\n"
- "fmla v26.4s, v4.4s, v1.s[2]\n"
- "fmla v27.4s, v4.4s, v1.s[3]\n"
- "ldr q1, [x9, #0x0]\n"
- "fmla v28.4s, v4.4s, v0.s[0]\n"
- "fmla v29.4s, v4.4s, v0.s[1]\n"
- "fmla v30.4s, v4.4s, v0.s[2]\n"
- "fmla v31.4s, v4.4s, v0.s[3]\n"
- "ldr q0, [x9, #0x10]\n"
- "ldr q4, [%x[weights], #0x10]\n"
+ "ldr q5, [x20, #0x10]\n"
+ "ldr q9, [%x[weights], #0x10]\n"
"add %x[weights], %x[weights], #0x20\n"
"bgt 3b\n"
"4:" // Output channel loop: Kernel loop tail
"tbnz %x[kernel_points], #0, 5f\n"
+ "fmla v16.4s, v10.4s, v3.s[0]\n"
+ "fmla v17.4s, v10.4s, v3.s[1]\n"
+ "lsl x28, x10, #0x2\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "fmla v18.4s, v10.4s, v3.s[2]\n"
+ "fmla v19.4s, v10.4s, v3.s[3]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "fmla v20.4s, v10.4s, v2.s[0]\n"
+ "fmla v21.4s, v10.4s, v2.s[1]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "fmla v22.4s, v10.4s, v2.s[2]\n"
+ "fmla v23.4s, v10.4s, v2.s[3]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "fmla v24.4s, v10.4s, v1.s[0]\n"
+ "fmla v25.4s, v10.4s, v1.s[1]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "fmla v26.4s, v10.4s, v1.s[2]\n"
+ "fmla v27.4s, v10.4s, v1.s[3]\n"
+ "fmla v28.4s, v10.4s, v0.s[0]\n"
+ "fmla v29.4s, v10.4s, v0.s[1]\n"
+ "fmla v30.4s, v10.4s, v0.s[2]\n"
+ "fmla v31.4s, v10.4s, v0.s[3]\n"
"fmla v16.4s, v9.4s, v8.s[0]\n"
"fmla v17.4s, v9.4s, v8.s[1]\n"
- "lsl x28, x10, #0x2\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
+ "fmin v16.4s, v16.4s, v11.4s\n"
"fmla v18.4s, v9.4s, v8.s[2]\n"
"fmla v19.4s, v9.4s, v8.s[3]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
+ "fmin v17.4s, v17.4s, v11.4s\n"
"fmla v20.4s, v9.4s, v7.s[0]\n"
"fmla v21.4s, v9.4s, v7.s[1]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
+ "fmin v18.4s, v18.4s, v11.4s\n"
"fmla v22.4s, v9.4s, v7.s[2]\n"
"fmla v23.4s, v9.4s, v7.s[3]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
+ "fmin v19.4s, v19.4s, v11.4s\n"
"fmla v24.4s, v9.4s, v6.s[0]\n"
"fmla v25.4s, v9.4s, v6.s[1]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
+ "fmin v20.4s, v20.4s, v11.4s\n"
"fmla v26.4s, v9.4s, v6.s[2]\n"
"fmla v27.4s, v9.4s, v6.s[3]\n"
+ "fmin v21.4s, v21.4s, v11.4s\n"
"fmla v28.4s, v9.4s, v5.s[0]\n"
"fmla v29.4s, v9.4s, v5.s[1]\n"
+ "fmin v22.4s, v22.4s, v11.4s\n"
"fmla v30.4s, v9.4s, v5.s[2]\n"
"fmla v31.4s, v9.4s, v5.s[3]\n"
- "fmla v16.4s, v4.4s, v3.s[0]\n"
- "fmla v17.4s, v4.4s, v3.s[1]\n"
- "fmin v16.4s, v16.4s, v10.4s\n"
- "fmla v18.4s, v4.4s, v3.s[2]\n"
- "fmla v19.4s, v4.4s, v3.s[3]\n"
- "fmin v17.4s, v17.4s, v10.4s\n"
- "fmla v20.4s, v4.4s, v2.s[0]\n"
- "fmla v21.4s, v4.4s, v2.s[1]\n"
- "fmin v18.4s, v18.4s, v10.4s\n"
- "fmla v22.4s, v4.4s, v2.s[2]\n"
- "fmla v23.4s, v4.4s, v2.s[3]\n"
- "fmin v19.4s, v19.4s, v10.4s\n"
- "fmla v24.4s, v4.4s, v1.s[0]\n"
- "fmla v25.4s, v4.4s, v1.s[1]\n"
- "fmin v20.4s, v20.4s, v10.4s\n"
- "fmla v26.4s, v4.4s, v1.s[2]\n"
- "fmla v27.4s, v4.4s, v1.s[3]\n"
- "fmin v21.4s, v21.4s, v10.4s\n"
- "fmla v28.4s, v4.4s, v0.s[0]\n"
- "fmla v29.4s, v4.4s, v0.s[1]\n"
- "fmin v22.4s, v22.4s, v10.4s\n"
- "fmla v30.4s, v4.4s, v0.s[2]\n"
- "fmla v31.4s, v4.4s, v0.s[3]\n"
- "fmin v23.4s, v23.4s, v10.4s\n"
- "fmax v16.4s, v16.4s, v11.4s\n"
- "fmax v17.4s, v17.4s, v11.4s\n"
- "str q16, [x20, x28]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "fmax v18.4s, v18.4s, v11.4s\n"
- "fmax v19.4s, v19.4s, v11.4s\n"
- "str q17, [x21, x28]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "fmax v20.4s, v20.4s, v11.4s\n"
- "fmax v21.4s, v21.4s, v11.4s\n"
- "str q18, [x22, x28]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "fmax v22.4s, v22.4s, v11.4s\n"
- "fmax v23.4s, v23.4s, v11.4s\n"
- "str q19, [x23, x28]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "fmin v24.4s, v24.4s, v10.4s\n"
- "fmin v25.4s, v25.4s, v10.4s\n"
- "str q20, [x24, x28]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "fmin v26.4s, v26.4s, v10.4s\n"
- "fmin v27.4s, v27.4s, v10.4s\n"
- "str q21, [x25, x28]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "fmin v28.4s, v28.4s, v10.4s\n"
- "fmin v29.4s, v29.4s, v10.4s\n"
- "str q22, [x26, x28]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
- "fmin v30.4s, v30.4s, v10.4s\n"
- "fmin v31.4s, v31.4s, v10.4s\n"
- "str q23, [x27, x28]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
- "fmax v24.4s, v24.4s, v11.4s\n"
- "fmax v25.4s, v25.4s, v11.4s\n"
- "str q24, [x20, x28]\n"
- "fmax v26.4s, v26.4s, v11.4s\n"
- "fmax v27.4s, v27.4s, v11.4s\n"
- "str q25, [x21, x28]\n"
- "fmax v28.4s, v28.4s, v11.4s\n"
- "fmax v29.4s, v29.4s, v11.4s\n"
- "str q26, [x22, x28]\n"
- "fmax v30.4s, v30.4s, v11.4s\n"
- "fmax v31.4s, v31.4s, v11.4s\n"
- "str q27, [x23, x28]\n"
- "str q28, [x24, x28]\n"
- "str q29, [x25, x28]\n"
- "str q30, [x26, x28]\n"
- "str q31, [x27, x28]\n"
+ "fmin v23.4s, v23.4s, v11.4s\n"
+ "fmax v16.4s, v16.4s, v12.4s\n"
+ "fmax v17.4s, v17.4s, v12.4s\n"
+ "str q16, [x27, x28]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "fmax v18.4s, v18.4s, v12.4s\n"
+ "fmax v19.4s, v19.4s, v12.4s\n"
+ "str q17, [x26, x28]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "fmax v20.4s, v20.4s, v12.4s\n"
+ "fmax v21.4s, v21.4s, v12.4s\n"
+ "str q18, [x25, x28]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "fmax v22.4s, v22.4s, v12.4s\n"
+ "fmax v23.4s, v23.4s, v12.4s\n"
+ "str q19, [x24, x28]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "fmin v24.4s, v24.4s, v11.4s\n"
+ "fmin v25.4s, v25.4s, v11.4s\n"
+ "str q20, [x23, x28]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmin v26.4s, v26.4s, v11.4s\n"
+ "fmin v27.4s, v27.4s, v11.4s\n"
+ "str q21, [x22, x28]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "fmin v28.4s, v28.4s, v11.4s\n"
+ "fmin v29.4s, v29.4s, v11.4s\n"
+ "str q22, [x21, x28]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "fmin v30.4s, v30.4s, v11.4s\n"
+ "fmin v31.4s, v31.4s, v11.4s\n"
+ "str q23, [x20, x28]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "fmax v24.4s, v24.4s, v12.4s\n"
+ "fmax v25.4s, v25.4s, v12.4s\n"
+ "str q24, [x27, x28]\n"
+ "fmax v26.4s, v26.4s, v12.4s\n"
+ "fmax v27.4s, v27.4s, v12.4s\n"
+ "str q25, [x26, x28]\n"
+ "fmax v28.4s, v28.4s, v12.4s\n"
+ "fmax v29.4s, v29.4s, v12.4s\n"
+ "str q26, [x25, x28]\n"
+ "fmax v30.4s, v30.4s, v12.4s\n"
+ "fmax v31.4s, v31.4s, v12.4s\n"
+ "str q27, [x24, x28]\n"
+ "str q28, [x23, x28]\n"
+ "str q29, [x22, x28]\n"
+ "str q30, [x21, x28]\n"
+ "str q31, [x20, x28]\n"
"b 7f\n"
"5:" // Output channel loop: Odd tail
- "fmla v16.4s, v9.4s, v8.s[0]\n"
- "fmla v17.4s, v9.4s, v8.s[1]\n"
- "ldp x23, x9, [x20], #0x10\n"
+ "fmla v16.4s, v10.4s, v3.s[0]\n"
+ "fmla v17.4s, v10.4s, v3.s[1]\n"
+ "ldp x20, x9, [x22], #0x10\n"
"lsl x28, x10, #0x2\n"
- "fmla v18.4s, v9.4s, v8.s[2]\n"
- "fmla v19.4s, v9.4s, v8.s[3]\n"
- "ldr q8, [x23, #0x0]\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "fmla v20.4s, v9.4s, v7.s[0]\n"
- "fmla v21.4s, v9.4s, v7.s[1]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "fmla v22.4s, v9.4s, v7.s[2]\n"
- "fmla v23.4s, v9.4s, v7.s[3]\n"
- "ldr q7, [x23, #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "fmla v24.4s, v9.4s, v6.s[0]\n"
- "fmla v25.4s, v9.4s, v6.s[1]\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "fmla v26.4s, v9.4s, v6.s[2]\n"
- "fmla v27.4s, v9.4s, v6.s[3]\n"
- "ldr q6, [x9, #0x0]\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "fmla v28.4s, v9.4s, v5.s[0]\n"
- "fmla v29.4s, v9.4s, v5.s[1]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "fmla v30.4s, v9.4s, v5.s[2]\n"
- "fmla v31.4s, v9.4s, v5.s[3]\n"
- "ldr q9, [%x[weights], #0x0]\n"
- "ldr q5, [x9, #0x10]\n"
- "fmla v16.4s, v4.4s, v3.s[0]\n"
- "fmla v17.4s, v4.4s, v3.s[1]\n"
- "add %x[weights], %x[weights], #0x10\n"
- "fmla v18.4s, v4.4s, v3.s[2]\n"
- "fmla v19.4s, v4.4s, v3.s[3]\n"
- "fmla v20.4s, v4.4s, v2.s[0]\n"
- "fmla v21.4s, v4.4s, v2.s[1]\n"
- "fmla v22.4s, v4.4s, v2.s[2]\n"
- "fmla v23.4s, v4.4s, v2.s[3]\n"
- "fmla v24.4s, v4.4s, v1.s[0]\n"
- "fmla v25.4s, v4.4s, v1.s[1]\n"
- "fmla v26.4s, v4.4s, v1.s[2]\n"
- "fmla v27.4s, v4.4s, v1.s[3]\n"
- "fmla v28.4s, v4.4s, v0.s[0]\n"
- "fmla v29.4s, v4.4s, v0.s[1]\n"
- "fmla v30.4s, v4.4s, v0.s[2]\n"
- "fmla v31.4s, v4.4s, v0.s[3]\n"
+ "fmla v18.4s, v10.4s, v3.s[2]\n"
+ "fmla v19.4s, v10.4s, v3.s[3]\n"
+ "ldr q4, [x20, #0x0]\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "fmla v20.4s, v10.4s, v2.s[0]\n"
+ "fmla v21.4s, v10.4s, v2.s[1]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "fmla v22.4s, v10.4s, v2.s[2]\n"
+ "fmla v23.4s, v10.4s, v2.s[3]\n"
+ "ldr q3, [x20, #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "fmla v24.4s, v10.4s, v1.s[0]\n"
+ "fmla v25.4s, v10.4s, v1.s[1]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "fmla v26.4s, v10.4s, v1.s[2]\n"
+ "fmla v27.4s, v10.4s, v1.s[3]\n"
+ "ldr q2, [x9, #0x0]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "fmla v28.4s, v10.4s, v0.s[0]\n"
+ "fmla v29.4s, v10.4s, v0.s[1]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "fmla v30.4s, v10.4s, v0.s[2]\n"
+ "fmla v31.4s, v10.4s, v0.s[3]\n"
+ "ldr q1, [%x[weights], #0x0]\n"
+ "ldr q0, [x9, #0x10]\n"
"fmla v16.4s, v9.4s, v8.s[0]\n"
"fmla v17.4s, v9.4s, v8.s[1]\n"
- "fmin v16.4s, v16.4s, v10.4s\n"
+ "add %x[weights], %x[weights], #0x10\n"
"fmla v18.4s, v9.4s, v8.s[2]\n"
"fmla v19.4s, v9.4s, v8.s[3]\n"
- "fmin v17.4s, v17.4s, v10.4s\n"
"fmla v20.4s, v9.4s, v7.s[0]\n"
"fmla v21.4s, v9.4s, v7.s[1]\n"
- "fmin v18.4s, v18.4s, v10.4s\n"
"fmla v22.4s, v9.4s, v7.s[2]\n"
"fmla v23.4s, v9.4s, v7.s[3]\n"
- "fmin v19.4s, v19.4s, v10.4s\n"
"fmla v24.4s, v9.4s, v6.s[0]\n"
"fmla v25.4s, v9.4s, v6.s[1]\n"
- "fmin v20.4s, v20.4s, v10.4s\n"
"fmla v26.4s, v9.4s, v6.s[2]\n"
"fmla v27.4s, v9.4s, v6.s[3]\n"
- "fmin v21.4s, v21.4s, v10.4s\n"
"fmla v28.4s, v9.4s, v5.s[0]\n"
"fmla v29.4s, v9.4s, v5.s[1]\n"
- "fmin v22.4s, v22.4s, v10.4s\n"
"fmla v30.4s, v9.4s, v5.s[2]\n"
"fmla v31.4s, v9.4s, v5.s[3]\n"
- "fmin v23.4s, v23.4s, v10.4s\n"
- "fmax v16.4s, v16.4s, v11.4s\n"
- "fmax v17.4s, v17.4s, v11.4s\n"
- "str q16, [x20, x28]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "fmax v18.4s, v18.4s, v11.4s\n"
- "fmax v19.4s, v19.4s, v11.4s\n"
- "str q17, [x21, x28]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "fmax v20.4s, v20.4s, v11.4s\n"
- "fmax v21.4s, v21.4s, v11.4s\n"
- "str q18, [x22, x28]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "fmax v22.4s, v22.4s, v11.4s\n"
- "fmax v23.4s, v23.4s, v11.4s\n"
- "str q19, [x23, x28]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "fmin v24.4s, v24.4s, v10.4s\n"
- "fmin v25.4s, v25.4s, v10.4s\n"
- "str q20, [x24, x28]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "fmin v26.4s, v26.4s, v10.4s\n"
- "fmin v27.4s, v27.4s, v10.4s\n"
- "str q21, [x25, x28]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "fmin v28.4s, v28.4s, v10.4s\n"
- "fmin v29.4s, v29.4s, v10.4s\n"
- "str q22, [x26, x28]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
- "fmin v30.4s, v30.4s, v10.4s\n"
- "fmin v31.4s, v31.4s, v10.4s\n"
- "str q23, [x27, x28]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
- "fmax v24.4s, v24.4s, v11.4s\n"
- "fmax v25.4s, v25.4s, v11.4s\n"
- "str q24, [x20, x28]\n"
- "fmax v26.4s, v26.4s, v11.4s\n"
- "fmax v27.4s, v27.4s, v11.4s\n"
- "str q25, [x21, x28]\n"
- "fmax v28.4s, v28.4s, v11.4s\n"
- "fmax v29.4s, v29.4s, v11.4s\n"
- "str q26, [x22, x28]\n"
- "fmax v30.4s, v30.4s, v11.4s\n"
- "fmax v31.4s, v31.4s, v11.4s\n"
- "str q27, [x23, x28]\n"
- "str q28, [x24, x28]\n"
- "str q29, [x25, x28]\n"
- "str q30, [x26, x28]\n"
- "str q31, [x27, x28]\n"
+ "fmla v16.4s, v1.4s, v4.s[0]\n"
+ "fmla v17.4s, v1.4s, v4.s[1]\n"
+ "fmin v16.4s, v16.4s, v11.4s\n"
+ "fmla v18.4s, v1.4s, v4.s[2]\n"
+ "fmla v19.4s, v1.4s, v4.s[3]\n"
+ "fmin v17.4s, v17.4s, v11.4s\n"
+ "fmla v20.4s, v1.4s, v3.s[0]\n"
+ "fmla v21.4s, v1.4s, v3.s[1]\n"
+ "fmin v18.4s, v18.4s, v11.4s\n"
+ "fmla v22.4s, v1.4s, v3.s[2]\n"
+ "fmla v23.4s, v1.4s, v3.s[3]\n"
+ "fmin v19.4s, v19.4s, v11.4s\n"
+ "fmla v24.4s, v1.4s, v2.s[0]\n"
+ "fmla v25.4s, v1.4s, v2.s[1]\n"
+ "fmin v20.4s, v20.4s, v11.4s\n"
+ "fmla v26.4s, v1.4s, v2.s[2]\n"
+ "fmla v27.4s, v1.4s, v2.s[3]\n"
+ "fmin v21.4s, v21.4s, v11.4s\n"
+ "fmla v28.4s, v1.4s, v0.s[0]\n"
+ "fmla v29.4s, v1.4s, v0.s[1]\n"
+ "fmin v22.4s, v22.4s, v11.4s\n"
+ "fmla v30.4s, v1.4s, v0.s[2]\n"
+ "fmla v31.4s, v1.4s, v0.s[3]\n"
+ "fmin v23.4s, v23.4s, v11.4s\n"
+ "fmax v16.4s, v16.4s, v12.4s\n"
+ "fmax v17.4s, v17.4s, v12.4s\n"
+ "str q16, [x27, x28]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "fmax v18.4s, v18.4s, v12.4s\n"
+ "fmax v19.4s, v19.4s, v12.4s\n"
+ "str q17, [x26, x28]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "fmax v20.4s, v20.4s, v12.4s\n"
+ "fmax v21.4s, v21.4s, v12.4s\n"
+ "str q18, [x25, x28]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "fmax v22.4s, v22.4s, v12.4s\n"
+ "fmax v23.4s, v23.4s, v12.4s\n"
+ "str q19, [x24, x28]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "fmin v24.4s, v24.4s, v11.4s\n"
+ "fmin v25.4s, v25.4s, v11.4s\n"
+ "str q20, [x23, x28]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmin v26.4s, v26.4s, v11.4s\n"
+ "fmin v27.4s, v27.4s, v11.4s\n"
+ "str q21, [x22, x28]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "fmin v28.4s, v28.4s, v11.4s\n"
+ "fmin v29.4s, v29.4s, v11.4s\n"
+ "str q22, [x21, x28]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "fmin v30.4s, v30.4s, v11.4s\n"
+ "fmin v31.4s, v31.4s, v11.4s\n"
+ "str q23, [x20, x28]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "fmax v24.4s, v24.4s, v12.4s\n"
+ "fmax v25.4s, v25.4s, v12.4s\n"
+ "str q24, [x27, x28]\n"
+ "fmax v26.4s, v26.4s, v12.4s\n"
+ "fmax v27.4s, v27.4s, v12.4s\n"
+ "str q25, [x26, x28]\n"
+ "fmax v28.4s, v28.4s, v12.4s\n"
+ "fmax v29.4s, v29.4s, v12.4s\n"
+ "str q26, [x25, x28]\n"
+ "fmax v30.4s, v30.4s, v12.4s\n"
+ "fmax v31.4s, v31.4s, v12.4s\n"
+ "str q27, [x24, x28]\n"
+ "str q28, [x23, x28]\n"
+ "str q29, [x22, x28]\n"
+ "str q30, [x21, x28]\n"
+ "str q31, [x20, x28]\n"
"b 7f\n"
"6:" // Output channel loop: Single kernel point
- "fmla v16.4s, v9.4s, v8.s[0]\n"
- "fmla v17.4s, v9.4s, v8.s[1]\n"
- "fmin v16.4s, v16.4s, v10.4s\n"
+ "fmla v16.4s, v10.4s, v3.s[0]\n"
+ "fmla v17.4s, v10.4s, v3.s[1]\n"
+ "fmin v16.4s, v16.4s, v11.4s\n"
"lsl x28, x10, #0x2\n"
- "fmla v18.4s, v9.4s, v8.s[2]\n"
- "fmla v19.4s, v9.4s, v8.s[3]\n"
- "fmin v17.4s, v17.4s, v10.4s\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "fmla v20.4s, v9.4s, v7.s[0]\n"
- "fmla v21.4s, v9.4s, v7.s[1]\n"
- "fmin v18.4s, v18.4s, v10.4s\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "fmla v22.4s, v9.4s, v7.s[2]\n"
- "fmla v23.4s, v9.4s, v7.s[3]\n"
- "fmin v19.4s, v19.4s, v10.4s\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "fmla v24.4s, v9.4s, v6.s[0]\n"
- "fmla v25.4s, v9.4s, v6.s[1]\n"
- "fmin v20.4s, v20.4s, v10.4s\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "fmla v26.4s, v9.4s, v6.s[2]\n"
- "fmla v27.4s, v9.4s, v6.s[3]\n"
- "fmin v21.4s, v21.4s, v10.4s\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "fmla v28.4s, v9.4s, v5.s[0]\n"
- "fmla v29.4s, v9.4s, v5.s[1]\n"
- "fmin v22.4s, v22.4s, v10.4s\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "fmla v30.4s, v9.4s, v5.s[2]\n"
- "fmla v31.4s, v9.4s, v5.s[3]\n"
- "fmin v23.4s, v23.4s, v10.4s\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "fmax v16.4s, v16.4s, v11.4s\n"
- "fmax v17.4s, v17.4s, v11.4s\n"
- "str q16, [x20, x28]\n"
- "fmax v18.4s, v18.4s, v11.4s\n"
- "fmax v19.4s, v19.4s, v11.4s\n"
- "str q17, [x21, x28]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "fmax v20.4s, v20.4s, v11.4s\n"
- "fmax v21.4s, v21.4s, v11.4s\n"
- "str q18, [x22, x28]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "fmax v22.4s, v22.4s, v11.4s\n"
- "fmax v23.4s, v23.4s, v11.4s\n"
- "str q19, [x23, x28]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "fmin v24.4s, v24.4s, v10.4s\n"
- "fmin v25.4s, v25.4s, v10.4s\n"
- "str q20, [x24, x28]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "fmin v26.4s, v26.4s, v10.4s\n"
- "fmin v27.4s, v27.4s, v10.4s\n"
- "str q21, [x25, x28]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "fmin v28.4s, v28.4s, v10.4s\n"
- "fmin v29.4s, v29.4s, v10.4s\n"
- "str q22, [x26, x28]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "fmin v30.4s, v30.4s, v10.4s\n"
- "fmin v31.4s, v31.4s, v10.4s\n"
- "str q23, [x27, x28]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
- "fmax v24.4s, v24.4s, v11.4s\n"
- "fmax v25.4s, v25.4s, v11.4s\n"
- "str q24, [x20, x28]\n"
- "fmax v26.4s, v26.4s, v11.4s\n"
- "fmax v27.4s, v27.4s, v11.4s\n"
- "str q25, [x21, x28]\n"
- "fmax v28.4s, v28.4s, v11.4s\n"
- "fmax v29.4s, v29.4s, v11.4s\n"
- "str q26, [x22, x28]\n"
- "fmax v30.4s, v30.4s, v11.4s\n"
- "fmax v31.4s, v31.4s, v11.4s\n"
- "str q27, [x23, x28]\n"
- "str q28, [x24, x28]\n"
- "str q29, [x25, x28]\n"
- "str q30, [x26, x28]\n"
- "str q31, [x27, x28]\n"
+ "fmla v18.4s, v10.4s, v3.s[2]\n"
+ "fmla v19.4s, v10.4s, v3.s[3]\n"
+ "fmin v17.4s, v17.4s, v11.4s\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "fmla v20.4s, v10.4s, v2.s[0]\n"
+ "fmla v21.4s, v10.4s, v2.s[1]\n"
+ "fmin v18.4s, v18.4s, v11.4s\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "fmla v22.4s, v10.4s, v2.s[2]\n"
+ "fmla v23.4s, v10.4s, v2.s[3]\n"
+ "fmin v19.4s, v19.4s, v11.4s\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "fmla v24.4s, v10.4s, v1.s[0]\n"
+ "fmla v25.4s, v10.4s, v1.s[1]\n"
+ "fmin v20.4s, v20.4s, v11.4s\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "fmla v26.4s, v10.4s, v1.s[2]\n"
+ "fmla v27.4s, v10.4s, v1.s[3]\n"
+ "fmin v21.4s, v21.4s, v11.4s\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "fmla v28.4s, v10.4s, v0.s[0]\n"
+ "fmla v29.4s, v10.4s, v0.s[1]\n"
+ "fmin v22.4s, v22.4s, v11.4s\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "fmla v30.4s, v10.4s, v0.s[2]\n"
+ "fmla v31.4s, v10.4s, v0.s[3]\n"
+ "fmin v23.4s, v23.4s, v11.4s\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "fmax v16.4s, v16.4s, v12.4s\n"
+ "fmax v17.4s, v17.4s, v12.4s\n"
+ "str q16, [x27, x28]\n"
+ "fmax v18.4s, v18.4s, v12.4s\n"
+ "fmax v19.4s, v19.4s, v12.4s\n"
+ "str q17, [x26, x28]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "fmax v20.4s, v20.4s, v12.4s\n"
+ "fmax v21.4s, v21.4s, v12.4s\n"
+ "str q18, [x25, x28]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "fmax v22.4s, v22.4s, v12.4s\n"
+ "fmax v23.4s, v23.4s, v12.4s\n"
+ "str q19, [x24, x28]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "fmin v24.4s, v24.4s, v11.4s\n"
+ "fmin v25.4s, v25.4s, v11.4s\n"
+ "str q20, [x23, x28]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "fmin v26.4s, v26.4s, v11.4s\n"
+ "fmin v27.4s, v27.4s, v11.4s\n"
+ "str q21, [x22, x28]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmin v28.4s, v28.4s, v11.4s\n"
+ "fmin v29.4s, v29.4s, v11.4s\n"
+ "str q22, [x21, x28]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "fmin v30.4s, v30.4s, v11.4s\n"
+ "fmin v31.4s, v31.4s, v11.4s\n"
+ "str q23, [x20, x28]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "fmax v24.4s, v24.4s, v12.4s\n"
+ "fmax v25.4s, v25.4s, v12.4s\n"
+ "str q24, [x27, x28]\n"
+ "fmax v26.4s, v26.4s, v12.4s\n"
+ "fmax v27.4s, v27.4s, v12.4s\n"
+ "str q25, [x26, x28]\n"
+ "fmax v28.4s, v28.4s, v12.4s\n"
+ "fmax v29.4s, v29.4s, v12.4s\n"
+ "str q26, [x25, x28]\n"
+ "fmax v30.4s, v30.4s, v12.4s\n"
+ "fmax v31.4s, v31.4s, v12.4s\n"
+ "str q27, [x24, x28]\n"
+ "str q28, [x23, x28]\n"
+ "str q29, [x22, x28]\n"
+ "str q30, [x21, x28]\n"
+ "str q31, [x20, x28]\n"
"7:" // Output channel loop: Done
"add x10, x10, #0x4\n"
"cmp x10, x11, LSL #2\n"
@@ -461,16 +462,16 @@ void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"ld1 { v31.s }[0], [x20]\n"
"10:" // Output channel oddments: Load bias: Bit 1: End
"11:" // Output channel oddments: Load bias: Done
- "ldr q9, [%x[weights], #0x0]\n"
- "mov x20, %x[inptrs]\n"
- "ldp x23, x9, [x20], #0x10\n"
- "lsr x21, %x[kernel_points], #0x1\n"
- "ldr q8, [x23, #0x0]\n"
- "ldr q7, [x23, #0x10]\n"
+ "ldr q10, [%x[weights], #0x0]\n"
+ "mov x22, %x[inptrs]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "lsr x23, %x[kernel_points], #0x1\n"
+ "ldr q3, [x21, #0x0]\n"
+ "ldr q2, [x21, #0x10]\n"
"mov v16.16b, v31.16b\n"
"mov v17.16b, v31.16b\n"
- "ldr q6, [x9, #0x0]\n"
- "ldr q5, [x9, #0x10]\n"
+ "ldr q1, [x20, #0x0]\n"
+ "ldr q0, [x20, #0x10]\n"
"mov v18.16b, v31.16b\n"
"mov v19.16b, v31.16b\n"
"mov v20.16b, v31.16b\n"
@@ -486,66 +487,82 @@ void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"mov v29.16b, v31.16b\n"
"mov v30.16b, v31.16b\n"
"mov v31.16b, v31.16b\n"
- "cbz x21, 15f\n"
- "ldr q4, [%x[weights], #0x0]\n"
- "ldp x23, x9, [x20], #0x10\n"
- "subs x21, x21, #0x1\n"
+ "cbz x23, 15f\n"
+ "ldr q9, [%x[weights], #0x0]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "subs x23, x23, #0x1\n"
"add %x[weights], %x[weights], #0x10\n"
- "ldr q3, [x23, #0x0]\n"
- "ldr q2, [x23, #0x10]\n"
- "ldr q1, [x9, #0x0]\n"
- "ldr q0, [x9, #0x10]\n"
+ "ldr q8, [x21, #0x0]\n"
+ "ldr q7, [x21, #0x10]\n"
+ "ldr q6, [x20, #0x0]\n"
+ "ldr q5, [x20, #0x10]\n"
"beq 13f\n"
"12:" // Output channel oddments: Kernel loop
- "ldp x23, x9, [x20], #0x10\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "fmla v16.4s, v10.4s, v3.s[0]\n"
+ "fmla v17.4s, v10.4s, v3.s[1]\n"
+ "subs x23, x23, #0x1\n"
+ "fmla v18.4s, v10.4s, v3.s[2]\n"
+ "fmla v19.4s, v10.4s, v3.s[3]\n"
+ "ldr q3, [x21, #0x0]\n"
+ "fmla v20.4s, v10.4s, v2.s[0]\n"
+ "fmla v21.4s, v10.4s, v2.s[1]\n"
+ "fmla v22.4s, v10.4s, v2.s[2]\n"
+ "fmla v23.4s, v10.4s, v2.s[3]\n"
+ "ldr q2, [x21, #0x10]\n"
+ "fmla v24.4s, v10.4s, v1.s[0]\n"
+ "fmla v25.4s, v10.4s, v1.s[1]\n"
+ "fmla v26.4s, v10.4s, v1.s[2]\n"
+ "fmla v27.4s, v10.4s, v1.s[3]\n"
+ "ldr q1, [x20, #0x0]\n"
+ "fmla v28.4s, v10.4s, v0.s[0]\n"
+ "fmla v29.4s, v10.4s, v0.s[1]\n"
+ "fmla v30.4s, v10.4s, v0.s[2]\n"
+ "fmla v31.4s, v10.4s, v0.s[3]\n"
+ "ldr q0, [x20, #0x10]\n"
+ "ldr q10, [%x[weights], #0x0]\n"
+ "ldp x21, x20, [x22], #0x10\n"
"fmla v16.4s, v9.4s, v8.s[0]\n"
"fmla v17.4s, v9.4s, v8.s[1]\n"
- "subs x21, x21, #0x1\n"
"fmla v18.4s, v9.4s, v8.s[2]\n"
"fmla v19.4s, v9.4s, v8.s[3]\n"
- "ldr q8, [x23, #0x0]\n"
+ "ldr q8, [x21, #0x0]\n"
"fmla v20.4s, v9.4s, v7.s[0]\n"
"fmla v21.4s, v9.4s, v7.s[1]\n"
"fmla v22.4s, v9.4s, v7.s[2]\n"
"fmla v23.4s, v9.4s, v7.s[3]\n"
- "ldr q7, [x23, #0x10]\n"
+ "ldr q7, [x21, #0x10]\n"
"fmla v24.4s, v9.4s, v6.s[0]\n"
"fmla v25.4s, v9.4s, v6.s[1]\n"
"fmla v26.4s, v9.4s, v6.s[2]\n"
"fmla v27.4s, v9.4s, v6.s[3]\n"
- "ldr q6, [x9, #0x0]\n"
+ "ldr q6, [x20, #0x0]\n"
"fmla v28.4s, v9.4s, v5.s[0]\n"
"fmla v29.4s, v9.4s, v5.s[1]\n"
"fmla v30.4s, v9.4s, v5.s[2]\n"
"fmla v31.4s, v9.4s, v5.s[3]\n"
- "ldr q5, [x9, #0x10]\n"
- "ldr q9, [%x[weights], #0x0]\n"
- "ldp x23, x9, [x20], #0x10\n"
- "fmla v16.4s, v4.4s, v3.s[0]\n"
- "fmla v17.4s, v4.4s, v3.s[1]\n"
- "fmla v18.4s, v4.4s, v3.s[2]\n"
- "fmla v19.4s, v4.4s, v3.s[3]\n"
- "ldr q3, [x23, #0x0]\n"
- "fmla v20.4s, v4.4s, v2.s[0]\n"
- "fmla v21.4s, v4.4s, v2.s[1]\n"
- "fmla v22.4s, v4.4s, v2.s[2]\n"
- "fmla v23.4s, v4.4s, v2.s[3]\n"
- "ldr q2, [x23, #0x10]\n"
- "fmla v24.4s, v4.4s, v1.s[0]\n"
- "fmla v25.4s, v4.4s, v1.s[1]\n"
- "fmla v26.4s, v4.4s, v1.s[2]\n"
- "fmla v27.4s, v4.4s, v1.s[3]\n"
- "ldr q1, [x9, #0x0]\n"
- "fmla v28.4s, v4.4s, v0.s[0]\n"
- "fmla v29.4s, v4.4s, v0.s[1]\n"
- "fmla v30.4s, v4.4s, v0.s[2]\n"
- "fmla v31.4s, v4.4s, v0.s[3]\n"
- "ldr q0, [x9, #0x10]\n"
- "ldr q4, [%x[weights], #0x10]\n"
+ "ldr q5, [x20, #0x10]\n"
+ "ldr q9, [%x[weights], #0x10]\n"
"add %x[weights], %x[weights], #0x20\n"
"bgt 12b\n"
"13:" // Output channel oddments: Kernel loop tail
"tbnz %x[kernel_points], #0, 14f\n"
+ "fmla v16.4s, v10.4s, v3.s[0]\n"
+ "fmla v17.4s, v10.4s, v3.s[1]\n"
+ "fmla v18.4s, v10.4s, v3.s[2]\n"
+ "fmla v19.4s, v10.4s, v3.s[3]\n"
+ "fmla v20.4s, v10.4s, v2.s[0]\n"
+ "fmla v21.4s, v10.4s, v2.s[1]\n"
+ "fmla v22.4s, v10.4s, v2.s[2]\n"
+ "fmla v23.4s, v10.4s, v2.s[3]\n"
+ "fmla v24.4s, v10.4s, v1.s[0]\n"
+ "fmla v25.4s, v10.4s, v1.s[1]\n"
+ "fmla v26.4s, v10.4s, v1.s[2]\n"
+ "fmla v27.4s, v10.4s, v1.s[3]\n"
+ "fmla v28.4s, v10.4s, v0.s[0]\n"
+ "fmla v29.4s, v10.4s, v0.s[1]\n"
+ "fmla v30.4s, v10.4s, v0.s[2]\n"
+ "fmla v31.4s, v10.4s, v0.s[3]\n"
"fmla v16.4s, v9.4s, v8.s[0]\n"
"fmla v17.4s, v9.4s, v8.s[1]\n"
"fmla v18.4s, v9.4s, v8.s[2]\n"
@@ -562,65 +579,33 @@ void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"fmla v29.4s, v9.4s, v5.s[1]\n"
"fmla v30.4s, v9.4s, v5.s[2]\n"
"fmla v31.4s, v9.4s, v5.s[3]\n"
- "fmla v16.4s, v4.4s, v3.s[0]\n"
- "fmla v17.4s, v4.4s, v3.s[1]\n"
- "fmla v18.4s, v4.4s, v3.s[2]\n"
- "fmla v19.4s, v4.4s, v3.s[3]\n"
- "fmla v20.4s, v4.4s, v2.s[0]\n"
- "fmla v21.4s, v4.4s, v2.s[1]\n"
- "fmla v22.4s, v4.4s, v2.s[2]\n"
- "fmla v23.4s, v4.4s, v2.s[3]\n"
- "fmla v24.4s, v4.4s, v1.s[0]\n"
- "fmla v25.4s, v4.4s, v1.s[1]\n"
- "fmla v26.4s, v4.4s, v1.s[2]\n"
- "fmla v27.4s, v4.4s, v1.s[3]\n"
- "fmla v28.4s, v4.4s, v0.s[0]\n"
- "fmla v29.4s, v4.4s, v0.s[1]\n"
- "fmla v30.4s, v4.4s, v0.s[2]\n"
- "fmla v31.4s, v4.4s, v0.s[3]\n"
"b 16f\n"
"14:" // Output channel oddments: Odd tail
+ "fmla v16.4s, v10.4s, v3.s[0]\n"
+ "fmla v17.4s, v10.4s, v3.s[1]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "fmla v18.4s, v10.4s, v3.s[2]\n"
+ "fmla v19.4s, v10.4s, v3.s[3]\n"
+ "ldr q4, [x21, #0x0]\n"
+ "fmla v20.4s, v10.4s, v2.s[0]\n"
+ "fmla v21.4s, v10.4s, v2.s[1]\n"
+ "fmla v22.4s, v10.4s, v2.s[2]\n"
+ "fmla v23.4s, v10.4s, v2.s[3]\n"
+ "ldr q3, [x21, #0x10]\n"
+ "fmla v24.4s, v10.4s, v1.s[0]\n"
+ "fmla v25.4s, v10.4s, v1.s[1]\n"
+ "fmla v26.4s, v10.4s, v1.s[2]\n"
+ "fmla v27.4s, v10.4s, v1.s[3]\n"
+ "ldr q2, [x20, #0x0]\n"
+ "fmla v28.4s, v10.4s, v0.s[0]\n"
+ "fmla v29.4s, v10.4s, v0.s[1]\n"
+ "fmla v30.4s, v10.4s, v0.s[2]\n"
+ "fmla v31.4s, v10.4s, v0.s[3]\n"
+ "ldr q1, [x20, #0x10]\n"
+ "ldr q0, [%x[weights], #0x0]\n"
"fmla v16.4s, v9.4s, v8.s[0]\n"
"fmla v17.4s, v9.4s, v8.s[1]\n"
- "ldp x23, x9, [x20], #0x10\n"
- "fmla v18.4s, v9.4s, v8.s[2]\n"
- "fmla v19.4s, v9.4s, v8.s[3]\n"
- "ldr q8, [x23, #0x0]\n"
- "fmla v20.4s, v9.4s, v7.s[0]\n"
- "fmla v21.4s, v9.4s, v7.s[1]\n"
- "fmla v22.4s, v9.4s, v7.s[2]\n"
- "fmla v23.4s, v9.4s, v7.s[3]\n"
- "ldr q7, [x23, #0x10]\n"
- "fmla v24.4s, v9.4s, v6.s[0]\n"
- "fmla v25.4s, v9.4s, v6.s[1]\n"
- "fmla v26.4s, v9.4s, v6.s[2]\n"
- "fmla v27.4s, v9.4s, v6.s[3]\n"
- "ldr q6, [x9, #0x0]\n"
- "fmla v28.4s, v9.4s, v5.s[0]\n"
- "fmla v29.4s, v9.4s, v5.s[1]\n"
- "fmla v30.4s, v9.4s, v5.s[2]\n"
- "fmla v31.4s, v9.4s, v5.s[3]\n"
- "ldr q5, [x9, #0x10]\n"
- "ldr q9, [%x[weights], #0x0]\n"
- "fmla v16.4s, v4.4s, v3.s[0]\n"
- "fmla v17.4s, v4.4s, v3.s[1]\n"
"add %x[weights], %x[weights], #0x10\n"
- "fmla v18.4s, v4.4s, v3.s[2]\n"
- "fmla v19.4s, v4.4s, v3.s[3]\n"
- "fmla v20.4s, v4.4s, v2.s[0]\n"
- "fmla v21.4s, v4.4s, v2.s[1]\n"
- "fmla v22.4s, v4.4s, v2.s[2]\n"
- "fmla v23.4s, v4.4s, v2.s[3]\n"
- "fmla v24.4s, v4.4s, v1.s[0]\n"
- "fmla v25.4s, v4.4s, v1.s[1]\n"
- "fmla v26.4s, v4.4s, v1.s[2]\n"
- "fmla v27.4s, v4.4s, v1.s[3]\n"
- "fmla v28.4s, v4.4s, v0.s[0]\n"
- "fmla v29.4s, v4.4s, v0.s[1]\n"
- "fmla v30.4s, v4.4s, v0.s[2]\n"
- "fmla v31.4s, v4.4s, v0.s[3]\n"
- "fmla v16.4s, v9.4s, v8.s[0]\n"
- "fmla v17.4s, v9.4s, v8.s[1]\n"
"fmla v18.4s, v9.4s, v8.s[2]\n"
"fmla v19.4s, v9.4s, v8.s[3]\n"
"fmla v20.4s, v9.4s, v7.s[0]\n"
@@ -635,216 +620,231 @@ void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"fmla v29.4s, v9.4s, v5.s[1]\n"
"fmla v30.4s, v9.4s, v5.s[2]\n"
"fmla v31.4s, v9.4s, v5.s[3]\n"
+ "fmla v16.4s, v0.4s, v4.s[0]\n"
+ "fmla v17.4s, v0.4s, v4.s[1]\n"
+ "fmla v18.4s, v0.4s, v4.s[2]\n"
+ "fmla v19.4s, v0.4s, v4.s[3]\n"
+ "fmla v20.4s, v0.4s, v3.s[0]\n"
+ "fmla v21.4s, v0.4s, v3.s[1]\n"
+ "fmla v22.4s, v0.4s, v3.s[2]\n"
+ "fmla v23.4s, v0.4s, v3.s[3]\n"
+ "fmla v24.4s, v0.4s, v2.s[0]\n"
+ "fmla v25.4s, v0.4s, v2.s[1]\n"
+ "fmla v26.4s, v0.4s, v2.s[2]\n"
+ "fmla v27.4s, v0.4s, v2.s[3]\n"
+ "fmla v28.4s, v0.4s, v1.s[0]\n"
+ "fmla v29.4s, v0.4s, v1.s[1]\n"
+ "fmla v30.4s, v0.4s, v1.s[2]\n"
+ "fmla v31.4s, v0.4s, v1.s[3]\n"
"b 16f\n"
"15:" // Output channel oddments: Single kernel point
- "fmla v16.4s, v9.4s, v8.s[0]\n"
- "fmla v17.4s, v9.4s, v8.s[1]\n"
- "fmla v18.4s, v9.4s, v8.s[2]\n"
- "fmla v19.4s, v9.4s, v8.s[3]\n"
- "fmla v20.4s, v9.4s, v7.s[0]\n"
- "fmla v21.4s, v9.4s, v7.s[1]\n"
- "fmla v22.4s, v9.4s, v7.s[2]\n"
- "fmla v23.4s, v9.4s, v7.s[3]\n"
- "fmla v24.4s, v9.4s, v6.s[0]\n"
- "fmla v25.4s, v9.4s, v6.s[1]\n"
- "fmla v26.4s, v9.4s, v6.s[2]\n"
- "fmla v27.4s, v9.4s, v6.s[3]\n"
- "fmla v28.4s, v9.4s, v5.s[0]\n"
- "fmla v29.4s, v9.4s, v5.s[1]\n"
- "fmla v30.4s, v9.4s, v5.s[2]\n"
- "fmla v31.4s, v9.4s, v5.s[3]\n"
+ "fmla v16.4s, v10.4s, v3.s[0]\n"
+ "fmla v17.4s, v10.4s, v3.s[1]\n"
+ "fmla v18.4s, v10.4s, v3.s[2]\n"
+ "fmla v19.4s, v10.4s, v3.s[3]\n"
+ "fmla v20.4s, v10.4s, v2.s[0]\n"
+ "fmla v21.4s, v10.4s, v2.s[1]\n"
+ "fmla v22.4s, v10.4s, v2.s[2]\n"
+ "fmla v23.4s, v10.4s, v2.s[3]\n"
+ "fmla v24.4s, v10.4s, v1.s[0]\n"
+ "fmla v25.4s, v10.4s, v1.s[1]\n"
+ "fmla v26.4s, v10.4s, v1.s[2]\n"
+ "fmla v27.4s, v10.4s, v1.s[3]\n"
+ "fmla v28.4s, v10.4s, v0.s[0]\n"
+ "fmla v29.4s, v10.4s, v0.s[1]\n"
+ "fmla v30.4s, v10.4s, v0.s[2]\n"
+ "fmla v31.4s, v10.4s, v0.s[3]\n"
"16:" // Output channel oddments: Done
- "fmin v16.4s, v16.4s, v10.4s\n"
- "fmin v17.4s, v17.4s, v10.4s\n"
- "fmin v18.4s, v18.4s, v10.4s\n"
- "fmin v19.4s, v19.4s, v10.4s\n"
- "fmin v20.4s, v20.4s, v10.4s\n"
- "fmin v21.4s, v21.4s, v10.4s\n"
- "fmin v22.4s, v22.4s, v10.4s\n"
- "fmin v23.4s, v23.4s, v10.4s\n"
- "fmin v24.4s, v24.4s, v10.4s\n"
- "fmin v25.4s, v25.4s, v10.4s\n"
- "fmin v26.4s, v26.4s, v10.4s\n"
- "fmin v27.4s, v27.4s, v10.4s\n"
- "fmin v28.4s, v28.4s, v10.4s\n"
- "fmin v29.4s, v29.4s, v10.4s\n"
- "fmin v30.4s, v30.4s, v10.4s\n"
- "fmin v31.4s, v31.4s, v10.4s\n"
- "fmax v16.4s, v16.4s, v11.4s\n"
- "fmax v17.4s, v17.4s, v11.4s\n"
- "fmax v18.4s, v18.4s, v11.4s\n"
- "fmax v19.4s, v19.4s, v11.4s\n"
- "fmax v20.4s, v20.4s, v11.4s\n"
- "fmax v21.4s, v21.4s, v11.4s\n"
- "fmax v22.4s, v22.4s, v11.4s\n"
- "fmax v23.4s, v23.4s, v11.4s\n"
- "fmax v24.4s, v24.4s, v11.4s\n"
- "fmax v25.4s, v25.4s, v11.4s\n"
- "fmax v26.4s, v26.4s, v11.4s\n"
- "fmax v27.4s, v27.4s, v11.4s\n"
- "fmax v28.4s, v28.4s, v11.4s\n"
- "fmax v29.4s, v29.4s, v11.4s\n"
- "fmax v30.4s, v30.4s, v11.4s\n"
- "fmax v31.4s, v31.4s, v11.4s\n"
+ "fmin v16.4s, v16.4s, v11.4s\n"
+ "fmin v17.4s, v17.4s, v11.4s\n"
+ "fmin v18.4s, v18.4s, v11.4s\n"
+ "fmin v19.4s, v19.4s, v11.4s\n"
+ "fmin v20.4s, v20.4s, v11.4s\n"
+ "fmin v21.4s, v21.4s, v11.4s\n"
+ "fmin v22.4s, v22.4s, v11.4s\n"
+ "fmin v23.4s, v23.4s, v11.4s\n"
+ "fmin v24.4s, v24.4s, v11.4s\n"
+ "fmin v25.4s, v25.4s, v11.4s\n"
+ "fmin v26.4s, v26.4s, v11.4s\n"
+ "fmin v27.4s, v27.4s, v11.4s\n"
+ "fmin v28.4s, v28.4s, v11.4s\n"
+ "fmin v29.4s, v29.4s, v11.4s\n"
+ "fmin v30.4s, v30.4s, v11.4s\n"
+ "fmin v31.4s, v31.4s, v11.4s\n"
+ "fmax v16.4s, v16.4s, v12.4s\n"
+ "fmax v17.4s, v17.4s, v12.4s\n"
+ "fmax v18.4s, v18.4s, v12.4s\n"
+ "fmax v19.4s, v19.4s, v12.4s\n"
+ "fmax v20.4s, v20.4s, v12.4s\n"
+ "fmax v21.4s, v21.4s, v12.4s\n"
+ "fmax v22.4s, v22.4s, v12.4s\n"
+ "fmax v23.4s, v23.4s, v12.4s\n"
+ "fmax v24.4s, v24.4s, v12.4s\n"
+ "fmax v25.4s, v25.4s, v12.4s\n"
+ "fmax v26.4s, v26.4s, v12.4s\n"
+ "fmax v27.4s, v27.4s, v12.4s\n"
+ "fmax v28.4s, v28.4s, v12.4s\n"
+ "fmax v29.4s, v29.4s, v12.4s\n"
+ "fmax v30.4s, v30.4s, v12.4s\n"
+ "fmax v31.4s, v31.4s, v12.4s\n"
"tbz %x[n_output_channels], #1, 17f\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x10, LSL #2\n"
- "add x21, x21, x10, LSL #2\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x10, LSL #2\n"
- "add x23, x23, x10, LSL #2\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x10, LSL #2\n"
- "add x25, x25, x10, LSL #2\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x10, LSL #2\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x10, LSL #2\n"
- "st1 { v16.d }[0], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x10, LSL #2\n"
- "st1 { v17.d }[0], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x10, LSL #2\n"
- "st1 { v18.d }[0], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x10, LSL #2\n"
- "st1 { v19.d }[0], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x10, LSL #2\n"
- "st1 { v20.d }[0], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x10, LSL #2\n"
- "st1 { v21.d }[0], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x10, LSL #2\n"
- "st1 { v22.d }[0], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x10, LSL #2\n"
- "st1 { v23.d }[0], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x10, LSL #2\n"
+ "add x24, x24, x10, LSL #2\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x10, LSL #2\n"
+ "add x22, x22, x10, LSL #2\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x10, LSL #2\n"
+ "add x20, x20, x10, LSL #2\n"
+ "st1 { v16.d }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x10, LSL #2\n"
+ "st1 { v17.d }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x10, LSL #2\n"
+ "st1 { v18.d }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x10, LSL #2\n"
+ "st1 { v19.d }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x10, LSL #2\n"
+ "st1 { v20.d }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #2\n"
+ "st1 { v21.d }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x10, LSL #2\n"
+ "st1 { v22.d }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x10, LSL #2\n"
+ "st1 { v23.d }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x10, LSL #2\n"
"add x10, x10, #0x2\n"
- "st1 { v24.d }[0], [x20]\n"
- "st1 { v25.d }[0], [x21]\n"
- "st1 { v26.d }[0], [x22]\n"
- "st1 { v27.d }[0], [x23]\n"
- "st1 { v28.d }[0], [x24]\n"
- "st1 { v29.d }[0], [x25]\n"
- "st1 { v30.d }[0], [x26]\n"
- "st1 { v31.d }[0], [x27]\n"
+ "st1 { v24.d }[0], [x27]\n"
+ "st1 { v25.d }[0], [x26]\n"
+ "st1 { v26.d }[0], [x25]\n"
+ "st1 { v27.d }[0], [x24]\n"
+ "st1 { v28.d }[0], [x23]\n"
+ "st1 { v29.d }[0], [x22]\n"
+ "st1 { v30.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
"tbz %x[n_output_channels], #0, 18f\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x10, LSL #2\n"
- "add x21, x21, x10, LSL #2\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x10, LSL #2\n"
- "add x23, x23, x10, LSL #2\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x10, LSL #2\n"
- "add x25, x25, x10, LSL #2\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x10, LSL #2\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x10, LSL #2\n"
- "st1 { v16.s }[2], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x10, LSL #2\n"
- "st1 { v17.s }[2], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x10, LSL #2\n"
- "st1 { v18.s }[2], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x10, LSL #2\n"
- "st1 { v19.s }[2], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x10, LSL #2\n"
- "st1 { v20.s }[2], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x10, LSL #2\n"
- "st1 { v21.s }[2], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x10, LSL #2\n"
- "st1 { v22.s }[2], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x10, LSL #2\n"
- "st1 { v23.s }[2], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x10, LSL #2\n"
+ "add x24, x24, x10, LSL #2\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x10, LSL #2\n"
+ "add x22, x22, x10, LSL #2\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x10, LSL #2\n"
+ "add x20, x20, x10, LSL #2\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x10, LSL #2\n"
- "st1 { v24.s }[2], [x20]\n"
- "st1 { v25.s }[2], [x21]\n"
- "st1 { v26.s }[2], [x22]\n"
- "st1 { v27.s }[2], [x23]\n"
- "st1 { v28.s }[2], [x24]\n"
- "st1 { v29.s }[2], [x25]\n"
- "st1 { v30.s }[2], [x26]\n"
- "st1 { v31.s }[2], [x27]\n"
+ "st1 { v17.s }[2], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x10, LSL #2\n"
+ "st1 { v18.s }[2], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x10, LSL #2\n"
+ "st1 { v19.s }[2], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x10, LSL #2\n"
+ "st1 { v20.s }[2], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #2\n"
+ "st1 { v21.s }[2], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x10, LSL #2\n"
+ "st1 { v22.s }[2], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x10, LSL #2\n"
+ "st1 { v23.s }[2], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x10, LSL #2\n"
+ "st1 { v24.s }[2], [x27]\n"
+ "st1 { v25.s }[2], [x26]\n"
+ "st1 { v26.s }[2], [x25]\n"
+ "st1 { v27.s }[2], [x24]\n"
+ "st1 { v28.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x22]\n"
+ "st1 { v30.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
"b 18f\n"
"17:" // Output channel oddments: Done: Store: Bit 1: Unset
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x10, LSL #2\n"
- "add x21, x21, x10, LSL #2\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x10, LSL #2\n"
- "add x23, x23, x10, LSL #2\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x10, LSL #2\n"
- "add x25, x25, x10, LSL #2\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x10, LSL #2\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x10, LSL #2\n"
- "st1 { v16.s }[0], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x10, LSL #2\n"
- "st1 { v17.s }[0], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x10, LSL #2\n"
- "st1 { v18.s }[0], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x10, LSL #2\n"
- "st1 { v19.s }[0], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x10, LSL #2\n"
- "st1 { v20.s }[0], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x10, LSL #2\n"
- "st1 { v21.s }[0], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x10, LSL #2\n"
- "st1 { v22.s }[0], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x10, LSL #2\n"
- "st1 { v23.s }[0], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x10, LSL #2\n"
+ "add x24, x24, x10, LSL #2\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x10, LSL #2\n"
+ "add x22, x22, x10, LSL #2\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x10, LSL #2\n"
+ "add x20, x20, x10, LSL #2\n"
+ "st1 { v16.s }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x10, LSL #2\n"
- "st1 { v24.s }[0], [x20]\n"
- "st1 { v25.s }[0], [x21]\n"
- "st1 { v26.s }[0], [x22]\n"
- "st1 { v27.s }[0], [x23]\n"
- "st1 { v28.s }[0], [x24]\n"
- "st1 { v29.s }[0], [x25]\n"
- "st1 { v30.s }[0], [x26]\n"
- "st1 { v31.s }[0], [x27]\n"
+ "st1 { v17.s }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x10, LSL #2\n"
+ "st1 { v18.s }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x10, LSL #2\n"
+ "st1 { v19.s }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x10, LSL #2\n"
+ "st1 { v20.s }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #2\n"
+ "st1 { v21.s }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x10, LSL #2\n"
+ "st1 { v22.s }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x10, LSL #2\n"
+ "st1 { v23.s }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x10, LSL #2\n"
+ "st1 { v24.s }[0], [x27]\n"
+ "st1 { v25.s }[0], [x26]\n"
+ "st1 { v26.s }[0], [x25]\n"
+ "st1 { v27.s }[0], [x24]\n"
+ "st1 { v28.s }[0], [x23]\n"
+ "st1 { v29.s }[0], [x22]\n"
+ "st1 { v30.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
"18:" // Output channel oddments: Done: Store: Bit 1: End
-
"19:" // Done
-
: [weights] "+&r" (weights)
: [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [minmax_vals] "r" (minmax_vals), [n_output_channels] "r" ((uint64_t) n_output_channels), [outptrs] "r" (outptrs)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
index 79bba40ca3..85053b374c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
@@ -34,15 +34,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
- const unsigned int,
- const int8_t *const *const,
- const int8_t *,
- const int32_t *,
- const arm_gemm::Requantize32&,
- const int32_t *, const int32_t *,
- int8_t *const *const
-);
+void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, int8_t *const *);
class a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
index fda88f94bb..916c8a4afe 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -30,15 +30,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
- const unsigned int n_channels,
- const int8_t *const *const inptrs,
- const int8_t *params,
- const int32_t *, // Bias, should be wrapped into the parameters
- const arm_gemm::Requantize32& qp,
- const int32_t *, const int32_t *, // Requant parameters, also wrapped
- int8_t *const *const outptrs
-)
+void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const int8_t *const *const inptrs, const int8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, int8_t *const *const outptrs)
{
__asm__ __volatile__(
"mov x20, #0x1\n"
@@ -47,817 +39,817 @@ void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
"ldp x13, x12, [%x[inptrs], #0x10]\n"
"orr x20, x20, #0x10000\n"
"lsr x11, %x[n_channels], #0x4\n"
- "dup v14.4s, w20\n"
+ "dup v12.4s, w20\n"
"ldp x10, x9, [%x[inptrs], #0x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
"ld1r { v13.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v12.4s }, [x20]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
"ld1r { v11.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v16.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v10.4s }, [x20]\n"
+ "ld1r { v14.4s }, [x20]\n"
"mov x28, #0x0\n"
"mov x27, #0x0\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
- "ldp x24, x23, [%x[outptrs], #0x0]\n"
- "ldp x22, x21, [%x[outptrs], #0x10]\n"
+ "ldp x26, x21, [%x[inptrs], #0x30]\n"
+ "ldp x25, x24, [%x[outptrs], #0x0]\n"
+ "ldp x23, x22, [%x[outptrs], #0x10]\n"
"cbz x11, 3f\n"
- "ldr q9, [x15, x28]\n"
- "ldr q8, [x14, x28]\n"
- "subs x11, x11, #0x1\n"
- "ldr q7, [x13, x28]\n"
- "ldr q6, [x12, x28]\n"
- "zip2 v5.16b, v9.16b, v7.16b\n"
- "zip1 v9.16b, v9.16b, v7.16b\n"
- "ldr q4, [x10, x28]\n"
- "ldr q3, [x9, x28]\n"
- "zip1 v7.16b, v8.16b, v6.16b\n"
- "zip2 v6.16b, v8.16b, v6.16b\n"
- "ldr q2, [x26, x28]\n"
- "ldr q1, [x25, x28]\n"
- "zip2 v8.16b, v9.16b, v7.16b\n"
- "zip1 v9.16b, v9.16b, v7.16b\n"
- "ldr q0, [%x[params], #0x10]\n"
- "ldr q16, [%x[params], #0x20]\n"
- "zip1 v7.16b, v5.16b, v6.16b\n"
- "zip2 v6.16b, v5.16b, v6.16b\n"
- "ldr q5, [%x[params], #0x0]\n"
- "ldr q31, [%x[params], #0x30]\n"
- "zip2 v30.16b, v4.16b, v2.16b\n"
- "zip1 v4.16b, v4.16b, v2.16b\n"
- "ldp x15, x14, [%x[inptrs], #0x40]\n"
- "ldr q29, [x15, x28]\n"
- "zip1 v2.16b, v3.16b, v1.16b\n"
- "zip2 v1.16b, v3.16b, v1.16b\n"
+ "ldr q15, [x15, x28]\n"
"ldr q28, [x14, x28]\n"
- "ldp x13, x12, [%x[inptrs], #0x50]\n"
- "zip2 v3.16b, v4.16b, v2.16b\n"
- "zip1 v4.16b, v4.16b, v2.16b\n"
- "ldr q27, [x13, x28]\n"
- "ldr q26, [x12, x28]\n"
- "zip2 v25.16b, v29.16b, v27.16b\n"
- "zip1 v29.16b, v29.16b, v27.16b\n"
- "ldp x10, x9, [%x[inptrs], #0x60]\n"
- "ldr q24, [x10, x28]\n"
- "zip1 v27.16b, v28.16b, v26.16b\n"
- "zip2 v26.16b, v28.16b, v26.16b\n"
- "ldr q23, [x9, x28]\n"
- "ldp x26, x25, [%x[inptrs], #0x70]\n"
- "zip1 v2.16b, v30.16b, v1.16b\n"
- "zip2 v1.16b, v30.16b, v1.16b\n"
- "ldr q22, [x26, x28]\n"
- "ldr q21, [x25, x28]\n"
- "zip2 v20.16b, v24.16b, v22.16b\n"
- "zip1 v24.16b, v24.16b, v22.16b\n"
- "zip1 v22.16b, v23.16b, v21.16b\n"
- "zip2 v21.16b, v23.16b, v21.16b\n"
+ "subs x11, x11, #0x1\n"
+ "ldr q30, [x13, x28]\n"
+ "ldr q8, [x12, x28]\n"
+ "zip2 v19.16b, v15.16b, v30.16b\n"
+ "zip1 v15.16b, v15.16b, v30.16b\n"
+ "ldr q26, [x10, x28]\n"
+ "ldr q0, [x9, x28]\n"
+ "zip1 v7.16b, v28.16b, v8.16b\n"
+ "zip2 v8.16b, v28.16b, v8.16b\n"
+ "ldr q29, [x26, x28]\n"
+ "ldr q10, [x21, x28]\n"
+ "zip2 v25.16b, v15.16b, v7.16b\n"
+ "zip1 v15.16b, v15.16b, v7.16b\n"
+ "ldr q1, [%x[params], #0x10]\n"
+ "ldr q6, [%x[params], #0x20]\n"
+ "zip1 v7.16b, v19.16b, v8.16b\n"
+ "zip2 v8.16b, v19.16b, v8.16b\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q20, [%x[params], #0x30]\n"
+ "zip2 v21.16b, v26.16b, v29.16b\n"
+ "zip1 v26.16b, v26.16b, v29.16b\n"
+ "ldp x21, x20, [%x[inptrs], #0x40]\n"
+ "ldr q22, [x21, x28]\n"
+ "zip1 v27.16b, v0.16b, v10.16b\n"
+ "zip2 v10.16b, v0.16b, v10.16b\n"
+ "ldr q17, [x20, x28]\n"
+ "ldp x21, x20, [%x[inptrs], #0x50]\n"
+ "zip2 v23.16b, v26.16b, v27.16b\n"
+ "zip1 v26.16b, v26.16b, v27.16b\n"
+ "ldr q9, [x21, x28]\n"
+ "ldr q5, [x20, x28]\n"
+ "zip2 v28.16b, v22.16b, v9.16b\n"
+ "zip1 v22.16b, v22.16b, v9.16b\n"
+ "ldp x21, x20, [%x[inptrs], #0x60]\n"
+ "ldr q27, [x21, x28]\n"
+ "zip1 v24.16b, v17.16b, v5.16b\n"
+ "zip2 v5.16b, v17.16b, v5.16b\n"
+ "ldr q18, [x20, x28]\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "zip1 v3.16b, v21.16b, v10.16b\n"
+ "zip2 v10.16b, v21.16b, v10.16b\n"
+ "ldr q4, [x21, x28]\n"
+ "ldr q9, [x20, x28]\n"
+ "zip2 v17.16b, v27.16b, v4.16b\n"
+ "zip1 v27.16b, v27.16b, v4.16b\n"
+ "zip1 v4.16b, v18.16b, v9.16b\n"
+ "zip2 v9.16b, v18.16b, v9.16b\n"
"ldp x15, x14, [%x[inptrs], #0x0]\n"
"ldp x13, x12, [%x[inptrs], #0x10]\n"
"ldp x10, x9, [%x[inptrs], #0x20]\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
- "zip2 v28.16b, v29.16b, v27.16b\n"
- "zip1 v29.16b, v29.16b, v27.16b\n"
- "zip1 v27.16b, v25.16b, v26.16b\n"
- "zip2 v26.16b, v25.16b, v26.16b\n"
+ "ldp x26, x21, [%x[inptrs], #0x30]\n"
+ "zip2 v19.16b, v22.16b, v24.16b\n"
+ "zip1 v22.16b, v22.16b, v24.16b\n"
+ "zip1 v0.16b, v28.16b, v5.16b\n"
+ "zip2 v5.16b, v28.16b, v5.16b\n"
"add %x[params], %x[params], #0x40\n"
- "zip2 v23.16b, v24.16b, v22.16b\n"
- "zip1 v24.16b, v24.16b, v22.16b\n"
- "zip1 v22.16b, v20.16b, v21.16b\n"
- "zip2 v21.16b, v20.16b, v21.16b\n"
- "mov v30.16b, v5.16b\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
+ "zip2 v24.16b, v27.16b, v4.16b\n"
+ "zip1 v27.16b, v27.16b, v4.16b\n"
+ "zip1 v2.16b, v17.16b, v9.16b\n"
+ "zip2 v9.16b, v17.16b, v9.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
"beq 2f\n"
"1:" // Loop
- "movi v19.4s, #0x0\n"
- ".inst 0x4e8495d3 // sdot v19.4s, v14.16b, v4.16b\n"
- ".inst 0x4e899405 // sdot v5.4s, v0.16b, v9.16b\n"
+ "movi v21.4s, #0x0\n"
+ ".inst 0x4e9a9595 // sdot v21.4s, v12.16b, v26.16b\n"
+ ".inst 0x4e8f943f // sdot v31.4s, v1.16b, v15.16b\n"
"add x28, x28, #0x10\n"
- ".inst 0x4e9d95d3 // sdot v19.4s, v14.16b, v29.16b\n"
- ".inst 0x4e849419 // sdot v25.4s, v0.16b, v4.16b\n"
+ ".inst 0x4e969595 // sdot v21.4s, v12.16b, v22.16b\n"
+ ".inst 0x4e9a943d // sdot v29.4s, v1.16b, v26.16b\n"
+ "movi v18.4s, #0x0\n"
"subs x11, x11, #0x1\n"
- ".inst 0x4e849605 // sdot v5.4s, v16.16b, v4.16b\n"
- "ext v4.16b, v4.16b, v4.16b, #0x1\n"
- "mov v18.16b, v19.16b\n .inst 0x4e9895d2 // sdot v18.4s, v14.16b, v24.16b\n"
- ".inst 0x4e8995d3 // sdot v19.4s, v14.16b, v9.16b\n"
- "ext v9.16b, v9.16b, v9.16b, #0x1\n"
- ".inst 0x4e9d9619 // sdot v25.4s, v16.16b, v29.16b\n"
- ".inst 0x4e9d97e5 // sdot v5.4s, v31.16b, v29.16b\n"
- "ext v29.16b, v29.16b, v29.16b, #0x1\n"
- ".inst 0x4e89941e // sdot v30.4s, v0.16b, v9.16b\n"
- ".inst 0x4e849414 // sdot v20.4s, v0.16b, v4.16b\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x4e8495d1 // sdot v17.4s, v14.16b, v4.16b\n"
- ".inst 0x4e9d95d1 // sdot v17.4s, v14.16b, v29.16b\n"
- ".inst 0x4e9897f9 // sdot v25.4s, v31.16b, v24.16b\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x4e84961e // sdot v30.4s, v16.16b, v4.16b\n"
- "ldr q4, [%x[params], #0x10]\n"
- ".inst 0x4e9d9614 // sdot v20.4s, v16.16b, v29.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x4e9895d0 // sdot v16.4s, v14.16b, v24.16b\n"
- ".inst 0x4e8995d1 // sdot v17.4s, v14.16b, v9.16b\n"
- "ldr q9, [%x[params], #0x0]\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- ".inst 0x4e9d97fe // sdot v30.4s, v31.16b, v29.16b\n"
- ".inst 0x4e9897f4 // sdot v20.4s, v31.16b, v24.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "ldr q9, [%x[params], #0x60]\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "ldr q16, [%x[params], #0x40]\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "ldr q31, [%x[params], #0x50]\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "ldr q0, [%x[params], #0x30]\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "ldr q4, [%x[params], #0x70]\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
+ ".inst 0x4e9a94df // sdot v31.4s, v6.16b, v26.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ "mov v17.16b, v21.16b\n .inst 0x4e9b9591 // sdot v17.4s, v12.16b, v27.16b\n"
+ ".inst 0x4e8f9595 // sdot v21.4s, v12.16b, v15.16b\n"
+ "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+ ".inst 0x4e9a9592 // sdot v18.4s, v12.16b, v26.16b\n"
+ ".inst 0x4e9694dd // sdot v29.4s, v6.16b, v22.16b\n"
+ ".inst 0x4e96969f // sdot v31.4s, v20.16b, v22.16b\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ ".inst 0x4e8f943e // sdot v30.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e9a943c // sdot v28.4s, v1.16b, v26.16b\n"
+ "mls v31.4s, v21.4s, v16.4s\n"
+ ".inst 0x4e969592 // sdot v18.4s, v12.16b, v22.16b\n"
+ ".inst 0x4e9b969d // sdot v29.4s, v20.16b, v27.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x4e9a94de // sdot v30.4s, v6.16b, v26.16b\n"
+ "ldr q26, [%x[params], #0x10]\n"
+ ".inst 0x4e9694dc // sdot v28.4s, v6.16b, v22.16b\n"
+ "mls v29.4s, v17.4s, v16.4s\n"
+ "mov v21.16b, v18.16b\n .inst 0x4e9b9595 // sdot v21.4s, v12.16b, v27.16b\n"
+ ".inst 0x4e8f9592 // sdot v18.4s, v12.16b, v15.16b\n"
+ "ldr q17, [%x[params], #0x0]\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x4e96969e // sdot v30.4s, v20.16b, v22.16b\n"
+ ".inst 0x4e9b969c // sdot v28.4s, v20.16b, v27.16b\n"
+ "mls v30.4s, v18.4s, v16.4s\n"
+ "mls v28.4s, v21.4s, v16.4s\n"
+ "and v15.16b, v31.16b, v26.16b\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v17.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v17.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v17.4s\n"
+ "ldr q1, [%x[params], #0x60]\n"
+ "sqadd v31.4s, v31.4s, v15.4s\n"
+ "and v18.16b, v30.16b, v26.16b\n"
+ "and v21.16b, v29.16b, v26.16b\n"
+ "and v17.16b, v28.16b, v26.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v26.4s\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "ldr q18, [%x[params], #0x40]\n"
+ "sqadd v29.4s, v29.4s, v21.4s\n"
+ "ldr q27, [%x[params], #0x50]\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "ldr q15, [%x[params], #0x30]\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "srshl v30.4s, v30.4s, v26.4s\n"
+ "srshl v29.4s, v29.4s, v26.4s\n"
+ "srshl v28.4s, v28.4s, v26.4s\n"
+ "ldr q20, [%x[params], #0x70]\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
"smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x4e8395d3 // sdot v19.4s, v14.16b, v3.16b\n"
- ".inst 0x4e9c95d3 // sdot v19.4s, v14.16b, v28.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "movi v22.4s, #0x0\n"
+ ".inst 0x4e979596 // sdot v22.4s, v12.16b, v23.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s5, [x24, x27]\n"
- "ldr q5, [%x[params], #0x20]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x4e9795d2 // sdot v18.4s, v14.16b, v23.16b\n"
+ "str s31, [x25, x27]\n"
+ "ldr q26, [%x[params], #0x20]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ ".inst 0x4e939596 // sdot v22.4s, v12.16b, v19.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
- ".inst 0x4e8895d3 // sdot v19.4s, v14.16b, v8.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "mov v30.16b, v5.16b\n"
- "str s20, [x21, x27]\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x4e889405 // sdot v5.4s, v0.16b, v8.16b\n"
- ".inst 0x4e839419 // sdot v25.4s, v0.16b, v3.16b\n"
- ".inst 0x4e839605 // sdot v5.4s, v16.16b, v3.16b\n"
- "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s30, [x24, x27]\n"
+ "mov v6.16b, v22.16b\n .inst 0x4e989586 // sdot v6.4s, v12.16b, v24.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s29, [x23, x27]\n"
+ "mov v30.16b, v26.16b\n"
+ ".inst 0x4e999596 // sdot v22.4s, v12.16b, v25.16b\n"
+ "str s28, [x22, x27]\n"
+ "mov v29.16b, v26.16b\n"
+ "mov v21.16b, v26.16b\n"
+ ".inst 0x4e9995fa // sdot v26.4s, v15.16b, v25.16b\n"
+ ".inst 0x4e9795fd // sdot v29.4s, v15.16b, v23.16b\n"
+ ".inst 0x4e97965a // sdot v26.4s, v18.16b, v23.16b\n"
+ "ext v25.16b, v25.16b, v25.16b, #0x1\n"
"add x27, x27, #0x4\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x4e88941e // sdot v30.4s, v0.16b, v8.16b\n"
- ".inst 0x4e839414 // sdot v20.4s, v0.16b, v3.16b\n"
- ".inst 0x4e8395d1 // sdot v17.4s, v14.16b, v3.16b\n"
- ".inst 0x4e9c9619 // sdot v25.4s, v16.16b, v28.16b\n"
- ".inst 0x4e9c97e5 // sdot v5.4s, v31.16b, v28.16b\n"
- "ext v28.16b, v28.16b, v28.16b, #0x1\n"
- ".inst 0x4e83961e // sdot v30.4s, v16.16b, v3.16b\n"
- "ldr q3, [x9, x28]\n"
- ".inst 0x4e9c9614 // sdot v20.4s, v16.16b, v28.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x4e9c95d1 // sdot v17.4s, v14.16b, v28.16b\n"
- ".inst 0x4e9797f9 // sdot v25.4s, v31.16b, v23.16b\n"
"ext v23.16b, v23.16b, v23.16b, #0x1\n"
- ".inst 0x4e9c97fe // sdot v30.4s, v31.16b, v28.16b\n"
- ".inst 0x4e9797f4 // sdot v20.4s, v31.16b, v23.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x4e9795d0 // sdot v16.4s, v14.16b, v23.16b\n"
- ".inst 0x4e8895d1 // sdot v17.4s, v14.16b, v8.16b\n"
- "ldr q8, [x14, x28]\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "ldr q9, [%x[params], #0xc0]\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "ldr q16, [%x[params], #0xa0]\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "ldr q31, [%x[params], #0xb0]\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "ldr q0, [%x[params], #0x90]\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "ldr q4, [%x[params], #0xd0]\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
+ "movi v28.4s, #0x0\n"
+ ".inst 0x4e9995fe // sdot v30.4s, v15.16b, v25.16b\n"
+ ".inst 0x4e9795f5 // sdot v21.4s, v15.16b, v23.16b\n"
+ ".inst 0x4e97959c // sdot v28.4s, v12.16b, v23.16b\n"
+ ".inst 0x4e93965d // sdot v29.4s, v18.16b, v19.16b\n"
+ ".inst 0x4e93977a // sdot v26.4s, v27.16b, v19.16b\n"
+ "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+ ".inst 0x4e97965e // sdot v30.4s, v18.16b, v23.16b\n"
+ "ldr q4, [x9, x28]\n"
+ ".inst 0x4e939655 // sdot v21.4s, v18.16b, v19.16b\n"
+ "mls v26.4s, v22.4s, v16.4s\n"
+ ".inst 0x4e93959c // sdot v28.4s, v12.16b, v19.16b\n"
+ ".inst 0x4e98977d // sdot v29.4s, v27.16b, v24.16b\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x4e93977e // sdot v30.4s, v27.16b, v19.16b\n"
+ ".inst 0x4e989775 // sdot v21.4s, v27.16b, v24.16b\n"
+ "sqrdmulh v26.4s, v26.4s, v1.4s\n"
+ "mov v17.16b, v28.16b\n .inst 0x4e989591 // sdot v17.4s, v12.16b, v24.16b\n"
+ ".inst 0x4e99959c // sdot v28.4s, v12.16b, v25.16b\n"
+ "ldr q31, [x14, x28]\n"
+ "mls v30.4s, v28.4s, v16.4s\n"
+ "mls v29.4s, v6.4s, v16.4s\n"
+ "mls v21.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v26.16b, v20.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v1.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v1.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v1.4s\n"
+ "ldr q27, [%x[params], #0xc0]\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "and v18.16b, v30.16b, v20.16b\n"
+ "and v6.16b, v29.16b, v20.16b\n"
+ "and v17.16b, v21.16b, v20.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "ldr q28, [%x[params], #0xa0]\n"
+ "sqadd v29.4s, v29.4s, v6.4s\n"
+ "ldr q24, [%x[params], #0xb0]\n"
+ "sqadd v21.4s, v21.4s, v17.4s\n"
+ "ldr q15, [%x[params], #0x90]\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "srshl v30.4s, v30.4s, v20.4s\n"
+ "srshl v29.4s, v29.4s, v20.4s\n"
+ "srshl v21.4s, v21.4s, v20.4s\n"
+ "ldr q1, [%x[params], #0xd0]\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
"smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x4e8295d3 // sdot v19.4s, v14.16b, v2.16b\n"
- ".inst 0x4e9b95d3 // sdot v19.4s, v14.16b, v27.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "movi v22.4s, #0x0\n"
+ ".inst 0x4e839596 // sdot v22.4s, v12.16b, v3.16b\n"
+ ".inst 0x4e809596 // sdot v22.4s, v12.16b, v0.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s5, [x24, x27]\n"
- "ldr q5, [%x[params], #0x80]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x4e9695d2 // sdot v18.4s, v14.16b, v22.16b\n"
+ "str s26, [x25, x27]\n"
+ "ldr q26, [%x[params], #0x80]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "mov v18.16b, v22.16b\n .inst 0x4e829592 // sdot v18.4s, v12.16b, v2.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
- ".inst 0x4e8795d3 // sdot v19.4s, v14.16b, v7.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "mov v30.16b, v5.16b\n"
- "str s20, [x21, x27]\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x4e879405 // sdot v5.4s, v0.16b, v7.16b\n"
- ".inst 0x4e829419 // sdot v25.4s, v0.16b, v2.16b\n"
- ".inst 0x4e829605 // sdot v5.4s, v16.16b, v2.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s30, [x24, x27]\n"
+ ".inst 0x4e879596 // sdot v22.4s, v12.16b, v7.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s29, [x23, x27]\n"
+ "mov v6.16b, v26.16b\n"
+ "str s21, [x22, x27]\n"
+ "mov v25.16b, v26.16b\n"
+ "mov v20.16b, v26.16b\n"
+ ".inst 0x4e8795fa // sdot v26.4s, v15.16b, v7.16b\n"
+ ".inst 0x4e8395f9 // sdot v25.4s, v15.16b, v3.16b\n"
+ ".inst 0x4e83979a // sdot v26.4s, v28.16b, v3.16b\n"
"ext v7.16b, v7.16b, v7.16b, #0x1\n"
"add x27, x27, #0x4\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "movi v23.4s, #0x0\n"
+ ".inst 0x4e8795e6 // sdot v6.4s, v15.16b, v7.16b\n"
+ ".inst 0x4e8395f4 // sdot v20.4s, v15.16b, v3.16b\n"
+ ".inst 0x4e839597 // sdot v23.4s, v12.16b, v3.16b\n"
+ ".inst 0x4e809799 // sdot v25.4s, v28.16b, v0.16b\n"
+ ".inst 0x4e80971a // sdot v26.4s, v24.16b, v0.16b\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x4e839786 // sdot v6.4s, v28.16b, v3.16b\n"
+ "ldr q19, [x26, x28]\n"
+ ".inst 0x4e809794 // sdot v20.4s, v28.16b, v0.16b\n"
+ "mls v26.4s, v22.4s, v16.4s\n"
+ ".inst 0x4e809597 // sdot v23.4s, v12.16b, v0.16b\n"
+ ".inst 0x4e829719 // sdot v25.4s, v24.16b, v2.16b\n"
"ext v2.16b, v2.16b, v2.16b, #0x1\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x4e87941e // sdot v30.4s, v0.16b, v7.16b\n"
- ".inst 0x4e829414 // sdot v20.4s, v0.16b, v2.16b\n"
- ".inst 0x4e8295d1 // sdot v17.4s, v14.16b, v2.16b\n"
- ".inst 0x4e9b9619 // sdot v25.4s, v16.16b, v27.16b\n"
- ".inst 0x4e9b97e5 // sdot v5.4s, v31.16b, v27.16b\n"
- "ext v27.16b, v27.16b, v27.16b, #0x1\n"
- ".inst 0x4e82961e // sdot v30.4s, v16.16b, v2.16b\n"
- "ldr q2, [x26, x28]\n"
- ".inst 0x4e9b9614 // sdot v20.4s, v16.16b, v27.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x4e9b95d1 // sdot v17.4s, v14.16b, v27.16b\n"
- ".inst 0x4e9697f9 // sdot v25.4s, v31.16b, v22.16b\n"
- "ext v22.16b, v22.16b, v22.16b, #0x1\n"
- ".inst 0x4e9b97fe // sdot v30.4s, v31.16b, v27.16b\n"
- ".inst 0x4e9697f4 // sdot v20.4s, v31.16b, v22.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x4e9695d0 // sdot v16.4s, v14.16b, v22.16b\n"
- ".inst 0x4e8795d1 // sdot v17.4s, v14.16b, v7.16b\n"
- "ldr q7, [x13, x28]\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "ldr q9, [%x[params], #0x120]\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "ldr q16, [%x[params], #0x100]\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "ldr q31, [%x[params], #0x110]\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "ldr q0, [%x[params], #0xf0]\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "ldr q4, [%x[params], #0x130]\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
+ ".inst 0x4e809706 // sdot v6.4s, v24.16b, v0.16b\n"
+ ".inst 0x4e829714 // sdot v20.4s, v24.16b, v2.16b\n"
+ "sqrdmulh v26.4s, v26.4s, v27.4s\n"
+ "mov v17.16b, v23.16b\n .inst 0x4e829591 // sdot v17.4s, v12.16b, v2.16b\n"
+ ".inst 0x4e879597 // sdot v23.4s, v12.16b, v7.16b\n"
+ "ldr q21, [x13, x28]\n"
+ "mls v6.4s, v23.4s, v16.4s\n"
+ "mls v25.4s, v18.4s, v16.4s\n"
+ "mls v20.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v26.16b, v1.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v6.4s, v6.4s, v27.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v27.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+ "ldr q15, [%x[params], #0x120]\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "and v18.16b, v6.16b, v1.16b\n"
+ "and v22.16b, v25.16b, v1.16b\n"
+ "and v17.16b, v20.16b, v1.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "ldr q30, [%x[params], #0x100]\n"
+ "sqadd v25.4s, v25.4s, v22.4s\n"
+ "ldr q27, [%x[params], #0x110]\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "ldr q24, [%x[params], #0xf0]\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "srshl v6.4s, v6.4s, v1.4s\n"
+ "srshl v25.4s, v25.4s, v1.4s\n"
+ "srshl v20.4s, v20.4s, v1.4s\n"
+ "ldr q23, [%x[params], #0x130]\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "add v6.4s, v6.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smax v6.4s, v6.4s, v13.4s\n"
"smax v25.4s, v25.4s, v13.4s\n"
"smax v20.4s, v20.4s, v13.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x4e8195d3 // sdot v19.4s, v14.16b, v1.16b\n"
- ".inst 0x4e9a95d3 // sdot v19.4s, v14.16b, v26.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s5, [x24, x27]\n"
- "ldr q5, [%x[params], #0xe0]\n"
+ "smin v6.4s, v6.4s, v11.4s\n"
+ "smin v25.4s, v25.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "movi v0.4s, #0x0\n"
+ ".inst 0x4e8a9580 // sdot v0.4s, v12.16b, v10.16b\n"
+ ".inst 0x4e859580 // sdot v0.4s, v12.16b, v5.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "str s26, [x25, x27]\n"
+ "ldr q28, [%x[params], #0xe0]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x4e9595d2 // sdot v18.4s, v14.16b, v21.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "mov v22.16b, v0.16b\n .inst 0x4e899596 // sdot v22.4s, v12.16b, v9.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
- ".inst 0x4e8695d3 // sdot v19.4s, v14.16b, v6.16b\n"
+ "str s6, [x24, x27]\n"
+ ".inst 0x4e889580 // sdot v0.4s, v12.16b, v8.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "mov v30.16b, v5.16b\n"
- "str s20, [x21, x27]\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x4e869405 // sdot v5.4s, v0.16b, v6.16b\n"
- ".inst 0x4e819419 // sdot v25.4s, v0.16b, v1.16b\n"
- ".inst 0x4e819605 // sdot v5.4s, v16.16b, v1.16b\n"
- "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ "str s25, [x23, x27]\n"
+ "mov v29.16b, v28.16b\n"
+ "str s20, [x22, x27]\n"
+ "mov v25.16b, v28.16b\n"
+ "mov v7.16b, v28.16b\n"
+ ".inst 0x4e88971c // sdot v28.4s, v24.16b, v8.16b\n"
+ ".inst 0x4e8a9719 // sdot v25.4s, v24.16b, v10.16b\n"
+ ".inst 0x4e8a97dc // sdot v28.4s, v30.16b, v10.16b\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
"add x27, x27, #0x4\n"
- "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
"movi v17.4s, #0x0\n"
- ".inst 0x4e86941e // sdot v30.4s, v0.16b, v6.16b\n"
- ".inst 0x4e819414 // sdot v20.4s, v0.16b, v1.16b\n"
- ".inst 0x4e8195d1 // sdot v17.4s, v14.16b, v1.16b\n"
- ".inst 0x4e9a9619 // sdot v25.4s, v16.16b, v26.16b\n"
- ".inst 0x4e9a97e5 // sdot v5.4s, v31.16b, v26.16b\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- ".inst 0x4e81961e // sdot v30.4s, v16.16b, v1.16b\n"
- "ldr q1, [x25, x28]\n"
- ".inst 0x4e9a9614 // sdot v20.4s, v16.16b, v26.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x4e9a95d1 // sdot v17.4s, v14.16b, v26.16b\n"
- ".inst 0x4e9597f9 // sdot v25.4s, v31.16b, v21.16b\n"
- "ext v21.16b, v21.16b, v21.16b, #0x1\n"
- ".inst 0x4e9a97fe // sdot v30.4s, v31.16b, v26.16b\n"
- ".inst 0x4e9597f4 // sdot v20.4s, v31.16b, v21.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x4e9595d0 // sdot v16.4s, v14.16b, v21.16b\n"
- ".inst 0x4e8695d1 // sdot v17.4s, v14.16b, v6.16b\n"
- "ldr q6, [x12, x28]\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "ldr q9, [x15, x28]\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "ldp x15, x14, [%x[inptrs], #0x40]\n"
- "ldr q29, [x15, x28]\n"
- "ldr q28, [x14, x28]\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "ldp x13, x12, [%x[inptrs], #0x50]\n"
- "ldr q27, [x13, x28]\n"
- "ldr q26, [x12, x28]\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "ldr q16, [%x[params], #0x160]\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "ldr q31, [%x[params], #0x170]\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "ldr q0, [%x[params], #0x150]\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "ldr q4, [x10, x28]\n"
- "ldp x10, x9, [%x[inptrs], #0x60]\n"
- "ldr q24, [x10, x28]\n"
- "ldr q23, [x9, x28]\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "ldp x26, x25, [%x[inptrs], #0x70]\n"
- "ldr q22, [x26, x28]\n"
- "ldr q21, [x25, x28]\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
+ ".inst 0x4e88971d // sdot v29.4s, v24.16b, v8.16b\n"
+ ".inst 0x4e8a9707 // sdot v7.4s, v24.16b, v10.16b\n"
+ ".inst 0x4e8a9591 // sdot v17.4s, v12.16b, v10.16b\n"
+ ".inst 0x4e8597d9 // sdot v25.4s, v30.16b, v5.16b\n"
+ ".inst 0x4e85977c // sdot v28.4s, v27.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x4e8a97dd // sdot v29.4s, v30.16b, v10.16b\n"
+ "ldr q10, [x21, x28]\n"
+ ".inst 0x4e8597c7 // sdot v7.4s, v30.16b, v5.16b\n"
+ "mls v28.4s, v0.4s, v16.4s\n"
+ ".inst 0x4e859591 // sdot v17.4s, v12.16b, v5.16b\n"
+ ".inst 0x4e899779 // sdot v25.4s, v27.16b, v9.16b\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+ ".inst 0x4e85977d // sdot v29.4s, v27.16b, v5.16b\n"
+ ".inst 0x4e899767 // sdot v7.4s, v27.16b, v9.16b\n"
+ "sqrdmulh v28.4s, v28.4s, v15.4s\n"
+ "mov v18.16b, v17.16b\n .inst 0x4e899592 // sdot v18.4s, v12.16b, v9.16b\n"
+ ".inst 0x4e889591 // sdot v17.4s, v12.16b, v8.16b\n"
+ "ldr q8, [x12, x28]\n"
+ "mls v29.4s, v17.4s, v16.4s\n"
+ "mls v25.4s, v22.4s, v16.4s\n"
+ "mls v7.4s, v18.4s, v16.4s\n"
+ "and v17.16b, v28.16b, v23.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v29.4s, v29.4s, v15.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v15.4s\n"
+ "sqrdmulh v7.4s, v7.4s, v15.4s\n"
+ "ldr q15, [x15, x28]\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "ldp x21, x20, [%x[inptrs], #0x40]\n"
+ "ldr q22, [x21, x28]\n"
+ "ldr q3, [x20, x28]\n"
+ "and v24.16b, v29.16b, v23.16b\n"
+ "and v20.16b, v25.16b, v23.16b\n"
+ "and v17.16b, v7.16b, v23.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "ldp x21, x20, [%x[inptrs], #0x50]\n"
+ "ldr q2, [x21, x28]\n"
+ "ldr q5, [x20, x28]\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v28.4s, v28.4s, v23.4s\n"
+ "sqadd v29.4s, v29.4s, v24.4s\n"
+ "ldr q6, [%x[params], #0x160]\n"
+ "sqadd v25.4s, v25.4s, v20.4s\n"
+ "ldr q20, [%x[params], #0x170]\n"
+ "sqadd v7.4s, v7.4s, v17.4s\n"
+ "ldr q1, [%x[params], #0x150]\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "srshl v29.4s, v29.4s, v23.4s\n"
+ "srshl v25.4s, v25.4s, v23.4s\n"
+ "srshl v7.4s, v7.4s, v23.4s\n"
+ "ldr q26, [x10, x28]\n"
+ "ldp x21, x20, [%x[inptrs], #0x60]\n"
+ "ldr q27, [x21, x28]\n"
+ "ldr q30, [x20, x28]\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v7.4s, v7.4s, v14.4s\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "ldr q23, [x21, x28]\n"
+ "ldr q9, [x20, x28]\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
"ldp x15, x14, [%x[inptrs], #0x0]\n"
"smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
+ "smax v7.4s, v7.4s, v13.4s\n"
"ldp x13, x12, [%x[inptrs], #0x10]\n"
"ldp x10, x9, [%x[inptrs], #0x20]\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s5, [x24, x27]\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v25.4s, v25.4s, v11.4s\n"
+ "ldp x26, x21, [%x[inptrs], #0x30]\n"
+ "smin v7.4s, v7.4s, v11.4s\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s28, [x25, x27]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "zip2 v5.16b, v9.16b, v7.16b\n"
- "zip1 v9.16b, v9.16b, v7.16b\n"
- "zip1 v7.16b, v8.16b, v6.16b\n"
- "zip2 v6.16b, v8.16b, v6.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "zip2 v17.16b, v15.16b, v21.16b\n"
+ "zip1 v15.16b, v15.16b, v21.16b\n"
+ "zip1 v18.16b, v31.16b, v8.16b\n"
+ "zip2 v8.16b, v31.16b, v8.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "zip2 v8.16b, v9.16b, v7.16b\n"
- "str s20, [x21, x27]\n"
- "zip1 v9.16b, v9.16b, v7.16b\n"
- "zip1 v7.16b, v5.16b, v6.16b\n"
+ "str s29, [x24, x27]\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "str s25, [x23, x27]\n"
+ "zip2 v25.16b, v15.16b, v18.16b\n"
+ "str s7, [x22, x27]\n"
+ "zip1 v15.16b, v15.16b, v18.16b\n"
+ "zip1 v7.16b, v17.16b, v8.16b\n"
"add x27, x27, #0x4\n"
- "zip2 v6.16b, v5.16b, v6.16b\n"
- "ldr q5, [%x[params], #0x140]\n"
- "zip2 v30.16b, v4.16b, v2.16b\n"
+ "zip2 v8.16b, v17.16b, v8.16b\n"
+ "ldr q31, [%x[params], #0x140]\n"
+ "zip2 v29.16b, v26.16b, v19.16b\n"
"add %x[params], %x[params], #0x180\n"
- "zip1 v4.16b, v4.16b, v2.16b\n"
- "zip1 v2.16b, v3.16b, v1.16b\n"
- "zip2 v1.16b, v3.16b, v1.16b\n"
- "zip2 v25.16b, v29.16b, v27.16b\n"
- "zip1 v29.16b, v29.16b, v27.16b\n"
- "zip1 v27.16b, v28.16b, v26.16b\n"
- "zip2 v26.16b, v28.16b, v26.16b\n"
- "zip2 v20.16b, v24.16b, v22.16b\n"
- "zip1 v24.16b, v24.16b, v22.16b\n"
- "zip1 v22.16b, v23.16b, v21.16b\n"
- "zip2 v21.16b, v23.16b, v21.16b\n"
- "zip2 v3.16b, v4.16b, v2.16b\n"
- "zip1 v4.16b, v4.16b, v2.16b\n"
- "zip1 v2.16b, v30.16b, v1.16b\n"
- "zip2 v1.16b, v30.16b, v1.16b\n"
- "zip2 v28.16b, v29.16b, v27.16b\n"
- "zip1 v29.16b, v29.16b, v27.16b\n"
- "zip1 v27.16b, v25.16b, v26.16b\n"
- "zip2 v26.16b, v25.16b, v26.16b\n"
- "zip2 v23.16b, v24.16b, v22.16b\n"
- "zip1 v24.16b, v24.16b, v22.16b\n"
- "zip1 v22.16b, v20.16b, v21.16b\n"
- "zip2 v21.16b, v20.16b, v21.16b\n"
- "mov v30.16b, v5.16b\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
+ "zip1 v26.16b, v26.16b, v19.16b\n"
+ "zip1 v28.16b, v4.16b, v10.16b\n"
+ "zip2 v10.16b, v4.16b, v10.16b\n"
+ "zip2 v24.16b, v22.16b, v2.16b\n"
+ "zip1 v22.16b, v22.16b, v2.16b\n"
+ "zip1 v21.16b, v3.16b, v5.16b\n"
+ "zip2 v5.16b, v3.16b, v5.16b\n"
+ "zip2 v18.16b, v27.16b, v23.16b\n"
+ "zip1 v27.16b, v27.16b, v23.16b\n"
+ "zip1 v17.16b, v30.16b, v9.16b\n"
+ "zip2 v9.16b, v30.16b, v9.16b\n"
+ "zip2 v23.16b, v26.16b, v28.16b\n"
+ "zip1 v26.16b, v26.16b, v28.16b\n"
+ "zip1 v3.16b, v29.16b, v10.16b\n"
+ "zip2 v10.16b, v29.16b, v10.16b\n"
+ "zip2 v19.16b, v22.16b, v21.16b\n"
+ "zip1 v22.16b, v22.16b, v21.16b\n"
+ "zip1 v0.16b, v24.16b, v5.16b\n"
+ "zip2 v5.16b, v24.16b, v5.16b\n"
+ "zip2 v24.16b, v27.16b, v17.16b\n"
+ "zip1 v27.16b, v27.16b, v17.16b\n"
+ "zip1 v2.16b, v18.16b, v9.16b\n"
+ "zip2 v9.16b, v18.16b, v9.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
"bgt 1b\n"
"2:" // Detached iteration
- "movi v19.4s, #0x0\n"
- ".inst 0x4e8495d3 // sdot v19.4s, v14.16b, v4.16b\n"
- ".inst 0x4e899405 // sdot v5.4s, v0.16b, v9.16b\n"
+ "movi v21.4s, #0x0\n"
+ ".inst 0x4e9a9595 // sdot v21.4s, v12.16b, v26.16b\n"
+ ".inst 0x4e8f943f // sdot v31.4s, v1.16b, v15.16b\n"
"tst %x[n_channels], #0xf\n"
- ".inst 0x4e9d95d3 // sdot v19.4s, v14.16b, v29.16b\n"
- ".inst 0x4e849419 // sdot v25.4s, v0.16b, v4.16b\n"
+ ".inst 0x4e969595 // sdot v21.4s, v12.16b, v22.16b\n"
+ ".inst 0x4e9a943d // sdot v29.4s, v1.16b, v26.16b\n"
+ "movi v18.4s, #0x0\n"
"add x28, x28, #0x10\n"
- ".inst 0x4e849605 // sdot v5.4s, v16.16b, v4.16b\n"
- "ext v4.16b, v4.16b, v4.16b, #0x1\n"
- "mov v18.16b, v19.16b\n .inst 0x4e9895d2 // sdot v18.4s, v14.16b, v24.16b\n"
- ".inst 0x4e8995d3 // sdot v19.4s, v14.16b, v9.16b\n"
- "ext v9.16b, v9.16b, v9.16b, #0x1\n"
- ".inst 0x4e9d9619 // sdot v25.4s, v16.16b, v29.16b\n"
- ".inst 0x4e9d97e5 // sdot v5.4s, v31.16b, v29.16b\n"
- "ext v29.16b, v29.16b, v29.16b, #0x1\n"
- ".inst 0x4e89941e // sdot v30.4s, v0.16b, v9.16b\n"
- ".inst 0x4e849414 // sdot v20.4s, v0.16b, v4.16b\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x4e8495d1 // sdot v17.4s, v14.16b, v4.16b\n"
- ".inst 0x4e9d95d1 // sdot v17.4s, v14.16b, v29.16b\n"
- ".inst 0x4e9897f9 // sdot v25.4s, v31.16b, v24.16b\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x4e84961e // sdot v30.4s, v16.16b, v4.16b\n"
+ ".inst 0x4e9a94df // sdot v31.4s, v6.16b, v26.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ "mov v17.16b, v21.16b\n .inst 0x4e9b9591 // sdot v17.4s, v12.16b, v27.16b\n"
+ ".inst 0x4e8f9595 // sdot v21.4s, v12.16b, v15.16b\n"
+ "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+ ".inst 0x4e9a9592 // sdot v18.4s, v12.16b, v26.16b\n"
+ ".inst 0x4e9694dd // sdot v29.4s, v6.16b, v22.16b\n"
+ ".inst 0x4e96969f // sdot v31.4s, v20.16b, v22.16b\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ ".inst 0x4e8f943e // sdot v30.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e9a943c // sdot v28.4s, v1.16b, v26.16b\n"
+ "mls v31.4s, v21.4s, v16.4s\n"
+ ".inst 0x4e969592 // sdot v18.4s, v12.16b, v22.16b\n"
+ ".inst 0x4e9b969d // sdot v29.4s, v20.16b, v27.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x4e9a94de // sdot v30.4s, v6.16b, v26.16b\n"
"ldr q4, [%x[params], #0x10]\n"
- ".inst 0x4e9d9614 // sdot v20.4s, v16.16b, v29.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x4e9895d0 // sdot v16.4s, v14.16b, v24.16b\n"
- ".inst 0x4e8995d1 // sdot v17.4s, v14.16b, v9.16b\n"
- "ldr q9, [%x[params], #0x0]\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- ".inst 0x4e9d97fe // sdot v30.4s, v31.16b, v29.16b\n"
- ".inst 0x4e9897f4 // sdot v20.4s, v31.16b, v24.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "ldr q9, [%x[params], #0x60]\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "ldr q16, [%x[params], #0x40]\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "ldr q31, [%x[params], #0x50]\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "ldr q0, [%x[params], #0x30]\n"
- "add v5.4s, v5.4s, v10.4s\n"
+ ".inst 0x4e9694dc // sdot v28.4s, v6.16b, v22.16b\n"
+ "mls v29.4s, v17.4s, v16.4s\n"
+ "mov v21.16b, v18.16b\n .inst 0x4e9b9595 // sdot v21.4s, v12.16b, v27.16b\n"
+ ".inst 0x4e8f9592 // sdot v18.4s, v12.16b, v15.16b\n"
+ "ldr q17, [%x[params], #0x0]\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x4e96969e // sdot v30.4s, v20.16b, v22.16b\n"
+ ".inst 0x4e9b969c // sdot v28.4s, v20.16b, v27.16b\n"
+ "mls v30.4s, v18.4s, v16.4s\n"
+ "mls v28.4s, v21.4s, v16.4s\n"
+ "and v27.16b, v31.16b, v4.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v17.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v17.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v17.4s\n"
+ "ldr q15, [%x[params], #0x60]\n"
+ "sqadd v31.4s, v31.4s, v27.4s\n"
+ "and v20.16b, v30.16b, v4.16b\n"
+ "and v18.16b, v29.16b, v4.16b\n"
+ "and v17.16b, v28.16b, v4.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v20.4s\n"
+ "ldr q27, [%x[params], #0x40]\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "ldr q26, [%x[params], #0x50]\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "ldr q6, [%x[params], #0x30]\n"
+ "add v31.4s, v31.4s, v14.4s\n"
"srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
+ "srshl v29.4s, v29.4s, v4.4s\n"
+ "srshl v28.4s, v28.4s, v4.4s\n"
"ldr q4, [%x[params], #0x70]\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
"smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x4e8395d3 // sdot v19.4s, v14.16b, v3.16b\n"
- ".inst 0x4e9c95d3 // sdot v19.4s, v14.16b, v28.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "movi v1.4s, #0x0\n"
+ ".inst 0x4e979581 // sdot v1.4s, v12.16b, v23.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s5, [x24, x27]\n"
- "ldr q5, [%x[params], #0x20]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x4e9795d2 // sdot v18.4s, v14.16b, v23.16b\n"
+ "str s31, [x25, x27]\n"
+ "ldr q31, [%x[params], #0x20]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ ".inst 0x4e939581 // sdot v1.4s, v12.16b, v19.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
- ".inst 0x4e8895d3 // sdot v19.4s, v14.16b, v8.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "mov v30.16b, v5.16b\n"
- "str s20, [x21, x27]\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x4e889405 // sdot v5.4s, v0.16b, v8.16b\n"
- ".inst 0x4e839419 // sdot v25.4s, v0.16b, v3.16b\n"
- ".inst 0x4e839605 // sdot v5.4s, v16.16b, v3.16b\n"
- "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s30, [x24, x27]\n"
+ "mov v22.16b, v1.16b\n .inst 0x4e989596 // sdot v22.4s, v12.16b, v24.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s29, [x23, x27]\n"
+ "mov v29.16b, v31.16b\n"
+ ".inst 0x4e999581 // sdot v1.4s, v12.16b, v25.16b\n"
+ "str s28, [x22, x27]\n"
+ "mov v21.16b, v31.16b\n"
+ "mov v20.16b, v31.16b\n"
+ ".inst 0x4e9994df // sdot v31.4s, v6.16b, v25.16b\n"
+ ".inst 0x4e9794d5 // sdot v21.4s, v6.16b, v23.16b\n"
+ ".inst 0x4e97977f // sdot v31.4s, v27.16b, v23.16b\n"
+ "ext v25.16b, v25.16b, v25.16b, #0x1\n"
"add x27, x27, #0x4\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x4e88941e // sdot v30.4s, v0.16b, v8.16b\n"
- ".inst 0x4e839414 // sdot v20.4s, v0.16b, v3.16b\n"
- ".inst 0x4e8395d1 // sdot v17.4s, v14.16b, v3.16b\n"
- ".inst 0x4e9c9619 // sdot v25.4s, v16.16b, v28.16b\n"
- ".inst 0x4e9c97e5 // sdot v5.4s, v31.16b, v28.16b\n"
- "ext v28.16b, v28.16b, v28.16b, #0x1\n"
- ".inst 0x4e83961e // sdot v30.4s, v16.16b, v3.16b\n"
- ".inst 0x4e9c9614 // sdot v20.4s, v16.16b, v28.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x4e9c95d1 // sdot v17.4s, v14.16b, v28.16b\n"
- ".inst 0x4e9797f9 // sdot v25.4s, v31.16b, v23.16b\n"
"ext v23.16b, v23.16b, v23.16b, #0x1\n"
- ".inst 0x4e9c97fe // sdot v30.4s, v31.16b, v28.16b\n"
- ".inst 0x4e9797f4 // sdot v20.4s, v31.16b, v23.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x4e9795d0 // sdot v16.4s, v14.16b, v23.16b\n"
- ".inst 0x4e8895d1 // sdot v17.4s, v14.16b, v8.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "ldr q9, [%x[params], #0xc0]\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "ldr q16, [%x[params], #0xa0]\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "ldr q31, [%x[params], #0xb0]\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "ldr q0, [%x[params], #0x90]\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
+ "movi v18.4s, #0x0\n"
+ ".inst 0x4e9994dd // sdot v29.4s, v6.16b, v25.16b\n"
+ ".inst 0x4e9794d4 // sdot v20.4s, v6.16b, v23.16b\n"
+ ".inst 0x4e979592 // sdot v18.4s, v12.16b, v23.16b\n"
+ ".inst 0x4e939775 // sdot v21.4s, v27.16b, v19.16b\n"
+ ".inst 0x4e93975f // sdot v31.4s, v26.16b, v19.16b\n"
+ "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+ ".inst 0x4e97977d // sdot v29.4s, v27.16b, v23.16b\n"
+ ".inst 0x4e939774 // sdot v20.4s, v27.16b, v19.16b\n"
+ "mls v31.4s, v1.4s, v16.4s\n"
+ ".inst 0x4e939592 // sdot v18.4s, v12.16b, v19.16b\n"
+ ".inst 0x4e989755 // sdot v21.4s, v26.16b, v24.16b\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x4e93975d // sdot v29.4s, v26.16b, v19.16b\n"
+ ".inst 0x4e989754 // sdot v20.4s, v26.16b, v24.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v15.4s\n"
+ "mov v17.16b, v18.16b\n .inst 0x4e989591 // sdot v17.4s, v12.16b, v24.16b\n"
+ ".inst 0x4e999592 // sdot v18.4s, v12.16b, v25.16b\n"
+ "mls v29.4s, v18.4s, v16.4s\n"
+ "mls v21.4s, v22.4s, v16.4s\n"
+ "mls v20.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v31.16b, v4.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v29.4s, v29.4s, v15.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v15.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v15.4s\n"
+ "ldr q27, [%x[params], #0xc0]\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "and v19.16b, v29.16b, v4.16b\n"
+ "and v18.16b, v21.16b, v4.16b\n"
+ "and v17.16b, v20.16b, v4.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v4.4s\n"
+ "sqadd v29.4s, v29.4s, v19.4s\n"
+ "ldr q26, [%x[params], #0xa0]\n"
+ "sqadd v21.4s, v21.4s, v18.4s\n"
+ "ldr q25, [%x[params], #0xb0]\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "ldr q24, [%x[params], #0x90]\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "srshl v29.4s, v29.4s, v4.4s\n"
+ "srshl v21.4s, v21.4s, v4.4s\n"
"srshl v20.4s, v20.4s, v4.4s\n"
- "ldr q4, [%x[params], #0xd0]\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
+ "ldr q1, [%x[params], #0xd0]\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
"smax v20.4s, v20.4s, v13.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x4e8295d3 // sdot v19.4s, v14.16b, v2.16b\n"
- ".inst 0x4e9b95d3 // sdot v19.4s, v14.16b, v27.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s5, [x24, x27]\n"
- "ldr q5, [%x[params], #0x80]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "movi v23.4s, #0x0\n"
+ ".inst 0x4e839597 // sdot v23.4s, v12.16b, v3.16b\n"
+ ".inst 0x4e809597 // sdot v23.4s, v12.16b, v0.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s31, [x25, x27]\n"
+ "ldr q31, [%x[params], #0x80]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x4e9695d2 // sdot v18.4s, v14.16b, v22.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
- ".inst 0x4e8795d3 // sdot v19.4s, v14.16b, v7.16b\n"
+ "mov v22.16b, v23.16b\n .inst 0x4e829596 // sdot v22.4s, v12.16b, v2.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s29, [x24, x27]\n"
+ ".inst 0x4e879597 // sdot v23.4s, v12.16b, v7.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "mov v30.16b, v5.16b\n"
- "str s20, [x21, x27]\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x4e879405 // sdot v5.4s, v0.16b, v7.16b\n"
- ".inst 0x4e829419 // sdot v25.4s, v0.16b, v2.16b\n"
- ".inst 0x4e829605 // sdot v5.4s, v16.16b, v2.16b\n"
+ "str s21, [x23, x27]\n"
+ "mov v21.16b, v31.16b\n"
+ "str s20, [x22, x27]\n"
+ "mov v4.16b, v31.16b\n"
+ "mov v20.16b, v31.16b\n"
+ ".inst 0x4e87971f // sdot v31.4s, v24.16b, v7.16b\n"
+ ".inst 0x4e839704 // sdot v4.4s, v24.16b, v3.16b\n"
+ ".inst 0x4e83975f // sdot v31.4s, v26.16b, v3.16b\n"
"ext v7.16b, v7.16b, v7.16b, #0x1\n"
"add x27, x27, #0x4\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "movi v18.4s, #0x0\n"
+ ".inst 0x4e879715 // sdot v21.4s, v24.16b, v7.16b\n"
+ ".inst 0x4e839714 // sdot v20.4s, v24.16b, v3.16b\n"
+ ".inst 0x4e839592 // sdot v18.4s, v12.16b, v3.16b\n"
+ ".inst 0x4e809744 // sdot v4.4s, v26.16b, v0.16b\n"
+ ".inst 0x4e80973f // sdot v31.4s, v25.16b, v0.16b\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x4e839755 // sdot v21.4s, v26.16b, v3.16b\n"
+ ".inst 0x4e809754 // sdot v20.4s, v26.16b, v0.16b\n"
+ "mls v31.4s, v23.4s, v16.4s\n"
+ ".inst 0x4e809592 // sdot v18.4s, v12.16b, v0.16b\n"
+ ".inst 0x4e829724 // sdot v4.4s, v25.16b, v2.16b\n"
"ext v2.16b, v2.16b, v2.16b, #0x1\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x4e87941e // sdot v30.4s, v0.16b, v7.16b\n"
- ".inst 0x4e829414 // sdot v20.4s, v0.16b, v2.16b\n"
- ".inst 0x4e8295d1 // sdot v17.4s, v14.16b, v2.16b\n"
- ".inst 0x4e9b9619 // sdot v25.4s, v16.16b, v27.16b\n"
- ".inst 0x4e9b97e5 // sdot v5.4s, v31.16b, v27.16b\n"
- "ext v27.16b, v27.16b, v27.16b, #0x1\n"
- ".inst 0x4e82961e // sdot v30.4s, v16.16b, v2.16b\n"
- ".inst 0x4e9b9614 // sdot v20.4s, v16.16b, v27.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x4e9b95d1 // sdot v17.4s, v14.16b, v27.16b\n"
- ".inst 0x4e9697f9 // sdot v25.4s, v31.16b, v22.16b\n"
- "ext v22.16b, v22.16b, v22.16b, #0x1\n"
- ".inst 0x4e9b97fe // sdot v30.4s, v31.16b, v27.16b\n"
- ".inst 0x4e9697f4 // sdot v20.4s, v31.16b, v22.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x4e9695d0 // sdot v16.4s, v14.16b, v22.16b\n"
- ".inst 0x4e8795d1 // sdot v17.4s, v14.16b, v7.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "ldr q9, [%x[params], #0x120]\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "ldr q16, [%x[params], #0x100]\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "ldr q31, [%x[params], #0x110]\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "ldr q0, [%x[params], #0xf0]\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "ldr q4, [%x[params], #0x130]\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
+ ".inst 0x4e809735 // sdot v21.4s, v25.16b, v0.16b\n"
+ ".inst 0x4e829734 // sdot v20.4s, v25.16b, v2.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v27.4s\n"
+ "mov v17.16b, v18.16b\n .inst 0x4e829591 // sdot v17.4s, v12.16b, v2.16b\n"
+ ".inst 0x4e879592 // sdot v18.4s, v12.16b, v7.16b\n"
+ "mls v21.4s, v18.4s, v16.4s\n"
+ "mls v4.4s, v22.4s, v16.4s\n"
+ "mls v20.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v31.16b, v1.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v27.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+ "ldr q30, [%x[params], #0x120]\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "and v19.16b, v21.16b, v1.16b\n"
+ "and v18.16b, v4.16b, v1.16b\n"
+ "and v17.16b, v20.16b, v1.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v1.4s\n"
+ "sqadd v21.4s, v21.4s, v19.4s\n"
+ "ldr q29, [%x[params], #0x100]\n"
+ "sqadd v4.4s, v4.4s, v18.4s\n"
+ "ldr q28, [%x[params], #0x110]\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "ldr q27, [%x[params], #0xf0]\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "srshl v21.4s, v21.4s, v1.4s\n"
+ "srshl v4.4s, v4.4s, v1.4s\n"
+ "srshl v20.4s, v20.4s, v1.4s\n"
+ "ldr q26, [%x[params], #0x130]\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v4.4s, v4.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
+ "smax v4.4s, v4.4s, v13.4s\n"
"smax v20.4s, v20.4s, v13.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x4e8195d3 // sdot v19.4s, v14.16b, v1.16b\n"
- ".inst 0x4e9a95d3 // sdot v19.4s, v14.16b, v26.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s5, [x24, x27]\n"
- "ldr q5, [%x[params], #0xe0]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v4.4s, v4.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "movi v25.4s, #0x0\n"
+ ".inst 0x4e8a9599 // sdot v25.4s, v12.16b, v10.16b\n"
+ ".inst 0x4e859599 // sdot v25.4s, v12.16b, v5.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s31, [x25, x27]\n"
+ "ldr q24, [%x[params], #0xe0]\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x4e9595d2 // sdot v18.4s, v14.16b, v21.16b\n"
+ "mov v23.16b, v25.16b\n .inst 0x4e899597 // sdot v23.4s, v12.16b, v9.16b\n"
"add %x[params], %x[params], #0x140\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
- ".inst 0x4e8695d3 // sdot v19.4s, v14.16b, v6.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "str s21, [x24, x27]\n"
+ ".inst 0x4e889599 // sdot v25.4s, v12.16b, v8.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "mov v30.16b, v5.16b\n"
- "str s20, [x21, x27]\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x4e869405 // sdot v5.4s, v0.16b, v6.16b\n"
- ".inst 0x4e819419 // sdot v25.4s, v0.16b, v1.16b\n"
- ".inst 0x4e819605 // sdot v5.4s, v16.16b, v1.16b\n"
- "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ "str s4, [x23, x27]\n"
+ "mov v22.16b, v24.16b\n"
+ "str s20, [x22, x27]\n"
+ "mov v21.16b, v24.16b\n"
+ "mov v20.16b, v24.16b\n"
+ ".inst 0x4e889778 // sdot v24.4s, v27.16b, v8.16b\n"
+ ".inst 0x4e8a9775 // sdot v21.4s, v27.16b, v10.16b\n"
+ ".inst 0x4e8a97b8 // sdot v24.4s, v29.16b, v10.16b\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
"add x27, x27, #0x4\n"
- "ext v1.16b, v1.16b, v1.16b, #0x1\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x4e86941e // sdot v30.4s, v0.16b, v6.16b\n"
- ".inst 0x4e819414 // sdot v20.4s, v0.16b, v1.16b\n"
- ".inst 0x4e8195d1 // sdot v17.4s, v14.16b, v1.16b\n"
- ".inst 0x4e9a9619 // sdot v25.4s, v16.16b, v26.16b\n"
- ".inst 0x4e9a97e5 // sdot v5.4s, v31.16b, v26.16b\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- ".inst 0x4e81961e // sdot v30.4s, v16.16b, v1.16b\n"
- ".inst 0x4e9a9614 // sdot v20.4s, v16.16b, v26.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x4e9a95d1 // sdot v17.4s, v14.16b, v26.16b\n"
- ".inst 0x4e9597f9 // sdot v25.4s, v31.16b, v21.16b\n"
- "ext v21.16b, v21.16b, v21.16b, #0x1\n"
- ".inst 0x4e9a97fe // sdot v30.4s, v31.16b, v26.16b\n"
- ".inst 0x4e9597f4 // sdot v20.4s, v31.16b, v21.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x4e9595d0 // sdot v16.4s, v14.16b, v21.16b\n"
- ".inst 0x4e8695d1 // sdot v17.4s, v14.16b, v6.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+ "movi v18.4s, #0x0\n"
+ ".inst 0x4e889776 // sdot v22.4s, v27.16b, v8.16b\n"
+ ".inst 0x4e8a9774 // sdot v20.4s, v27.16b, v10.16b\n"
+ ".inst 0x4e8a9592 // sdot v18.4s, v12.16b, v10.16b\n"
+ ".inst 0x4e8597b5 // sdot v21.4s, v29.16b, v5.16b\n"
+ ".inst 0x4e859798 // sdot v24.4s, v28.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x4e8a97b6 // sdot v22.4s, v29.16b, v10.16b\n"
+ ".inst 0x4e8597b4 // sdot v20.4s, v29.16b, v5.16b\n"
+ "mls v24.4s, v25.4s, v16.4s\n"
+ ".inst 0x4e859592 // sdot v18.4s, v12.16b, v5.16b\n"
+ ".inst 0x4e899795 // sdot v21.4s, v28.16b, v9.16b\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+ ".inst 0x4e859796 // sdot v22.4s, v28.16b, v5.16b\n"
+ ".inst 0x4e899794 // sdot v20.4s, v28.16b, v9.16b\n"
+ "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+ "mov v17.16b, v18.16b\n .inst 0x4e899591 // sdot v17.4s, v12.16b, v9.16b\n"
+ ".inst 0x4e889592 // sdot v18.4s, v12.16b, v8.16b\n"
+ "mls v22.4s, v18.4s, v16.4s\n"
+ "mls v21.4s, v23.4s, v16.4s\n"
+ "mls v20.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v24.16b, v26.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v22.4s, v22.4s, v30.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "and v19.16b, v22.16b, v26.16b\n"
+ "and v18.16b, v21.16b, v26.16b\n"
+ "and v17.16b, v20.16b, v26.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v22.4s, v22.4s, v19.4s\n"
+ "sqadd v21.4s, v21.4s, v18.4s\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "srshl v24.4s, v24.4s, v26.4s\n"
+ "srshl v22.4s, v22.4s, v26.4s\n"
+ "srshl v21.4s, v21.4s, v26.4s\n"
+ "srshl v20.4s, v20.4s, v26.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v22.4s, v22.4s, v14.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v22.4s, v22.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
"smax v20.4s, v20.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "smin v24.4s, v24.4s, v11.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "str s5, [x24, x27]\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x25, x27]\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s22, [x24, x27]\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "str s20, [x21, x27]\n"
+ "str s21, [x23, x27]\n"
+ "str s20, [x22, x27]\n"
"add x27, x27, #0x4\n"
"beq 35f\n"
"3:" // Oddments
@@ -869,794 +861,794 @@ void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
"add x10, x10, x28\n"
"add x9, x9, x28\n"
"add x26, x26, x28\n"
- "add x25, x25, x28\n"
+ "add x21, x21, x28\n"
"tbz %x[n_channels], #3, 7f\n"
- "ldr d9, [x15], #0x8\n"
- "ldr d8, [x14], #0x8\n"
+ "ldr d15, [x15], #0x8\n"
+ "ldr d25, [x14], #0x8\n"
"ldr d7, [x13], #0x8\n"
- "ldr d6, [x12], #0x8\n"
- "ldr d4, [x10], #0x8\n"
- "ldr d3, [x9], #0x8\n"
- "ldr d2, [x26], #0x8\n"
- "ldr d1, [x25], #0x8\n"
+ "ldr d8, [x12], #0x8\n"
+ "ldr d26, [x10], #0x8\n"
+ "ldr d23, [x9], #0x8\n"
+ "ldr d3, [x26], #0x8\n"
+ "ldr d10, [x21], #0x8\n"
"tbz %x[n_channels], #2, 5f\n"
- "ld1 { v9.s }[2], [x15], #0x4\n"
- "ld1 { v8.s }[2], [x14], #0x4\n"
+ "ld1 { v15.s }[2], [x15], #0x4\n"
+ "ld1 { v25.s }[2], [x14], #0x4\n"
"ld1 { v7.s }[2], [x13], #0x4\n"
- "ld1 { v6.s }[2], [x12], #0x4\n"
- "ld1 { v4.s }[2], [x10], #0x4\n"
- "ld1 { v3.s }[2], [x9], #0x4\n"
- "ld1 { v2.s }[2], [x26], #0x4\n"
- "ld1 { v1.s }[2], [x25], #0x4\n"
+ "ld1 { v8.s }[2], [x12], #0x4\n"
+ "ld1 { v26.s }[2], [x10], #0x4\n"
+ "ld1 { v23.s }[2], [x9], #0x4\n"
+ "ld1 { v3.s }[2], [x26], #0x4\n"
+ "ld1 { v10.s }[2], [x21], #0x4\n"
"tbz %x[n_channels], #1, 4f\n"
- "ld1 { v9.h }[6], [x15], #0x2\n"
- "ld1 { v8.h }[6], [x14], #0x2\n"
+ "ld1 { v15.h }[6], [x15], #0x2\n"
+ "ld1 { v25.h }[6], [x14], #0x2\n"
"ld1 { v7.h }[6], [x13], #0x2\n"
- "ld1 { v6.h }[6], [x12], #0x2\n"
- "ld1 { v4.h }[6], [x10], #0x2\n"
- "ld1 { v3.h }[6], [x9], #0x2\n"
- "ld1 { v2.h }[6], [x26], #0x2\n"
- "ld1 { v1.h }[6], [x25], #0x2\n"
+ "ld1 { v8.h }[6], [x12], #0x2\n"
+ "ld1 { v26.h }[6], [x10], #0x2\n"
+ "ld1 { v23.h }[6], [x9], #0x2\n"
+ "ld1 { v3.h }[6], [x26], #0x2\n"
+ "ld1 { v10.h }[6], [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.b }[14], [x15], #0x1\n"
- "ld1 { v8.b }[14], [x14], #0x1\n"
+ "ld1 { v15.b }[14], [x15], #0x1\n"
+ "ld1 { v25.b }[14], [x14], #0x1\n"
"ld1 { v7.b }[14], [x13], #0x1\n"
- "ld1 { v6.b }[14], [x12], #0x1\n"
- "ld1 { v4.b }[14], [x10], #0x1\n"
- "ld1 { v3.b }[14], [x9], #0x1\n"
- "ld1 { v2.b }[14], [x26], #0x1\n"
- "ld1 { v1.b }[14], [x25], #0x1\n"
+ "ld1 { v8.b }[14], [x12], #0x1\n"
+ "ld1 { v26.b }[14], [x10], #0x1\n"
+ "ld1 { v23.b }[14], [x9], #0x1\n"
+ "ld1 { v3.b }[14], [x26], #0x1\n"
+ "ld1 { v10.b }[14], [x21], #0x1\n"
"b 11f\n"
"4:" // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.b }[12], [x15], #0x1\n"
- "ld1 { v8.b }[12], [x14], #0x1\n"
+ "ld1 { v15.b }[12], [x15], #0x1\n"
+ "ld1 { v25.b }[12], [x14], #0x1\n"
"ld1 { v7.b }[12], [x13], #0x1\n"
- "ld1 { v6.b }[12], [x12], #0x1\n"
- "ld1 { v4.b }[12], [x10], #0x1\n"
- "ld1 { v3.b }[12], [x9], #0x1\n"
- "ld1 { v2.b }[12], [x26], #0x1\n"
- "ld1 { v1.b }[12], [x25], #0x1\n"
+ "ld1 { v8.b }[12], [x12], #0x1\n"
+ "ld1 { v26.b }[12], [x10], #0x1\n"
+ "ld1 { v23.b }[12], [x9], #0x1\n"
+ "ld1 { v3.b }[12], [x26], #0x1\n"
+ "ld1 { v10.b }[12], [x21], #0x1\n"
"b 11f\n"
"5:" // Oddments: Load (A): Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 6f\n"
- "ld1 { v9.h }[4], [x15], #0x2\n"
- "ld1 { v8.h }[4], [x14], #0x2\n"
+ "ld1 { v15.h }[4], [x15], #0x2\n"
+ "ld1 { v25.h }[4], [x14], #0x2\n"
"ld1 { v7.h }[4], [x13], #0x2\n"
- "ld1 { v6.h }[4], [x12], #0x2\n"
- "ld1 { v4.h }[4], [x10], #0x2\n"
- "ld1 { v3.h }[4], [x9], #0x2\n"
- "ld1 { v2.h }[4], [x26], #0x2\n"
- "ld1 { v1.h }[4], [x25], #0x2\n"
+ "ld1 { v8.h }[4], [x12], #0x2\n"
+ "ld1 { v26.h }[4], [x10], #0x2\n"
+ "ld1 { v23.h }[4], [x9], #0x2\n"
+ "ld1 { v3.h }[4], [x26], #0x2\n"
+ "ld1 { v10.h }[4], [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.b }[10], [x15], #0x1\n"
- "ld1 { v8.b }[10], [x14], #0x1\n"
+ "ld1 { v15.b }[10], [x15], #0x1\n"
+ "ld1 { v25.b }[10], [x14], #0x1\n"
"ld1 { v7.b }[10], [x13], #0x1\n"
- "ld1 { v6.b }[10], [x12], #0x1\n"
- "ld1 { v4.b }[10], [x10], #0x1\n"
- "ld1 { v3.b }[10], [x9], #0x1\n"
- "ld1 { v2.b }[10], [x26], #0x1\n"
- "ld1 { v1.b }[10], [x25], #0x1\n"
+ "ld1 { v8.b }[10], [x12], #0x1\n"
+ "ld1 { v26.b }[10], [x10], #0x1\n"
+ "ld1 { v23.b }[10], [x9], #0x1\n"
+ "ld1 { v3.b }[10], [x26], #0x1\n"
+ "ld1 { v10.b }[10], [x21], #0x1\n"
"b 11f\n"
"6:" // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.b }[8], [x15], #0x1\n"
- "ld1 { v8.b }[8], [x14], #0x1\n"
+ "ld1 { v15.b }[8], [x15], #0x1\n"
+ "ld1 { v25.b }[8], [x14], #0x1\n"
"ld1 { v7.b }[8], [x13], #0x1\n"
- "ld1 { v6.b }[8], [x12], #0x1\n"
- "ld1 { v4.b }[8], [x10], #0x1\n"
- "ld1 { v3.b }[8], [x9], #0x1\n"
- "ld1 { v2.b }[8], [x26], #0x1\n"
- "ld1 { v1.b }[8], [x25], #0x1\n"
+ "ld1 { v8.b }[8], [x12], #0x1\n"
+ "ld1 { v26.b }[8], [x10], #0x1\n"
+ "ld1 { v23.b }[8], [x9], #0x1\n"
+ "ld1 { v3.b }[8], [x26], #0x1\n"
+ "ld1 { v10.b }[8], [x21], #0x1\n"
"b 11f\n"
"7:" // Oddments: Load (A): Bit 3: Unset
"tbz %x[n_channels], #2, 9f\n"
- "ldr s9, [x15], #0x4\n"
- "ldr s8, [x14], #0x4\n"
+ "ldr s15, [x15], #0x4\n"
+ "ldr s25, [x14], #0x4\n"
"ldr s7, [x13], #0x4\n"
- "ldr s6, [x12], #0x4\n"
- "ldr s4, [x10], #0x4\n"
- "ldr s3, [x9], #0x4\n"
- "ldr s2, [x26], #0x4\n"
- "ldr s1, [x25], #0x4\n"
+ "ldr s8, [x12], #0x4\n"
+ "ldr s26, [x10], #0x4\n"
+ "ldr s23, [x9], #0x4\n"
+ "ldr s3, [x26], #0x4\n"
+ "ldr s10, [x21], #0x4\n"
"tbz %x[n_channels], #1, 8f\n"
- "ld1 { v9.h }[2], [x15], #0x2\n"
- "ld1 { v8.h }[2], [x14], #0x2\n"
+ "ld1 { v15.h }[2], [x15], #0x2\n"
+ "ld1 { v25.h }[2], [x14], #0x2\n"
"ld1 { v7.h }[2], [x13], #0x2\n"
- "ld1 { v6.h }[2], [x12], #0x2\n"
- "ld1 { v4.h }[2], [x10], #0x2\n"
- "ld1 { v3.h }[2], [x9], #0x2\n"
- "ld1 { v2.h }[2], [x26], #0x2\n"
- "ld1 { v1.h }[2], [x25], #0x2\n"
+ "ld1 { v8.h }[2], [x12], #0x2\n"
+ "ld1 { v26.h }[2], [x10], #0x2\n"
+ "ld1 { v23.h }[2], [x9], #0x2\n"
+ "ld1 { v3.h }[2], [x26], #0x2\n"
+ "ld1 { v10.h }[2], [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.b }[6], [x15], #0x1\n"
- "ld1 { v8.b }[6], [x14], #0x1\n"
+ "ld1 { v15.b }[6], [x15], #0x1\n"
+ "ld1 { v25.b }[6], [x14], #0x1\n"
"ld1 { v7.b }[6], [x13], #0x1\n"
- "ld1 { v6.b }[6], [x12], #0x1\n"
- "ld1 { v4.b }[6], [x10], #0x1\n"
- "ld1 { v3.b }[6], [x9], #0x1\n"
- "ld1 { v2.b }[6], [x26], #0x1\n"
- "ld1 { v1.b }[6], [x25], #0x1\n"
+ "ld1 { v8.b }[6], [x12], #0x1\n"
+ "ld1 { v26.b }[6], [x10], #0x1\n"
+ "ld1 { v23.b }[6], [x9], #0x1\n"
+ "ld1 { v3.b }[6], [x26], #0x1\n"
+ "ld1 { v10.b }[6], [x21], #0x1\n"
"b 11f\n"
"8:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.b }[4], [x15], #0x1\n"
- "ld1 { v8.b }[4], [x14], #0x1\n"
+ "ld1 { v15.b }[4], [x15], #0x1\n"
+ "ld1 { v25.b }[4], [x14], #0x1\n"
"ld1 { v7.b }[4], [x13], #0x1\n"
- "ld1 { v6.b }[4], [x12], #0x1\n"
- "ld1 { v4.b }[4], [x10], #0x1\n"
- "ld1 { v3.b }[4], [x9], #0x1\n"
- "ld1 { v2.b }[4], [x26], #0x1\n"
- "ld1 { v1.b }[4], [x25], #0x1\n"
+ "ld1 { v8.b }[4], [x12], #0x1\n"
+ "ld1 { v26.b }[4], [x10], #0x1\n"
+ "ld1 { v23.b }[4], [x9], #0x1\n"
+ "ld1 { v3.b }[4], [x26], #0x1\n"
+ "ld1 { v10.b }[4], [x21], #0x1\n"
"b 11f\n"
"9:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 10f\n"
- "ldr h9, [x15], #0x2\n"
- "ldr h8, [x14], #0x2\n"
+ "ldr h15, [x15], #0x2\n"
+ "ldr h25, [x14], #0x2\n"
"ldr h7, [x13], #0x2\n"
- "ldr h6, [x12], #0x2\n"
- "ldr h4, [x10], #0x2\n"
- "ldr h3, [x9], #0x2\n"
- "ldr h2, [x26], #0x2\n"
- "ldr h1, [x25], #0x2\n"
+ "ldr h8, [x12], #0x2\n"
+ "ldr h26, [x10], #0x2\n"
+ "ldr h23, [x9], #0x2\n"
+ "ldr h3, [x26], #0x2\n"
+ "ldr h10, [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.b }[2], [x15], #0x1\n"
- "ld1 { v8.b }[2], [x14], #0x1\n"
+ "ld1 { v15.b }[2], [x15], #0x1\n"
+ "ld1 { v25.b }[2], [x14], #0x1\n"
"ld1 { v7.b }[2], [x13], #0x1\n"
- "ld1 { v6.b }[2], [x12], #0x1\n"
- "ld1 { v4.b }[2], [x10], #0x1\n"
- "ld1 { v3.b }[2], [x9], #0x1\n"
- "ld1 { v2.b }[2], [x26], #0x1\n"
- "ld1 { v1.b }[2], [x25], #0x1\n"
+ "ld1 { v8.b }[2], [x12], #0x1\n"
+ "ld1 { v26.b }[2], [x10], #0x1\n"
+ "ld1 { v23.b }[2], [x9], #0x1\n"
+ "ld1 { v3.b }[2], [x26], #0x1\n"
+ "ld1 { v10.b }[2], [x21], #0x1\n"
"b 11f\n"
"10:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
- "ldr b9, [x15], #0x1\n"
- "ldr b8, [x14], #0x1\n"
+ "ldr b15, [x15], #0x1\n"
+ "ldr b25, [x14], #0x1\n"
"ldr b7, [x13], #0x1\n"
- "ldr b6, [x12], #0x1\n"
- "ldr b4, [x10], #0x1\n"
- "ldr b3, [x9], #0x1\n"
- "ldr b2, [x26], #0x1\n"
- "ldr b1, [x25], #0x1\n"
+ "ldr b8, [x12], #0x1\n"
+ "ldr b26, [x10], #0x1\n"
+ "ldr b23, [x9], #0x1\n"
+ "ldr b3, [x26], #0x1\n"
+ "ldr b10, [x21], #0x1\n"
"11:" // Oddments: Load (A): Bit 3: End
"ldp x15, x14, [%x[inptrs], #0x40]\n"
"ldp x13, x12, [%x[inptrs], #0x50]\n"
"add x15, x15, x28\n"
"add x14, x14, x28\n"
"ldp x10, x9, [%x[inptrs], #0x60]\n"
- "ldp x26, x25, [%x[inptrs], #0x70]\n"
+ "ldp x26, x21, [%x[inptrs], #0x70]\n"
"add x13, x13, x28\n"
"add x12, x12, x28\n"
"add x10, x10, x28\n"
"add x9, x9, x28\n"
"add x26, x26, x28\n"
- "add x25, x25, x28\n"
+ "add x21, x21, x28\n"
"tbz %x[n_channels], #3, 15f\n"
- "ldr d29, [x15], #0x8\n"
- "ldr d28, [x14], #0x8\n"
- "ldr d27, [x13], #0x8\n"
- "ldr d26, [x12], #0x8\n"
- "ldr d24, [x10], #0x8\n"
- "ldr d23, [x9], #0x8\n"
- "ldr d22, [x26], #0x8\n"
- "ldr d21, [x25], #0x8\n"
+ "ldr d22, [x15], #0x8\n"
+ "ldr d19, [x14], #0x8\n"
+ "ldr d0, [x13], #0x8\n"
+ "ldr d5, [x12], #0x8\n"
+ "ldr d27, [x10], #0x8\n"
+ "ldr d24, [x9], #0x8\n"
+ "ldr d2, [x26], #0x8\n"
+ "ldr d9, [x21], #0x8\n"
"tbz %x[n_channels], #2, 13f\n"
- "ld1 { v29.s }[2], [x15], #0x4\n"
- "ld1 { v28.s }[2], [x14], #0x4\n"
- "ld1 { v27.s }[2], [x13], #0x4\n"
- "ld1 { v26.s }[2], [x12], #0x4\n"
- "ld1 { v24.s }[2], [x10], #0x4\n"
- "ld1 { v23.s }[2], [x9], #0x4\n"
- "ld1 { v22.s }[2], [x26], #0x4\n"
- "ld1 { v21.s }[2], [x25], #0x4\n"
+ "ld1 { v22.s }[2], [x15], #0x4\n"
+ "ld1 { v19.s }[2], [x14], #0x4\n"
+ "ld1 { v0.s }[2], [x13], #0x4\n"
+ "ld1 { v5.s }[2], [x12], #0x4\n"
+ "ld1 { v27.s }[2], [x10], #0x4\n"
+ "ld1 { v24.s }[2], [x9], #0x4\n"
+ "ld1 { v2.s }[2], [x26], #0x4\n"
+ "ld1 { v9.s }[2], [x21], #0x4\n"
"tbz %x[n_channels], #1, 12f\n"
- "ld1 { v29.h }[6], [x15], #0x2\n"
- "ld1 { v28.h }[6], [x14], #0x2\n"
- "ld1 { v27.h }[6], [x13], #0x2\n"
- "ld1 { v26.h }[6], [x12], #0x2\n"
- "ld1 { v24.h }[6], [x10], #0x2\n"
- "ld1 { v23.h }[6], [x9], #0x2\n"
- "ld1 { v22.h }[6], [x26], #0x2\n"
- "ld1 { v21.h }[6], [x25], #0x2\n"
+ "ld1 { v22.h }[6], [x15], #0x2\n"
+ "ld1 { v19.h }[6], [x14], #0x2\n"
+ "ld1 { v0.h }[6], [x13], #0x2\n"
+ "ld1 { v5.h }[6], [x12], #0x2\n"
+ "ld1 { v27.h }[6], [x10], #0x2\n"
+ "ld1 { v24.h }[6], [x9], #0x2\n"
+ "ld1 { v2.h }[6], [x26], #0x2\n"
+ "ld1 { v9.h }[6], [x21], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v29.b }[14], [x15], #0x1\n"
- "ld1 { v28.b }[14], [x14], #0x1\n"
- "ld1 { v27.b }[14], [x13], #0x1\n"
- "ld1 { v26.b }[14], [x12], #0x1\n"
- "ld1 { v24.b }[14], [x10], #0x1\n"
- "ld1 { v23.b }[14], [x9], #0x1\n"
- "ld1 { v22.b }[14], [x26], #0x1\n"
- "ld1 { v21.b }[14], [x25], #0x1\n"
+ "ld1 { v22.b }[14], [x15], #0x1\n"
+ "ld1 { v19.b }[14], [x14], #0x1\n"
+ "ld1 { v0.b }[14], [x13], #0x1\n"
+ "ld1 { v5.b }[14], [x12], #0x1\n"
+ "ld1 { v27.b }[14], [x10], #0x1\n"
+ "ld1 { v24.b }[14], [x9], #0x1\n"
+ "ld1 { v2.b }[14], [x26], #0x1\n"
+ "ld1 { v9.b }[14], [x21], #0x1\n"
"b 19f\n"
"12:" // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v29.b }[12], [x15], #0x1\n"
- "ld1 { v28.b }[12], [x14], #0x1\n"
- "ld1 { v27.b }[12], [x13], #0x1\n"
- "ld1 { v26.b }[12], [x12], #0x1\n"
- "ld1 { v24.b }[12], [x10], #0x1\n"
- "ld1 { v23.b }[12], [x9], #0x1\n"
- "ld1 { v22.b }[12], [x26], #0x1\n"
- "ld1 { v21.b }[12], [x25], #0x1\n"
+ "ld1 { v22.b }[12], [x15], #0x1\n"
+ "ld1 { v19.b }[12], [x14], #0x1\n"
+ "ld1 { v0.b }[12], [x13], #0x1\n"
+ "ld1 { v5.b }[12], [x12], #0x1\n"
+ "ld1 { v27.b }[12], [x10], #0x1\n"
+ "ld1 { v24.b }[12], [x9], #0x1\n"
+ "ld1 { v2.b }[12], [x26], #0x1\n"
+ "ld1 { v9.b }[12], [x21], #0x1\n"
"b 19f\n"
"13:" // Oddments: Load (B): Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 14f\n"
- "ld1 { v29.h }[4], [x15], #0x2\n"
- "ld1 { v28.h }[4], [x14], #0x2\n"
- "ld1 { v27.h }[4], [x13], #0x2\n"
- "ld1 { v26.h }[4], [x12], #0x2\n"
- "ld1 { v24.h }[4], [x10], #0x2\n"
- "ld1 { v23.h }[4], [x9], #0x2\n"
- "ld1 { v22.h }[4], [x26], #0x2\n"
- "ld1 { v21.h }[4], [x25], #0x2\n"
+ "ld1 { v22.h }[4], [x15], #0x2\n"
+ "ld1 { v19.h }[4], [x14], #0x2\n"
+ "ld1 { v0.h }[4], [x13], #0x2\n"
+ "ld1 { v5.h }[4], [x12], #0x2\n"
+ "ld1 { v27.h }[4], [x10], #0x2\n"
+ "ld1 { v24.h }[4], [x9], #0x2\n"
+ "ld1 { v2.h }[4], [x26], #0x2\n"
+ "ld1 { v9.h }[4], [x21], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v29.b }[10], [x15], #0x1\n"
- "ld1 { v28.b }[10], [x14], #0x1\n"
- "ld1 { v27.b }[10], [x13], #0x1\n"
- "ld1 { v26.b }[10], [x12], #0x1\n"
- "ld1 { v24.b }[10], [x10], #0x1\n"
- "ld1 { v23.b }[10], [x9], #0x1\n"
- "ld1 { v22.b }[10], [x26], #0x1\n"
- "ld1 { v21.b }[10], [x25], #0x1\n"
+ "ld1 { v22.b }[10], [x15], #0x1\n"
+ "ld1 { v19.b }[10], [x14], #0x1\n"
+ "ld1 { v0.b }[10], [x13], #0x1\n"
+ "ld1 { v5.b }[10], [x12], #0x1\n"
+ "ld1 { v27.b }[10], [x10], #0x1\n"
+ "ld1 { v24.b }[10], [x9], #0x1\n"
+ "ld1 { v2.b }[10], [x26], #0x1\n"
+ "ld1 { v9.b }[10], [x21], #0x1\n"
"b 19f\n"
"14:" // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v29.b }[8], [x15], #0x1\n"
- "ld1 { v28.b }[8], [x14], #0x1\n"
- "ld1 { v27.b }[8], [x13], #0x1\n"
- "ld1 { v26.b }[8], [x12], #0x1\n"
- "ld1 { v24.b }[8], [x10], #0x1\n"
- "ld1 { v23.b }[8], [x9], #0x1\n"
- "ld1 { v22.b }[8], [x26], #0x1\n"
- "ld1 { v21.b }[8], [x25], #0x1\n"
+ "ld1 { v22.b }[8], [x15], #0x1\n"
+ "ld1 { v19.b }[8], [x14], #0x1\n"
+ "ld1 { v0.b }[8], [x13], #0x1\n"
+ "ld1 { v5.b }[8], [x12], #0x1\n"
+ "ld1 { v27.b }[8], [x10], #0x1\n"
+ "ld1 { v24.b }[8], [x9], #0x1\n"
+ "ld1 { v2.b }[8], [x26], #0x1\n"
+ "ld1 { v9.b }[8], [x21], #0x1\n"
"b 19f\n"
"15:" // Oddments: Load (B): Bit 3: Unset
"tbz %x[n_channels], #2, 17f\n"
- "ldr s29, [x15], #0x4\n"
- "ldr s28, [x14], #0x4\n"
- "ldr s27, [x13], #0x4\n"
- "ldr s26, [x12], #0x4\n"
- "ldr s24, [x10], #0x4\n"
- "ldr s23, [x9], #0x4\n"
- "ldr s22, [x26], #0x4\n"
- "ldr s21, [x25], #0x4\n"
+ "ldr s22, [x15], #0x4\n"
+ "ldr s19, [x14], #0x4\n"
+ "ldr s0, [x13], #0x4\n"
+ "ldr s5, [x12], #0x4\n"
+ "ldr s27, [x10], #0x4\n"
+ "ldr s24, [x9], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s9, [x21], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v29.h }[2], [x15], #0x2\n"
- "ld1 { v28.h }[2], [x14], #0x2\n"
- "ld1 { v27.h }[2], [x13], #0x2\n"
- "ld1 { v26.h }[2], [x12], #0x2\n"
- "ld1 { v24.h }[2], [x10], #0x2\n"
- "ld1 { v23.h }[2], [x9], #0x2\n"
- "ld1 { v22.h }[2], [x26], #0x2\n"
- "ld1 { v21.h }[2], [x25], #0x2\n"
+ "ld1 { v22.h }[2], [x15], #0x2\n"
+ "ld1 { v19.h }[2], [x14], #0x2\n"
+ "ld1 { v0.h }[2], [x13], #0x2\n"
+ "ld1 { v5.h }[2], [x12], #0x2\n"
+ "ld1 { v27.h }[2], [x10], #0x2\n"
+ "ld1 { v24.h }[2], [x9], #0x2\n"
+ "ld1 { v2.h }[2], [x26], #0x2\n"
+ "ld1 { v9.h }[2], [x21], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v29.b }[6], [x15], #0x1\n"
- "ld1 { v28.b }[6], [x14], #0x1\n"
- "ld1 { v27.b }[6], [x13], #0x1\n"
- "ld1 { v26.b }[6], [x12], #0x1\n"
- "ld1 { v24.b }[6], [x10], #0x1\n"
- "ld1 { v23.b }[6], [x9], #0x1\n"
- "ld1 { v22.b }[6], [x26], #0x1\n"
- "ld1 { v21.b }[6], [x25], #0x1\n"
+ "ld1 { v22.b }[6], [x15], #0x1\n"
+ "ld1 { v19.b }[6], [x14], #0x1\n"
+ "ld1 { v0.b }[6], [x13], #0x1\n"
+ "ld1 { v5.b }[6], [x12], #0x1\n"
+ "ld1 { v27.b }[6], [x10], #0x1\n"
+ "ld1 { v24.b }[6], [x9], #0x1\n"
+ "ld1 { v2.b }[6], [x26], #0x1\n"
+ "ld1 { v9.b }[6], [x21], #0x1\n"
"b 19f\n"
"16:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v29.b }[4], [x15], #0x1\n"
- "ld1 { v28.b }[4], [x14], #0x1\n"
- "ld1 { v27.b }[4], [x13], #0x1\n"
- "ld1 { v26.b }[4], [x12], #0x1\n"
- "ld1 { v24.b }[4], [x10], #0x1\n"
- "ld1 { v23.b }[4], [x9], #0x1\n"
- "ld1 { v22.b }[4], [x26], #0x1\n"
- "ld1 { v21.b }[4], [x25], #0x1\n"
+ "ld1 { v22.b }[4], [x15], #0x1\n"
+ "ld1 { v19.b }[4], [x14], #0x1\n"
+ "ld1 { v0.b }[4], [x13], #0x1\n"
+ "ld1 { v5.b }[4], [x12], #0x1\n"
+ "ld1 { v27.b }[4], [x10], #0x1\n"
+ "ld1 { v24.b }[4], [x9], #0x1\n"
+ "ld1 { v2.b }[4], [x26], #0x1\n"
+ "ld1 { v9.b }[4], [x21], #0x1\n"
"b 19f\n"
"17:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ldr h29, [x15], #0x2\n"
- "ldr h28, [x14], #0x2\n"
- "ldr h27, [x13], #0x2\n"
- "ldr h26, [x12], #0x2\n"
- "ldr h24, [x10], #0x2\n"
- "ldr h23, [x9], #0x2\n"
- "ldr h22, [x26], #0x2\n"
- "ldr h21, [x25], #0x2\n"
+ "ldr h22, [x15], #0x2\n"
+ "ldr h19, [x14], #0x2\n"
+ "ldr h0, [x13], #0x2\n"
+ "ldr h5, [x12], #0x2\n"
+ "ldr h27, [x10], #0x2\n"
+ "ldr h24, [x9], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h9, [x21], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v29.b }[2], [x15], #0x1\n"
- "ld1 { v28.b }[2], [x14], #0x1\n"
- "ld1 { v27.b }[2], [x13], #0x1\n"
- "ld1 { v26.b }[2], [x12], #0x1\n"
- "ld1 { v24.b }[2], [x10], #0x1\n"
- "ld1 { v23.b }[2], [x9], #0x1\n"
- "ld1 { v22.b }[2], [x26], #0x1\n"
- "ld1 { v21.b }[2], [x25], #0x1\n"
+ "ld1 { v22.b }[2], [x15], #0x1\n"
+ "ld1 { v19.b }[2], [x14], #0x1\n"
+ "ld1 { v0.b }[2], [x13], #0x1\n"
+ "ld1 { v5.b }[2], [x12], #0x1\n"
+ "ld1 { v27.b }[2], [x10], #0x1\n"
+ "ld1 { v24.b }[2], [x9], #0x1\n"
+ "ld1 { v2.b }[2], [x26], #0x1\n"
+ "ld1 { v9.b }[2], [x21], #0x1\n"
"b 19f\n"
"18:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
- "ldr b29, [x15], #0x1\n"
- "ldr b28, [x14], #0x1\n"
- "ldr b27, [x13], #0x1\n"
- "ldr b26, [x12], #0x1\n"
- "ldr b24, [x10], #0x1\n"
- "ldr b23, [x9], #0x1\n"
- "ldr b22, [x26], #0x1\n"
- "ldr b21, [x25], #0x1\n"
+ "ldr b22, [x15], #0x1\n"
+ "ldr b19, [x14], #0x1\n"
+ "ldr b0, [x13], #0x1\n"
+ "ldr b5, [x12], #0x1\n"
+ "ldr b27, [x10], #0x1\n"
+ "ldr b24, [x9], #0x1\n"
+ "ldr b2, [x26], #0x1\n"
+ "ldr b9, [x21], #0x1\n"
"19:" // Oddments: Load (B): Bit 3: End
- "ldr q0, [%x[params], #0x10]\n"
- "ldr q16, [%x[params], #0x20]\n"
- "zip2 v30.16b, v4.16b, v2.16b\n"
- "zip1 v4.16b, v4.16b, v2.16b\n"
- "ldr q31, [%x[params], #0x30]\n"
- "zip1 v2.16b, v3.16b, v1.16b\n"
- "zip2 v5.16b, v9.16b, v7.16b\n"
+ "ldr q20, [%x[params], #0x10]\n"
+ "ldr q6, [%x[params], #0x20]\n"
+ "zip2 v1.16b, v26.16b, v3.16b\n"
+ "zip1 v26.16b, v26.16b, v3.16b\n"
+ "ldr q4, [%x[params], #0x30]\n"
+ "zip1 v18.16b, v23.16b, v10.16b\n"
+ "zip2 v30.16b, v15.16b, v7.16b\n"
"cmp x20, #0x4\n"
- "zip1 v9.16b, v9.16b, v7.16b\n"
- "zip1 v7.16b, v8.16b, v6.16b\n"
- "zip2 v6.16b, v8.16b, v6.16b\n"
- "zip2 v1.16b, v3.16b, v1.16b\n"
- "zip2 v3.16b, v4.16b, v2.16b\n"
- "zip1 v4.16b, v4.16b, v2.16b\n"
- "zip2 v25.16b, v29.16b, v27.16b\n"
- "zip1 v29.16b, v29.16b, v27.16b\n"
- "zip1 v27.16b, v28.16b, v26.16b\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x4e8495d3 // sdot v19.4s, v14.16b, v4.16b\n"
- "zip2 v8.16b, v9.16b, v7.16b\n"
- "zip1 v9.16b, v9.16b, v7.16b\n"
- "zip1 v7.16b, v5.16b, v6.16b\n"
- "zip2 v6.16b, v5.16b, v6.16b\n"
- "ldr q5, [%x[params], #0x0]\n"
- "zip2 v26.16b, v28.16b, v26.16b\n"
- "zip2 v20.16b, v24.16b, v22.16b\n"
- "zip1 v24.16b, v24.16b, v22.16b\n"
- "zip1 v22.16b, v23.16b, v21.16b\n"
- "zip2 v21.16b, v23.16b, v21.16b\n"
- "zip2 v28.16b, v29.16b, v27.16b\n"
- "zip1 v29.16b, v29.16b, v27.16b\n"
- "zip1 v2.16b, v30.16b, v1.16b\n"
- ".inst 0x4e9d95d3 // sdot v19.4s, v14.16b, v29.16b\n"
- "zip2 v1.16b, v30.16b, v1.16b\n"
- "zip1 v27.16b, v25.16b, v26.16b\n"
- "zip2 v26.16b, v25.16b, v26.16b\n"
- "zip2 v23.16b, v24.16b, v22.16b\n"
- "zip1 v24.16b, v24.16b, v22.16b\n"
- "zip1 v22.16b, v20.16b, v21.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x4e9895d2 // sdot v18.4s, v14.16b, v24.16b\n"
- "zip2 v21.16b, v20.16b, v21.16b\n"
- "mov v30.16b, v5.16b\n"
- ".inst 0x4e8995d3 // sdot v19.4s, v14.16b, v9.16b\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x4e899405 // sdot v5.4s, v0.16b, v9.16b\n"
- ".inst 0x4e849419 // sdot v25.4s, v0.16b, v4.16b\n"
- ".inst 0x4e849605 // sdot v5.4s, v16.16b, v4.16b\n"
- "ext v4.16b, v4.16b, v4.16b, #0x1\n"
- "ext v9.16b, v9.16b, v9.16b, #0x1\n"
- ".inst 0x4e9d9619 // sdot v25.4s, v16.16b, v29.16b\n"
- ".inst 0x4e9d97e5 // sdot v5.4s, v31.16b, v29.16b\n"
- "ext v29.16b, v29.16b, v29.16b, #0x1\n"
- ".inst 0x4e89941e // sdot v30.4s, v0.16b, v9.16b\n"
- ".inst 0x4e849414 // sdot v20.4s, v0.16b, v4.16b\n"
+ "zip1 v15.16b, v15.16b, v7.16b\n"
+ "zip1 v29.16b, v25.16b, v8.16b\n"
+ "zip2 v8.16b, v25.16b, v8.16b\n"
+ "zip2 v10.16b, v23.16b, v10.16b\n"
+ "zip2 v23.16b, v26.16b, v18.16b\n"
+ "zip1 v26.16b, v26.16b, v18.16b\n"
+ "zip2 v28.16b, v22.16b, v0.16b\n"
+ "zip1 v22.16b, v22.16b, v0.16b\n"
+ "zip1 v21.16b, v19.16b, v5.16b\n"
"movi v17.4s, #0x0\n"
- ".inst 0x4e8495d1 // sdot v17.4s, v14.16b, v4.16b\n"
- ".inst 0x4e9d95d1 // sdot v17.4s, v14.16b, v29.16b\n"
- ".inst 0x4e9897f9 // sdot v25.4s, v31.16b, v24.16b\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x4e84961e // sdot v30.4s, v16.16b, v4.16b\n"
- "ldr q4, [%x[params], #0x50]\n"
- ".inst 0x4e9d9614 // sdot v20.4s, v16.16b, v29.16b\n"
- "mov v16.16b, v17.16b\n .inst 0x4e9895d0 // sdot v16.4s, v14.16b, v24.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x4e8995d1 // sdot v17.4s, v14.16b, v9.16b\n"
- "ldr q9, [%x[params], #0x40]\n"
- ".inst 0x4e9d97fe // sdot v30.4s, v31.16b, v29.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- ".inst 0x4e9897f4 // sdot v20.4s, v31.16b, v24.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
+ ".inst 0x4e9a9591 // sdot v17.4s, v12.16b, v26.16b\n"
+ "zip2 v25.16b, v15.16b, v29.16b\n"
+ "zip1 v15.16b, v15.16b, v29.16b\n"
+ "zip1 v7.16b, v30.16b, v8.16b\n"
+ "zip2 v8.16b, v30.16b, v8.16b\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "zip2 v5.16b, v19.16b, v5.16b\n"
+ "zip2 v30.16b, v27.16b, v2.16b\n"
+ "zip1 v27.16b, v27.16b, v2.16b\n"
+ "zip1 v18.16b, v24.16b, v9.16b\n"
+ "zip2 v9.16b, v24.16b, v9.16b\n"
+ "zip2 v19.16b, v22.16b, v21.16b\n"
+ "zip1 v22.16b, v22.16b, v21.16b\n"
+ "zip1 v3.16b, v1.16b, v10.16b\n"
+ ".inst 0x4e969591 // sdot v17.4s, v12.16b, v22.16b\n"
+ "zip2 v10.16b, v1.16b, v10.16b\n"
+ "zip1 v0.16b, v28.16b, v5.16b\n"
+ "zip2 v5.16b, v28.16b, v5.16b\n"
+ "zip2 v24.16b, v27.16b, v18.16b\n"
+ "zip1 v27.16b, v27.16b, v18.16b\n"
+ "zip1 v2.16b, v30.16b, v9.16b\n"
+ "mov v18.16b, v17.16b\n .inst 0x4e9b9592 // sdot v18.4s, v12.16b, v27.16b\n"
+ "zip2 v9.16b, v30.16b, v9.16b\n"
+ "mov v30.16b, v31.16b\n"
+ ".inst 0x4e8f9591 // sdot v17.4s, v12.16b, v15.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
+ ".inst 0x4e8f969f // sdot v31.4s, v20.16b, v15.16b\n"
+ ".inst 0x4e9a969d // sdot v29.4s, v20.16b, v26.16b\n"
+ ".inst 0x4e9a94df // sdot v31.4s, v6.16b, v26.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ "movi v1.4s, #0x0\n"
+ "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+ ".inst 0x4e9a9581 // sdot v1.4s, v12.16b, v26.16b\n"
+ ".inst 0x4e9694dd // sdot v29.4s, v6.16b, v22.16b\n"
+ ".inst 0x4e96949f // sdot v31.4s, v4.16b, v22.16b\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ ".inst 0x4e8f969e // sdot v30.4s, v20.16b, v15.16b\n"
+ ".inst 0x4e9a969c // sdot v28.4s, v20.16b, v26.16b\n"
+ "mls v31.4s, v17.4s, v16.4s\n"
+ ".inst 0x4e969581 // sdot v1.4s, v12.16b, v22.16b\n"
+ ".inst 0x4e9b949d // sdot v29.4s, v4.16b, v27.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x4e9a94de // sdot v30.4s, v6.16b, v26.16b\n"
+ "ldr q21, [%x[params], #0x50]\n"
+ ".inst 0x4e9694dc // sdot v28.4s, v6.16b, v22.16b\n"
+ "mls v29.4s, v18.4s, v16.4s\n"
+ "mov v20.16b, v1.16b\n .inst 0x4e9b9594 // sdot v20.4s, v12.16b, v27.16b\n"
+ ".inst 0x4e8f9581 // sdot v1.4s, v12.16b, v15.16b\n"
+ "ldr q18, [%x[params], #0x40]\n"
+ "sqrdmulh v31.4s, v31.4s, v18.4s\n"
+ ".inst 0x4e96949e // sdot v30.4s, v4.16b, v22.16b\n"
+ ".inst 0x4e9b949c // sdot v28.4s, v4.16b, v27.16b\n"
+ "mls v30.4s, v1.4s, v16.4s\n"
"add %x[params], %x[params], #0x60\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smax v5.4s, v5.4s, v13.4s\n"
+ "mls v28.4s, v20.4s, v16.4s\n"
+ "and v17.16b, v31.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v18.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v18.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v18.4s\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "and v17.16b, v30.16b, v21.16b\n"
+ "and v18.16b, v29.16b, v21.16b\n"
+ "and v26.16b, v28.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v28.4s, v28.4s, v26.4s\n"
+ "srshl v31.4s, v31.4s, v21.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "srshl v29.4s, v29.4s, v21.4s\n"
+ "srshl v28.4s, v28.4s, v21.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
"smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
"blt 20f\n"
- "str s5, [x24, x27]\n"
- "str s30, [x23, x27]\n"
- "str s25, [x22, x27]\n"
- "str s20, [x21, x27]\n"
+ "str s31, [x25, x27]\n"
+ "str s30, [x24, x27]\n"
+ "str s29, [x23, x27]\n"
+ "str s28, [x22, x27]\n"
"b 23f\n"
"20:" // Oddments: Unroll 0: Oddment store
+ "add x25, x25, x27\n"
"add x24, x24, x27\n"
"add x23, x23, x27\n"
"add x22, x22, x27\n"
- "add x21, x21, x27\n"
"tbz x20, #1, 21f\n"
- "st1 { v5.h }[0], [x24], #0x2\n"
- "st1 { v30.h }[0], [x23], #0x2\n"
- "st1 { v25.h }[0], [x22], #0x2\n"
- "st1 { v20.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v30.h }[0], [x24], #0x2\n"
+ "st1 { v29.h }[0], [x23], #0x2\n"
+ "st1 { v28.h }[0], [x22], #0x2\n"
"tbz x20, #0, 22f\n"
- "st1 { v5.b }[2], [x24], #0x1\n"
- "st1 { v30.b }[2], [x23], #0x1\n"
- "st1 { v25.b }[2], [x22], #0x1\n"
- "st1 { v20.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v30.b }[2], [x24], #0x1\n"
+ "st1 { v29.b }[2], [x23], #0x1\n"
+ "st1 { v28.b }[2], [x22], #0x1\n"
"b 22f\n"
"21:" // Oddments: Unroll 0: Oddment store: Bit 1: Unset
- "st1 { v5.b }[0], [x24], #0x1\n"
- "st1 { v30.b }[0], [x23], #0x1\n"
- "st1 { v25.b }[0], [x22], #0x1\n"
- "st1 { v20.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v30.b }[0], [x24], #0x1\n"
+ "st1 { v29.b }[0], [x23], #0x1\n"
+ "st1 { v28.b }[0], [x22], #0x1\n"
"22:" // Oddments: Unroll 0: Oddment store: Bit 1: End
"23:" // Oddments: Unroll 0: After oddment store
"subs x20, x20, #0x4\n"
"add x27, x27, #0x4\n"
"ble 35f\n"
- "ldr q5, [%x[params], #0x0]\n"
- "ldr q0, [%x[params], #0x10]\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x4e8395d3 // sdot v19.4s, v14.16b, v3.16b\n"
- "ldr q16, [%x[params], #0x20]\n"
- "ldr q31, [%x[params], #0x30]\n"
- "mov v30.16b, v5.16b\n"
- "mov v25.16b, v5.16b\n"
- "ldr q9, [%x[params], #0x40]\n"
- "ldr q4, [%x[params], #0x50]\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x4e889405 // sdot v5.4s, v0.16b, v8.16b\n"
- ".inst 0x4e9c95d3 // sdot v19.4s, v14.16b, v28.16b\n"
- ".inst 0x4e839419 // sdot v25.4s, v0.16b, v3.16b\n"
- "movi v17.4s, #0x0\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q27, [%x[params], #0x10]\n"
+ "movi v1.4s, #0x0\n"
+ ".inst 0x4e979581 // sdot v1.4s, v12.16b, v23.16b\n"
+ "ldr q26, [%x[params], #0x20]\n"
+ "ldr q22, [%x[params], #0x30]\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "ldr q4, [%x[params], #0x40]\n"
+ "ldr q21, [%x[params], #0x50]\n"
+ "mov v28.16b, v31.16b\n"
+ ".inst 0x4e99977f // sdot v31.4s, v27.16b, v25.16b\n"
+ ".inst 0x4e939581 // sdot v1.4s, v12.16b, v19.16b\n"
+ ".inst 0x4e97977d // sdot v29.4s, v27.16b, v23.16b\n"
+ "movi v20.4s, #0x0\n"
"cmp x20, #0x4\n"
- ".inst 0x4e839605 // sdot v5.4s, v16.16b, v3.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x4e9795d2 // sdot v18.4s, v14.16b, v23.16b\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- "add %x[params], %x[params], #0x60\n"
- ".inst 0x4e8895d3 // sdot v19.4s, v14.16b, v8.16b\n"
- "ext v8.16b, v8.16b, v8.16b, #0x1\n"
- ".inst 0x4e88941e // sdot v30.4s, v0.16b, v8.16b\n"
- ".inst 0x4e839414 // sdot v20.4s, v0.16b, v3.16b\n"
- ".inst 0x4e8395d1 // sdot v17.4s, v14.16b, v3.16b\n"
- ".inst 0x4e9c9619 // sdot v25.4s, v16.16b, v28.16b\n"
- ".inst 0x4e9c97e5 // sdot v5.4s, v31.16b, v28.16b\n"
- "ext v28.16b, v28.16b, v28.16b, #0x1\n"
- ".inst 0x4e83961e // sdot v30.4s, v16.16b, v3.16b\n"
- ".inst 0x4e9c9614 // sdot v20.4s, v16.16b, v28.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x4e9c95d1 // sdot v17.4s, v14.16b, v28.16b\n"
- ".inst 0x4e9797f9 // sdot v25.4s, v31.16b, v23.16b\n"
+ ".inst 0x4e97975f // sdot v31.4s, v26.16b, v23.16b\n"
+ "mov v18.16b, v1.16b\n .inst 0x4e989592 // sdot v18.4s, v12.16b, v24.16b\n"
"ext v23.16b, v23.16b, v23.16b, #0x1\n"
- ".inst 0x4e9c97fe // sdot v30.4s, v31.16b, v28.16b\n"
- ".inst 0x4e9797f4 // sdot v20.4s, v31.16b, v23.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x4e9795d0 // sdot v16.4s, v14.16b, v23.16b\n"
- ".inst 0x4e8895d1 // sdot v17.4s, v14.16b, v8.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smax v5.4s, v5.4s, v13.4s\n"
+ "add %x[params], %x[params], #0x60\n"
+ ".inst 0x4e999581 // sdot v1.4s, v12.16b, v25.16b\n"
+ "ext v25.16b, v25.16b, v25.16b, #0x1\n"
+ ".inst 0x4e99977e // sdot v30.4s, v27.16b, v25.16b\n"
+ ".inst 0x4e97977c // sdot v28.4s, v27.16b, v23.16b\n"
+ ".inst 0x4e979594 // sdot v20.4s, v12.16b, v23.16b\n"
+ ".inst 0x4e93975d // sdot v29.4s, v26.16b, v19.16b\n"
+ ".inst 0x4e9396df // sdot v31.4s, v22.16b, v19.16b\n"
+ "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+ ".inst 0x4e97975e // sdot v30.4s, v26.16b, v23.16b\n"
+ ".inst 0x4e93975c // sdot v28.4s, v26.16b, v19.16b\n"
+ "mls v31.4s, v1.4s, v16.4s\n"
+ ".inst 0x4e939594 // sdot v20.4s, v12.16b, v19.16b\n"
+ ".inst 0x4e9896dd // sdot v29.4s, v22.16b, v24.16b\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x4e9396de // sdot v30.4s, v22.16b, v19.16b\n"
+ ".inst 0x4e9896dc // sdot v28.4s, v22.16b, v24.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+ "mov v17.16b, v20.16b\n .inst 0x4e989591 // sdot v17.4s, v12.16b, v24.16b\n"
+ ".inst 0x4e999594 // sdot v20.4s, v12.16b, v25.16b\n"
+ "mls v30.4s, v20.4s, v16.4s\n"
+ "mls v29.4s, v18.4s, v16.4s\n"
+ "mls v28.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v31.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v4.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v4.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v4.4s\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "and v19.16b, v30.16b, v21.16b\n"
+ "and v18.16b, v29.16b, v21.16b\n"
+ "and v17.16b, v28.16b, v21.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "srshl v31.4s, v31.4s, v21.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "srshl v29.4s, v29.4s, v21.4s\n"
+ "srshl v28.4s, v28.4s, v21.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
"smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
"blt 24f\n"
- "str s5, [x24, x27]\n"
- "str s30, [x23, x27]\n"
- "str s25, [x22, x27]\n"
- "str s20, [x21, x27]\n"
+ "str s31, [x25, x27]\n"
+ "str s30, [x24, x27]\n"
+ "str s29, [x23, x27]\n"
+ "str s28, [x22, x27]\n"
"b 27f\n"
"24:" // Oddments: Unroll 1: Oddment store
+ "add x25, x25, x27\n"
"add x24, x24, x27\n"
"add x23, x23, x27\n"
"add x22, x22, x27\n"
- "add x21, x21, x27\n"
"tbz x20, #1, 25f\n"
- "st1 { v5.h }[0], [x24], #0x2\n"
- "st1 { v30.h }[0], [x23], #0x2\n"
- "st1 { v25.h }[0], [x22], #0x2\n"
- "st1 { v20.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v30.h }[0], [x24], #0x2\n"
+ "st1 { v29.h }[0], [x23], #0x2\n"
+ "st1 { v28.h }[0], [x22], #0x2\n"
"tbz x20, #0, 26f\n"
- "st1 { v5.b }[2], [x24], #0x1\n"
- "st1 { v30.b }[2], [x23], #0x1\n"
- "st1 { v25.b }[2], [x22], #0x1\n"
- "st1 { v20.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v30.b }[2], [x24], #0x1\n"
+ "st1 { v29.b }[2], [x23], #0x1\n"
+ "st1 { v28.b }[2], [x22], #0x1\n"
"b 26f\n"
"25:" // Oddments: Unroll 1: Oddment store: Bit 1: Unset
- "st1 { v5.b }[0], [x24], #0x1\n"
- "st1 { v30.b }[0], [x23], #0x1\n"
- "st1 { v25.b }[0], [x22], #0x1\n"
- "st1 { v20.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v30.b }[0], [x24], #0x1\n"
+ "st1 { v29.b }[0], [x23], #0x1\n"
+ "st1 { v28.b }[0], [x22], #0x1\n"
"26:" // Oddments: Unroll 1: Oddment store: Bit 1: End
"27:" // Oddments: Unroll 1: After oddment store
"subs x20, x20, #0x4\n"
"add x27, x27, #0x4\n"
"ble 35f\n"
- "ldr q5, [%x[params], #0x0]\n"
- "ldr q0, [%x[params], #0x10]\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q25, [%x[params], #0x10]\n"
+ "movi v24.4s, #0x0\n"
+ ".inst 0x4e839598 // sdot v24.4s, v12.16b, v3.16b\n"
+ "ldr q23, [%x[params], #0x20]\n"
+ "ldr q22, [%x[params], #0x30]\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "ldr q21, [%x[params], #0x40]\n"
+ "ldr q20, [%x[params], #0x50]\n"
+ "mov v28.16b, v31.16b\n"
+ ".inst 0x4e87973f // sdot v31.4s, v25.16b, v7.16b\n"
+ ".inst 0x4e809598 // sdot v24.4s, v12.16b, v0.16b\n"
+ ".inst 0x4e83973d // sdot v29.4s, v25.16b, v3.16b\n"
"movi v19.4s, #0x0\n"
- ".inst 0x4e8295d3 // sdot v19.4s, v14.16b, v2.16b\n"
- "ldr q16, [%x[params], #0x20]\n"
- "ldr q31, [%x[params], #0x30]\n"
- "mov v30.16b, v5.16b\n"
- "mov v25.16b, v5.16b\n"
- "ldr q9, [%x[params], #0x40]\n"
- "ldr q4, [%x[params], #0x50]\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x4e879405 // sdot v5.4s, v0.16b, v7.16b\n"
- ".inst 0x4e9b95d3 // sdot v19.4s, v14.16b, v27.16b\n"
- ".inst 0x4e829419 // sdot v25.4s, v0.16b, v2.16b\n"
- "movi v17.4s, #0x0\n"
"cmp x20, #0x4\n"
- ".inst 0x4e829605 // sdot v5.4s, v16.16b, v2.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x4e9695d2 // sdot v18.4s, v14.16b, v22.16b\n"
- "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ ".inst 0x4e8396ff // sdot v31.4s, v23.16b, v3.16b\n"
+ "mov v18.16b, v24.16b\n .inst 0x4e829592 // sdot v18.4s, v12.16b, v2.16b\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
"add %x[params], %x[params], #0x60\n"
- ".inst 0x4e8795d3 // sdot v19.4s, v14.16b, v7.16b\n"
+ ".inst 0x4e879598 // sdot v24.4s, v12.16b, v7.16b\n"
"ext v7.16b, v7.16b, v7.16b, #0x1\n"
- ".inst 0x4e87941e // sdot v30.4s, v0.16b, v7.16b\n"
- ".inst 0x4e829414 // sdot v20.4s, v0.16b, v2.16b\n"
- ".inst 0x4e8295d1 // sdot v17.4s, v14.16b, v2.16b\n"
- ".inst 0x4e9b9619 // sdot v25.4s, v16.16b, v27.16b\n"
- ".inst 0x4e9b97e5 // sdot v5.4s, v31.16b, v27.16b\n"
- "ext v27.16b, v27.16b, v27.16b, #0x1\n"
- ".inst 0x4e82961e // sdot v30.4s, v16.16b, v2.16b\n"
- ".inst 0x4e9b9614 // sdot v20.4s, v16.16b, v27.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x4e9b95d1 // sdot v17.4s, v14.16b, v27.16b\n"
- ".inst 0x4e9697f9 // sdot v25.4s, v31.16b, v22.16b\n"
- "ext v22.16b, v22.16b, v22.16b, #0x1\n"
- ".inst 0x4e9b97fe // sdot v30.4s, v31.16b, v27.16b\n"
- ".inst 0x4e9697f4 // sdot v20.4s, v31.16b, v22.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x4e9695d0 // sdot v16.4s, v14.16b, v22.16b\n"
- ".inst 0x4e8795d1 // sdot v17.4s, v14.16b, v7.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smax v5.4s, v5.4s, v13.4s\n"
+ ".inst 0x4e87973e // sdot v30.4s, v25.16b, v7.16b\n"
+ ".inst 0x4e83973c // sdot v28.4s, v25.16b, v3.16b\n"
+ ".inst 0x4e839593 // sdot v19.4s, v12.16b, v3.16b\n"
+ ".inst 0x4e8096fd // sdot v29.4s, v23.16b, v0.16b\n"
+ ".inst 0x4e8096df // sdot v31.4s, v22.16b, v0.16b\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x4e8396fe // sdot v30.4s, v23.16b, v3.16b\n"
+ ".inst 0x4e8096fc // sdot v28.4s, v23.16b, v0.16b\n"
+ "mls v31.4s, v24.4s, v16.4s\n"
+ ".inst 0x4e809593 // sdot v19.4s, v12.16b, v0.16b\n"
+ ".inst 0x4e8296dd // sdot v29.4s, v22.16b, v2.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ ".inst 0x4e8096de // sdot v30.4s, v22.16b, v0.16b\n"
+ ".inst 0x4e8296dc // sdot v28.4s, v22.16b, v2.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+ "mov v17.16b, v19.16b\n .inst 0x4e829591 // sdot v17.4s, v12.16b, v2.16b\n"
+ ".inst 0x4e879593 // sdot v19.4s, v12.16b, v7.16b\n"
+ "mls v30.4s, v19.4s, v16.4s\n"
+ "mls v29.4s, v18.4s, v16.4s\n"
+ "mls v28.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v31.16b, v20.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v21.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "and v19.16b, v30.16b, v20.16b\n"
+ "and v18.16b, v29.16b, v20.16b\n"
+ "and v17.16b, v28.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "srshl v30.4s, v30.4s, v20.4s\n"
+ "srshl v29.4s, v29.4s, v20.4s\n"
+ "srshl v28.4s, v28.4s, v20.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
"smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
"blt 28f\n"
- "str s5, [x24, x27]\n"
- "str s30, [x23, x27]\n"
- "str s25, [x22, x27]\n"
- "str s20, [x21, x27]\n"
+ "str s31, [x25, x27]\n"
+ "str s30, [x24, x27]\n"
+ "str s29, [x23, x27]\n"
+ "str s28, [x22, x27]\n"
"b 31f\n"
"28:" // Oddments: Unroll 2: Oddment store
+ "add x25, x25, x27\n"
"add x24, x24, x27\n"
"add x23, x23, x27\n"
"add x22, x22, x27\n"
- "add x21, x21, x27\n"
"tbz x20, #1, 29f\n"
- "st1 { v5.h }[0], [x24], #0x2\n"
- "st1 { v30.h }[0], [x23], #0x2\n"
- "st1 { v25.h }[0], [x22], #0x2\n"
- "st1 { v20.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v30.h }[0], [x24], #0x2\n"
+ "st1 { v29.h }[0], [x23], #0x2\n"
+ "st1 { v28.h }[0], [x22], #0x2\n"
"tbz x20, #0, 30f\n"
- "st1 { v5.b }[2], [x24], #0x1\n"
- "st1 { v30.b }[2], [x23], #0x1\n"
- "st1 { v25.b }[2], [x22], #0x1\n"
- "st1 { v20.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v30.b }[2], [x24], #0x1\n"
+ "st1 { v29.b }[2], [x23], #0x1\n"
+ "st1 { v28.b }[2], [x22], #0x1\n"
"b 30f\n"
"29:" // Oddments: Unroll 2: Oddment store: Bit 1: Unset
- "st1 { v5.b }[0], [x24], #0x1\n"
- "st1 { v30.b }[0], [x23], #0x1\n"
- "st1 { v25.b }[0], [x22], #0x1\n"
- "st1 { v20.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v30.b }[0], [x24], #0x1\n"
+ "st1 { v29.b }[0], [x23], #0x1\n"
+ "st1 { v28.b }[0], [x22], #0x1\n"
"30:" // Oddments: Unroll 2: Oddment store: Bit 1: End
"31:" // Oddments: Unroll 2: After oddment store
"subs x20, x20, #0x4\n"
"add x27, x27, #0x4\n"
"ble 35f\n"
- "ldr q5, [%x[params], #0x0]\n"
- "ldr q0, [%x[params], #0x10]\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x4e8195d3 // sdot v19.4s, v14.16b, v1.16b\n"
- "ldr q16, [%x[params], #0x20]\n"
- "ldr q31, [%x[params], #0x30]\n"
- "mov v30.16b, v5.16b\n"
- "mov v25.16b, v5.16b\n"
- "ldr q9, [%x[params], #0x40]\n"
- "ldr q4, [%x[params], #0x50]\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x4e869405 // sdot v5.4s, v0.16b, v6.16b\n"
- ".inst 0x4e9a95d3 // sdot v19.4s, v14.16b, v26.16b\n"
- ".inst 0x4e819419 // sdot v25.4s, v0.16b, v1.16b\n"
- "movi v17.4s, #0x0\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q23, [%x[params], #0x10]\n"
+ "movi v22.4s, #0x0\n"
+ ".inst 0x4e8a9596 // sdot v22.4s, v12.16b, v10.16b\n"
+ "ldr q21, [%x[params], #0x20]\n"
+ "ldr q19, [%x[params], #0x30]\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "ldr q20, [%x[params], #0x40]\n"
+ "ldr q26, [%x[params], #0x50]\n"
+ "mov v28.16b, v31.16b\n"
+ ".inst 0x4e8896ff // sdot v31.4s, v23.16b, v8.16b\n"
+ ".inst 0x4e859596 // sdot v22.4s, v12.16b, v5.16b\n"
+ ".inst 0x4e8a96fd // sdot v29.4s, v23.16b, v10.16b\n"
+ "movi v18.4s, #0x0\n"
"add %x[params], %x[params], #0x60\n"
- ".inst 0x4e819605 // sdot v5.4s, v16.16b, v1.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x4e9595d2 // sdot v18.4s, v14.16b, v21.16b\n"
- "ext v1.16b, v1.16b, v1.16b, #0x1\n"
- ".inst 0x4e8695d3 // sdot v19.4s, v14.16b, v6.16b\n"
- "ext v6.16b, v6.16b, v6.16b, #0x1\n"
- ".inst 0x4e86941e // sdot v30.4s, v0.16b, v6.16b\n"
- ".inst 0x4e819414 // sdot v20.4s, v0.16b, v1.16b\n"
- ".inst 0x4e8195d1 // sdot v17.4s, v14.16b, v1.16b\n"
- ".inst 0x4e9a9619 // sdot v25.4s, v16.16b, v26.16b\n"
- ".inst 0x4e9a97e5 // sdot v5.4s, v31.16b, v26.16b\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- ".inst 0x4e81961e // sdot v30.4s, v16.16b, v1.16b\n"
- ".inst 0x4e9a9614 // sdot v20.4s, v16.16b, v26.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x4e9a95d1 // sdot v17.4s, v14.16b, v26.16b\n"
- ".inst 0x4e9597f9 // sdot v25.4s, v31.16b, v21.16b\n"
- "ext v21.16b, v21.16b, v21.16b, #0x1\n"
- ".inst 0x4e9a97fe // sdot v30.4s, v31.16b, v26.16b\n"
- ".inst 0x4e9597f4 // sdot v20.4s, v31.16b, v21.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x4e9595d0 // sdot v16.4s, v14.16b, v21.16b\n"
- ".inst 0x4e8695d1 // sdot v17.4s, v14.16b, v6.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
+ ".inst 0x4e8a96bf // sdot v31.4s, v21.16b, v10.16b\n"
+ "mov v17.16b, v22.16b\n .inst 0x4e899591 // sdot v17.4s, v12.16b, v9.16b\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+ ".inst 0x4e889596 // sdot v22.4s, v12.16b, v8.16b\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ ".inst 0x4e8896fe // sdot v30.4s, v23.16b, v8.16b\n"
+ ".inst 0x4e8a96fc // sdot v28.4s, v23.16b, v10.16b\n"
+ ".inst 0x4e8a9592 // sdot v18.4s, v12.16b, v10.16b\n"
+ ".inst 0x4e8596bd // sdot v29.4s, v21.16b, v5.16b\n"
+ ".inst 0x4e85967f // sdot v31.4s, v19.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x4e8a96be // sdot v30.4s, v21.16b, v10.16b\n"
+ ".inst 0x4e8596bc // sdot v28.4s, v21.16b, v5.16b\n"
+ "mls v31.4s, v22.4s, v16.4s\n"
+ ".inst 0x4e859592 // sdot v18.4s, v12.16b, v5.16b\n"
+ ".inst 0x4e89967d // sdot v29.4s, v19.16b, v9.16b\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+ ".inst 0x4e85967e // sdot v30.4s, v19.16b, v5.16b\n"
+ ".inst 0x4e89967c // sdot v28.4s, v19.16b, v9.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v20.4s\n"
+ "mov v7.16b, v18.16b\n .inst 0x4e899587 // sdot v7.4s, v12.16b, v9.16b\n"
+ ".inst 0x4e889592 // sdot v18.4s, v12.16b, v8.16b\n"
+ "mls v30.4s, v18.4s, v16.4s\n"
+ "mls v29.4s, v17.4s, v16.4s\n"
+ "mls v28.4s, v7.4s, v16.4s\n"
+ "and v16.16b, v31.16b, v26.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smax v5.4s, v5.4s, v13.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v20.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v20.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v20.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v18.16b, v30.16b, v26.16b\n"
+ "and v17.16b, v29.16b, v26.16b\n"
+ "and v16.16b, v28.16b, v26.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "sqadd v29.4s, v29.4s, v17.4s\n"
+ "sqadd v28.4s, v28.4s, v16.4s\n"
+ "srshl v31.4s, v31.4s, v26.4s\n"
+ "srshl v30.4s, v30.4s, v26.4s\n"
+ "srshl v29.4s, v29.4s, v26.4s\n"
+ "srshl v28.4s, v28.4s, v26.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
"smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
"32:" // Oddments: Unroll 3: Oddment store
+ "add x25, x25, x27\n"
"add x24, x24, x27\n"
"add x23, x23, x27\n"
"add x22, x22, x27\n"
- "add x21, x21, x27\n"
"tbz x20, #1, 33f\n"
- "st1 { v5.h }[0], [x24], #0x2\n"
- "st1 { v30.h }[0], [x23], #0x2\n"
- "st1 { v25.h }[0], [x22], #0x2\n"
- "st1 { v20.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v30.h }[0], [x24], #0x2\n"
+ "st1 { v29.h }[0], [x23], #0x2\n"
+ "st1 { v28.h }[0], [x22], #0x2\n"
"tbz x20, #0, 34f\n"
- "st1 { v5.b }[2], [x24], #0x1\n"
- "st1 { v30.b }[2], [x23], #0x1\n"
- "st1 { v25.b }[2], [x22], #0x1\n"
- "st1 { v20.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v30.b }[2], [x24], #0x1\n"
+ "st1 { v29.b }[2], [x23], #0x1\n"
+ "st1 { v28.b }[2], [x22], #0x1\n"
"b 34f\n"
"33:" // Oddments: Unroll 3: Oddment store: Bit 1: Unset
- "st1 { v5.b }[0], [x24], #0x1\n"
- "st1 { v30.b }[0], [x23], #0x1\n"
- "st1 { v25.b }[0], [x22], #0x1\n"
- "st1 { v20.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v30.b }[0], [x24], #0x1\n"
+ "st1 { v29.b }[0], [x23], #0x1\n"
+ "st1 { v28.b }[0], [x22], #0x1\n"
"34:" // Oddments: Unroll 3: Oddment store: Bit 1: End
"35:" // End
: [params] "+&r" (params)
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index 411b4788d8..a679b02f7c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
@@ -34,15 +34,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
- const unsigned int,
- const int8_t *const *const,
- const int8_t *const,
- const int32_t *const,
- const arm_gemm::Requantize32 &,
- const int32_t *const,
- const int32_t *const,
- int8_t *const *const);
+void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
class a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index d69d0e1ef2..a181603f1e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -91,1072 +91,1072 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x6, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
- "lsr x7, x6, #0x3\n"
+ "lsr x8, x7, #0x3\n"
"add x20, x23, %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v24.16b }, [x20]\n"
+ "ld1r { v14.16b }, [x20]\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
"add x21, x23, %[offsetof_Requantize32_b_offset]\n"
"add x20, x23, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v15.16b }, [x21]\n"
- "ld1r { v14.8h }, [x20]\n"
+ "ld1r { v19.16b }, [x21]\n"
+ "ld1r { v13.8h }, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_minval]\n"
"add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v12.8h }, [x21]\n"
- "ld1r { v11.8h }, [x20]\n"
- "mov x8, #0x0\n"
+ "ld1r { v29.8h }, [x21]\n"
+ "ld1r { v12.8h }, [x20]\n"
"mov x17, #0x0\n"
- "add x16, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x15, [%x[params], %[offsetof_Params_weights]]\n"
- "ldr x14, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "ldr x13, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x12, x11, [x22, #0x0]\n"
- "ldp x10, x9, [x22, #0x10]\n"
- "cbz x7, 3f\n"
- "ldr d0, [x15, #0x0]\n"
- "ldr d1, [x15, #0x8]\n"
- "subs x7, x7, #0x1\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ldr d2, [x15, #0x10]\n"
- "ldr d3, [x15, #0x18]\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ldr d4, [x15, #0x20]\n"
- "ldr d5, [x15, #0x28]\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ldr d6, [x15, #0x30]\n"
- "ldr d7, [x15, #0x38]\n"
- "ssubl v5.8h, v5.8b, v15.8b\n"
- "ssubl v6.8h, v6.8b, v15.8b\n"
- "ldr d8, [x15, #0x40]\n"
- "ldr x28, [%x[params], %[offsetof_Params_bias]]\n"
- "ssubl v7.8h, v7.8b, v15.8b\n"
- "ssubl v8.8h, v8.8b, v15.8b\n"
- "ldr q13, [x28, #0x0]\n"
- "ldr q20, [x28, #0x10]\n"
- "add x28, x28, #0x20\n"
- "str x28, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x24, x23, [x16, #0x0]\n"
- "ldp x22, x21, [x16, #0x10]\n"
- "mov v9.16b, v13.16b\n"
- "mov v18.16b, v20.16b\n"
- "ldr d31, [x24, x8]\n"
- "ldr d30, [x23, x8]\n"
- "mov v16.16b, v13.16b\n"
- "mov v26.16b, v20.16b\n"
- "ldr d29, [x22, x8]\n"
- "ldr d28, [x21, x8]\n"
- "mov v25.16b, v13.16b\n"
- "mov v10.16b, v20.16b\n"
- "ldr x20, [x16, #0x20]\n"
- "ldr d27, [x20, x8]\n"
- "ssubl v31.8h, v31.8b, v24.8b\n"
- "ssubl v30.8h, v30.8b, v24.8b\n"
- "ssubl v29.8h, v29.8b, v24.8b\n"
- "ssubl v28.8h, v28.8b, v24.8b\n"
- "ssubl v27.8h, v27.8b, v24.8b\n"
+ "mov x16, #0x0\n"
+ "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x11, x10, [x22, #0x0]\n"
+ "ldp x9, x28, [x22, #0x10]\n"
+ "cbz x8, 3f\n"
+ "ldr d23, [x14, #0x0]\n"
+ "ldr d16, [x14, #0x8]\n"
+ "subs x8, x8, #0x1\n"
+ "ssubl v23.8h, v23.8b, v19.8b\n"
+ "ldr d1, [x14, #0x10]\n"
+ "ldr d5, [x14, #0x18]\n"
+ "ssubl v16.8h, v16.8b, v19.8b\n"
+ "ssubl v1.8h, v1.8b, v19.8b\n"
+ "ldr d26, [x14, #0x20]\n"
+ "ldr d18, [x14, #0x28]\n"
+ "ssubl v5.8h, v5.8b, v19.8b\n"
+ "ssubl v26.8h, v26.8b, v19.8b\n"
+ "ldr d31, [x14, #0x30]\n"
+ "ldr d25, [x14, #0x38]\n"
+ "ssubl v18.8h, v18.8b, v19.8b\n"
+ "ssubl v31.8h, v31.8b, v19.8b\n"
+ "ldr d20, [x14, #0x40]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ssubl v25.8h, v25.8b, v19.8b\n"
+ "ssubl v20.8h, v20.8b, v19.8b\n"
+ "ldr q9, [x20, #0x0]\n"
+ "ldr q24, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x23, x22, [x15, #0x0]\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "mov v7.16b, v9.16b\n"
+ "mov v0.16b, v24.16b\n"
+ "ldr d22, [x23, x17]\n"
+ "ldr d4, [x22, x17]\n"
+ "mov v2.16b, v9.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "ldr d8, [x21, x17]\n"
+ "ldr d27, [x20, x17]\n"
+ "mov v10.16b, v9.16b\n"
+ "mov v6.16b, v24.16b\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ldr d15, [x20, x17]\n"
+ "ssubl v22.8h, v22.8b, v14.8b\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "ssubl v8.8h, v8.8b, v14.8b\n"
+ "ssubl v27.8h, v27.8b, v14.8b\n"
+ "ssubl v15.8h, v15.8b, v14.8b\n"
"beq 2f\n"
"1:" // Loop
- "ldr q17, [x14, #0x0]\n"
- "ldr q22, [x13, #0x0]\n"
- "smlal v13.4s, v31.4h, v4.4h\n"
- "smlal2 v20.4s, v31.8h, v4.8h\n"
- "ldr q23, [x14, #0x10]\n"
- "smlal v9.4s, v31.4h, v3.4h\n"
- "smlal2 v18.4s, v31.8h, v3.8h\n"
- "ldr x21, [x16, #0x28]\n"
- "smlal v13.4s, v30.4h, v0.4h\n"
- "smlal2 v20.4s, v30.8h, v0.8h\n"
- "ldr q19, [x13, #0x10]\n"
- "ldr x28, [x16, #0x38]\n"
- "smlal v9.4s, v29.4h, v2.4h\n"
- "smlal2 v18.4s, v29.8h, v2.8h\n"
- "ldr x20, [x16, #0x30]\n"
- "ldr d29, [x20, x8]\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "smlal2 v26.4s, v31.8h, v1.8h\n"
- "ldr x27, [x16, #0x40]\n"
- "ldr x26, [x16, #0x48]\n"
- "smlal v25.4s, v31.4h, v0.4h\n"
- "smlal2 v10.4s, v31.8h, v0.8h\n"
- "ldr d31, [x21, x8]\n"
- "ssubl v31.8h, v31.8b, v24.8b\n"
- "smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "ssubl v29.8h, v29.8b, v24.8b\n"
- "ldr x25, [x16, #0x50]\n"
- "smlal v9.4s, v28.4h, v4.4h\n"
- "smlal2 v18.4s, v28.8h, v4.8h\n"
- "ldr x24, [x16, #0x58]\n"
- "ldr x23, [x16, #0x60]\n"
- "smlal v16.4s, v28.4h, v2.4h\n"
- "smlal2 v26.4s, v28.8h, v2.8h\n"
- "ldr x22, [x16, #0x68]\n"
- "ldr x21, [x16, #0x70]\n"
- "smlal v25.4s, v28.4h, v1.4h\n"
- "smlal2 v10.4s, v28.8h, v1.8h\n"
- "ldr d28, [x28, x8]\n"
- "ssubl v28.8h, v28.8b, v24.8b\n"
- "smlal v13.4s, v27.4h, v7.4h\n"
- "smlal2 v20.4s, v27.8h, v7.8h\n"
- "ldr x20, [x16, #0x78]\n"
- "ldr x28, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal v9.4s, v27.4h, v6.4h\n"
- "smlal2 v18.4s, v27.8h, v6.8h\n"
- "add x15, x15, #0x48\n"
- "subs x7, x7, #0x1\n"
- "smlal v16.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v6.8h\n"
- "ldr d31, [x27, x8]\n"
- "ssubl v31.8h, v31.8b, v24.8b\n"
- "smlal v25.4s, v27.4h, v3.4h\n"
- "smlal2 v10.4s, v27.8h, v3.8h\n"
- "add x14, x14, #0x20\n"
+ "ldr q3, [x13, #0x0]\n"
+ "ldr q17, [x12, #0x0]\n"
+ "smlal v9.4s, v22.4h, v26.4h\n"
+ "smlal2 v24.4s, v22.8h, v26.8h\n"
+ "ldr q21, [x13, #0x10]\n"
+ "ldr q28, [x12, #0x10]\n"
+ "smlal v9.4s, v4.4h, v23.4h\n"
+ "smlal v7.4s, v22.4h, v5.4h\n"
+ "ldr x20, [x15, #0x28]\n"
+ "ldr d11, [x20, x17]\n"
+ "smlal v2.4s, v22.4h, v16.4h\n"
+ "smlal v10.4s, v22.4h, v23.4h\n"
+ "smlal2 v24.4s, v4.8h, v23.8h\n"
+ "ldr x20, [x15, #0x38]\n"
+ "ldr d4, [x20, x17]\n"
+ "smlal v9.4s, v27.4h, v18.4h\n"
+ "smlal2 v0.4s, v22.8h, v5.8h\n"
+ "smlal2 v30.4s, v22.8h, v16.8h\n"
+ "ldr x20, [x15, #0x30]\n"
+ "ssubl v11.8h, v11.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v23.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "smlal v2.4s, v27.4h, v1.4h\n"
+ "smlal v10.4s, v27.4h, v16.4h\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "ldr x27, [x15, #0x48]\n"
+ "smlal2 v24.4s, v27.8h, v18.8h\n"
+ "smlal v9.4s, v15.4h, v25.4h\n"
+ "ssubl v22.8h, v22.8b, v14.8b\n"
+ "ldr x26, [x15, #0x50]\n"
+ "smlal2 v0.4s, v8.8h, v1.8h\n"
+ "ldr d8, [x20, x17]\n"
+ "smlal2 v30.4s, v27.8h, v1.8h\n"
+ "ssubl v8.8h, v8.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v16.8h\n"
+ "smlal v7.4s, v27.4h, v26.4h\n"
+ "ldr x25, [x15, #0x58]\n"
+ "ldr x24, [x15, #0x60]\n"
+ "smlal v2.4s, v11.4h, v31.4h\n"
+ "smlal v10.4s, v15.4h, v5.4h\n"
+ "ldr x23, [x15, #0x68]\n"
+ "ldr x22, [x15, #0x70]\n"
+ "smlal2 v24.4s, v15.8h, v25.8h\n"
+ "smlal v9.4s, v4.4h, v16.4h\n"
+ "ldr x21, [x15, #0x78]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v0.4s, v27.8h, v26.8h\n"
+ "ldr d27, [x27, x17]\n"
+ "smlal2 v30.4s, v11.8h, v31.8h\n"
+ "ldr d11, [x26, x17]\n"
+ "smlal2 v6.4s, v15.8h, v5.8h\n"
+ "smlal v7.4s, v15.4h, v31.4h\n"
+ "ssubl v27.8h, v27.8b, v14.8b\n"
+ "add x14, x14, #0x48\n"
+ "smlal v2.4s, v15.4h, v26.4h\n"
+ "smlal v10.4s, v22.4h, v20.4h\n"
+ "ssubl v11.8h, v11.8b, v14.8b\n"
+ "subs x8, x8, #0x1\n"
+ "smlal2 v24.4s, v4.8h, v16.8h\n"
+ "smlal v9.4s, v8.4h, v1.4h\n"
"add x13, x13, #0x20\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "smlal v9.4s, v28.4h, v0.4h\n"
- "smlal2 v18.4s, v28.8h, v0.8h\n"
- "ldr d30, [x26, x8]\n"
- "ssubl v30.8h, v30.8b, v24.8b\n"
- "smlal v16.4s, v27.4h, v4.4h\n"
- "smlal v25.4s, v29.4h, v8.4h\n"
- "smlal2 v26.4s, v27.8h, v4.8h\n"
- "ldr d28, [x24, x8]\n"
- "smlal2 v10.4s, v29.8h, v8.8h\n"
- "ldr d29, [x25, x8]\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v20.4s, v31.8h, v2.8h\n"
- "ssubl v29.8h, v29.8b, v24.8b\n"
- "smlal v9.4s, v31.4h, v1.4h\n"
- "smlal2 v18.4s, v31.8h, v1.8h\n"
- "ldr d31, [x23, x8]\n"
- "ssubl v28.8h, v28.8b, v24.8b\n"
- "smlal v16.4s, v30.4h, v5.4h\n"
- "smlal v25.4s, v30.4h, v4.4h\n"
- "ssubl v31.8h, v31.8b, v24.8b\n"
- "smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v20.4s, v30.8h, v8.8h\n"
- "smlal v9.4s, v30.4h, v7.4h\n"
- "smlal2 v18.4s, v30.8h, v7.8h\n"
- "smlal2 v26.4s, v30.8h, v5.8h\n"
- "smlal2 v10.4s, v30.8h, v4.8h\n"
- "ldr d30, [x22, x8]\n"
- "ssubl v30.8h, v30.8b, v24.8b\n"
- "smlal v16.4s, v29.4h, v0.4h\n"
- "smlal v25.4s, v28.4h, v2.4h\n"
- "smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v20.4s, v29.8h, v3.8h\n"
- "smlal2 v26.4s, v29.8h, v0.8h\n"
- "ldr d29, [x21, x8]\n"
- "smlal2 v10.4s, v28.8h, v2.8h\n"
- "ssubl v29.8h, v29.8b, v24.8b\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal v25.4s, v30.4h, v5.4h\n"
- "smlal v9.4s, v28.4h, v5.4h\n"
- "smlal2 v18.4s, v28.8h, v5.8h\n"
- "ldr d28, [x20, x8]\n"
- "ssubl v28.8h, v28.8b, v24.8b\n"
- "smlal v13.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v3.8h\n"
- "sqrdmulh v13.4s, v13.4s, v17.4s\n"
- "add x8, x8, #0x8\n"
- "smlal2 v10.4s, v30.8h, v5.8h\n"
- "smlal v16.4s, v29.4h, v7.4h\n"
- "and v21.16b, v13.16b, v22.16b\n"
- "smlal v25.4s, v29.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "smlal2 v26.4s, v29.8h, v7.8h\n"
- "smlal2 v10.4s, v29.8h, v6.8h\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "smlal v9.4s, v30.4h, v8.4h\n"
- "smlal v16.4s, v28.4h, v8.4h\n"
- "and v29.16b, v20.16b, v19.16b\n"
- "smlal v25.4s, v28.4h, v7.4h\n"
- "smlal2 v18.4s, v30.8h, v8.8h\n"
- "sqrdmulh v9.4s, v9.4s, v17.4s\n"
- "smlal2 v26.4s, v28.8h, v8.8h\n"
- "smlal2 v10.4s, v28.8h, v7.8h\n"
- "sqrdmulh v16.4s, v16.4s, v17.4s\n"
- "sqrdmulh v25.4s, v25.4s, v17.4s\n"
- "sqadd v13.4s, v13.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v0.16b, v9.16b, v22.16b\n"
- "sqrdmulh v18.4s, v18.4s, v23.4s\n"
- "and v27.16b, v16.16b, v22.16b\n"
- "sqrdmulh v26.4s, v26.4s, v23.4s\n"
- "and v21.16b, v25.16b, v22.16b\n"
- "sqrdmulh v10.4s, v10.4s, v23.4s\n"
- "sqadd v20.4s, v20.4s, v29.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v17.16b, v18.16b, v19.16b\n"
+ "add x12, x12, #0x20\n"
+ "smlal2 v0.4s, v15.8h, v31.8h\n"
+ "smlal2 v30.4s, v15.8h, v26.8h\n"
+ "ldr d15, [x25, x17]\n"
+ "ssubl v15.8h, v15.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v20.8h\n"
+ "ldr d22, [x24, x17]\n"
+ "smlal v7.4s, v4.4h, v23.4h\n"
+ "ssubl v22.8h, v22.8b, v14.8b\n"
+ "smlal v2.4s, v27.4h, v18.4h\n"
+ "smlal v10.4s, v27.4h, v26.4h\n"
+ "smlal2 v24.4s, v8.8h, v1.8h\n"
+ "smlal v9.4s, v27.4h, v20.4h\n"
+ "smlal2 v0.4s, v4.8h, v23.8h\n"
+ "ldr d4, [x23, x17]\n"
+ "smlal2 v30.4s, v27.8h, v18.8h\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v26.8h\n"
+ "ldr d26, [x22, x17]\n"
+ "smlal v7.4s, v8.4h, v16.4h\n"
+ "ssubl v26.8h, v26.8b, v14.8b\n"
+ "smlal v2.4s, v11.4h, v23.4h\n"
+ "smlal v10.4s, v15.4h, v1.4h\n"
+ "smlal2 v24.4s, v27.8h, v20.8h\n"
+ "smlal v9.4s, v11.4h, v5.4h\n"
+ "smlal2 v0.4s, v8.8h, v16.8h\n"
+ "ldr d8, [x21, x17]\n"
+ "smlal2 v30.4s, v11.8h, v23.8h\n"
+ "ssubl v8.8h, v8.8b, v14.8b\n"
+ "smlal2 v6.4s, v15.8h, v1.8h\n"
+ "smlal v7.4s, v27.4h, v25.4h\n"
+ "add x17, x17, #0x8\n"
+ "smlal v2.4s, v22.4h, v5.4h\n"
+ "smlal v10.4s, v4.4h, v18.4h\n"
+ "smlal2 v24.4s, v11.8h, v5.8h\n"
+ "smlal v9.4s, v22.4h, v31.4h\n"
+ "sqrdmulh v9.4s, v9.4s, v3.4s\n"
+ "smlal2 v0.4s, v27.8h, v25.8h\n"
+ "smlal2 v30.4s, v22.8h, v5.8h\n"
+ "and v27.16b, v9.16b, v17.16b\n"
+ "smlal2 v6.4s, v4.8h, v18.8h\n"
+ "smlal v7.4s, v15.4h, v18.4h\n"
"sshr v27.4s, v27.4s, #0x1f\n"
- "and v7.16b, v26.16b, v19.16b\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "and v29.16b, v10.16b, v19.16b\n"
- "sqadd v9.4s, v9.4s, v0.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v27.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v25.4s, v25.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v22.4s\n"
- "srshl v9.4s, v9.4s, v22.4s\n"
- "sqadd v18.4s, v18.4s, v17.4s\n"
- "srshl v16.4s, v16.4s, v22.4s\n"
- "sqadd v26.4s, v26.4s, v7.4s\n"
- "srshl v25.4s, v25.4s, v22.4s\n"
- "sqadd v10.4s, v10.4s, v29.4s\n"
- "srshl v20.4s, v20.4s, v19.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v18.4s, v18.4s, v19.4s\n"
+ "smlal v2.4s, v26.4h, v25.4h\n"
+ "smlal v10.4s, v26.4h, v31.4h\n"
+ "sqadd v9.4s, v9.4s, v27.4s\n"
+ "smlal2 v24.4s, v22.8h, v31.8h\n"
+ "smlal2 v0.4s, v15.8h, v18.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ "smlal2 v30.4s, v26.8h, v25.8h\n"
+ "smlal2 v6.4s, v26.8h, v31.8h\n"
+ "and v31.16b, v24.16b, v28.16b\n"
+ "smlal v7.4s, v4.4h, v20.4h\n"
+ "smlal v2.4s, v8.4h, v20.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v3.4s\n"
+ "smlal v10.4s, v8.4h, v25.4h\n"
+ "smlal2 v0.4s, v4.8h, v20.8h\n"
+ "sqrdmulh v2.4s, v2.4s, v3.4s\n"
+ "smlal2 v30.4s, v8.8h, v20.8h\n"
+ "smlal2 v6.4s, v8.8h, v25.8h\n"
+ "sqrdmulh v10.4s, v10.4s, v3.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v22.16b, v7.16b, v17.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v21.4s\n"
+ "and v3.16b, v2.16b, v17.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "and v11.16b, v10.16b, v17.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v20.16b, v0.16b, v28.16b\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "and v31.16b, v30.16b, v28.16b\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "and v18.16b, v6.16b, v28.16b\n"
+ "sqadd v7.4s, v7.4s, v22.4s\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sqadd v2.4s, v2.4s, v3.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v11.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "srshl v9.4s, v9.4s, v17.4s\n"
+ "srshl v7.4s, v7.4s, v17.4s\n"
+ "sqadd v0.4s, v0.4s, v20.4s\n"
+ "srshl v2.4s, v2.4s, v17.4s\n"
+ "sqadd v30.4s, v30.4s, v31.4s\n"
+ "srshl v10.4s, v10.4s, v17.4s\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
"sqxtn v9.4h, v9.4s\n"
- "srshl v26.4s, v26.4s, v19.4s\n"
- "sqxtn v16.4h, v16.4s\n"
- "srshl v10.4s, v10.4s, v19.4s\n"
- "sqxtn v25.4h, v25.4s\n"
- "sqxtn2 v13.8h, v20.4s\n"
- "sqxtn2 v9.8h, v18.4s\n"
- "sqxtn2 v16.8h, v26.4s\n"
- "sqxtn2 v25.8h, v10.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v9.8h, v9.8h, v14.8h\n"
- "sqadd v16.8h, v16.8h, v14.8h\n"
- "sqadd v25.8h, v25.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v9.8h, v9.8h, v12.8h\n"
- "smax v16.8h, v16.8h, v12.8h\n"
- "smax v25.8h, v25.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v9.8h, v9.8h, v11.8h\n"
- "smin v16.8h, v16.8h, v11.8h\n"
- "smin v25.8h, v25.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
- "str d13, [x12, x17]\n"
+ "srshl v0.4s, v0.4s, v28.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v30.4s, v30.4s, v28.4s\n"
+ "sqxtn v2.4h, v2.4s\n"
+ "srshl v6.4s, v6.4s, v28.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "sqxtn2 v9.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v0.4s\n"
+ "sqxtn2 v2.8h, v30.4s\n"
+ "sqxtn2 v10.8h, v6.4s\n"
+ "sqadd v9.8h, v9.8h, v13.8h\n"
+ "sqadd v7.8h, v7.8h, v13.8h\n"
+ "sqadd v2.8h, v2.8h, v13.8h\n"
+ "sqadd v10.8h, v10.8h, v13.8h\n"
+ "smax v9.8h, v9.8h, v29.8h\n"
+ "smax v7.8h, v7.8h, v29.8h\n"
+ "smax v2.8h, v2.8h, v29.8h\n"
+ "smax v10.8h, v10.8h, v29.8h\n"
+ "smin v9.8h, v9.8h, v12.8h\n"
+ "smin v7.8h, v7.8h, v12.8h\n"
+ "smin v2.8h, v2.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str d9, [x11, x17]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str d16, [x10, x17]\n"
- "str d25, [x9, x17]\n"
- "ldr q13, [x28, #0x0]\n"
- "ldr q20, [x28, #0x10]\n"
- "add x28, x28, #0x20\n"
- "ldr d0, [x15, #0x0]\n"
- "ldr d1, [x15, #0x8]\n"
- "add x17, x17, #0x8\n"
- "str x28, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d2, [x15, #0x10]\n"
- "ldr d3, [x15, #0x18]\n"
- "mov v9.16b, v13.16b\n"
- "mov v18.16b, v20.16b\n"
- "ldr d4, [x15, #0x20]\n"
- "ldr d5, [x15, #0x28]\n"
- "mov v16.16b, v13.16b\n"
- "mov v26.16b, v20.16b\n"
- "ldr d6, [x15, #0x30]\n"
- "ldr d7, [x15, #0x38]\n"
- "mov v25.16b, v13.16b\n"
- "mov v10.16b, v20.16b\n"
- "ldr d8, [x15, #0x40]\n"
- "ldp x24, x23, [x16, #0x0]\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldp x22, x21, [x16, #0x10]\n"
- "ldr d31, [x24, x8]\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr d30, [x23, x8]\n"
- "ldr d29, [x22, x8]\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ssubl v5.8h, v5.8b, v15.8b\n"
- "ldr d28, [x21, x8]\n"
- "ldr x20, [x16, #0x20]\n"
- "ssubl v6.8h, v6.8b, v15.8b\n"
- "ssubl v7.8h, v7.8b, v15.8b\n"
- "ldr d27, [x20, x8]\n"
- "ssubl v8.8h, v8.8b, v15.8b\n"
- "ssubl v31.8h, v31.8b, v24.8b\n"
- "ssubl v30.8h, v30.8b, v24.8b\n"
- "ssubl v29.8h, v29.8b, v24.8b\n"
- "ssubl v28.8h, v28.8b, v24.8b\n"
- "ssubl v27.8h, v27.8b, v24.8b\n"
+ "str d9, [x11, x16]\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "str d7, [x10, x16]\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "str d2, [x9, x16]\n"
+ "str d10, [x28, x16]\n"
+ "ldr q9, [x20, #0x0]\n"
+ "ldr q24, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d23, [x14, #0x0]\n"
+ "ldr d16, [x14, #0x8]\n"
+ "add x16, x16, #0x8\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d1, [x14, #0x10]\n"
+ "ldr d5, [x14, #0x18]\n"
+ "mov v7.16b, v9.16b\n"
+ "mov v0.16b, v24.16b\n"
+ "ldr d26, [x14, #0x20]\n"
+ "ldr d18, [x14, #0x28]\n"
+ "mov v2.16b, v9.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "ldr d31, [x14, #0x30]\n"
+ "ldr d25, [x14, #0x38]\n"
+ "mov v10.16b, v9.16b\n"
+ "mov v6.16b, v24.16b\n"
+ "ldr d20, [x14, #0x40]\n"
+ "ldp x23, x22, [x15, #0x0]\n"
+ "ssubl v23.8h, v23.8b, v19.8b\n"
+ "ssubl v16.8h, v16.8b, v19.8b\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr d22, [x23, x17]\n"
+ "ssubl v1.8h, v1.8b, v19.8b\n"
+ "ssubl v5.8h, v5.8b, v19.8b\n"
+ "ldr d4, [x22, x17]\n"
+ "ldr d8, [x21, x17]\n"
+ "ssubl v26.8h, v26.8b, v19.8b\n"
+ "ssubl v18.8h, v18.8b, v19.8b\n"
+ "ldr d27, [x20, x17]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ssubl v31.8h, v31.8b, v19.8b\n"
+ "ssubl v25.8h, v25.8b, v19.8b\n"
+ "ldr d15, [x20, x17]\n"
+ "ssubl v20.8h, v20.8b, v19.8b\n"
+ "ssubl v22.8h, v22.8b, v14.8b\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "ssubl v8.8h, v8.8b, v14.8b\n"
+ "ssubl v27.8h, v27.8b, v14.8b\n"
+ "ssubl v15.8h, v15.8b, v14.8b\n"
"bgt 1b\n"
"2:" // Tail
- "ldr q17, [x14, #0x0]\n"
- "ldr q22, [x13, #0x0]\n"
- "smlal v13.4s, v31.4h, v4.4h\n"
- "smlal2 v20.4s, v31.8h, v4.8h\n"
- "ldr q23, [x14, #0x10]\n"
- "smlal v9.4s, v31.4h, v3.4h\n"
- "smlal2 v18.4s, v31.8h, v3.8h\n"
- "ldr x21, [x16, #0x28]\n"
- "smlal v13.4s, v30.4h, v0.4h\n"
- "smlal2 v20.4s, v30.8h, v0.8h\n"
- "ldr q19, [x13, #0x10]\n"
- "ldr x28, [x16, #0x38]\n"
- "smlal v9.4s, v29.4h, v2.4h\n"
- "smlal2 v18.4s, v29.8h, v2.8h\n"
- "ldr x20, [x16, #0x30]\n"
- "ldr d29, [x20, x8]\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "smlal2 v26.4s, v31.8h, v1.8h\n"
- "ldr x27, [x16, #0x40]\n"
- "ldr x26, [x16, #0x48]\n"
- "smlal v25.4s, v31.4h, v0.4h\n"
- "smlal2 v10.4s, v31.8h, v0.8h\n"
- "ldr d31, [x21, x8]\n"
- "ssubl v31.8h, v31.8b, v24.8b\n"
- "smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "ssubl v29.8h, v29.8b, v24.8b\n"
- "ldr x25, [x16, #0x50]\n"
- "smlal v9.4s, v28.4h, v4.4h\n"
- "smlal2 v18.4s, v28.8h, v4.8h\n"
- "ldr x24, [x16, #0x58]\n"
- "ldr x23, [x16, #0x60]\n"
- "smlal v16.4s, v28.4h, v2.4h\n"
- "smlal2 v26.4s, v28.8h, v2.8h\n"
- "ldr x22, [x16, #0x68]\n"
- "ldr x21, [x16, #0x70]\n"
- "smlal v25.4s, v28.4h, v1.4h\n"
- "smlal2 v10.4s, v28.8h, v1.8h\n"
- "ldr d28, [x28, x8]\n"
- "ssubl v28.8h, v28.8b, v24.8b\n"
- "smlal v13.4s, v27.4h, v7.4h\n"
- "smlal2 v20.4s, v27.8h, v7.8h\n"
- "ldr x20, [x16, #0x78]\n"
- "tst x6, #0x7\n"
- "smlal v9.4s, v27.4h, v6.4h\n"
- "smlal2 v18.4s, v27.8h, v6.8h\n"
- "add x14, x14, #0x20\n"
+ "ldr q28, [x13, #0x0]\n"
+ "ldr q17, [x12, #0x0]\n"
+ "smlal v9.4s, v22.4h, v26.4h\n"
+ "smlal2 v24.4s, v22.8h, v26.8h\n"
+ "ldr q21, [x13, #0x10]\n"
+ "ldr q3, [x12, #0x10]\n"
+ "smlal v9.4s, v4.4h, v23.4h\n"
+ "smlal v7.4s, v22.4h, v5.4h\n"
+ "ldr x20, [x15, #0x28]\n"
+ "ldr d11, [x20, x17]\n"
+ "smlal v2.4s, v22.4h, v16.4h\n"
+ "smlal v10.4s, v22.4h, v23.4h\n"
+ "smlal2 v24.4s, v4.8h, v23.8h\n"
+ "ldr x20, [x15, #0x38]\n"
+ "ldr d4, [x20, x17]\n"
+ "smlal v9.4s, v27.4h, v18.4h\n"
+ "smlal2 v0.4s, v22.8h, v5.8h\n"
+ "smlal2 v30.4s, v22.8h, v16.8h\n"
+ "ldr x20, [x15, #0x30]\n"
+ "ssubl v11.8h, v11.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v23.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "smlal v2.4s, v27.4h, v1.4h\n"
+ "smlal v10.4s, v27.4h, v16.4h\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "ldr x26, [x15, #0x48]\n"
+ "smlal2 v24.4s, v27.8h, v18.8h\n"
+ "smlal v9.4s, v15.4h, v25.4h\n"
+ "ssubl v22.8h, v22.8b, v14.8b\n"
+ "ldr x25, [x15, #0x50]\n"
+ "smlal2 v0.4s, v8.8h, v1.8h\n"
+ "ldr d8, [x20, x17]\n"
+ "smlal2 v30.4s, v27.8h, v1.8h\n"
+ "ssubl v8.8h, v8.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v16.8h\n"
+ "smlal v7.4s, v27.4h, v26.4h\n"
+ "ldr x24, [x15, #0x58]\n"
+ "ldr x23, [x15, #0x60]\n"
+ "smlal v2.4s, v11.4h, v31.4h\n"
+ "smlal v10.4s, v15.4h, v5.4h\n"
+ "ldr x22, [x15, #0x68]\n"
+ "ldr x21, [x15, #0x70]\n"
+ "smlal2 v24.4s, v15.8h, v25.8h\n"
+ "smlal v9.4s, v4.4h, v16.4h\n"
+ "ldr x20, [x15, #0x78]\n"
+ "tst x7, #0x7\n"
+ "smlal2 v0.4s, v27.8h, v26.8h\n"
+ "ldr d27, [x26, x17]\n"
+ "smlal2 v30.4s, v11.8h, v31.8h\n"
+ "ldr d11, [x25, x17]\n"
+ "smlal2 v6.4s, v15.8h, v5.8h\n"
+ "smlal v7.4s, v15.4h, v31.4h\n"
+ "ssubl v27.8h, v27.8b, v14.8b\n"
"add x13, x13, #0x20\n"
- "smlal v16.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v6.8h\n"
- "ldr d31, [x27, x8]\n"
- "ssubl v31.8h, v31.8b, v24.8b\n"
- "smlal v25.4s, v27.4h, v3.4h\n"
- "smlal2 v10.4s, v27.8h, v3.8h\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "smlal v9.4s, v28.4h, v0.4h\n"
- "smlal2 v18.4s, v28.8h, v0.8h\n"
- "ldr d30, [x26, x8]\n"
- "ssubl v30.8h, v30.8b, v24.8b\n"
- "smlal v16.4s, v27.4h, v4.4h\n"
- "smlal v25.4s, v29.4h, v8.4h\n"
- "smlal2 v26.4s, v27.8h, v4.8h\n"
- "ldr d28, [x24, x8]\n"
- "smlal2 v10.4s, v29.8h, v8.8h\n"
- "ldr d29, [x25, x8]\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v20.4s, v31.8h, v2.8h\n"
- "ssubl v29.8h, v29.8b, v24.8b\n"
- "smlal v9.4s, v31.4h, v1.4h\n"
- "smlal2 v18.4s, v31.8h, v1.8h\n"
- "ldr d31, [x23, x8]\n"
- "ssubl v28.8h, v28.8b, v24.8b\n"
- "smlal v16.4s, v30.4h, v5.4h\n"
- "smlal v25.4s, v30.4h, v4.4h\n"
- "ssubl v31.8h, v31.8b, v24.8b\n"
- "smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v20.4s, v30.8h, v8.8h\n"
- "smlal v9.4s, v30.4h, v7.4h\n"
- "smlal2 v18.4s, v30.8h, v7.8h\n"
- "smlal2 v26.4s, v30.8h, v5.8h\n"
- "smlal2 v10.4s, v30.8h, v4.8h\n"
- "ldr d30, [x22, x8]\n"
- "ssubl v30.8h, v30.8b, v24.8b\n"
- "smlal v16.4s, v29.4h, v0.4h\n"
- "smlal v25.4s, v28.4h, v2.4h\n"
- "smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v20.4s, v29.8h, v3.8h\n"
- "smlal2 v26.4s, v29.8h, v0.8h\n"
- "ldr d29, [x21, x8]\n"
- "smlal2 v10.4s, v28.8h, v2.8h\n"
- "ssubl v29.8h, v29.8b, v24.8b\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal v25.4s, v30.4h, v5.4h\n"
- "smlal v9.4s, v28.4h, v5.4h\n"
- "smlal2 v18.4s, v28.8h, v5.8h\n"
- "ldr d28, [x20, x8]\n"
- "ssubl v28.8h, v28.8b, v24.8b\n"
- "smlal v13.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v3.8h\n"
- "sqrdmulh v13.4s, v13.4s, v17.4s\n"
- "add x8, x8, #0x8\n"
- "smlal2 v10.4s, v30.8h, v5.8h\n"
- "smlal v16.4s, v29.4h, v7.4h\n"
- "and v21.16b, v13.16b, v22.16b\n"
- "smlal v25.4s, v29.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "smlal2 v26.4s, v29.8h, v7.8h\n"
- "smlal2 v10.4s, v29.8h, v6.8h\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "smlal v9.4s, v30.4h, v8.4h\n"
- "smlal v16.4s, v28.4h, v8.4h\n"
- "and v29.16b, v20.16b, v19.16b\n"
- "smlal v25.4s, v28.4h, v7.4h\n"
- "smlal2 v18.4s, v30.8h, v8.8h\n"
- "sqrdmulh v9.4s, v9.4s, v17.4s\n"
- "smlal2 v26.4s, v28.8h, v8.8h\n"
- "smlal2 v10.4s, v28.8h, v7.8h\n"
- "sqrdmulh v16.4s, v16.4s, v17.4s\n"
- "sqrdmulh v25.4s, v25.4s, v17.4s\n"
- "sqadd v13.4s, v13.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v0.16b, v9.16b, v22.16b\n"
- "sqrdmulh v18.4s, v18.4s, v23.4s\n"
- "and v27.16b, v16.16b, v22.16b\n"
- "sqrdmulh v26.4s, v26.4s, v23.4s\n"
- "and v21.16b, v25.16b, v22.16b\n"
- "sqrdmulh v10.4s, v10.4s, v23.4s\n"
- "sqadd v20.4s, v20.4s, v29.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v17.16b, v18.16b, v19.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "and v7.16b, v26.16b, v19.16b\n"
+ "smlal v2.4s, v15.4h, v26.4h\n"
+ "smlal v10.4s, v22.4h, v20.4h\n"
+ "ssubl v11.8h, v11.8b, v14.8b\n"
+ "add x12, x12, #0x20\n"
+ "smlal2 v24.4s, v4.8h, v16.8h\n"
+ "smlal v9.4s, v8.4h, v1.4h\n"
+ "smlal2 v0.4s, v15.8h, v31.8h\n"
+ "smlal2 v30.4s, v15.8h, v26.8h\n"
+ "ldr d15, [x24, x17]\n"
+ "ssubl v15.8h, v15.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v20.8h\n"
+ "ldr d22, [x23, x17]\n"
+ "smlal v7.4s, v4.4h, v23.4h\n"
+ "ssubl v22.8h, v22.8b, v14.8b\n"
+ "smlal v2.4s, v27.4h, v18.4h\n"
+ "smlal v10.4s, v27.4h, v26.4h\n"
+ "smlal2 v24.4s, v8.8h, v1.8h\n"
+ "smlal v9.4s, v27.4h, v20.4h\n"
+ "smlal2 v0.4s, v4.8h, v23.8h\n"
+ "ldr d4, [x22, x17]\n"
+ "smlal2 v30.4s, v27.8h, v18.8h\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v26.8h\n"
+ "ldr d26, [x21, x17]\n"
+ "smlal v7.4s, v8.4h, v16.4h\n"
+ "ssubl v26.8h, v26.8b, v14.8b\n"
+ "smlal v2.4s, v11.4h, v23.4h\n"
+ "smlal v10.4s, v15.4h, v1.4h\n"
+ "smlal2 v24.4s, v27.8h, v20.8h\n"
+ "smlal v9.4s, v11.4h, v5.4h\n"
+ "smlal2 v0.4s, v8.8h, v16.8h\n"
+ "ldr d16, [x20, x17]\n"
+ "smlal2 v30.4s, v11.8h, v23.8h\n"
+ "ssubl v16.8h, v16.8b, v14.8b\n"
+ "smlal2 v6.4s, v15.8h, v1.8h\n"
+ "smlal v7.4s, v27.4h, v25.4h\n"
+ "add x17, x17, #0x8\n"
+ "smlal v2.4s, v22.4h, v5.4h\n"
+ "smlal v10.4s, v4.4h, v18.4h\n"
+ "smlal2 v24.4s, v11.8h, v5.8h\n"
+ "smlal v9.4s, v22.4h, v31.4h\n"
+ "sqrdmulh v9.4s, v9.4s, v28.4s\n"
+ "smlal2 v0.4s, v27.8h, v25.8h\n"
+ "smlal2 v30.4s, v22.8h, v5.8h\n"
+ "and v1.16b, v9.16b, v17.16b\n"
+ "smlal2 v6.4s, v4.8h, v18.8h\n"
+ "smlal v7.4s, v15.4h, v18.4h\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "smlal v2.4s, v26.4h, v25.4h\n"
+ "smlal v10.4s, v26.4h, v31.4h\n"
+ "sqadd v9.4s, v9.4s, v1.4s\n"
+ "smlal2 v24.4s, v22.8h, v31.8h\n"
+ "smlal2 v0.4s, v15.8h, v18.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ "smlal2 v30.4s, v26.8h, v25.8h\n"
+ "smlal2 v6.4s, v26.8h, v31.8h\n"
+ "and v31.16b, v24.16b, v3.16b\n"
+ "smlal v7.4s, v4.4h, v20.4h\n"
+ "smlal v2.4s, v16.4h, v20.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v28.4s\n"
+ "smlal v10.4s, v16.4h, v25.4h\n"
+ "smlal2 v0.4s, v4.8h, v20.8h\n"
+ "sqrdmulh v2.4s, v2.4s, v28.4s\n"
+ "smlal2 v30.4s, v16.8h, v20.8h\n"
+ "smlal2 v6.4s, v16.8h, v25.8h\n"
+ "sqrdmulh v10.4s, v10.4s, v28.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v22.16b, v7.16b, v17.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v21.4s\n"
+ "and v15.16b, v2.16b, v17.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "and v11.16b, v10.16b, v17.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v18.16b, v0.16b, v3.16b\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "and v23.16b, v30.16b, v3.16b\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "and v21.16b, v6.16b, v3.16b\n"
+ "sqadd v7.4s, v7.4s, v22.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v2.4s, v2.4s, v15.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v11.4s\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "and v29.16b, v10.16b, v19.16b\n"
- "sqadd v9.4s, v9.4s, v0.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v27.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v25.4s, v25.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v22.4s\n"
- "srshl v9.4s, v9.4s, v22.4s\n"
- "sqadd v18.4s, v18.4s, v17.4s\n"
- "srshl v16.4s, v16.4s, v22.4s\n"
- "sqadd v26.4s, v26.4s, v7.4s\n"
- "srshl v25.4s, v25.4s, v22.4s\n"
- "sqadd v10.4s, v10.4s, v29.4s\n"
- "srshl v20.4s, v20.4s, v19.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v18.4s, v18.4s, v19.4s\n"
+ "srshl v9.4s, v9.4s, v17.4s\n"
+ "srshl v7.4s, v7.4s, v17.4s\n"
+ "sqadd v0.4s, v0.4s, v18.4s\n"
+ "srshl v2.4s, v2.4s, v17.4s\n"
+ "sqadd v30.4s, v30.4s, v23.4s\n"
+ "srshl v10.4s, v10.4s, v17.4s\n"
+ "sqadd v6.4s, v6.4s, v21.4s\n"
+ "srshl v24.4s, v24.4s, v3.4s\n"
"sqxtn v9.4h, v9.4s\n"
- "srshl v26.4s, v26.4s, v19.4s\n"
- "sqxtn v16.4h, v16.4s\n"
- "srshl v10.4s, v10.4s, v19.4s\n"
- "sqxtn v25.4h, v25.4s\n"
- "sqxtn2 v13.8h, v20.4s\n"
- "sqxtn2 v9.8h, v18.4s\n"
- "sqxtn2 v16.8h, v26.4s\n"
- "sqxtn2 v25.8h, v10.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v9.8h, v9.8h, v14.8h\n"
- "sqadd v16.8h, v16.8h, v14.8h\n"
- "sqadd v25.8h, v25.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v9.8h, v9.8h, v12.8h\n"
- "smax v16.8h, v16.8h, v12.8h\n"
- "smax v25.8h, v25.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v9.8h, v9.8h, v11.8h\n"
- "smin v16.8h, v16.8h, v11.8h\n"
- "smin v25.8h, v25.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
- "str d13, [x12, x17]\n"
+ "srshl v0.4s, v0.4s, v3.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v30.4s, v30.4s, v3.4s\n"
+ "sqxtn v2.4h, v2.4s\n"
+ "srshl v6.4s, v6.4s, v3.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "sqxtn2 v9.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v0.4s\n"
+ "sqxtn2 v2.8h, v30.4s\n"
+ "sqxtn2 v10.8h, v6.4s\n"
+ "sqadd v9.8h, v9.8h, v13.8h\n"
+ "sqadd v7.8h, v7.8h, v13.8h\n"
+ "sqadd v2.8h, v2.8h, v13.8h\n"
+ "sqadd v10.8h, v10.8h, v13.8h\n"
+ "smax v9.8h, v9.8h, v29.8h\n"
+ "smax v7.8h, v7.8h, v29.8h\n"
+ "smax v2.8h, v2.8h, v29.8h\n"
+ "smax v10.8h, v10.8h, v29.8h\n"
+ "smin v9.8h, v9.8h, v12.8h\n"
+ "smin v7.8h, v7.8h, v12.8h\n"
+ "smin v2.8h, v2.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str d9, [x11, x17]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str d16, [x10, x17]\n"
- "str d25, [x9, x17]\n"
- "add x17, x17, #0x8\n"
+ "str d9, [x11, x16]\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "str d7, [x10, x16]\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "str d2, [x9, x16]\n"
+ "str d10, [x28, x16]\n"
+ "add x16, x16, #0x8\n"
"beq 64f\n"
- "add x15, x15, #0x48\n"
+ "add x14, x14, #0x48\n"
"3:" // Oddments
- "ldr x28, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x6, #2, 5f\n"
- "ld1 { v13.4s }, [x28], #0x10\n"
- "tbz x6, #1, 4f\n"
- "ld1 { v20.d }[0], [x28], #0x8\n"
- "tbz x6, #0, 7f\n"
- "ld1 { v20.s }[2], [x28]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x7, #2, 5f\n"
+ "ld1 { v9.4s }, [x20], #0x10\n"
+ "tbz x7, #1, 4f\n"
+ "ld1 { v24.d }[0], [x20], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v24.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x6, #0, 7f\n"
- "ld1 { v20.s }[0], [x28]\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v24.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x6, #1, 6f\n"
- "ld1 { v13.d }[0], [x28], #0x8\n"
- "tbz x6, #0, 7f\n"
- "ld1 { v13.s }[2], [x28]\n"
+ "tbz x7, #1, 6f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v9.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 7f\n"
- "ld1 { v13.s }[0], [x28]\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v9.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x15, #0x0]\n"
- "ldr d1, [x15, #0x8]\n"
- "mov v9.16b, v13.16b\n"
- "mov v18.16b, v20.16b\n"
- "ldr d2, [x15, #0x10]\n"
- "ldr d3, [x15, #0x18]\n"
- "mov v16.16b, v13.16b\n"
- "mov v26.16b, v20.16b\n"
- "ldr d4, [x15, #0x20]\n"
- "ldr d5, [x15, #0x28]\n"
- "mov v25.16b, v13.16b\n"
- "mov v10.16b, v20.16b\n"
- "ldr d6, [x15, #0x30]\n"
- "ldr d7, [x15, #0x38]\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldr d8, [x15, #0x40]\n"
- "ldp x24, x23, [x16, #0x0]\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldp x22, x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x20]\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ssubl v5.8h, v5.8b, v15.8b\n"
- "ssubl v6.8h, v6.8b, v15.8b\n"
- "ssubl v7.8h, v7.8b, v15.8b\n"
- "ssubl v8.8h, v8.8b, v15.8b\n"
- "add x24, x24, x8\n"
- "add x23, x23, x8\n"
- "add x22, x22, x8\n"
- "add x21, x21, x8\n"
- "add x20, x20, x8\n"
- "tbz x6, #2, 9f\n"
- "ld1 { v31.s }[0], [x24], #0x4\n"
- "ld1 { v30.s }[0], [x23], #0x4\n"
- "ld1 { v29.s }[0], [x22], #0x4\n"
- "ld1 { v28.s }[0], [x21], #0x4\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
- "tbz x6, #1, 8f\n"
- "ld1 { v31.h }[2], [x24], #0x2\n"
- "ld1 { v30.h }[2], [x23], #0x2\n"
- "ld1 { v29.h }[2], [x22], #0x2\n"
- "ld1 { v28.h }[2], [x21], #0x2\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
- "tbz x6, #0, 11f\n"
- "ld1 { v31.b }[6], [x24]\n"
- "ld1 { v30.b }[6], [x23]\n"
- "ld1 { v29.b }[6], [x22]\n"
- "ld1 { v28.b }[6], [x21]\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ldr d23, [x14, #0x0]\n"
+ "ldr d16, [x14, #0x8]\n"
+ "mov v7.16b, v9.16b\n"
+ "mov v0.16b, v24.16b\n"
+ "ldr d1, [x14, #0x10]\n"
+ "ldr d5, [x14, #0x18]\n"
+ "mov v2.16b, v9.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "ldr d26, [x14, #0x20]\n"
+ "ldr d18, [x14, #0x28]\n"
+ "mov v10.16b, v9.16b\n"
+ "mov v6.16b, v24.16b\n"
+ "ldr d31, [x14, #0x30]\n"
+ "ldr d25, [x14, #0x38]\n"
+ "ssubl v23.8h, v23.8b, v19.8b\n"
+ "ssubl v16.8h, v16.8b, v19.8b\n"
+ "ldr d20, [x14, #0x40]\n"
+ "ldp x24, x23, [x15, #0x0]\n"
+ "ssubl v1.8h, v1.8b, v19.8b\n"
+ "ssubl v5.8h, v5.8b, v19.8b\n"
+ "ldp x22, x21, [x15, #0x10]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ssubl v26.8h, v26.8b, v19.8b\n"
+ "ssubl v18.8h, v18.8b, v19.8b\n"
+ "ssubl v31.8h, v31.8b, v19.8b\n"
+ "ssubl v25.8h, v25.8b, v19.8b\n"
+ "ssubl v20.8h, v20.8b, v19.8b\n"
+ "add x24, x24, x17\n"
+ "add x23, x23, x17\n"
+ "add x22, x22, x17\n"
+ "add x21, x21, x17\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 9f\n"
+ "ld1 { v22.s }[0], [x24], #0x4\n"
+ "ld1 { v4.s }[0], [x23], #0x4\n"
+ "ld1 { v8.s }[0], [x22], #0x4\n"
+ "ld1 { v27.s }[0], [x21], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 8f\n"
+ "ld1 { v22.h }[2], [x24], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v8.h }[2], [x22], #0x2\n"
+ "ld1 { v27.h }[2], [x21], #0x2\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[6], [x24]\n"
+ "ld1 { v4.b }[6], [x23]\n"
+ "ld1 { v8.b }[6], [x22]\n"
+ "ld1 { v27.b }[6], [x21]\n"
+ "ld1 { v15.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x6, #0, 11f\n"
- "ld1 { v31.b }[4], [x24]\n"
- "ld1 { v30.b }[4], [x23]\n"
- "ld1 { v29.b }[4], [x22]\n"
- "ld1 { v28.b }[4], [x21]\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[4], [x24]\n"
+ "ld1 { v4.b }[4], [x23]\n"
+ "ld1 { v8.b }[4], [x22]\n"
+ "ld1 { v27.b }[4], [x21]\n"
+ "ld1 { v15.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x6, #1, 10f\n"
- "ld1 { v31.h }[0], [x24], #0x2\n"
- "ld1 { v30.h }[0], [x23], #0x2\n"
- "ld1 { v29.h }[0], [x22], #0x2\n"
- "ld1 { v28.h }[0], [x21], #0x2\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
- "tbz x6, #0, 11f\n"
- "ld1 { v31.b }[2], [x24]\n"
- "ld1 { v30.b }[2], [x23]\n"
- "ld1 { v29.b }[2], [x22]\n"
- "ld1 { v28.b }[2], [x21]\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "tbz x7, #1, 10f\n"
+ "ld1 { v22.h }[0], [x24], #0x2\n"
+ "ld1 { v4.h }[0], [x23], #0x2\n"
+ "ld1 { v8.h }[0], [x22], #0x2\n"
+ "ld1 { v27.h }[0], [x21], #0x2\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[2], [x24]\n"
+ "ld1 { v4.b }[2], [x23]\n"
+ "ld1 { v8.b }[2], [x22]\n"
+ "ld1 { v27.b }[2], [x21]\n"
+ "ld1 { v15.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 11f\n"
- "ld1 { v31.b }[0], [x24]\n"
- "ld1 { v30.b }[0], [x23]\n"
- "ld1 { v29.b }[0], [x22]\n"
- "ld1 { v28.b }[0], [x21]\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[0], [x24]\n"
+ "ld1 { v4.b }[0], [x23]\n"
+ "ld1 { v8.b }[0], [x22]\n"
+ "ld1 { v27.b }[0], [x21]\n"
+ "ld1 { v15.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "ssubl v31.8h, v31.8b, v24.8b\n"
- "smlal v13.4s, v31.4h, v4.4h\n"
- "smlal2 v20.4s, v31.8h, v4.8h\n"
- "ldr x21, [x16, #0x28]\n"
- "smlal v9.4s, v31.4h, v3.4h\n"
- "smlal2 v18.4s, v31.8h, v3.8h\n"
- "ssubl v30.8h, v30.8b, v24.8b\n"
- "add x21, x21, x8\n"
- "ssubl v29.8h, v29.8b, v24.8b\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "smlal2 v26.4s, v31.8h, v1.8h\n"
- "smlal v25.4s, v31.4h, v0.4h\n"
- "smlal2 v10.4s, v31.8h, v0.8h\n"
- "ssubl v28.8h, v28.8b, v24.8b\n"
- "smlal v13.4s, v30.4h, v0.4h\n"
- "smlal2 v20.4s, v30.8h, v0.8h\n"
- "ssubl v27.8h, v27.8b, v24.8b\n"
- "smlal v9.4s, v29.4h, v2.4h\n"
- "smlal2 v18.4s, v29.8h, v2.8h\n"
- "smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "smlal v9.4s, v28.4h, v4.4h\n"
- "smlal2 v18.4s, v28.8h, v4.8h\n"
- "smlal v16.4s, v28.4h, v2.4h\n"
- "smlal2 v26.4s, v28.8h, v2.8h\n"
- "smlal v25.4s, v28.4h, v1.4h\n"
- "smlal2 v10.4s, v28.8h, v1.8h\n"
- "tbz x6, #2, 13f\n"
- "ld1 { v31.s }[0], [x21], #0x4\n"
- "tbz x6, #1, 12f\n"
- "ld1 { v31.h }[2], [x21], #0x2\n"
- "tbz x6, #0, 15f\n"
- "ld1 { v31.b }[6], [x21]\n"
+ "ssubl v22.8h, v22.8b, v14.8b\n"
+ "smlal v9.4s, v22.4h, v26.4h\n"
+ "smlal2 v24.4s, v22.8h, v26.8h\n"
+ "ldr x20, [x15, #0x28]\n"
+ "smlal v7.4s, v22.4h, v5.4h\n"
+ "smlal2 v0.4s, v22.8h, v5.8h\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "ssubl v8.8h, v8.8b, v14.8b\n"
+ "smlal v2.4s, v22.4h, v16.4h\n"
+ "smlal2 v30.4s, v22.8h, v16.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v10.4s, v22.4h, v23.4h\n"
+ "smlal2 v6.4s, v22.8h, v23.8h\n"
+ "ssubl v27.8h, v27.8b, v14.8b\n"
+ "smlal v9.4s, v4.4h, v23.4h\n"
+ "smlal2 v24.4s, v4.8h, v23.8h\n"
+ "ssubl v15.8h, v15.8b, v14.8b\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "smlal2 v0.4s, v8.8h, v1.8h\n"
+ "smlal v9.4s, v27.4h, v18.4h\n"
+ "smlal2 v24.4s, v27.8h, v18.8h\n"
+ "smlal v7.4s, v27.4h, v26.4h\n"
+ "smlal2 v0.4s, v27.8h, v26.8h\n"
+ "smlal v2.4s, v27.4h, v1.4h\n"
+ "smlal2 v30.4s, v27.8h, v1.8h\n"
+ "smlal v10.4s, v27.4h, v16.4h\n"
+ "smlal2 v6.4s, v27.8h, v16.8h\n"
+ "tbz x7, #2, 13f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 12f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x6, #0, 15f\n"
- "ld1 { v31.b }[4], [x21]\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x6, #1, 14f\n"
- "ld1 { v31.h }[0], [x21], #0x2\n"
- "tbz x6, #0, 15f\n"
- "ld1 { v31.b }[2], [x21]\n"
+ "tbz x7, #1, 14f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 15f\n"
- "ld1 { v31.b }[0], [x21]\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[0], [x20]\n"
"15:" // Oddments: Load (3, 0): Bit 2: End
- "ssubl v31.8h, v31.8b, v24.8b\n"
- "smlal v16.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v6.8h\n"
- "ldr x20, [x16, #0x30]\n"
- "smlal v13.4s, v27.4h, v7.4h\n"
- "smlal2 v20.4s, v27.8h, v7.8h\n"
- "add x20, x20, x8\n"
- "smlal v9.4s, v27.4h, v6.4h\n"
- "smlal2 v18.4s, v27.8h, v6.8h\n"
- "smlal v16.4s, v27.4h, v4.4h\n"
- "smlal2 v26.4s, v27.8h, v4.8h\n"
- "smlal v25.4s, v27.4h, v3.4h\n"
- "smlal2 v10.4s, v27.8h, v3.8h\n"
- "tbz x6, #2, 17f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
- "tbz x6, #1, 16f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
- "tbz x6, #0, 19f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "ssubl v21.8h, v21.8b, v14.8b\n"
+ "smlal v2.4s, v21.4h, v31.4h\n"
+ "smlal2 v30.4s, v21.8h, v31.8h\n"
+ "ldr x20, [x15, #0x30]\n"
+ "smlal v9.4s, v15.4h, v25.4h\n"
+ "smlal2 v24.4s, v15.8h, v25.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v7.4s, v15.4h, v31.4h\n"
+ "smlal2 v0.4s, v15.8h, v31.8h\n"
+ "smlal v2.4s, v15.4h, v26.4h\n"
+ "smlal2 v30.4s, v15.8h, v26.8h\n"
+ "smlal v10.4s, v15.4h, v5.4h\n"
+ "smlal2 v6.4s, v15.8h, v5.8h\n"
+ "tbz x7, #2, 17f\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 16f\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x6, #0, 19f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x6, #1, 18f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
- "tbz x6, #0, 19f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "tbz x7, #1, 18f\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 19f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[0], [x20]\n"
"19:" // Oddments: Load (3, 3): Bit 2: End
- "ssubl v29.8h, v29.8b, v24.8b\n"
- "ldr x28, [x16, #0x38]\n"
- "smlal v25.4s, v29.4h, v8.4h\n"
- "smlal2 v10.4s, v29.8h, v8.8h\n"
- "add x28, x28, x8\n"
- "tbz x6, #2, 21f\n"
- "ld1 { v28.s }[0], [x28], #0x4\n"
- "tbz x6, #1, 20f\n"
- "ld1 { v28.h }[2], [x28], #0x2\n"
- "tbz x6, #0, 23f\n"
- "ld1 { v28.b }[6], [x28]\n"
+ "ssubl v28.8h, v28.8b, v14.8b\n"
+ "ldr x20, [x15, #0x38]\n"
+ "smlal v10.4s, v28.4h, v20.4h\n"
+ "smlal2 v6.4s, v28.8h, v20.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 21f\n"
+ "ld1 { v22.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 20f\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
- "tbz x6, #0, 23f\n"
- "ld1 { v28.b }[4], [x28]\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 1): Bit 2: Unset
- "tbz x6, #1, 22f\n"
- "ld1 { v28.h }[0], [x28], #0x2\n"
- "tbz x6, #0, 23f\n"
- "ld1 { v28.b }[2], [x28]\n"
+ "tbz x7, #1, 22f\n"
+ "ld1 { v22.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 23f\n"
- "ld1 { v28.b }[0], [x28]\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[0], [x20]\n"
"23:" // Oddments: Load (0, 1): Bit 2: End
- "ssubl v28.8h, v28.8b, v24.8b\n"
- "ldr x27, [x16, #0x40]\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "smlal v9.4s, v28.4h, v0.4h\n"
- "smlal2 v18.4s, v28.8h, v0.8h\n"
- "add x27, x27, x8\n"
- "tbz x6, #2, 25f\n"
- "ld1 { v31.s }[0], [x27], #0x4\n"
- "tbz x6, #1, 24f\n"
- "ld1 { v31.h }[2], [x27], #0x2\n"
- "tbz x6, #0, 27f\n"
- "ld1 { v31.b }[6], [x27]\n"
+ "ssubl v22.8h, v22.8b, v14.8b\n"
+ "ldr x20, [x15, #0x40]\n"
+ "smlal v9.4s, v22.4h, v16.4h\n"
+ "smlal2 v24.4s, v22.8h, v16.8h\n"
+ "smlal v7.4s, v22.4h, v23.4h\n"
+ "smlal2 v0.4s, v22.8h, v23.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 25f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 24f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
- "tbz x6, #0, 27f\n"
- "ld1 { v31.b }[4], [x27]\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (0, 2): Bit 2: Unset
- "tbz x6, #1, 26f\n"
- "ld1 { v31.h }[0], [x27], #0x2\n"
- "tbz x6, #0, 27f\n"
- "ld1 { v31.b }[2], [x27]\n"
+ "tbz x7, #1, 26f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 27f\n"
- "ld1 { v31.b }[0], [x27]\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[0], [x20]\n"
"27:" // Oddments: Load (0, 2): Bit 2: End
- "ssubl v31.8h, v31.8b, v24.8b\n"
- "ldr x26, [x16, #0x48]\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v20.4s, v31.8h, v2.8h\n"
- "smlal v9.4s, v31.4h, v1.4h\n"
- "smlal2 v18.4s, v31.8h, v1.8h\n"
- "add x26, x26, x8\n"
- "tbz x6, #2, 29f\n"
- "ld1 { v30.s }[0], [x26], #0x4\n"
- "tbz x6, #1, 28f\n"
- "ld1 { v30.h }[2], [x26], #0x2\n"
- "tbz x6, #0, 31f\n"
- "ld1 { v30.b }[6], [x26]\n"
+ "ssubl v21.8h, v21.8b, v14.8b\n"
+ "ldr x20, [x15, #0x48]\n"
+ "smlal v9.4s, v21.4h, v1.4h\n"
+ "smlal2 v24.4s, v21.8h, v1.8h\n"
+ "smlal v7.4s, v21.4h, v16.4h\n"
+ "smlal2 v0.4s, v21.8h, v16.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 29f\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 28f\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
- "tbz x6, #0, 31f\n"
- "ld1 { v30.b }[4], [x26]\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
- "tbz x6, #1, 30f\n"
- "ld1 { v30.h }[0], [x26], #0x2\n"
- "tbz x6, #0, 31f\n"
- "ld1 { v30.b }[2], [x26]\n"
+ "tbz x7, #1, 30f\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 31f\n"
- "ld1 { v30.b }[0], [x26]\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "ssubl v30.8h, v30.8b, v24.8b\n"
- "ldr x25, [x16, #0x50]\n"
- "smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v20.4s, v30.8h, v8.8h\n"
- "smlal v9.4s, v30.4h, v7.4h\n"
- "smlal2 v18.4s, v30.8h, v7.8h\n"
- "add x25, x25, x8\n"
- "smlal v16.4s, v30.4h, v5.4h\n"
- "smlal2 v26.4s, v30.8h, v5.8h\n"
- "smlal v25.4s, v30.4h, v4.4h\n"
- "smlal2 v10.4s, v30.8h, v4.8h\n"
- "tbz x6, #2, 33f\n"
- "ld1 { v29.s }[0], [x25], #0x4\n"
- "tbz x6, #1, 32f\n"
- "ld1 { v29.h }[2], [x25], #0x2\n"
- "tbz x6, #0, 35f\n"
- "ld1 { v29.b }[6], [x25]\n"
+ "ssubl v28.8h, v28.8b, v14.8b\n"
+ "ldr x20, [x15, #0x50]\n"
+ "smlal v9.4s, v28.4h, v20.4h\n"
+ "smlal2 v24.4s, v28.8h, v20.8h\n"
+ "smlal v7.4s, v28.4h, v25.4h\n"
+ "smlal2 v0.4s, v28.8h, v25.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v2.4s, v28.4h, v18.4h\n"
+ "smlal2 v30.4s, v28.8h, v18.8h\n"
+ "smlal v10.4s, v28.4h, v26.4h\n"
+ "smlal2 v6.4s, v28.8h, v26.8h\n"
+ "tbz x7, #2, 33f\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 32f\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
- "tbz x6, #0, 35f\n"
- "ld1 { v29.b }[4], [x25]\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (1, 0): Bit 2: Unset
- "tbz x6, #1, 34f\n"
- "ld1 { v29.h }[0], [x25], #0x2\n"
- "tbz x6, #0, 35f\n"
- "ld1 { v29.b }[2], [x25]\n"
+ "tbz x7, #1, 34f\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 35f\n"
- "ld1 { v29.b }[0], [x25]\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[0], [x20]\n"
"35:" // Oddments: Load (1, 0): Bit 2: End
- "ssubl v29.8h, v29.8b, v24.8b\n"
- "ldr x24, [x16, #0x58]\n"
- "smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v20.4s, v29.8h, v3.8h\n"
- "smlal v16.4s, v29.4h, v0.4h\n"
- "smlal2 v26.4s, v29.8h, v0.8h\n"
- "add x24, x24, x8\n"
- "tbz x6, #2, 37f\n"
- "ld1 { v28.s }[0], [x24], #0x4\n"
- "tbz x6, #1, 36f\n"
- "ld1 { v28.h }[2], [x24], #0x2\n"
- "tbz x6, #0, 39f\n"
- "ld1 { v28.b }[6], [x24]\n"
+ "ssubl v8.8h, v8.8b, v14.8b\n"
+ "ldr x20, [x15, #0x58]\n"
+ "smlal v9.4s, v8.4h, v5.4h\n"
+ "smlal2 v24.4s, v8.8h, v5.8h\n"
+ "smlal v2.4s, v8.4h, v23.4h\n"
+ "smlal2 v30.4s, v8.8h, v23.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 37f\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 36f\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x6, #0, 39f\n"
- "ld1 { v28.b }[4], [x24]\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x6, #1, 38f\n"
- "ld1 { v28.h }[0], [x24], #0x2\n"
- "tbz x6, #0, 39f\n"
- "ld1 { v28.b }[2], [x24]\n"
+ "tbz x7, #1, 38f\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 39f\n"
- "ld1 { v28.b }[0], [x24]\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[0], [x20]\n"
"39:" // Oddments: Load (1, 3): Bit 2: End
- "ssubl v28.8h, v28.8b, v24.8b\n"
- "ldr x23, [x16, #0x60]\n"
- "smlal v9.4s, v28.4h, v5.4h\n"
- "smlal2 v18.4s, v28.8h, v5.8h\n"
- "smlal v25.4s, v28.4h, v2.4h\n"
- "smlal2 v10.4s, v28.8h, v2.8h\n"
- "add x23, x23, x8\n"
- "tbz x6, #2, 41f\n"
- "ld1 { v31.s }[0], [x23], #0x4\n"
- "tbz x6, #1, 40f\n"
- "ld1 { v31.h }[2], [x23], #0x2\n"
- "tbz x6, #0, 43f\n"
- "ld1 { v31.b }[6], [x23]\n"
+ "ssubl v8.8h, v8.8b, v14.8b\n"
+ "ldr x20, [x15, #0x60]\n"
+ "smlal v7.4s, v8.4h, v18.4h\n"
+ "smlal2 v0.4s, v8.8h, v18.8h\n"
+ "smlal v10.4s, v8.4h, v1.4h\n"
+ "smlal2 v6.4s, v8.8h, v1.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 41f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 40f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
- "tbz x6, #0, 43f\n"
- "ld1 { v31.b }[4], [x23]\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 0): Bit 2: Unset
- "tbz x6, #1, 42f\n"
- "ld1 { v31.h }[0], [x23], #0x2\n"
- "tbz x6, #0, 43f\n"
- "ld1 { v31.b }[2], [x23]\n"
+ "tbz x7, #1, 42f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 43f\n"
- "ld1 { v31.b }[0], [x23]\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"43:" // Oddments: Load (2, 0): Bit 2: End
- "ssubl v31.8h, v31.8b, v24.8b\n"
- "ldr x22, [x16, #0x68]\n"
- "smlal v13.4s, v31.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal2 v26.4s, v31.8h, v3.8h\n"
- "add x22, x22, x8\n"
- "tbz x6, #2, 45f\n"
- "ld1 { v30.s }[0], [x22], #0x4\n"
- "tbz x6, #1, 44f\n"
- "ld1 { v30.h }[2], [x22], #0x2\n"
- "tbz x6, #0, 47f\n"
- "ld1 { v30.b }[6], [x22]\n"
+ "ssubl v17.8h, v17.8b, v14.8b\n"
+ "ldr x20, [x15, #0x68]\n"
+ "smlal v9.4s, v17.4h, v31.4h\n"
+ "smlal2 v24.4s, v17.8h, v31.8h\n"
+ "smlal v2.4s, v17.4h, v5.4h\n"
+ "smlal2 v30.4s, v17.8h, v5.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 45f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 44f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x6, #0, 47f\n"
- "ld1 { v30.b }[4], [x22]\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x6, #1, 46f\n"
- "ld1 { v30.h }[0], [x22], #0x2\n"
- "tbz x6, #0, 47f\n"
- "ld1 { v30.b }[2], [x22]\n"
+ "tbz x7, #1, 46f\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 47f\n"
- "ld1 { v30.b }[0], [x22]\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[0], [x20]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "ssubl v30.8h, v30.8b, v24.8b\n"
- "ldr x21, [x16, #0x70]\n"
- "smlal v9.4s, v30.4h, v8.4h\n"
- "smlal2 v18.4s, v30.8h, v8.8h\n"
- "smlal v25.4s, v30.4h, v5.4h\n"
- "smlal2 v10.4s, v30.8h, v5.8h\n"
- "add x21, x21, x8\n"
- "tbz x6, #2, 49f\n"
- "ld1 { v29.s }[0], [x21], #0x4\n"
- "tbz x6, #1, 48f\n"
- "ld1 { v29.h }[2], [x21], #0x2\n"
- "tbz x6, #0, 51f\n"
- "ld1 { v29.b }[6], [x21]\n"
+ "ssubl v23.8h, v23.8b, v14.8b\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal v7.4s, v23.4h, v20.4h\n"
+ "smlal2 v0.4s, v23.8h, v20.8h\n"
+ "smlal v10.4s, v23.4h, v18.4h\n"
+ "smlal2 v6.4s, v23.8h, v18.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 49f\n"
+ "ld1 { v5.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 48f\n"
+ "ld1 { v5.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x6, #0, 51f\n"
- "ld1 { v29.b }[4], [x21]\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x6, #1, 50f\n"
- "ld1 { v29.h }[0], [x21], #0x2\n"
- "tbz x6, #0, 51f\n"
- "ld1 { v29.b }[2], [x21]\n"
+ "tbz x7, #1, 50f\n"
+ "ld1 { v5.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 51f\n"
- "ld1 { v29.b }[0], [x21]\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "ssubl v29.8h, v29.8b, v24.8b\n"
- "ldr x20, [x16, #0x78]\n"
- "smlal v16.4s, v29.4h, v7.4h\n"
- "smlal2 v26.4s, v29.8h, v7.8h\n"
- "smlal v25.4s, v29.4h, v6.4h\n"
- "smlal2 v10.4s, v29.8h, v6.8h\n"
- "add x20, x20, x8\n"
- "tbz x6, #2, 53f\n"
- "ld1 { v28.s }[0], [x20], #0x4\n"
- "tbz x6, #1, 52f\n"
- "ld1 { v28.h }[2], [x20], #0x2\n"
- "tbz x6, #0, 55f\n"
- "ld1 { v28.b }[6], [x20]\n"
+ "ssubl v5.8h, v5.8b, v14.8b\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v2.4s, v5.4h, v25.4h\n"
+ "smlal2 v30.4s, v5.8h, v25.8h\n"
+ "smlal v10.4s, v5.4h, v31.4h\n"
+ "smlal2 v6.4s, v5.8h, v31.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 53f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 52f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x6, #0, 55f\n"
- "ld1 { v28.b }[4], [x20]\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x6, #1, 54f\n"
- "ld1 { v28.h }[0], [x20], #0x2\n"
- "tbz x6, #0, 55f\n"
- "ld1 { v28.b }[2], [x20]\n"
+ "tbz x7, #1, 54f\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 55f\n"
- "ld1 { v28.b }[0], [x20]\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[0], [x20]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "ssubl v28.8h, v28.8b, v24.8b\n"
- "smlal v16.4s, v28.4h, v8.4h\n"
- "smlal2 v26.4s, v28.8h, v8.8h\n"
- "smlal v25.4s, v28.4h, v7.4h\n"
- "smlal2 v10.4s, v28.8h, v7.8h\n"
- "tbz x6, #2, 57f\n"
- "ld1 { v17.4s }, [x14], #0x10\n"
- "ld1 { v22.4s }, [x13], #0x10\n"
- "tbz x6, #1, 56f\n"
- "ld1 { v23.d }[0], [x14], #0x8\n"
- "ld1 { v19.d }[0], [x13], #0x8\n"
- "tbz x6, #0, 59f\n"
- "ld1 { v23.s }[2], [x14]\n"
- "ld1 { v19.s }[2], [x13]\n"
+ "ssubl v23.8h, v23.8b, v14.8b\n"
+ "smlal v2.4s, v23.4h, v20.4h\n"
+ "smlal2 v30.4s, v23.8h, v20.8h\n"
+ "smlal v10.4s, v23.4h, v25.4h\n"
+ "smlal2 v6.4s, v23.8h, v25.8h\n"
+ "tbz x7, #2, 57f\n"
+ "ld1 { v15.4s }, [x13], #0x10\n"
+ "ld1 { v19.4s }, [x12], #0x10\n"
+ "tbz x7, #1, 56f\n"
+ "ld1 { v18.d }[0], [x13], #0x8\n"
+ "ld1 { v22.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v18.s }[2], [x13]\n"
+ "ld1 { v22.s }[2], [x12]\n"
"b 59f\n"
"56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x6, #0, 59f\n"
- "ld1 { v23.s }[0], [x14]\n"
- "ld1 { v19.s }[0], [x13]\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v18.s }[0], [x13]\n"
+ "ld1 { v22.s }[0], [x12]\n"
"b 59f\n"
"57:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x6, #1, 58f\n"
- "ld1 { v17.d }[0], [x14], #0x8\n"
- "ld1 { v22.d }[0], [x13], #0x8\n"
- "tbz x6, #0, 59f\n"
- "ld1 { v17.s }[2], [x14]\n"
- "ld1 { v22.s }[2], [x13]\n"
+ "tbz x7, #1, 58f\n"
+ "ld1 { v15.d }[0], [x13], #0x8\n"
+ "ld1 { v19.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v15.s }[2], [x13]\n"
+ "ld1 { v19.s }[2], [x12]\n"
"b 59f\n"
"58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 59f\n"
- "ld1 { v17.s }[0], [x14]\n"
- "ld1 { v22.s }[0], [x13]\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v15.s }[0], [x13]\n"
+ "ld1 { v19.s }[0], [x12]\n"
"59:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v13.4s, v13.4s, v17.4s\n"
- "and v21.16b, v13.16b, v22.16b\n"
- "add x12, x12, x17\n"
- "add x11, x11, x17\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "add x10, x10, x17\n"
- "add x9, x9, x17\n"
- "and v29.16b, v20.16b, v19.16b\n"
- "sqrdmulh v9.4s, v9.4s, v17.4s\n"
- "sqrdmulh v16.4s, v16.4s, v17.4s\n"
- "sqrdmulh v25.4s, v25.4s, v17.4s\n"
- "sqadd v13.4s, v13.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v0.16b, v9.16b, v22.16b\n"
- "sqrdmulh v18.4s, v18.4s, v23.4s\n"
- "and v27.16b, v16.16b, v22.16b\n"
- "sqrdmulh v26.4s, v26.4s, v23.4s\n"
- "and v21.16b, v25.16b, v22.16b\n"
- "sqrdmulh v10.4s, v10.4s, v23.4s\n"
- "sqadd v20.4s, v20.4s, v29.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v17.16b, v18.16b, v19.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "and v7.16b, v26.16b, v19.16b\n"
+ "sqrdmulh v9.4s, v9.4s, v15.4s\n"
+ "and v17.16b, v9.16b, v19.16b\n"
+ "add x11, x11, x16\n"
+ "add x10, x10, x16\n"
+ "sqrdmulh v24.4s, v24.4s, v18.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "add x9, x9, x16\n"
+ "add x28, x28, x16\n"
+ "and v20.16b, v24.16b, v22.16b\n"
+ "sqrdmulh v7.4s, v7.4s, v15.4s\n"
+ "sqrdmulh v2.4s, v2.4s, v15.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v15.4s\n"
+ "sqadd v9.4s, v9.4s, v17.4s\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v21.16b, v7.16b, v19.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+ "and v15.16b, v2.16b, v19.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v18.4s\n"
+ "and v23.16b, v10.16b, v19.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v18.4s\n"
+ "sqadd v24.4s, v24.4s, v20.4s\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "and v29.16b, v10.16b, v19.16b\n"
- "sqadd v9.4s, v9.4s, v0.4s\n"
+ "and v18.16b, v0.16b, v22.16b\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "and v17.16b, v30.16b, v22.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v28.16b, v6.16b, v22.16b\n"
+ "sqadd v7.4s, v7.4s, v21.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v2.4s, v2.4s, v15.4s\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v27.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v25.4s, v25.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v22.4s\n"
- "srshl v9.4s, v9.4s, v22.4s\n"
- "sqadd v18.4s, v18.4s, v17.4s\n"
- "srshl v16.4s, v16.4s, v22.4s\n"
- "sqadd v26.4s, v26.4s, v7.4s\n"
- "srshl v25.4s, v25.4s, v22.4s\n"
- "sqadd v10.4s, v10.4s, v29.4s\n"
- "srshl v20.4s, v20.4s, v19.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v18.4s, v18.4s, v19.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v26.4s, v26.4s, v19.4s\n"
- "sqxtn v16.4h, v16.4s\n"
+ "sqadd v10.4s, v10.4s, v23.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "srshl v9.4s, v9.4s, v19.4s\n"
+ "srshl v7.4s, v7.4s, v19.4s\n"
+ "sqadd v0.4s, v0.4s, v18.4s\n"
+ "srshl v2.4s, v2.4s, v19.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
"srshl v10.4s, v10.4s, v19.4s\n"
- "sqxtn v25.4h, v25.4s\n"
- "sqxtn2 v13.8h, v20.4s\n"
- "sqxtn2 v9.8h, v18.4s\n"
- "sqxtn2 v16.8h, v26.4s\n"
- "sqxtn2 v25.8h, v10.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v9.8h, v9.8h, v14.8h\n"
- "sqadd v16.8h, v16.8h, v14.8h\n"
- "sqadd v25.8h, v25.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v9.8h, v9.8h, v12.8h\n"
- "smax v16.8h, v16.8h, v12.8h\n"
- "smax v25.8h, v25.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v9.8h, v9.8h, v11.8h\n"
- "smin v16.8h, v16.8h, v11.8h\n"
- "smin v25.8h, v25.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "sqadd v6.4s, v6.4s, v28.4s\n"
+ "srshl v24.4s, v24.4s, v22.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "srshl v0.4s, v0.4s, v22.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v30.4s, v30.4s, v22.4s\n"
+ "sqxtn v2.4h, v2.4s\n"
+ "srshl v6.4s, v6.4s, v22.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "sqxtn2 v9.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v0.4s\n"
+ "sqxtn2 v2.8h, v30.4s\n"
+ "sqxtn2 v10.8h, v6.4s\n"
+ "sqadd v9.8h, v9.8h, v13.8h\n"
+ "sqadd v7.8h, v7.8h, v13.8h\n"
+ "sqadd v2.8h, v2.8h, v13.8h\n"
+ "sqadd v10.8h, v10.8h, v13.8h\n"
+ "smax v9.8h, v9.8h, v29.8h\n"
+ "smax v7.8h, v7.8h, v29.8h\n"
+ "smax v2.8h, v2.8h, v29.8h\n"
+ "smax v10.8h, v10.8h, v29.8h\n"
+ "smin v9.8h, v9.8h, v12.8h\n"
+ "smin v7.8h, v7.8h, v12.8h\n"
+ "smin v2.8h, v2.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "tbz x6, #2, 61f\n"
- "st1 { v13.s }[0], [x12], #0x4\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "tbz x7, #2, 61f\n"
"st1 { v9.s }[0], [x11], #0x4\n"
- "st1 { v16.s }[0], [x10], #0x4\n"
- "st1 { v25.s }[0], [x9], #0x4\n"
- "tbz x6, #1, 60f\n"
- "st1 { v13.h }[2], [x12], #0x2\n"
+ "st1 { v7.s }[0], [x10], #0x4\n"
+ "st1 { v2.s }[0], [x9], #0x4\n"
+ "st1 { v10.s }[0], [x28], #0x4\n"
+ "tbz x7, #1, 60f\n"
"st1 { v9.h }[2], [x11], #0x2\n"
- "st1 { v16.h }[2], [x10], #0x2\n"
- "st1 { v25.h }[2], [x9], #0x2\n"
- "tbz x6, #0, 63f\n"
- "st1 { v13.b }[6], [x12], #0x1\n"
+ "st1 { v7.h }[2], [x10], #0x2\n"
+ "st1 { v2.h }[2], [x9], #0x2\n"
+ "st1 { v10.h }[2], [x28], #0x2\n"
+ "tbz x7, #0, 63f\n"
"st1 { v9.b }[6], [x11], #0x1\n"
- "st1 { v16.b }[6], [x10], #0x1\n"
- "st1 { v25.b }[6], [x9], #0x1\n"
+ "st1 { v7.b }[6], [x10], #0x1\n"
+ "st1 { v2.b }[6], [x9], #0x1\n"
+ "st1 { v10.b }[6], [x28], #0x1\n"
"b 63f\n"
"60:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x6, #0, 63f\n"
- "st1 { v13.b }[4], [x12], #0x1\n"
+ "tbz x7, #0, 63f\n"
"st1 { v9.b }[4], [x11], #0x1\n"
- "st1 { v16.b }[4], [x10], #0x1\n"
- "st1 { v25.b }[4], [x9], #0x1\n"
+ "st1 { v7.b }[4], [x10], #0x1\n"
+ "st1 { v2.b }[4], [x9], #0x1\n"
+ "st1 { v10.b }[4], [x28], #0x1\n"
"b 63f\n"
"61:" // Oddments: Bit 2: Unset
- "tbz x6, #1, 62f\n"
- "st1 { v13.h }[0], [x12], #0x2\n"
+ "tbz x7, #1, 62f\n"
"st1 { v9.h }[0], [x11], #0x2\n"
- "st1 { v16.h }[0], [x10], #0x2\n"
- "st1 { v25.h }[0], [x9], #0x2\n"
- "tbz x6, #0, 63f\n"
- "st1 { v13.b }[2], [x12], #0x1\n"
+ "st1 { v7.h }[0], [x10], #0x2\n"
+ "st1 { v2.h }[0], [x9], #0x2\n"
+ "st1 { v10.h }[0], [x28], #0x2\n"
+ "tbz x7, #0, 63f\n"
"st1 { v9.b }[2], [x11], #0x1\n"
- "st1 { v16.b }[2], [x10], #0x1\n"
- "st1 { v25.b }[2], [x9], #0x1\n"
+ "st1 { v7.b }[2], [x10], #0x1\n"
+ "st1 { v2.b }[2], [x9], #0x1\n"
+ "st1 { v10.b }[2], [x28], #0x1\n"
"b 63f\n"
"62:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 63f\n"
- "st1 { v13.b }[0], [x12], #0x1\n"
+ "tbz x7, #0, 63f\n"
"st1 { v9.b }[0], [x11], #0x1\n"
- "st1 { v16.b }[0], [x10], #0x1\n"
- "st1 { v25.b }[0], [x9], #0x1\n"
+ "st1 { v7.b }[0], [x10], #0x1\n"
+ "st1 { v2.b }[0], [x9], #0x1\n"
+ "st1 { v10.b }[0], [x28], #0x1\n"
"63:" // Oddments: Bit 2: End
"64:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index 852466c48d..7370f89699 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
@@ -34,15 +34,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
- const unsigned int,
- const int8_t *const *const,
- const int8_t *const,
- const int32_t *const,
- const arm_gemm::Requantize32 &,
- const int32_t *const,
- const int32_t *const,
- int8_t *const *const);
+void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
class a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index fa9ae97dee..6432417c35 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -104,16 +104,16 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
"lsr x8, x7, #0x3\n"
"add x20, x23, %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v12.16b }, [x20]\n"
+ "ld1r { v6.16b }, [x20]\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
"add x21, x23, %[offsetof_Requantize32_b_offset]\n"
"add x20, x23, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v13.16b }, [x21]\n"
- "ld1r { v11.8h }, [x20]\n"
+ "ld1r { v15.16b }, [x21]\n"
+ "ld1r { v13.8h }, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_minval]\n"
"add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v16.8h }, [x21]\n"
- "ld1r { v14.8h }, [x20]\n"
+ "ld1r { v17.8h }, [x21]\n"
+ "ld1r { v24.8h }, [x20]\n"
"mov x17, #0x0\n"
"mov x16, #0x0\n"
"add x15, %x[params], %[offsetof_Params_inptrs]\n"
@@ -123,563 +123,563 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ldp x11, x10, [x22, #0x0]\n"
"ldp x9, x28, [x22, #0x10]\n"
"cbz x8, 3f\n"
- "ldr d0, [x14, #0x0]\n"
- "ldr d1, [x14, #0x8]\n"
+ "ldr d11, [x14, #0x0]\n"
+ "ldr d22, [x14, #0x8]\n"
"subs x8, x8, #0x1\n"
- "ssubl v0.8h, v0.8b, v13.8b\n"
- "ldr d2, [x14, #0x10]\n"
- "ldr d3, [x14, #0x18]\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
- "ldr d4, [x14, #0x20]\n"
- "ldr d5, [x14, #0x28]\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
- "ssubl v4.8h, v4.8b, v13.8b\n"
- "ldr d6, [x14, #0x30]\n"
+ "ssubl v11.8h, v11.8b, v15.8b\n"
+ "ldr d14, [x14, #0x10]\n"
+ "ldr d28, [x14, #0x18]\n"
+ "ssubl v22.8h, v22.8b, v15.8b\n"
+ "ssubl v14.8h, v14.8b, v15.8b\n"
+ "ldr d18, [x14, #0x20]\n"
+ "ldr d9, [x14, #0x28]\n"
+ "ssubl v28.8h, v28.8b, v15.8b\n"
+ "ssubl v18.8h, v18.8b, v15.8b\n"
+ "ldr d26, [x14, #0x30]\n"
"ldr d7, [x14, #0x38]\n"
- "ssubl v5.8h, v5.8b, v13.8b\n"
- "ssubl v6.8h, v6.8b, v13.8b\n"
- "ldr d8, [x14, #0x40]\n"
- "ldr x24, [%x[params], %[offsetof_Params_bias]]\n"
- "ssubl v7.8h, v7.8b, v13.8b\n"
- "ssubl v8.8h, v8.8b, v13.8b\n"
- "ldr q15, [x24, #0x0]\n"
- "ldr q17, [x24, #0x10]\n"
- "add x24, x24, #0x20\n"
- "str x24, [%x[params], %[offsetof_Params_bias]]\n"
+ "ssubl v9.8h, v9.8b, v15.8b\n"
+ "ssubl v26.8h, v26.8b, v15.8b\n"
+ "ldr d4, [x14, #0x40]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q3, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
"ldp x27, x26, [x15, #0x0]\n"
"ldp x25, x24, [x15, #0x10]\n"
- "mov v10.16b, v15.16b\n"
- "mov v20.16b, v17.16b\n"
+ "mov v21.16b, v5.16b\n"
+ "mov v8.16b, v3.16b\n"
"ldp x23, x22, [x15, #0x20]\n"
"ldp x21, x20, [x15, #0x30]\n"
- "mov v9.16b, v15.16b\n"
- "mov v23.16b, v17.16b\n"
- "ldr d31, [x27, x17]\n"
- "ldr d30, [x26, x17]\n"
- "mov v21.16b, v15.16b\n"
- "mov v22.16b, v17.16b\n"
- "ldr d29, [x25, x17]\n"
- "ldr d28, [x24, x17]\n"
- "ssubl v31.8h, v31.8b, v12.8b\n"
- "ssubl v30.8h, v30.8b, v12.8b\n"
- "ldr d27, [x23, x17]\n"
- "ldr d26, [x22, x17]\n"
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "ssubl v28.8h, v28.8b, v12.8b\n"
- "ldr d25, [x21, x17]\n"
- "ldr d24, [x20, x17]\n"
- "ssubl v27.8h, v27.8b, v12.8b\n"
- "ssubl v26.8h, v26.8b, v12.8b\n"
- "ssubl v25.8h, v25.8b, v12.8b\n"
- "ssubl v24.8h, v24.8b, v12.8b\n"
+ "mov v20.16b, v5.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "ldr d25, [x27, x17]\n"
+ "ldr d27, [x26, x17]\n"
+ "mov v19.16b, v5.16b\n"
+ "mov v31.16b, v3.16b\n"
+ "ldr d1, [x25, x17]\n"
+ "ldr d2, [x24, x17]\n"
+ "ssubl v25.8h, v25.8b, v6.8b\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
+ "ldr d12, [x23, x17]\n"
+ "ldr d16, [x22, x17]\n"
+ "ssubl v1.8h, v1.8b, v6.8b\n"
+ "ssubl v2.8h, v2.8b, v6.8b\n"
+ "ldr d23, [x21, x17]\n"
+ "ldr d10, [x20, x17]\n"
+ "ssubl v12.8h, v12.8b, v6.8b\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ssubl v23.8h, v23.8b, v6.8b\n"
+ "ssubl v10.8h, v10.8b, v6.8b\n"
"beq 2f\n"
"1:" // Loop
- "smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v17.4s, v31.8h, v8.8h\n"
- "ldr x24, [x15, #0x40]\n"
- "ldr x22, [x15, #0x48]\n"
- "smlal v10.4s, v31.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "ldr x21, [x15, #0x50]\n"
- "ldr x20, [x15, #0x58]\n"
- "smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v17.4s, v30.8h, v0.8h\n"
- "ldr q19, [x13, #0x0]\n"
- "ldr x23, [x15, #0x78]\n"
- "smlal v10.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "ldr d28, [x22, x17]\n"
- "ssubl v28.8h, v28.8b, v12.8b\n"
- "smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v17.4s, v29.8h, v1.8h\n"
- "ldr d29, [x24, x17]\n"
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
+ "ldr q30, [x13, #0x0]\n"
+ "ldr q29, [x12, #0x0]\n"
+ "smlal v5.4s, v25.4h, v4.4h\n"
+ "smlal2 v3.4s, v25.8h, v4.8h\n"
+ "ldr x21, [x15, #0x58]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v5.4s, v27.4h, v11.4h\n"
+ "smlal v21.4s, v25.4h, v26.4h\n"
+ "ldr x25, [x15, #0x60]\n"
+ "ldr x24, [x15, #0x80]\n"
+ "smlal v20.4s, v25.4h, v14.4h\n"
+ "smlal v19.4s, v25.4h, v11.4h\n"
+ "smlal2 v3.4s, v27.8h, v11.8h\n"
"ldr d27, [x21, x17]\n"
- "ssubl v27.8h, v27.8b, v12.8b\n"
- "smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v17.4s, v26.8h, v3.8h\n"
- "ldr d26, [x20, x17]\n"
- "ldr x20, [x15, #0x60]\n"
- "smlal v10.4s, v24.4h, v0.4h\n"
- "smlal2 v20.4s, v24.8h, v0.8h\n"
- "ssubl v26.8h, v26.8b, v12.8b\n"
- "ldr x21, [x15, #0x80]\n"
- "smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v17.4s, v25.8h, v4.8h\n"
- "ldr d25, [x20, x17]\n"
- "ldr x20, [x15, #0x68]\n"
- "smlal v10.4s, v29.4h, v4.4h\n"
- "smlal2 v20.4s, v29.8h, v4.8h\n"
- "ldr d29, [x20, x17]\n"
- "ssubl v25.8h, v25.8b, v12.8b\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v17.4s, v24.8h, v2.8h\n"
- "ldr q18, [x12, #0x0]\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v22.4h\n"
+ "smlal2 v8.4s, v25.8h, v26.8h\n"
+ "smlal2 v0.4s, v25.8h, v14.8h\n"
+ "ldr x23, [x15, #0x68]\n"
"ldr x22, [x15, #0x88]\n"
- "smlal v10.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "ldr d28, [x21, x17]\n"
- "ldr x21, [x15, #0x70]\n"
- "smlal v9.4s, v31.4h, v2.4h\n"
- "smlal2 v23.4s, v31.8h, v2.8h\n"
- "ssubl v28.8h, v28.8b, v12.8b\n"
+ "smlal2 v31.4s, v25.8h, v11.8h\n"
+ "ldr d25, [x20, x17]\n"
+ "ssubl v25.8h, v25.8b, v6.8b\n"
+ "smlal v21.4s, v2.4h, v22.4h\n"
+ "smlal v20.4s, v27.4h, v28.4h\n"
+ "smlal v19.4s, v25.4h, v18.4h\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal2 v3.4s, v1.8h, v22.8h\n"
+ "ldr d1, [x25, x17]\n"
+ "ssubl v1.8h, v1.8b, v6.8b\n"
+ "smlal v5.4s, v16.4h, v28.4h\n"
+ "smlal2 v8.4s, v2.8h, v22.8h\n"
+ "ldr d2, [x24, x17]\n"
+ "ssubl v2.8h, v2.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v28.8h\n"
+ "ldr d27, [x23, x17]\n"
+ "smlal2 v31.4s, v25.8h, v18.8h\n"
+ "ldr d25, [x22, x17]\n"
+ "smlal v21.4s, v12.4h, v14.4h\n"
"ldr x25, [x15, #0x98]\n"
- "smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v17.4s, v27.8h, v5.8h\n"
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "ldr x24, [x15, #0x90]\n"
- "smlal v10.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "smlal v20.4s, v1.4h, v11.4h\n"
+ "smlal v19.4s, v2.4h, v22.4h\n"
+ "ldr x24, [x15, #0x50]\n"
+ "smlal2 v3.4s, v16.8h, v28.8h\n"
+ "ldr d16, [x21, x17]\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v23.4h, v18.4h\n"
+ "ssubl v25.8h, v25.8b, v6.8b\n"
+ "smlal2 v8.4s, v12.8h, v14.8h\n"
+ "ldr d12, [x20, x17]\n"
+ "ldr x23, [x15, #0x48]\n"
+ "smlal2 v0.4s, v1.8h, v11.8h\n"
+ "smlal2 v31.4s, v2.8h, v22.8h\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v21.4s, v10.4h, v11.4h\n"
+ "smlal v20.4s, v27.4h, v18.4h\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "smlal v19.4s, v25.4h, v9.4h\n"
+ "smlal2 v3.4s, v23.8h, v18.8h\n"
+ "ldr d23, [x25, x17]\n"
+ "ssubl v12.8h, v12.8b, v6.8b\n"
+ "ssubl v23.8h, v23.8b, v6.8b\n"
+ "smlal v5.4s, v10.4h, v14.4h\n"
+ "smlal2 v8.4s, v10.8h, v11.8h\n"
+ "ldr d11, [x24, x17]\n"
+ "ssubl v11.8h, v11.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v18.8h\n"
"ldr d27, [x23, x17]\n"
- "ssubl v27.8h, v27.8b, v12.8b\n"
- "smlal v21.4s, v31.4h, v0.4h\n"
- "smlal v9.4s, v26.4h, v3.4h\n"
- "ldr x23, [x15, #0xa8]\n"
- "ldr x20, [x15, #0xa0]\n"
- "smlal2 v23.4s, v26.8h, v3.8h\n"
- "ldr d26, [x22, x17]\n"
- "smlal2 v22.4s, v31.8h, v0.8h\n"
- "ldr d24, [x21, x17]\n"
- "smlal v21.4s, v27.4h, v4.4h\n"
- "smlal v9.4s, v25.4h, v0.4h\n"
- "ssubl v26.8h, v26.8b, v12.8b\n"
- "ldr x22, [x15, #0xb0]\n"
- "smlal2 v23.4s, v25.8h, v0.8h\n"
- "ldr q30, [x13, #0x10]\n"
- "smlal2 v22.4s, v27.8h, v4.8h\n"
- "ldr d27, [x20, x17]\n"
- "smlal v21.4s, v28.4h, v1.4h\n"
- "smlal v15.4s, v25.4h, v6.4h\n"
- "ssubl v24.8h, v24.8b, v12.8b\n"
- "ldr x21, [x15, #0xb8]\n"
- "smlal2 v17.4s, v25.8h, v6.8h\n"
- "ldr d25, [x24, x17]\n"
- "smlal v9.4s, v29.4h, v4.4h\n"
- "ssubl v25.8h, v25.8b, v12.8b\n"
- "smlal2 v23.4s, v29.8h, v4.8h\n"
- "ldr d29, [x25, x17]\n"
- "ldr q31, [x12, #0x10]\n"
- "smlal2 v22.4s, v28.8h, v1.8h\n"
- "smlal v21.4s, v26.4h, v5.4h\n"
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal v15.4s, v24.4h, v7.4h\n"
- "ldr x20, [x15, #0xc0]\n"
- "smlal2 v17.4s, v24.8h, v7.8h\n"
- "smlal v9.4s, v24.4h, v1.4h\n"
- "ssubl v27.8h, v27.8b, v12.8b\n"
- "ldr x24, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal2 v23.4s, v24.8h, v1.8h\n"
- "ldr d24, [x23, x17]\n"
- "smlal2 v22.4s, v26.8h, v5.8h\n"
- "ldr d26, [x22, x17]\n"
- "smlal v21.4s, v29.4h, v2.4h\n"
- "ssubl v24.8h, v24.8b, v12.8b\n"
- "smlal2 v22.4s, v29.8h, v2.8h\n"
+ "smlal2 v31.4s, v25.8h, v9.8h\n"
+ "ldr d25, [x21, x17]\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "smlal v21.4s, v16.4h, v18.4h\n"
+ "smlal v20.4s, v12.4h, v22.4h\n"
+ "smlal v19.4s, v23.4h, v14.4h\n"
+ "smlal2 v3.4s, v10.8h, v14.8h\n"
+ "ldr d10, [x20, x17]\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
+ "ssubl v25.8h, v25.8b, v6.8b\n"
+ "ssubl v10.8h, v10.8b, v6.8b\n"
+ "smlal v5.4s, v11.4h, v9.4h\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "smlal2 v8.4s, v16.8h, v18.8h\n"
+ "ldr d18, [x22, x17]\n"
+ "ldr d16, [x21, x17]\n"
+ "smlal2 v0.4s, v12.8h, v22.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal2 v31.4s, v23.8h, v14.8h\n"
+ "ldr q14, [x13, #0x10]\n"
+ "smlal v21.4s, v27.4h, v9.4h\n"
+ "smlal v20.4s, v25.4h, v26.4h\n"
+ "smlal v19.4s, v10.4h, v28.4h\n"
+ "ssubl v18.8h, v18.8b, v6.8b\n"
+ "ldr x21, [x15, #0xc0]\n"
+ "smlal2 v3.4s, v11.8h, v9.8h\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v26.4h\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v8.4s, v27.8h, v9.8h\n"
+ "ldr d27, [x21, x17]\n"
+ "smlal2 v0.4s, v25.8h, v26.8h\n"
+ "ldr q25, [x12, #0x10]\n"
+ "smlal2 v31.4s, v10.8h, v28.8h\n"
+ "smlal v21.4s, v11.4h, v28.4h\n"
+ "ssubl v22.8h, v22.8b, v6.8b\n"
"add x14, x14, #0x48\n"
- "smlal v9.4s, v25.4h, v6.4h\n"
- "smlal v21.4s, v24.4h, v3.4h\n"
- "ssubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v20.4s, v18.4h, v7.4h\n"
+ "smlal v19.4s, v16.4h, v7.4h\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
+ "add x17, x17, #0x8\n"
+ "smlal2 v3.4s, v1.8h, v26.8h\n"
+ "smlal v5.4s, v12.4h, v7.4h\n"
+ "sqrdmulh v5.4s, v5.4s, v30.4s\n"
"subs x8, x8, #0x1\n"
- "smlal v10.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
- "sqrdmulh v15.4s, v15.4s, v19.4s\n"
+ "smlal2 v8.4s, v11.8h, v28.8h\n"
+ "smlal2 v0.4s, v18.8h, v7.8h\n"
+ "and v28.16b, v5.16b, v29.16b\n"
"add x13, x13, #0x20\n"
- "smlal2 v23.4s, v25.8h, v6.8h\n"
- "ldr d25, [x21, x17]\n"
- "smlal2 v22.4s, v24.8h, v3.8h\n"
- "ssubl v25.8h, v25.8b, v12.8b\n"
- "smlal v9.4s, v27.4h, v7.4h\n"
- "smlal v21.4s, v26.4h, v7.4h\n"
- "and v0.16b, v15.16b, v18.16b\n"
+ "smlal2 v31.4s, v16.8h, v7.8h\n"
+ "smlal v21.4s, v2.4h, v7.4h\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
"add x12, x12, #0x20\n"
- "smlal v10.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "ldr d29, [x20, x17]\n"
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal2 v23.4s, v27.8h, v7.8h\n"
- "smlal2 v22.4s, v26.8h, v7.8h\n"
- "sqrdmulh v17.4s, v17.4s, v30.4s\n"
- "add x17, x17, #0x8\n"
- "smlal v9.4s, v24.4h, v5.4h\n"
- "smlal v21.4s, v25.4h, v6.4h\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "smlal2 v23.4s, v24.8h, v5.8h\n"
- "smlal2 v22.4s, v25.8h, v6.8h\n"
- "and v7.16b, v17.16b, v31.16b\n"
- "smlal v9.4s, v25.4h, v8.4h\n"
- "smlal v21.4s, v29.4h, v8.4h\n"
- "sqrdmulh v10.4s, v10.4s, v19.4s\n"
- "smlal2 v23.4s, v25.8h, v8.8h\n"
- "smlal2 v22.4s, v29.8h, v8.8h\n"
- "sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "sqrdmulh v21.4s, v21.4s, v19.4s\n"
- "sqadd v15.4s, v15.4s, v0.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "and v19.16b, v10.16b, v18.16b\n"
+ "smlal v20.4s, v10.4h, v9.4h\n"
+ "smlal v19.4s, v22.4h, v26.4h\n"
+ "sqadd v5.4s, v5.4s, v28.4s\n"
+ "smlal2 v3.4s, v12.8h, v7.8h\n"
+ "smlal2 v8.4s, v2.8h, v7.8h\n"
+ "sqrdmulh v3.4s, v3.4s, v14.4s\n"
+ "smlal2 v0.4s, v10.8h, v9.8h\n"
+ "smlal2 v31.4s, v22.8h, v26.8h\n"
+ "and v16.16b, v3.16b, v25.16b\n"
+ "smlal v21.4s, v23.4h, v4.4h\n"
+ "smlal v20.4s, v22.4h, v4.4h\n"
+ "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+ "smlal v19.4s, v27.4h, v4.4h\n"
+ "smlal2 v8.4s, v23.8h, v4.8h\n"
"sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "and v27.16b, v9.16b, v18.16b\n"
- "sqrdmulh v23.4s, v23.4s, v30.4s\n"
- "and v0.16b, v21.16b, v18.16b\n"
- "sqrdmulh v22.4s, v22.4s, v30.4s\n"
- "sqadd v17.4s, v17.4s, v7.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "and v5.16b, v20.16b, v31.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "and v4.16b, v23.16b, v31.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v7.16b, v22.16b, v31.16b\n"
- "sqadd v10.4s, v10.4s, v19.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v27.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v18.4s\n"
- "srshl v10.4s, v10.4s, v18.4s\n"
- "sqadd v20.4s, v20.4s, v5.4s\n"
- "srshl v9.4s, v9.4s, v18.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "srshl v21.4s, v21.4s, v18.4s\n"
- "sqadd v22.4s, v22.4s, v7.4s\n"
- "srshl v17.4s, v17.4s, v31.4s\n"
- "sqxtn v15.4h, v15.4s\n"
- "srshl v20.4s, v20.4s, v31.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "srshl v23.4s, v23.4s, v31.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v22.4s, v22.4s, v31.4s\n"
+ "smlal2 v0.4s, v22.8h, v4.8h\n"
+ "smlal2 v31.4s, v27.8h, v4.8h\n"
+ "sqrdmulh v19.4s, v19.4s, v30.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v12.16b, v21.16b, v29.16b\n"
+ "sqrdmulh v8.4s, v8.4s, v14.4s\n"
+ "and v23.16b, v20.16b, v29.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v14.4s\n"
+ "and v9.16b, v19.16b, v29.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v14.4s\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "and v18.16b, v8.16b, v25.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v22.16b, v0.16b, v25.16b\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "and v16.16b, v31.16b, v25.16b\n"
+ "sqadd v21.4s, v21.4s, v12.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v23.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v9.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v29.4s\n"
+ "srshl v21.4s, v21.4s, v29.4s\n"
+ "sqadd v8.4s, v8.4s, v18.4s\n"
+ "srshl v20.4s, v20.4s, v29.4s\n"
+ "sqadd v0.4s, v0.4s, v22.4s\n"
+ "srshl v19.4s, v19.4s, v29.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v3.4s, v3.4s, v25.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "srshl v8.4s, v8.4s, v25.4s\n"
"sqxtn v21.4h, v21.4s\n"
- "sqxtn2 v15.8h, v17.4s\n"
- "sqxtn2 v10.8h, v20.4s\n"
- "sqxtn2 v9.8h, v23.4s\n"
- "sqxtn2 v21.8h, v22.4s\n"
- "sqadd v15.8h, v15.8h, v11.8h\n"
- "sqadd v10.8h, v10.8h, v11.8h\n"
- "sqadd v9.8h, v9.8h, v11.8h\n"
- "sqadd v21.8h, v21.8h, v11.8h\n"
- "smax v15.8h, v15.8h, v16.8h\n"
- "smax v10.8h, v10.8h, v16.8h\n"
- "smax v9.8h, v9.8h, v16.8h\n"
- "smax v21.8h, v21.8h, v16.8h\n"
- "smin v15.8h, v15.8h, v14.8h\n"
- "smin v10.8h, v10.8h, v14.8h\n"
- "smin v9.8h, v9.8h, v14.8h\n"
- "smin v21.8h, v21.8h, v14.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
- "str d15, [x11, x16]\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
- "str d10, [x10, x16]\n"
+ "srshl v0.4s, v0.4s, v25.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v31.4s, v31.4s, v25.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "sqxtn2 v5.8h, v3.4s\n"
+ "sqxtn2 v21.8h, v8.4s\n"
+ "sqxtn2 v20.8h, v0.4s\n"
+ "sqxtn2 v19.8h, v31.4s\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "sqadd v21.8h, v21.8h, v13.8h\n"
+ "sqadd v20.8h, v20.8h, v13.8h\n"
+ "sqadd v19.8h, v19.8h, v13.8h\n"
+ "smax v5.8h, v5.8h, v17.8h\n"
+ "smax v21.8h, v21.8h, v17.8h\n"
+ "smax v20.8h, v20.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smin v5.8h, v5.8h, v24.8h\n"
+ "smin v21.8h, v21.8h, v24.8h\n"
+ "smin v20.8h, v20.8h, v24.8h\n"
+ "smin v19.8h, v19.8h, v24.8h\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "str d5, [x11, x16]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
- "str d9, [x9, x16]\n"
- "str d21, [x28, x16]\n"
- "ldr q15, [x24, #0x0]\n"
- "ldr q17, [x24, #0x10]\n"
- "add x24, x24, #0x20\n"
- "ldr d0, [x14, #0x0]\n"
- "ldr d1, [x14, #0x8]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d21, [x10, x16]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str d20, [x9, x16]\n"
+ "str d19, [x28, x16]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q3, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d11, [x14, #0x0]\n"
+ "ldr d22, [x14, #0x8]\n"
"add x16, x16, #0x8\n"
- "str x24, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d2, [x14, #0x10]\n"
- "ldr d3, [x14, #0x18]\n"
- "mov v10.16b, v15.16b\n"
- "mov v20.16b, v17.16b\n"
- "ldr d4, [x14, #0x20]\n"
- "ldr d5, [x14, #0x28]\n"
- "mov v9.16b, v15.16b\n"
- "mov v23.16b, v17.16b\n"
- "ldr d6, [x14, #0x30]\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d14, [x14, #0x10]\n"
+ "ldr d28, [x14, #0x18]\n"
+ "mov v21.16b, v5.16b\n"
+ "mov v8.16b, v3.16b\n"
+ "ldr d18, [x14, #0x20]\n"
+ "ldr d9, [x14, #0x28]\n"
+ "mov v20.16b, v5.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "ldr d26, [x14, #0x30]\n"
"ldr d7, [x14, #0x38]\n"
- "mov v21.16b, v15.16b\n"
- "mov v22.16b, v17.16b\n"
- "ldr d8, [x14, #0x40]\n"
+ "mov v19.16b, v5.16b\n"
+ "mov v31.16b, v3.16b\n"
+ "ldr d4, [x14, #0x40]\n"
"ldp x27, x26, [x15, #0x0]\n"
- "ssubl v0.8h, v0.8b, v13.8b\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
+ "ssubl v11.8h, v11.8b, v15.8b\n"
+ "ssubl v22.8h, v22.8b, v15.8b\n"
"ldp x25, x24, [x15, #0x10]\n"
"ldp x23, x22, [x15, #0x20]\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
+ "ssubl v14.8h, v14.8b, v15.8b\n"
+ "ssubl v28.8h, v28.8b, v15.8b\n"
"ldp x21, x20, [x15, #0x30]\n"
- "ldr d31, [x27, x17]\n"
- "ssubl v4.8h, v4.8b, v13.8b\n"
- "ssubl v5.8h, v5.8b, v13.8b\n"
- "ldr d30, [x26, x17]\n"
- "ldr d29, [x25, x17]\n"
- "ssubl v6.8h, v6.8b, v13.8b\n"
- "ssubl v7.8h, v7.8b, v13.8b\n"
- "ldr d28, [x24, x17]\n"
- "ldr d27, [x23, x17]\n"
- "ssubl v8.8h, v8.8b, v13.8b\n"
- "ssubl v31.8h, v31.8b, v12.8b\n"
- "ldr d26, [x22, x17]\n"
- "ldr d25, [x21, x17]\n"
- "ssubl v30.8h, v30.8b, v12.8b\n"
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "ldr d24, [x20, x17]\n"
- "ssubl v28.8h, v28.8b, v12.8b\n"
- "ssubl v27.8h, v27.8b, v12.8b\n"
- "ssubl v26.8h, v26.8b, v12.8b\n"
- "ssubl v25.8h, v25.8b, v12.8b\n"
- "ssubl v24.8h, v24.8b, v12.8b\n"
+ "ldr d25, [x27, x17]\n"
+ "ssubl v18.8h, v18.8b, v15.8b\n"
+ "ssubl v9.8h, v9.8b, v15.8b\n"
+ "ldr d27, [x26, x17]\n"
+ "ldr d1, [x25, x17]\n"
+ "ssubl v26.8h, v26.8b, v15.8b\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "ldr d2, [x24, x17]\n"
+ "ldr d12, [x23, x17]\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ssubl v25.8h, v25.8b, v6.8b\n"
+ "ldr d16, [x22, x17]\n"
+ "ldr d23, [x21, x17]\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
+ "ssubl v1.8h, v1.8b, v6.8b\n"
+ "ldr d10, [x20, x17]\n"
+ "ssubl v2.8h, v2.8b, v6.8b\n"
+ "ssubl v12.8h, v12.8b, v6.8b\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ssubl v23.8h, v23.8b, v6.8b\n"
+ "ssubl v10.8h, v10.8b, v6.8b\n"
"bgt 1b\n"
"2:" // Tail
- "smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v17.4s, v31.8h, v8.8h\n"
- "ldr x24, [x15, #0x40]\n"
- "ldr x22, [x15, #0x48]\n"
- "smlal v10.4s, v31.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "ldr x21, [x15, #0x50]\n"
- "ldr x20, [x15, #0x58]\n"
- "smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v17.4s, v30.8h, v0.8h\n"
- "ldr q19, [x13, #0x0]\n"
- "ldr x23, [x15, #0x78]\n"
- "smlal v10.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "ldr d28, [x22, x17]\n"
- "ssubl v28.8h, v28.8b, v12.8b\n"
- "smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v17.4s, v29.8h, v1.8h\n"
- "ldr d29, [x24, x17]\n"
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
+ "ldr q29, [x13, #0x0]\n"
+ "ldr q30, [x12, #0x0]\n"
+ "smlal v5.4s, v25.4h, v4.4h\n"
+ "smlal2 v3.4s, v25.8h, v4.8h\n"
+ "ldr x21, [x15, #0x58]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v5.4s, v27.4h, v11.4h\n"
+ "smlal v21.4s, v25.4h, v26.4h\n"
+ "ldr x25, [x15, #0x60]\n"
+ "ldr x24, [x15, #0x80]\n"
+ "smlal v20.4s, v25.4h, v14.4h\n"
+ "smlal v19.4s, v25.4h, v11.4h\n"
+ "smlal2 v3.4s, v27.8h, v11.8h\n"
"ldr d27, [x21, x17]\n"
- "ssubl v27.8h, v27.8b, v12.8b\n"
- "smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v17.4s, v26.8h, v3.8h\n"
- "ldr d26, [x20, x17]\n"
- "ldr x20, [x15, #0x60]\n"
- "smlal v10.4s, v24.4h, v0.4h\n"
- "smlal2 v20.4s, v24.8h, v0.8h\n"
- "ssubl v26.8h, v26.8b, v12.8b\n"
- "ldr x21, [x15, #0x80]\n"
- "smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v17.4s, v25.8h, v4.8h\n"
- "ldr d25, [x20, x17]\n"
- "ldr x20, [x15, #0x68]\n"
- "smlal v10.4s, v29.4h, v4.4h\n"
- "smlal2 v20.4s, v29.8h, v4.8h\n"
- "ldr d29, [x20, x17]\n"
- "ssubl v25.8h, v25.8b, v12.8b\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v17.4s, v24.8h, v2.8h\n"
- "ldr q18, [x12, #0x0]\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v22.4h\n"
+ "smlal2 v8.4s, v25.8h, v26.8h\n"
+ "smlal2 v0.4s, v25.8h, v14.8h\n"
+ "ldr x23, [x15, #0x68]\n"
"ldr x22, [x15, #0x88]\n"
- "smlal v10.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "ldr d28, [x21, x17]\n"
- "ldr x21, [x15, #0x70]\n"
- "smlal v9.4s, v31.4h, v2.4h\n"
- "smlal2 v23.4s, v31.8h, v2.8h\n"
- "ssubl v28.8h, v28.8b, v12.8b\n"
+ "smlal2 v31.4s, v25.8h, v11.8h\n"
+ "ldr d25, [x20, x17]\n"
+ "ssubl v25.8h, v25.8b, v6.8b\n"
+ "smlal v21.4s, v2.4h, v22.4h\n"
+ "smlal v20.4s, v27.4h, v28.4h\n"
+ "smlal v19.4s, v25.4h, v18.4h\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal2 v3.4s, v1.8h, v22.8h\n"
+ "ldr d1, [x25, x17]\n"
+ "ssubl v1.8h, v1.8b, v6.8b\n"
+ "smlal v5.4s, v16.4h, v28.4h\n"
+ "smlal2 v8.4s, v2.8h, v22.8h\n"
+ "ldr d2, [x24, x17]\n"
+ "ssubl v2.8h, v2.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v28.8h\n"
+ "ldr d27, [x23, x17]\n"
+ "smlal2 v31.4s, v25.8h, v18.8h\n"
+ "ldr d25, [x22, x17]\n"
+ "smlal v21.4s, v12.4h, v14.4h\n"
"ldr x25, [x15, #0x98]\n"
- "smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v17.4s, v27.8h, v5.8h\n"
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "ldr x24, [x15, #0x90]\n"
- "smlal v10.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "smlal v20.4s, v1.4h, v11.4h\n"
+ "smlal v19.4s, v2.4h, v22.4h\n"
+ "ldr x24, [x15, #0x50]\n"
+ "smlal2 v3.4s, v16.8h, v28.8h\n"
+ "ldr d16, [x21, x17]\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v23.4h, v18.4h\n"
+ "ssubl v25.8h, v25.8b, v6.8b\n"
+ "smlal2 v8.4s, v12.8h, v14.8h\n"
+ "ldr d12, [x20, x17]\n"
+ "ldr x23, [x15, #0x48]\n"
+ "smlal2 v0.4s, v1.8h, v11.8h\n"
+ "smlal2 v31.4s, v2.8h, v22.8h\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v21.4s, v10.4h, v11.4h\n"
+ "smlal v20.4s, v27.4h, v18.4h\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "smlal v19.4s, v25.4h, v9.4h\n"
+ "smlal2 v3.4s, v23.8h, v18.8h\n"
+ "ldr d23, [x25, x17]\n"
+ "ssubl v12.8h, v12.8b, v6.8b\n"
+ "ssubl v23.8h, v23.8b, v6.8b\n"
+ "smlal v5.4s, v10.4h, v14.4h\n"
+ "smlal2 v8.4s, v10.8h, v11.8h\n"
+ "ldr d11, [x24, x17]\n"
+ "ssubl v11.8h, v11.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v18.8h\n"
"ldr d27, [x23, x17]\n"
- "ssubl v27.8h, v27.8b, v12.8b\n"
- "smlal v21.4s, v31.4h, v0.4h\n"
- "smlal v9.4s, v26.4h, v3.4h\n"
- "ldr x23, [x15, #0xa8]\n"
- "ldr x20, [x15, #0xa0]\n"
- "smlal2 v23.4s, v26.8h, v3.8h\n"
- "ldr d26, [x22, x17]\n"
- "smlal2 v22.4s, v31.8h, v0.8h\n"
- "ldr d24, [x21, x17]\n"
- "smlal v21.4s, v27.4h, v4.4h\n"
- "smlal v9.4s, v25.4h, v0.4h\n"
- "ssubl v26.8h, v26.8b, v12.8b\n"
- "ldr x22, [x15, #0xb0]\n"
- "smlal2 v23.4s, v25.8h, v0.8h\n"
- "ldr q30, [x13, #0x10]\n"
- "smlal2 v22.4s, v27.8h, v4.8h\n"
- "ldr d27, [x20, x17]\n"
- "smlal v21.4s, v28.4h, v1.4h\n"
- "smlal v15.4s, v25.4h, v6.4h\n"
- "ssubl v24.8h, v24.8b, v12.8b\n"
- "ldr x21, [x15, #0xb8]\n"
- "smlal2 v17.4s, v25.8h, v6.8h\n"
- "ldr d25, [x24, x17]\n"
- "smlal v9.4s, v29.4h, v4.4h\n"
- "ssubl v25.8h, v25.8b, v12.8b\n"
- "smlal2 v23.4s, v29.8h, v4.8h\n"
- "ldr d29, [x25, x17]\n"
- "ldr q31, [x12, #0x10]\n"
- "smlal2 v22.4s, v28.8h, v1.8h\n"
- "smlal v21.4s, v26.4h, v5.4h\n"
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal v15.4s, v24.4h, v7.4h\n"
+ "smlal2 v31.4s, v25.8h, v9.8h\n"
+ "ldr d25, [x21, x17]\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "smlal v21.4s, v16.4h, v18.4h\n"
+ "smlal v20.4s, v12.4h, v22.4h\n"
+ "smlal v19.4s, v23.4h, v14.4h\n"
+ "smlal2 v3.4s, v10.8h, v14.8h\n"
+ "ldr d10, [x20, x17]\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
+ "ssubl v25.8h, v25.8b, v6.8b\n"
+ "ssubl v10.8h, v10.8b, v6.8b\n"
+ "smlal v5.4s, v11.4h, v9.4h\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "smlal2 v8.4s, v16.8h, v18.8h\n"
+ "ldr d16, [x22, x17]\n"
+ "ldr d18, [x21, x17]\n"
+ "smlal2 v0.4s, v12.8h, v22.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal2 v31.4s, v23.8h, v14.8h\n"
+ "ldr q14, [x13, #0x10]\n"
+ "smlal v21.4s, v27.4h, v9.4h\n"
+ "smlal v20.4s, v25.4h, v26.4h\n"
+ "smlal v19.4s, v10.4h, v28.4h\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0xc0]\n"
- "smlal2 v17.4s, v24.8h, v7.8h\n"
- "smlal v9.4s, v24.4h, v1.4h\n"
- "ssubl v27.8h, v27.8b, v12.8b\n"
+ "smlal2 v3.4s, v11.8h, v9.8h\n"
+ "ssubl v18.8h, v18.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v26.4h\n"
"tst x7, #0x7\n"
- "smlal2 v23.4s, v24.8h, v1.8h\n"
- "ldr d24, [x23, x17]\n"
- "smlal2 v22.4s, v26.8h, v5.8h\n"
- "ldr d26, [x22, x17]\n"
- "smlal v21.4s, v29.4h, v2.4h\n"
- "ssubl v24.8h, v24.8b, v12.8b\n"
- "smlal2 v22.4s, v29.8h, v2.8h\n"
+ "smlal2 v8.4s, v27.8h, v9.8h\n"
+ "ldr d27, [x20, x17]\n"
+ "smlal2 v0.4s, v25.8h, v26.8h\n"
+ "ldr q25, [x12, #0x10]\n"
+ "smlal2 v31.4s, v10.8h, v28.8h\n"
+ "smlal v21.4s, v11.4h, v28.4h\n"
+ "ssubl v22.8h, v22.8b, v6.8b\n"
+ "add x17, x17, #0x8\n"
+ "smlal v20.4s, v16.4h, v7.4h\n"
+ "smlal v19.4s, v18.4h, v7.4h\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
"add x13, x13, #0x20\n"
- "smlal v9.4s, v25.4h, v6.4h\n"
- "smlal v21.4s, v24.4h, v3.4h\n"
- "ssubl v26.8h, v26.8b, v12.8b\n"
+ "smlal2 v3.4s, v1.8h, v26.8h\n"
+ "smlal v5.4s, v12.4h, v7.4h\n"
+ "sqrdmulh v5.4s, v5.4s, v29.4s\n"
"add x12, x12, #0x20\n"
- "smlal v10.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
- "sqrdmulh v15.4s, v15.4s, v19.4s\n"
- "smlal2 v23.4s, v25.8h, v6.8h\n"
- "ldr d25, [x21, x17]\n"
- "smlal2 v22.4s, v24.8h, v3.8h\n"
- "ssubl v25.8h, v25.8b, v12.8b\n"
- "smlal v9.4s, v27.4h, v7.4h\n"
- "smlal v21.4s, v26.4h, v7.4h\n"
- "and v0.16b, v15.16b, v18.16b\n"
- "smlal v10.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "ldr d29, [x20, x17]\n"
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal2 v23.4s, v27.8h, v7.8h\n"
- "smlal2 v22.4s, v26.8h, v7.8h\n"
- "sqrdmulh v17.4s, v17.4s, v30.4s\n"
- "add x17, x17, #0x8\n"
- "smlal v9.4s, v24.4h, v5.4h\n"
- "smlal v21.4s, v25.4h, v6.4h\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "smlal2 v23.4s, v24.8h, v5.8h\n"
- "smlal2 v22.4s, v25.8h, v6.8h\n"
- "and v7.16b, v17.16b, v31.16b\n"
- "smlal v9.4s, v25.4h, v8.4h\n"
- "smlal v21.4s, v29.4h, v8.4h\n"
- "sqrdmulh v10.4s, v10.4s, v19.4s\n"
- "smlal2 v23.4s, v25.8h, v8.8h\n"
- "smlal2 v22.4s, v29.8h, v8.8h\n"
- "sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "sqrdmulh v21.4s, v21.4s, v19.4s\n"
- "sqadd v15.4s, v15.4s, v0.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "and v19.16b, v10.16b, v18.16b\n"
- "sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "and v27.16b, v9.16b, v18.16b\n"
- "sqrdmulh v23.4s, v23.4s, v30.4s\n"
- "and v0.16b, v21.16b, v18.16b\n"
- "sqrdmulh v22.4s, v22.4s, v30.4s\n"
- "sqadd v17.4s, v17.4s, v7.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "and v5.16b, v20.16b, v31.16b\n"
+ "smlal2 v8.4s, v11.8h, v28.8h\n"
+ "smlal2 v0.4s, v16.8h, v7.8h\n"
+ "and v16.16b, v5.16b, v30.16b\n"
+ "smlal2 v31.4s, v18.8h, v7.8h\n"
+ "smlal v21.4s, v2.4h, v7.4h\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smlal v20.4s, v10.4h, v9.4h\n"
+ "smlal v19.4s, v22.4h, v26.4h\n"
+ "sqadd v5.4s, v5.4s, v16.4s\n"
+ "smlal2 v3.4s, v12.8h, v7.8h\n"
+ "smlal2 v8.4s, v2.8h, v7.8h\n"
+ "sqrdmulh v3.4s, v3.4s, v14.4s\n"
+ "smlal2 v0.4s, v10.8h, v9.8h\n"
+ "smlal2 v31.4s, v22.8h, v26.8h\n"
+ "and v16.16b, v3.16b, v25.16b\n"
+ "smlal v21.4s, v23.4h, v4.4h\n"
+ "smlal v20.4s, v22.4h, v4.4h\n"
+ "sqrdmulh v21.4s, v21.4s, v29.4s\n"
+ "smlal v19.4s, v27.4h, v4.4h\n"
+ "smlal2 v8.4s, v23.8h, v4.8h\n"
+ "sqrdmulh v20.4s, v20.4s, v29.4s\n"
+ "smlal2 v0.4s, v22.8h, v4.8h\n"
+ "smlal2 v31.4s, v27.8h, v4.8h\n"
+ "sqrdmulh v19.4s, v19.4s, v29.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v23.16b, v21.16b, v30.16b\n"
+ "sqrdmulh v8.4s, v8.4s, v14.4s\n"
+ "and v27.16b, v20.16b, v30.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v14.4s\n"
+ "and v22.16b, v19.16b, v30.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v14.4s\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v14.16b, v8.16b, v25.16b\n"
"sshr v27.4s, v27.4s, #0x1f\n"
- "and v4.16b, v23.16b, v31.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v7.16b, v22.16b, v31.16b\n"
- "sqadd v10.4s, v10.4s, v19.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v27.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v18.4s\n"
- "srshl v10.4s, v10.4s, v18.4s\n"
- "sqadd v20.4s, v20.4s, v5.4s\n"
- "srshl v9.4s, v9.4s, v18.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "srshl v21.4s, v21.4s, v18.4s\n"
- "sqadd v22.4s, v22.4s, v7.4s\n"
- "srshl v17.4s, v17.4s, v31.4s\n"
- "sqxtn v15.4h, v15.4s\n"
- "srshl v20.4s, v20.4s, v31.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "srshl v23.4s, v23.4s, v31.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v22.4s, v22.4s, v31.4s\n"
+ "and v18.16b, v0.16b, v25.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v16.16b, v31.16b, v25.16b\n"
+ "sqadd v21.4s, v21.4s, v23.4s\n"
+ "sshr v14.4s, v14.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v27.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v22.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v30.4s\n"
+ "srshl v21.4s, v21.4s, v30.4s\n"
+ "sqadd v8.4s, v8.4s, v14.4s\n"
+ "srshl v20.4s, v20.4s, v30.4s\n"
+ "sqadd v0.4s, v0.4s, v18.4s\n"
+ "srshl v19.4s, v19.4s, v30.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v3.4s, v3.4s, v25.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "srshl v8.4s, v8.4s, v25.4s\n"
"sqxtn v21.4h, v21.4s\n"
- "sqxtn2 v15.8h, v17.4s\n"
- "sqxtn2 v10.8h, v20.4s\n"
- "sqxtn2 v9.8h, v23.4s\n"
- "sqxtn2 v21.8h, v22.4s\n"
- "sqadd v15.8h, v15.8h, v11.8h\n"
- "sqadd v10.8h, v10.8h, v11.8h\n"
- "sqadd v9.8h, v9.8h, v11.8h\n"
- "sqadd v21.8h, v21.8h, v11.8h\n"
- "smax v15.8h, v15.8h, v16.8h\n"
- "smax v10.8h, v10.8h, v16.8h\n"
- "smax v9.8h, v9.8h, v16.8h\n"
- "smax v21.8h, v21.8h, v16.8h\n"
- "smin v15.8h, v15.8h, v14.8h\n"
- "smin v10.8h, v10.8h, v14.8h\n"
- "smin v9.8h, v9.8h, v14.8h\n"
- "smin v21.8h, v21.8h, v14.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
- "str d15, [x11, x16]\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
- "str d10, [x10, x16]\n"
+ "srshl v0.4s, v0.4s, v25.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v31.4s, v31.4s, v25.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "sqxtn2 v5.8h, v3.4s\n"
+ "sqxtn2 v21.8h, v8.4s\n"
+ "sqxtn2 v20.8h, v0.4s\n"
+ "sqxtn2 v19.8h, v31.4s\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "sqadd v21.8h, v21.8h, v13.8h\n"
+ "sqadd v20.8h, v20.8h, v13.8h\n"
+ "sqadd v19.8h, v19.8h, v13.8h\n"
+ "smax v5.8h, v5.8h, v17.8h\n"
+ "smax v21.8h, v21.8h, v17.8h\n"
+ "smax v20.8h, v20.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smin v5.8h, v5.8h, v24.8h\n"
+ "smin v21.8h, v21.8h, v24.8h\n"
+ "smin v20.8h, v20.8h, v24.8h\n"
+ "smin v19.8h, v19.8h, v24.8h\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "str d5, [x11, x16]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
- "str d9, [x9, x16]\n"
- "str d21, [x28, x16]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d21, [x10, x16]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str d20, [x9, x16]\n"
+ "str d19, [x28, x16]\n"
"add x16, x16, #0x8\n"
"beq 88f\n"
"add x14, x14, #0x48\n"
"3:" // Oddments
- "ldr x24, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
"tbz x7, #2, 5f\n"
- "ld1 { v15.4s }, [x24], #0x10\n"
+ "ld1 { v5.4s }, [x20], #0x10\n"
"tbz x7, #1, 4f\n"
- "ld1 { v17.d }[0], [x24], #0x8\n"
+ "ld1 { v3.d }[0], [x20], #0x8\n"
"tbz x7, #0, 7f\n"
- "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v3.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
"tbz x7, #0, 7f\n"
- "ld1 { v17.s }[0], [x24]\n"
+ "ld1 { v3.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
"tbz x7, #1, 6f\n"
- "ld1 { v15.d }[0], [x24], #0x8\n"
+ "ld1 { v5.d }[0], [x20], #0x8\n"
"tbz x7, #0, 7f\n"
- "ld1 { v15.s }[2], [x24]\n"
+ "ld1 { v5.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 7f\n"
- "ld1 { v15.s }[0], [x24]\n"
+ "ld1 { v5.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x14, #0x0]\n"
- "ldr d1, [x14, #0x8]\n"
- "mov v10.16b, v15.16b\n"
- "mov v20.16b, v17.16b\n"
- "ldr d2, [x14, #0x10]\n"
- "ldr d3, [x14, #0x18]\n"
- "mov v9.16b, v15.16b\n"
- "mov v23.16b, v17.16b\n"
- "ldr d4, [x14, #0x20]\n"
- "ldr d5, [x14, #0x28]\n"
- "mov v21.16b, v15.16b\n"
- "mov v22.16b, v17.16b\n"
- "ldr d6, [x14, #0x30]\n"
+ "ldr d11, [x14, #0x0]\n"
+ "ldr d22, [x14, #0x8]\n"
+ "mov v21.16b, v5.16b\n"
+ "mov v8.16b, v3.16b\n"
+ "ldr d14, [x14, #0x10]\n"
+ "ldr d28, [x14, #0x18]\n"
+ "mov v20.16b, v5.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "ldr d18, [x14, #0x20]\n"
+ "ldr d9, [x14, #0x28]\n"
+ "mov v19.16b, v5.16b\n"
+ "mov v31.16b, v3.16b\n"
+ "ldr d26, [x14, #0x30]\n"
"ldr d7, [x14, #0x38]\n"
- "ssubl v0.8h, v0.8b, v13.8b\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
- "ldr d8, [x14, #0x40]\n"
+ "ssubl v11.8h, v11.8b, v15.8b\n"
+ "ssubl v22.8h, v22.8b, v15.8b\n"
+ "ldr d4, [x14, #0x40]\n"
"ldp x27, x26, [x15, #0x0]\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
+ "ssubl v14.8h, v14.8b, v15.8b\n"
+ "ssubl v28.8h, v28.8b, v15.8b\n"
"ldp x25, x24, [x15, #0x10]\n"
"ldp x23, x22, [x15, #0x20]\n"
- "ssubl v4.8h, v4.8b, v13.8b\n"
- "ssubl v5.8h, v5.8b, v13.8b\n"
+ "ssubl v18.8h, v18.8b, v15.8b\n"
+ "ssubl v9.8h, v9.8b, v15.8b\n"
"ldp x21, x20, [x15, #0x30]\n"
- "ssubl v6.8h, v6.8b, v13.8b\n"
- "ssubl v7.8h, v7.8b, v13.8b\n"
- "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ssubl v26.8h, v26.8b, v15.8b\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
"add x27, x27, x17\n"
"add x26, x26, x17\n"
"add x25, x25, x17\n"
@@ -689,700 +689,700 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"add x21, x21, x17\n"
"add x20, x20, x17\n"
"tbz x7, #2, 9f\n"
- "ld1 { v31.s }[0], [x27], #0x4\n"
- "ld1 { v30.s }[0], [x26], #0x4\n"
- "ld1 { v29.s }[0], [x25], #0x4\n"
- "ld1 { v28.s }[0], [x24], #0x4\n"
- "ld1 { v27.s }[0], [x23], #0x4\n"
- "ld1 { v26.s }[0], [x22], #0x4\n"
- "ld1 { v25.s }[0], [x21], #0x4\n"
- "ld1 { v24.s }[0], [x20], #0x4\n"
+ "ld1 { v25.s }[0], [x27], #0x4\n"
+ "ld1 { v27.s }[0], [x26], #0x4\n"
+ "ld1 { v1.s }[0], [x25], #0x4\n"
+ "ld1 { v2.s }[0], [x24], #0x4\n"
+ "ld1 { v12.s }[0], [x23], #0x4\n"
+ "ld1 { v16.s }[0], [x22], #0x4\n"
+ "ld1 { v23.s }[0], [x21], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz x7, #1, 8f\n"
- "ld1 { v31.h }[2], [x27], #0x2\n"
- "ld1 { v30.h }[2], [x26], #0x2\n"
- "ld1 { v29.h }[2], [x25], #0x2\n"
- "ld1 { v28.h }[2], [x24], #0x2\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
- "ld1 { v26.h }[2], [x22], #0x2\n"
- "ld1 { v25.h }[2], [x21], #0x2\n"
- "ld1 { v24.h }[2], [x20], #0x2\n"
+ "ld1 { v25.h }[2], [x27], #0x2\n"
+ "ld1 { v27.h }[2], [x26], #0x2\n"
+ "ld1 { v1.h }[2], [x25], #0x2\n"
+ "ld1 { v2.h }[2], [x24], #0x2\n"
+ "ld1 { v12.h }[2], [x23], #0x2\n"
+ "ld1 { v16.h }[2], [x22], #0x2\n"
+ "ld1 { v23.h }[2], [x21], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[6], [x27]\n"
- "ld1 { v30.b }[6], [x26]\n"
- "ld1 { v29.b }[6], [x25]\n"
- "ld1 { v28.b }[6], [x24]\n"
- "ld1 { v27.b }[6], [x23]\n"
- "ld1 { v26.b }[6], [x22]\n"
- "ld1 { v25.b }[6], [x21]\n"
- "ld1 { v24.b }[6], [x20]\n"
+ "ld1 { v25.b }[6], [x27]\n"
+ "ld1 { v27.b }[6], [x26]\n"
+ "ld1 { v1.b }[6], [x25]\n"
+ "ld1 { v2.b }[6], [x24]\n"
+ "ld1 { v12.b }[6], [x23]\n"
+ "ld1 { v16.b }[6], [x22]\n"
+ "ld1 { v23.b }[6], [x21]\n"
+ "ld1 { v10.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[4], [x27]\n"
- "ld1 { v30.b }[4], [x26]\n"
- "ld1 { v29.b }[4], [x25]\n"
- "ld1 { v28.b }[4], [x24]\n"
- "ld1 { v27.b }[4], [x23]\n"
- "ld1 { v26.b }[4], [x22]\n"
- "ld1 { v25.b }[4], [x21]\n"
- "ld1 { v24.b }[4], [x20]\n"
+ "ld1 { v25.b }[4], [x27]\n"
+ "ld1 { v27.b }[4], [x26]\n"
+ "ld1 { v1.b }[4], [x25]\n"
+ "ld1 { v2.b }[4], [x24]\n"
+ "ld1 { v12.b }[4], [x23]\n"
+ "ld1 { v16.b }[4], [x22]\n"
+ "ld1 { v23.b }[4], [x21]\n"
+ "ld1 { v10.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
"tbz x7, #1, 10f\n"
- "ld1 { v31.h }[0], [x27], #0x2\n"
- "ld1 { v30.h }[0], [x26], #0x2\n"
- "ld1 { v29.h }[0], [x25], #0x2\n"
- "ld1 { v28.h }[0], [x24], #0x2\n"
- "ld1 { v27.h }[0], [x23], #0x2\n"
- "ld1 { v26.h }[0], [x22], #0x2\n"
- "ld1 { v25.h }[0], [x21], #0x2\n"
- "ld1 { v24.h }[0], [x20], #0x2\n"
+ "ld1 { v25.h }[0], [x27], #0x2\n"
+ "ld1 { v27.h }[0], [x26], #0x2\n"
+ "ld1 { v1.h }[0], [x25], #0x2\n"
+ "ld1 { v2.h }[0], [x24], #0x2\n"
+ "ld1 { v12.h }[0], [x23], #0x2\n"
+ "ld1 { v16.h }[0], [x22], #0x2\n"
+ "ld1 { v23.h }[0], [x21], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[2], [x27]\n"
- "ld1 { v30.b }[2], [x26]\n"
- "ld1 { v29.b }[2], [x25]\n"
- "ld1 { v28.b }[2], [x24]\n"
- "ld1 { v27.b }[2], [x23]\n"
- "ld1 { v26.b }[2], [x22]\n"
- "ld1 { v25.b }[2], [x21]\n"
- "ld1 { v24.b }[2], [x20]\n"
+ "ld1 { v25.b }[2], [x27]\n"
+ "ld1 { v27.b }[2], [x26]\n"
+ "ld1 { v1.b }[2], [x25]\n"
+ "ld1 { v2.b }[2], [x24]\n"
+ "ld1 { v12.b }[2], [x23]\n"
+ "ld1 { v16.b }[2], [x22]\n"
+ "ld1 { v23.b }[2], [x21]\n"
+ "ld1 { v10.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[0], [x27]\n"
- "ld1 { v30.b }[0], [x26]\n"
- "ld1 { v29.b }[0], [x25]\n"
- "ld1 { v28.b }[0], [x24]\n"
- "ld1 { v27.b }[0], [x23]\n"
- "ld1 { v26.b }[0], [x22]\n"
- "ld1 { v25.b }[0], [x21]\n"
- "ld1 { v24.b }[0], [x20]\n"
+ "ld1 { v25.b }[0], [x27]\n"
+ "ld1 { v27.b }[0], [x26]\n"
+ "ld1 { v1.b }[0], [x25]\n"
+ "ld1 { v2.b }[0], [x24]\n"
+ "ld1 { v12.b }[0], [x23]\n"
+ "ld1 { v16.b }[0], [x22]\n"
+ "ld1 { v23.b }[0], [x21]\n"
+ "ld1 { v10.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "ssubl v31.8h, v31.8b, v12.8b\n"
- "smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v17.4s, v31.8h, v8.8h\n"
- "ldr x24, [x15, #0x40]\n"
- "ssubl v30.8h, v30.8b, v12.8b\n"
- "smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v17.4s, v30.8h, v0.8h\n"
- "add x24, x24, x17\n"
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal v10.4s, v31.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v17.4s, v29.8h, v1.8h\n"
- "ssubl v28.8h, v28.8b, v12.8b\n"
- "ssubl v26.8h, v26.8b, v12.8b\n"
- "smlal v10.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v17.4s, v26.8h, v3.8h\n"
- "ssubl v27.8h, v27.8b, v12.8b\n"
- "ssubl v25.8h, v25.8b, v12.8b\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
- "smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v17.4s, v25.8h, v4.8h\n"
- "ssubl v24.8h, v24.8b, v12.8b\n"
- "smlal v9.4s, v31.4h, v2.4h\n"
- "smlal2 v23.4s, v31.8h, v2.8h\n"
- "smlal v21.4s, v31.4h, v0.4h\n"
- "smlal2 v22.4s, v31.8h, v0.8h\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v17.4s, v24.8h, v2.8h\n"
- "smlal v10.4s, v24.4h, v0.4h\n"
- "smlal2 v20.4s, v24.8h, v0.8h\n"
+ "ssubl v25.8h, v25.8b, v6.8b\n"
+ "smlal v5.4s, v25.4h, v4.4h\n"
+ "smlal2 v3.4s, v25.8h, v4.8h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v27.4h, v11.4h\n"
+ "smlal2 v3.4s, v27.8h, v11.8h\n"
+ "ssubl v1.8h, v1.8b, v6.8b\n"
+ "smlal v21.4s, v25.4h, v26.4h\n"
+ "smlal2 v8.4s, v25.8h, v26.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v5.4s, v1.4h, v22.4h\n"
+ "smlal2 v3.4s, v1.8h, v22.8h\n"
+ "ssubl v2.8h, v2.8b, v6.8b\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "smlal v21.4s, v2.4h, v22.4h\n"
+ "smlal2 v8.4s, v2.8h, v22.8h\n"
+ "smlal v5.4s, v16.4h, v28.4h\n"
+ "smlal2 v3.4s, v16.8h, v28.8h\n"
+ "ssubl v12.8h, v12.8b, v6.8b\n"
+ "ssubl v23.8h, v23.8b, v6.8b\n"
+ "smlal v21.4s, v12.4h, v14.4h\n"
+ "smlal2 v8.4s, v12.8h, v14.8h\n"
+ "smlal v5.4s, v23.4h, v18.4h\n"
+ "smlal2 v3.4s, v23.8h, v18.8h\n"
+ "ssubl v10.8h, v10.8b, v6.8b\n"
+ "smlal v20.4s, v25.4h, v14.4h\n"
+ "smlal2 v0.4s, v25.8h, v14.8h\n"
+ "smlal v19.4s, v25.4h, v11.4h\n"
+ "smlal2 v31.4s, v25.8h, v11.8h\n"
+ "smlal v5.4s, v10.4h, v14.4h\n"
+ "smlal2 v3.4s, v10.8h, v14.8h\n"
+ "smlal v21.4s, v10.4h, v11.4h\n"
+ "smlal2 v8.4s, v10.8h, v11.8h\n"
"tbz x7, #2, 13f\n"
- "ld1 { v29.s }[0], [x24], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
"tbz x7, #1, 12f\n"
- "ld1 { v29.h }[2], [x24], #0x2\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[6], [x24]\n"
+ "ld1 { v15.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[4], [x24]\n"
+ "ld1 { v15.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
"tbz x7, #1, 14f\n"
- "ld1 { v29.h }[0], [x24], #0x2\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[2], [x24]\n"
+ "ld1 { v15.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[0], [x24]\n"
+ "ld1 { v15.b }[0], [x20]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "ldr x22, [x15, #0x48]\n"
- "smlal v10.4s, v29.4h, v4.4h\n"
- "smlal2 v20.4s, v29.8h, v4.8h\n"
- "add x22, x22, x17\n"
+ "ssubl v15.8h, v15.8b, v6.8b\n"
+ "ldr x20, [x15, #0x48]\n"
+ "smlal v21.4s, v15.4h, v18.4h\n"
+ "smlal2 v8.4s, v15.8h, v18.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 17f\n"
- "ld1 { v28.s }[0], [x22], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 16f\n"
- "ld1 { v28.h }[2], [x22], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[6], [x22]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[4], [x22]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
"tbz x7, #1, 18f\n"
- "ld1 { v28.h }[0], [x22], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[2], [x22]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[0], [x22]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
- "ssubl v28.8h, v28.8b, v12.8b\n"
- "ldr x21, [x15, #0x50]\n"
- "smlal v10.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "add x21, x21, x17\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x50]\n"
+ "smlal v21.4s, v16.4h, v9.4h\n"
+ "smlal2 v8.4s, v16.8h, v9.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 21f\n"
- "ld1 { v27.s }[0], [x21], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 20f\n"
- "ld1 { v27.h }[2], [x21], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[6], [x21]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[4], [x21]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (1, 2): Bit 2: Unset
"tbz x7, #1, 22f\n"
- "ld1 { v27.h }[0], [x21], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[0], [x21]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"23:" // Oddments: Load (1, 2): Bit 2: End
- "ssubl v27.8h, v27.8b, v12.8b\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0x58]\n"
- "smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v17.4s, v27.8h, v5.8h\n"
- "smlal v10.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "smlal v5.4s, v16.4h, v9.4h\n"
+ "smlal2 v3.4s, v16.8h, v9.8h\n"
+ "smlal v21.4s, v16.4h, v28.4h\n"
+ "smlal2 v8.4s, v16.8h, v28.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 25f\n"
- "ld1 { v26.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 24f\n"
- "ld1 { v26.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (3, 0): Bit 2: Unset
"tbz x7, #1, 26f\n"
- "ld1 { v26.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"27:" // Oddments: Load (3, 0): Bit 2: End
- "ssubl v26.8h, v26.8b, v12.8b\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0x60]\n"
- "smlal v9.4s, v26.4h, v3.4h\n"
- "smlal2 v23.4s, v26.8h, v3.8h\n"
+ "smlal v20.4s, v16.4h, v28.4h\n"
+ "smlal2 v0.4s, v16.8h, v28.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 29f\n"
- "ld1 { v25.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 28f\n"
- "ld1 { v25.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 0): Bit 2: Unset
"tbz x7, #1, 30f\n"
- "ld1 { v25.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 0): Bit 2: End
- "ssubl v25.8h, v25.8b, v12.8b\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0x68]\n"
- "smlal v15.4s, v25.4h, v6.4h\n"
- "smlal2 v17.4s, v25.8h, v6.8h\n"
- "smlal v9.4s, v25.4h, v0.4h\n"
- "smlal2 v23.4s, v25.8h, v0.8h\n"
+ "smlal v5.4s, v16.4h, v26.4h\n"
+ "smlal2 v3.4s, v16.8h, v26.8h\n"
+ "smlal v20.4s, v16.4h, v11.4h\n"
+ "smlal2 v0.4s, v16.8h, v11.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 33f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 32f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (3, 1): Bit 2: Unset
"tbz x7, #1, 34f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"35:" // Oddments: Load (3, 1): Bit 2: End
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "ldr x21, [x15, #0x70]\n"
- "smlal v9.4s, v29.4h, v4.4h\n"
- "smlal2 v23.4s, v29.8h, v4.8h\n"
- "add x21, x21, x17\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal v20.4s, v16.4h, v18.4h\n"
+ "smlal2 v0.4s, v16.8h, v18.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 37f\n"
- "ld1 { v24.s }[0], [x21], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 36f\n"
- "ld1 { v24.h }[2], [x21], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[6], [x21]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[4], [x21]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 1): Bit 2: Unset
"tbz x7, #1, 38f\n"
- "ld1 { v24.h }[0], [x21], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[0], [x21]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"39:" // Oddments: Load (2, 1): Bit 2: End
- "ssubl v24.8h, v24.8b, v12.8b\n"
- "ldr x23, [x15, #0x78]\n"
- "smlal v15.4s, v24.4h, v7.4h\n"
- "smlal2 v17.4s, v24.8h, v7.8h\n"
- "smlal v9.4s, v24.4h, v1.4h\n"
- "smlal2 v23.4s, v24.8h, v1.8h\n"
- "add x23, x23, x17\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v5.4s, v16.4h, v7.4h\n"
+ "smlal2 v3.4s, v16.8h, v7.8h\n"
+ "smlal v20.4s, v16.4h, v22.4h\n"
+ "smlal2 v0.4s, v16.8h, v22.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 41f\n"
- "ld1 { v27.s }[0], [x23], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 40f\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[6], [x23]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[4], [x23]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (3, 3): Bit 2: Unset
"tbz x7, #1, 42f\n"
- "ld1 { v27.h }[0], [x23], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[2], [x23]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[0], [x23]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"43:" // Oddments: Load (3, 3): Bit 2: End
- "ssubl v27.8h, v27.8b, v12.8b\n"
- "ldr x21, [x15, #0x80]\n"
- "smlal v21.4s, v27.4h, v4.4h\n"
- "smlal2 v22.4s, v27.8h, v4.8h\n"
- "add x21, x21, x17\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x80]\n"
+ "smlal v19.4s, v16.4h, v18.4h\n"
+ "smlal2 v31.4s, v16.8h, v18.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 45f\n"
- "ld1 { v28.s }[0], [x21], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 44f\n"
- "ld1 { v28.h }[2], [x21], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[6], [x21]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[4], [x21]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
"tbz x7, #1, 46f\n"
- "ld1 { v28.h }[0], [x21], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[0], [x21]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "ssubl v28.8h, v28.8b, v12.8b\n"
- "ldr x22, [x15, #0x88]\n"
- "smlal v10.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
- "smlal v21.4s, v28.4h, v1.4h\n"
- "smlal2 v22.4s, v28.8h, v1.8h\n"
- "add x22, x22, x17\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x88]\n"
+ "smlal v21.4s, v16.4h, v7.4h\n"
+ "smlal2 v8.4s, v16.8h, v7.8h\n"
+ "smlal v19.4s, v16.4h, v22.4h\n"
+ "smlal2 v31.4s, v16.8h, v22.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 49f\n"
- "ld1 { v26.s }[0], [x22], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 48f\n"
- "ld1 { v26.h }[2], [x22], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[6], [x22]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[4], [x22]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 4): Bit 2: Unset
"tbz x7, #1, 50f\n"
- "ld1 { v26.h }[0], [x22], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[2], [x22]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[0], [x22]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 4): Bit 2: End
- "ssubl v26.8h, v26.8b, v12.8b\n"
- "ldr x24, [x15, #0x90]\n"
- "smlal v21.4s, v26.4h, v5.4h\n"
- "smlal2 v22.4s, v26.8h, v5.8h\n"
- "add x24, x24, x17\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x90]\n"
+ "smlal v19.4s, v16.4h, v9.4h\n"
+ "smlal2 v31.4s, v16.8h, v9.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 53f\n"
- "ld1 { v25.s }[0], [x24], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 52f\n"
- "ld1 { v25.h }[2], [x24], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[6], [x24]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[4], [x24]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (4, 0): Bit 2: Unset
"tbz x7, #1, 54f\n"
- "ld1 { v25.h }[0], [x24], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[2], [x24]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[0], [x24]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"55:" // Oddments: Load (4, 0): Bit 2: End
- "ssubl v25.8h, v25.8b, v12.8b\n"
- "ldr x25, [x15, #0x98]\n"
- "smlal v9.4s, v25.4h, v6.4h\n"
- "smlal2 v23.4s, v25.8h, v6.8h\n"
- "add x25, x25, x17\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x98]\n"
+ "smlal v20.4s, v16.4h, v26.4h\n"
+ "smlal2 v0.4s, v16.8h, v26.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 57f\n"
- "ld1 { v29.s }[0], [x25], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 56f\n"
- "ld1 { v29.h }[2], [x25], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[6], [x25]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 59f\n"
"56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[4], [x25]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 59f\n"
"57:" // Oddments: Load (2, 4): Bit 2: Unset
"tbz x7, #1, 58f\n"
- "ld1 { v29.h }[0], [x25], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[2], [x25]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 59f\n"
"58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[0], [x25]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"59:" // Oddments: Load (2, 4): Bit 2: End
- "ssubl v29.8h, v29.8b, v12.8b\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0xa0]\n"
- "smlal v10.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "smlal v21.4s, v29.4h, v2.4h\n"
- "smlal2 v22.4s, v29.8h, v2.8h\n"
+ "smlal v21.4s, v16.4h, v4.4h\n"
+ "smlal2 v8.4s, v16.8h, v4.8h\n"
+ "smlal v19.4s, v16.4h, v14.4h\n"
+ "smlal2 v31.4s, v16.8h, v14.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 61f\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 60f\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 63f\n"
"60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 63f\n"
"61:" // Oddments: Load (4, 1): Bit 2: Unset
"tbz x7, #1, 62f\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 63f\n"
"62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"63:" // Oddments: Load (4, 1): Bit 2: End
- "ssubl v27.8h, v27.8b, v12.8b\n"
- "ldr x23, [x15, #0xa8]\n"
- "smlal v9.4s, v27.4h, v7.4h\n"
- "smlal2 v23.4s, v27.8h, v7.8h\n"
- "add x23, x23, x17\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v20.4s, v16.4h, v7.4h\n"
+ "smlal2 v0.4s, v16.8h, v7.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 65f\n"
- "ld1 { v24.s }[0], [x23], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 64f\n"
- "ld1 { v24.h }[2], [x23], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[6], [x23]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[4], [x23]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 2): Bit 2: Unset
"tbz x7, #1, 66f\n"
- "ld1 { v24.h }[0], [x23], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[2], [x23]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[0], [x23]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"67:" // Oddments: Load (3, 2): Bit 2: End
- "ssubl v24.8h, v24.8b, v12.8b\n"
- "ldr x22, [x15, #0xb0]\n"
- "smlal v9.4s, v24.4h, v5.4h\n"
- "smlal2 v23.4s, v24.8h, v5.8h\n"
- "smlal v21.4s, v24.4h, v3.4h\n"
- "smlal2 v22.4s, v24.8h, v3.8h\n"
- "add x22, x22, x17\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xb0]\n"
+ "smlal v20.4s, v16.4h, v9.4h\n"
+ "smlal2 v0.4s, v16.8h, v9.8h\n"
+ "smlal v19.4s, v16.4h, v28.4h\n"
+ "smlal2 v31.4s, v16.8h, v28.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 69f\n"
- "ld1 { v26.s }[0], [x22], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 68f\n"
- "ld1 { v26.h }[2], [x22], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[6], [x22]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[4], [x22]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 3): Bit 2: Unset
"tbz x7, #1, 70f\n"
- "ld1 { v26.h }[0], [x22], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[2], [x22]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[0], [x22]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"71:" // Oddments: Load (4, 3): Bit 2: End
- "ssubl v26.8h, v26.8b, v12.8b\n"
- "ldr x21, [x15, #0xb8]\n"
- "smlal v21.4s, v26.4h, v7.4h\n"
- "smlal2 v22.4s, v26.8h, v7.8h\n"
- "add x21, x21, x17\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "smlal v19.4s, v16.4h, v7.4h\n"
+ "smlal2 v31.4s, v16.8h, v7.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 73f\n"
- "ld1 { v25.s }[0], [x21], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 72f\n"
- "ld1 { v25.h }[2], [x21], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[6], [x21]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[4], [x21]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 2): Bit 2: Unset
"tbz x7, #1, 74f\n"
- "ld1 { v25.h }[0], [x21], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[0], [x21]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"75:" // Oddments: Load (4, 2): Bit 2: End
- "ssubl v25.8h, v25.8b, v12.8b\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0xc0]\n"
- "smlal v9.4s, v25.4h, v8.4h\n"
- "smlal2 v23.4s, v25.8h, v8.8h\n"
- "smlal v21.4s, v25.4h, v6.4h\n"
- "smlal2 v22.4s, v25.8h, v6.8h\n"
+ "smlal v20.4s, v16.4h, v4.4h\n"
+ "smlal2 v0.4s, v16.8h, v4.8h\n"
+ "smlal v19.4s, v16.4h, v26.4h\n"
+ "smlal2 v31.4s, v16.8h, v26.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 77f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 76f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 4): Bit 2: Unset
"tbz x7, #1, 78f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"79:" // Oddments: Load (4, 4): Bit 2: End
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal v21.4s, v29.4h, v8.4h\n"
- "smlal2 v22.4s, v29.8h, v8.8h\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "smlal v19.4s, v16.4h, v4.4h\n"
+ "smlal2 v31.4s, v16.8h, v4.8h\n"
"tbz x7, #2, 81f\n"
- "ld1 { v19.4s }, [x13], #0x10\n"
- "ld1 { v18.4s }, [x12], #0x10\n"
+ "ld1 { v14.4s }, [x13], #0x10\n"
+ "ld1 { v25.4s }, [x12], #0x10\n"
"tbz x7, #1, 80f\n"
- "ld1 { v30.d }[0], [x13], #0x8\n"
- "ld1 { v31.d }[0], [x12], #0x8\n"
+ "ld1 { v18.d }[0], [x13], #0x8\n"
+ "ld1 { v12.d }[0], [x12], #0x8\n"
"tbz x7, #0, 83f\n"
- "ld1 { v30.s }[2], [x13]\n"
- "ld1 { v31.s }[2], [x12]\n"
+ "ld1 { v18.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x12]\n"
"b 83f\n"
"80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
"tbz x7, #0, 83f\n"
- "ld1 { v30.s }[0], [x13]\n"
- "ld1 { v31.s }[0], [x12]\n"
+ "ld1 { v18.s }[0], [x13]\n"
+ "ld1 { v12.s }[0], [x12]\n"
"b 83f\n"
"81:" // Oddments: Load requant params: Bit 2: Unset
"tbz x7, #1, 82f\n"
- "ld1 { v19.d }[0], [x13], #0x8\n"
- "ld1 { v18.d }[0], [x12], #0x8\n"
+ "ld1 { v14.d }[0], [x13], #0x8\n"
+ "ld1 { v25.d }[0], [x12], #0x8\n"
"tbz x7, #0, 83f\n"
- "ld1 { v19.s }[2], [x13]\n"
- "ld1 { v18.s }[2], [x12]\n"
+ "ld1 { v14.s }[2], [x13]\n"
+ "ld1 { v25.s }[2], [x12]\n"
"b 83f\n"
"82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 83f\n"
- "ld1 { v19.s }[0], [x13]\n"
- "ld1 { v18.s }[0], [x12]\n"
+ "ld1 { v14.s }[0], [x13]\n"
+ "ld1 { v25.s }[0], [x12]\n"
"83:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v15.4s, v15.4s, v19.4s\n"
- "and v0.16b, v15.16b, v18.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v14.4s\n"
+ "and v28.16b, v5.16b, v25.16b\n"
"add x11, x11, x16\n"
"add x10, x10, x16\n"
- "sqrdmulh v17.4s, v17.4s, v30.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v3.4s, v3.4s, v18.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
"add x9, x9, x16\n"
"add x28, x28, x16\n"
- "and v7.16b, v17.16b, v31.16b\n"
- "sqrdmulh v10.4s, v10.4s, v19.4s\n"
- "sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "sqrdmulh v21.4s, v21.4s, v19.4s\n"
- "sqadd v15.4s, v15.4s, v0.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "and v19.16b, v10.16b, v18.16b\n"
- "sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "and v27.16b, v9.16b, v18.16b\n"
- "sqrdmulh v23.4s, v23.4s, v30.4s\n"
- "and v0.16b, v21.16b, v18.16b\n"
- "sqrdmulh v22.4s, v22.4s, v30.4s\n"
- "sqadd v17.4s, v17.4s, v7.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "and v5.16b, v20.16b, v31.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "and v4.16b, v23.16b, v31.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v7.16b, v22.16b, v31.16b\n"
- "sqadd v10.4s, v10.4s, v19.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v27.4s\n"
+ "and v16.16b, v3.16b, v12.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v14.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v14.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v14.4s\n"
+ "sqadd v5.4s, v5.4s, v28.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v14.16b, v21.16b, v25.16b\n"
+ "sqrdmulh v8.4s, v8.4s, v18.4s\n"
+ "and v6.16b, v20.16b, v25.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+ "and v4.16b, v19.16b, v25.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v18.4s\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "sshr v14.4s, v14.4s, #0x1f\n"
+ "and v18.16b, v8.16b, v12.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "and v7.16b, v0.16b, v12.16b\n"
"sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
+ "and v16.16b, v31.16b, v12.16b\n"
+ "sqadd v21.4s, v21.4s, v14.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v6.4s\n"
"sshr v7.4s, v7.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v18.4s\n"
- "srshl v10.4s, v10.4s, v18.4s\n"
- "sqadd v20.4s, v20.4s, v5.4s\n"
- "srshl v9.4s, v9.4s, v18.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "srshl v21.4s, v21.4s, v18.4s\n"
- "sqadd v22.4s, v22.4s, v7.4s\n"
- "srshl v17.4s, v17.4s, v31.4s\n"
- "sqxtn v15.4h, v15.4s\n"
- "srshl v20.4s, v20.4s, v31.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "srshl v23.4s, v23.4s, v31.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v22.4s, v22.4s, v31.4s\n"
+ "sqadd v19.4s, v19.4s, v4.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v25.4s\n"
+ "srshl v21.4s, v21.4s, v25.4s\n"
+ "sqadd v8.4s, v8.4s, v18.4s\n"
+ "srshl v20.4s, v20.4s, v25.4s\n"
+ "sqadd v0.4s, v0.4s, v7.4s\n"
+ "srshl v19.4s, v19.4s, v25.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v3.4s, v3.4s, v12.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "srshl v8.4s, v8.4s, v12.4s\n"
"sqxtn v21.4h, v21.4s\n"
- "sqxtn2 v15.8h, v17.4s\n"
- "sqxtn2 v10.8h, v20.4s\n"
- "sqxtn2 v9.8h, v23.4s\n"
- "sqxtn2 v21.8h, v22.4s\n"
- "sqadd v15.8h, v15.8h, v11.8h\n"
- "sqadd v10.8h, v10.8h, v11.8h\n"
- "sqadd v9.8h, v9.8h, v11.8h\n"
- "sqadd v21.8h, v21.8h, v11.8h\n"
- "smax v15.8h, v15.8h, v16.8h\n"
- "smax v10.8h, v10.8h, v16.8h\n"
- "smax v9.8h, v9.8h, v16.8h\n"
- "smax v21.8h, v21.8h, v16.8h\n"
- "smin v15.8h, v15.8h, v14.8h\n"
- "smin v10.8h, v10.8h, v14.8h\n"
- "smin v9.8h, v9.8h, v14.8h\n"
- "smin v21.8h, v21.8h, v14.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "srshl v0.4s, v0.4s, v12.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v31.4s, v31.4s, v12.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "sqxtn2 v5.8h, v3.4s\n"
+ "sqxtn2 v21.8h, v8.4s\n"
+ "sqxtn2 v20.8h, v0.4s\n"
+ "sqxtn2 v19.8h, v31.4s\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "sqadd v21.8h, v21.8h, v13.8h\n"
+ "sqadd v20.8h, v20.8h, v13.8h\n"
+ "sqadd v19.8h, v19.8h, v13.8h\n"
+ "smax v5.8h, v5.8h, v17.8h\n"
+ "smax v21.8h, v21.8h, v17.8h\n"
+ "smax v20.8h, v20.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smin v5.8h, v5.8h, v24.8h\n"
+ "smin v21.8h, v21.8h, v24.8h\n"
+ "smin v20.8h, v20.8h, v24.8h\n"
+ "smin v19.8h, v19.8h, v24.8h\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
"tbz x7, #2, 85f\n"
- "st1 { v15.s }[0], [x11], #0x4\n"
- "st1 { v10.s }[0], [x10], #0x4\n"
- "st1 { v9.s }[0], [x9], #0x4\n"
- "st1 { v21.s }[0], [x28], #0x4\n"
+ "st1 { v5.s }[0], [x11], #0x4\n"
+ "st1 { v21.s }[0], [x10], #0x4\n"
+ "st1 { v20.s }[0], [x9], #0x4\n"
+ "st1 { v19.s }[0], [x28], #0x4\n"
"tbz x7, #1, 84f\n"
- "st1 { v15.h }[2], [x11], #0x2\n"
- "st1 { v10.h }[2], [x10], #0x2\n"
- "st1 { v9.h }[2], [x9], #0x2\n"
- "st1 { v21.h }[2], [x28], #0x2\n"
+ "st1 { v5.h }[2], [x11], #0x2\n"
+ "st1 { v21.h }[2], [x10], #0x2\n"
+ "st1 { v20.h }[2], [x9], #0x2\n"
+ "st1 { v19.h }[2], [x28], #0x2\n"
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[6], [x11], #0x1\n"
- "st1 { v10.b }[6], [x10], #0x1\n"
- "st1 { v9.b }[6], [x9], #0x1\n"
- "st1 { v21.b }[6], [x28], #0x1\n"
+ "st1 { v5.b }[6], [x11], #0x1\n"
+ "st1 { v21.b }[6], [x10], #0x1\n"
+ "st1 { v20.b }[6], [x9], #0x1\n"
+ "st1 { v19.b }[6], [x28], #0x1\n"
"b 87f\n"
"84:" // Oddments: Bit 2: Bit 1: Unset
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[4], [x11], #0x1\n"
- "st1 { v10.b }[4], [x10], #0x1\n"
- "st1 { v9.b }[4], [x9], #0x1\n"
- "st1 { v21.b }[4], [x28], #0x1\n"
+ "st1 { v5.b }[4], [x11], #0x1\n"
+ "st1 { v21.b }[4], [x10], #0x1\n"
+ "st1 { v20.b }[4], [x9], #0x1\n"
+ "st1 { v19.b }[4], [x28], #0x1\n"
"b 87f\n"
"85:" // Oddments: Bit 2: Unset
"tbz x7, #1, 86f\n"
- "st1 { v15.h }[0], [x11], #0x2\n"
- "st1 { v10.h }[0], [x10], #0x2\n"
- "st1 { v9.h }[0], [x9], #0x2\n"
- "st1 { v21.h }[0], [x28], #0x2\n"
+ "st1 { v5.h }[0], [x11], #0x2\n"
+ "st1 { v21.h }[0], [x10], #0x2\n"
+ "st1 { v20.h }[0], [x9], #0x2\n"
+ "st1 { v19.h }[0], [x28], #0x2\n"
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[2], [x11], #0x1\n"
- "st1 { v10.b }[2], [x10], #0x1\n"
- "st1 { v9.b }[2], [x9], #0x1\n"
- "st1 { v21.b }[2], [x28], #0x1\n"
+ "st1 { v5.b }[2], [x11], #0x1\n"
+ "st1 { v21.b }[2], [x10], #0x1\n"
+ "st1 { v20.b }[2], [x9], #0x1\n"
+ "st1 { v19.b }[2], [x28], #0x1\n"
"b 87f\n"
"86:" // Oddments: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[0], [x11], #0x1\n"
- "st1 { v10.b }[0], [x10], #0x1\n"
- "st1 { v9.b }[0], [x9], #0x1\n"
- "st1 { v21.b }[0], [x28], #0x1\n"
+ "st1 { v5.b }[0], [x11], #0x1\n"
+ "st1 { v21.b }[0], [x10], #0x1\n"
+ "st1 { v20.b }[0], [x9], #0x1\n"
+ "st1 { v19.b }[0], [x28], #0x1\n"
"87:" // Oddments: Bit 2: End
"88:" // End
:
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index e60597d390..65ebe627ef 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
@@ -34,15 +34,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
- const unsigned int,
- const int8_t *const *const,
- const int8_t *const,
- const int32_t *const,
- const arm_gemm::Requantize32 &,
- const int32_t *const,
- const int32_t *const,
- int8_t *const *const);
+void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
class a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index 4b0ad00187..1dc0f33186 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -112,1188 +112,1188 @@ void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
__asm__ __volatile__(
"ldr x1, [%x[params], %[offsetof_Params_n_channels]]\n"
- "ldr x13, [%x[params], %[offsetof_Params_requant]]\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
"lsr x2, x1, #0x3\n"
- "add x3, x13, %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v9.16b }, [x3]\n"
- "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x11, x13, %[offsetof_Requantize32_b_offset]\n"
- "add x5, x13, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v15.16b }, [x11]\n"
- "ld1r { v14.8h }, [x5]\n"
- "add x3, x13, %[offsetof_Requantize32_minval]\n"
- "add x15, x13, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v12.8h }, [x3]\n"
- "ld1r { v11.8h }, [x15]\n"
- "mov x0, #0x0\n"
- "mov x10, #0x0\n"
- "add x4, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x3, [%x[params], %[offsetof_Params_weights]]\n"
- "ldr x5, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v18.16b }, [x20]\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v13.16b }, [x21]\n"
+ "ld1r { v26.8h }, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_minval]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v11.8h }, [x21]\n"
+ "ld1r { v0.8h }, [x20]\n"
+ "mov x3, #0x0\n"
+ "mov x4, #0x0\n"
+ "add x5, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_requant_muls]]\n"
"ldr x8, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x17, x6, [x24, #0x0]\n"
- "ldp x7, x16, [x24, #0x10]\n"
+ "ldp x17, x16, [x22, #0x0]\n"
+ "ldp x15, x14, [x22, #0x10]\n"
"cbz x2, 3f\n"
- "ldr d0, [x3, #0x0]\n"
- "ldr d1, [x3, #0x8]\n"
+ "ldr d6, [x6, #0x0]\n"
+ "ldr d14, [x6, #0x8]\n"
"subs x2, x2, #0x1\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ldr d2, [x3, #0x10]\n"
- "ldr d3, [x3, #0x18]\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ldr d4, [x3, #0x20]\n"
- "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ldr q13, [x13, #0x0]\n"
- "ldr q19, [x13, #0x10]\n"
- "add x13, x13, #0x20\n"
- "str x13, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x9, x28, [x4, #0x0]\n"
- "ldp x27, x26, [x4, #0x10]\n"
- "mov v20.16b, v13.16b\n"
- "mov v10.16b, v19.16b\n"
- "ldp x25, x24, [x4, #0x20]\n"
- "ldp x23, x22, [x4, #0x30]\n"
- "mov v8.16b, v13.16b\n"
- "mov v7.16b, v19.16b\n"
- "ldp x21, x20, [x4, #0x40]\n"
- "ldr d31, [x9, x0]\n"
- "mov v17.16b, v13.16b\n"
- "mov v21.16b, v19.16b\n"
- "ldr d30, [x28, x0]\n"
- "ldr d29, [x27, x0]\n"
- "ssubl v31.8h, v31.8b, v9.8b\n"
- "ssubl v30.8h, v30.8b, v9.8b\n"
- "ldr d28, [x26, x0]\n"
- "ldr d27, [x25, x0]\n"
- "ssubl v29.8h, v29.8b, v9.8b\n"
- "ssubl v28.8h, v28.8b, v9.8b\n"
- "ldr d23, [x24, x0]\n"
- "ldr d25, [x23, x0]\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "ssubl v23.8h, v23.8b, v9.8b\n"
- "ldr d24, [x22, x0]\n"
- "ldr d26, [x21, x0]\n"
- "ssubl v25.8h, v25.8b, v9.8b\n"
- "ssubl v24.8h, v24.8b, v9.8b\n"
- "ldr d22, [x20, x0]\n"
- "ssubl v26.8h, v26.8b, v9.8b\n"
- "ssubl v22.8h, v22.8b, v9.8b\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ldr d10, [x6, #0x10]\n"
+ "ldr d21, [x6, #0x18]\n"
+ "ssubl v14.8h, v14.8b, v13.8b\n"
+ "ssubl v10.8h, v10.8b, v13.8b\n"
+ "ldr d12, [x6, #0x20]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ssubl v12.8h, v12.8b, v13.8b\n"
+ "ldr q7, [x20, #0x0]\n"
+ "ldr q15, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "mov v20.16b, v7.16b\n"
+ "mov v5.16b, v15.16b\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "mov v24.16b, v7.16b\n"
+ "mov v22.16b, v15.16b\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "ldr d31, [x9, x3]\n"
+ "mov v23.16b, v7.16b\n"
+ "mov v19.16b, v15.16b\n"
+ "ldr d17, [x28, x3]\n"
+ "ldr d30, [x27, x3]\n"
+ "ssubl v31.8h, v31.8b, v18.8b\n"
+ "ssubl v17.8h, v17.8b, v18.8b\n"
+ "ldr d16, [x26, x3]\n"
+ "ldr d3, [x25, x3]\n"
+ "ssubl v30.8h, v30.8b, v18.8b\n"
+ "ssubl v16.8h, v16.8b, v18.8b\n"
+ "ldr d4, [x24, x3]\n"
+ "ldr d25, [x23, x3]\n"
+ "ssubl v3.8h, v3.8b, v18.8b\n"
+ "ssubl v4.8h, v4.8b, v18.8b\n"
+ "ldr d9, [x22, x3]\n"
+ "ldr d29, [x21, x3]\n"
+ "ssubl v25.8h, v25.8b, v18.8b\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "ldr d28, [x20, x3]\n"
+ "ssubl v29.8h, v29.8b, v18.8b\n"
+ "ssubl v28.8h, v28.8b, v18.8b\n"
"beq 2f\n"
"1:" // Loop
- "ldr q18, [x5, #0x0]\n"
- "ldr q6, [x8, #0x0]\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "ldr q5, [x5, #0x10]\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "ldr x20, [x4, #0x50]\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "smlal v8.4s, v29.4h, v0.4h\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "ldr x22, [x4, #0x58]\n"
- "ldr x21, [x4, #0x60]\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr d31, [x20, x0]\n"
- "ssubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v7.4s, v29.8h, v0.8h\n"
- "smlal v13.4s, v27.4h, v2.4h\n"
- "ldr x20, [x4, #0x68]\n"
- "ldr x26, [x4, #0x70]\n"
- "smlal2 v21.4s, v28.8h, v0.8h\n"
- "ldr d30, [x22, x0]\n"
- "smlal v20.4s, v27.4h, v1.4h\n"
- "ssubl v30.8h, v30.8b, v9.8b\n"
- "smlal v8.4s, v28.4h, v1.4h\n"
- "smlal v17.4s, v23.4h, v1.4h\n"
- "ldr x25, [x4, #0x78]\n"
- "ldr x23, [x4, #0x80]\n"
- "smlal2 v19.4s, v27.8h, v2.8h\n"
- "smlal2 v10.4s, v27.8h, v1.8h\n"
- "ldr d0, [x3, #0x28]\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal2 v7.4s, v28.8h, v1.8h\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "ldr x24, [x4, #0x88]\n"
- "ldr x15, [x4, #0x90]\n"
- "smlal2 v21.4s, v23.8h, v1.8h\n"
- "ldr d27, [x21, x0]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal v8.4s, v23.4h, v2.4h\n"
- "smlal v17.4s, v31.4h, v2.4h\n"
- "ldr x21, [x4, #0x98]\n"
- "ldr x14, [x4, #0xa0]\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "ldr d1, [x3, #0x30]\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal2 v7.4s, v23.8h, v2.8h\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "ldr x13, [x4, #0xa8]\n"
- "ldr x12, [x4, #0xb0]\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "ldr d25, [x20, x0]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "ssubl v25.8h, v25.8b, v9.8b\n"
- "smlal v8.4s, v31.4h, v3.4h\n"
- "smlal v17.4s, v30.4h, v3.4h\n"
- "ldr x20, [x4, #0xb8]\n"
- "ldr x11, [x4, #0xc0]\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "ldr d2, [x3, #0x38]\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal2 v7.4s, v31.8h, v3.8h\n"
- "smlal v13.4s, v29.4h, v0.4h\n"
- "ldr x22, [x4, #0xc8]\n"
- "ldr x9, [x4, #0xd0]\n"
- "smlal2 v21.4s, v30.8h, v3.8h\n"
- "ldr d24, [x26, x0]\n"
- "smlal v20.4s, v27.4h, v4.4h\n"
- "ssubl v24.8h, v24.8b, v9.8b\n"
- "smlal v8.4s, v30.4h, v4.4h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "ldr x28, [x4, #0xd8]\n"
- "ldr x27, [x4, #0xe0]\n"
- "smlal2 v19.4s, v29.8h, v0.8h\n"
- "ldr d3, [x3, #0x40]\n"
- "smlal2 v10.4s, v27.8h, v4.8h\n"
- "ldr d27, [x25, x0]\n"
- "smlal2 v7.4s, v30.8h, v4.8h\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr x26, [x4, #0xe8]\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0x48]\n"
- "smlal v20.4s, v28.4h, v0.4h\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal v8.4s, v22.4h, v0.4h\n"
- "smlal v17.4s, v25.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ldr d2, [x6, #0x28]\n"
+ "ldr d27, [x6, #0x30]\n"
+ "smlal v7.4s, v31.4h, v6.4h\n"
+ "smlal2 v15.4s, v31.8h, v6.8h\n"
+ "ldr d1, [x6, #0x38]\n"
+ "ldr d31, [x6, #0x40]\n"
+ "smlal v7.4s, v17.4h, v14.4h\n"
+ "smlal v20.4s, v17.4h, v6.4h\n"
+ "ldr d8, [x6, #0x48]\n"
+ "ldr x22, [x5, #0x50]\n"
+ "smlal v24.4s, v30.4h, v6.4h\n"
+ "smlal v23.4s, v16.4h, v6.4h\n"
+ "smlal2 v15.4s, v17.8h, v14.8h\n"
+ "smlal v7.4s, v3.4h, v10.4h\n"
+ "ldr x20, [x5, #0x58]\n"
+ "ldr x21, [x5, #0x60]\n"
+ "smlal2 v5.4s, v17.8h, v6.8h\n"
+ "ldr d17, [x22, x3]\n"
+ "smlal2 v22.4s, v30.8h, v6.8h\n"
+ "ssubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v6.8h\n"
+ "ldr d6, [x20, x3]\n"
+ "smlal v20.4s, v3.4h, v14.4h\n"
+ "ssubl v6.8h, v6.8b, v18.8b\n"
+ "smlal v24.4s, v16.4h, v14.4h\n"
+ "smlal v23.4s, v4.4h, v14.4h\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal2 v15.4s, v3.8h, v10.8h\n"
+ "smlal v7.4s, v25.4h, v21.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "ldr x22, [x5, #0x70]\n"
+ "smlal2 v5.4s, v3.8h, v14.8h\n"
+ "ldr d3, [x21, x3]\n"
+ "smlal2 v22.4s, v16.8h, v14.8h\n"
+ "ssubl v3.8h, v3.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v14.8h\n"
+ "ldr d14, [x20, x3]\n"
+ "smlal v20.4s, v25.4h, v10.4h\n"
+ "ssubl v14.8h, v14.8b, v18.8b\n"
+ "smlal v24.4s, v4.4h, v10.4h\n"
+ "smlal v23.4s, v17.4h, v10.4h\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "ldr x20, [x5, #0x78]\n"
+ "smlal2 v15.4s, v25.8h, v21.8h\n"
+ "smlal v7.4s, v9.4h, v12.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x21, [x5, #0x80]\n"
+ "smlal2 v5.4s, v25.8h, v10.8h\n"
+ "ldr d25, [x22, x3]\n"
+ "smlal2 v22.4s, v4.8h, v10.8h\n"
+ "ssubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v10.8h\n"
+ "ldr d10, [x20, x3]\n"
+ "smlal v20.4s, v9.4h, v21.4h\n"
+ "ssubl v10.8h, v10.8b, v18.8b\n"
+ "smlal v24.4s, v17.4h, v21.4h\n"
+ "smlal v23.4s, v6.4h, v21.4h\n"
+ "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x24, [x5, #0x88]\n"
+ "smlal2 v15.4s, v9.8h, v12.8h\n"
+ "smlal v7.4s, v30.4h, v2.4h\n"
+ "ldr x20, [x5, #0x90]\n"
+ "ldr x23, [x5, #0x98]\n"
+ "smlal2 v5.4s, v9.8h, v21.8h\n"
+ "ldr d9, [x21, x3]\n"
+ "smlal2 v22.4s, v17.8h, v21.8h\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v21.8h\n"
+ "ldr d21, [x6, #0x50]\n"
+ "smlal v20.4s, v3.4h, v12.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v12.4h\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "ldr x22, [x5, #0xa0]\n"
+ "ldr x21, [x5, #0xa8]\n"
+ "smlal2 v15.4s, v30.8h, v2.8h\n"
+ "ldr d30, [x24, x3]\n"
+ "smlal v7.4s, v16.4h, v27.4h\n"
+ "ssubl v30.8h, v30.8b, v18.8b\n"
+ "smlal2 v5.4s, v3.8h, v12.8h\n"
+ "ldr d3, [x6, #0x58]\n"
+ "smlal2 v22.4s, v6.8h, v12.8h\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "ldr d12, [x20, x3]\n"
+ "smlal v20.4s, v16.4h, v2.4h\n"
+ "ssubl v12.8h, v12.8b, v18.8b\n"
+ "smlal v24.4s, v28.4h, v2.4h\n"
+ "smlal v23.4s, v14.4h, v2.4h\n"
+ "ldr x20, [x5, #0xb0]\n"
+ "ldr x13, [x5, #0xb8]\n"
+ "smlal2 v15.4s, v16.8h, v27.8h\n"
+ "smlal v7.4s, v4.4h, v1.4h\n"
+ "ldr x12, [x5, #0xc0]\n"
+ "ldr x11, [x5, #0xc8]\n"
+ "smlal2 v5.4s, v16.8h, v2.8h\n"
+ "ldr d16, [x23, x3]\n"
+ "smlal2 v22.4s, v28.8h, v2.8h\n"
+ "ssubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v2.8h\n"
+ "ldr d2, [x6, #0x60]\n"
+ "smlal v20.4s, v4.4h, v27.4h\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v27.4h\n"
+ "smlal v23.4s, v25.4h, v27.4h\n"
+ "ldr x10, [x5, #0xd0]\n"
+ "ldr x9, [x5, #0xd8]\n"
+ "smlal2 v15.4s, v4.8h, v1.8h\n"
+ "smlal v7.4s, v17.4h, v31.4h\n"
+ "ldr x28, [x5, #0xe0]\n"
+ "ldr x27, [x5, #0xe8]\n"
+ "smlal2 v5.4s, v4.8h, v27.8h\n"
+ "ldr d4, [x22, x3]\n"
+ "smlal2 v22.4s, v14.8h, v27.8h\n"
+ "ssubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v19.4s, v25.8h, v27.8h\n"
+ "ldr d27, [x6, #0x68]\n"
+ "smlal v20.4s, v17.4h, v1.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v1.4h\n"
+ "smlal v23.4s, v10.4h, v1.4h\n"
+ "ldr x26, [x5, #0xf0]\n"
+ "ldr x25, [x5, #0xf8]\n"
+ "smlal2 v15.4s, v17.8h, v31.8h\n"
+ "smlal v7.4s, v6.4h, v8.4h\n"
+ "ldr x24, [x5, #0x100]\n"
+ "ldr x23, [x5, #0x108]\n"
+ "smlal2 v5.4s, v17.8h, v1.8h\n"
+ "ldr d17, [x21, x3]\n"
+ "smlal2 v22.4s, v25.8h, v1.8h\n"
+ "ssubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v1.8h\n"
+ "ldr d1, [x6, #0x70]\n"
+ "smlal v20.4s, v6.4h, v31.4h\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v31.4h\n"
+ "smlal v23.4s, v9.4h, v31.4h\n"
+ "ldr x22, [x5, #0x110]\n"
+ "ldr x21, [x5, #0x118]\n"
+ "smlal2 v15.4s, v6.8h, v8.8h\n"
+ "smlal v7.4s, v28.4h, v21.4h\n"
"subs x2, x2, #0x1\n"
+ "smlal2 v5.4s, v6.8h, v31.8h\n"
+ "ldr d6, [x20, x3]\n"
+ "smlal2 v22.4s, v10.8h, v31.8h\n"
+ "ssubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v31.8h\n"
+ "ldr d31, [x6, #0x78]\n"
+ "smlal v20.4s, v29.4h, v8.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v9.4h, v8.4h\n"
+ "smlal v23.4s, v30.4h, v8.4h\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v15.4s, v28.8h, v21.8h\n"
+ "ldr d28, [x13, x3]\n"
+ "smlal v7.4s, v14.4h, v3.4h\n"
+ "ssubl v28.8h, v28.8b, v18.8b\n"
+ "smlal2 v5.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x6, #0x80]\n"
+ "smlal2 v22.4s, v9.8h, v8.8h\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "smlal2 v19.4s, v30.8h, v8.8h\n"
+ "ldr d8, [x12, x3]\n"
+ "smlal v20.4s, v14.4h, v21.4h\n"
+ "ssubl v8.8h, v8.8b, v18.8b\n"
+ "smlal v24.4s, v12.4h, v21.4h\n"
+ "smlal v23.4s, v16.4h, v21.4h\n"
+ "smlal2 v15.4s, v14.8h, v3.8h\n"
+ "smlal v7.4s, v25.4h, v2.4h\n"
+ "smlal2 v5.4s, v14.8h, v21.8h\n"
+ "ldr d14, [x11, x3]\n"
+ "smlal2 v22.4s, v12.8h, v21.8h\n"
+ "ssubl v14.8h, v14.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v21.8h\n"
+ "ldr d21, [x6, #0x88]\n"
+ "smlal v20.4s, v25.4h, v3.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v16.4h, v3.4h\n"
+ "smlal v23.4s, v4.4h, v3.4h\n"
+ "smlal2 v15.4s, v25.8h, v2.8h\n"
+ "smlal v7.4s, v10.4h, v27.4h\n"
+ "smlal2 v5.4s, v25.8h, v3.8h\n"
+ "ldr d25, [x10, x3]\n"
+ "smlal2 v22.4s, v16.8h, v3.8h\n"
+ "ssubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v3.8h\n"
+ "ldr d3, [x6, #0x90]\n"
+ "smlal v20.4s, v10.4h, v2.4h\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v4.4h, v2.4h\n"
+ "smlal v23.4s, v17.4h, v2.4h\n"
+ "smlal2 v15.4s, v10.8h, v27.8h\n"
+ "smlal v7.4s, v9.4h, v1.4h\n"
+ "smlal2 v5.4s, v10.8h, v2.8h\n"
+ "ldr d10, [x9, x3]\n"
+ "smlal2 v22.4s, v4.8h, v2.8h\n"
+ "ssubl v10.8h, v10.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v2.8h\n"
+ "ldr d2, [x6, #0x98]\n"
+ "smlal v20.4s, v9.4h, v27.4h\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v17.4h, v27.4h\n"
+ "smlal v23.4s, v6.4h, v27.4h\n"
+ "smlal2 v15.4s, v9.8h, v1.8h\n"
+ "smlal v7.4s, v12.4h, v31.4h\n"
+ "smlal2 v5.4s, v9.8h, v27.8h\n"
+ "ldr d9, [x28, x3]\n"
+ "smlal2 v22.4s, v17.8h, v27.8h\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v27.8h\n"
+ "ldr d27, [x6, #0xa0]\n"
+ "smlal v20.4s, v30.4h, v1.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v1.4h\n"
+ "smlal v23.4s, v28.4h, v1.4h\n"
+ "smlal2 v15.4s, v12.8h, v31.8h\n"
+ "ldr d12, [x27, x3]\n"
+ "smlal v7.4s, v16.4h, v29.4h\n"
+ "ssubl v12.8h, v12.8b, v18.8b\n"
+ "smlal2 v5.4s, v30.8h, v1.8h\n"
+ "ldr d30, [x6, #0xa8]\n"
+ "smlal2 v22.4s, v6.8h, v1.8h\n"
+ "ssubl v30.8h, v30.8b, v13.8b\n"
"smlal2 v19.4s, v28.8h, v1.8h\n"
- "smlal2 v10.4s, v28.8h, v0.8h\n"
- "ldr d28, [x24, x0]\n"
- "ssubl v28.8h, v28.8b, v9.8b\n"
- "smlal2 v7.4s, v22.8h, v0.8h\n"
- "smlal v13.4s, v23.4h, v2.4h\n"
- "ldr x25, [x4, #0xf0]\n"
- "add x5, x5, #0x20\n"
- "smlal2 v21.4s, v25.8h, v0.8h\n"
- "ldr d0, [x3, #0x50]\n"
- "smlal v20.4s, v23.4h, v1.4h\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v1.4h\n"
- "smlal v17.4s, v24.4h, v1.4h\n"
- "smlal2 v19.4s, v23.8h, v2.8h\n"
- "smlal2 v10.4s, v23.8h, v1.8h\n"
- "ldr d23, [x23, x0]\n"
- "ssubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v7.4s, v25.8h, v1.8h\n"
- "smlal v13.4s, v31.4h, v3.4h\n"
- "ldr x24, [x4, #0xf8]\n"
- "smlal2 v21.4s, v24.8h, v1.8h\n"
- "ldr d1, [x3, #0x58]\n"
- "smlal v20.4s, v31.4h, v2.4h\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v24.4h, v2.4h\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal2 v19.4s, v31.8h, v3.8h\n"
- "smlal2 v10.4s, v31.8h, v2.8h\n"
- "ldr d31, [x15, x0]\n"
- "ssubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v7.4s, v24.8h, v2.8h\n"
- "smlal v13.4s, v30.4h, v4.4h\n"
- "ldr x23, [x4, #0x100]\n"
- "smlal2 v21.4s, v27.8h, v2.8h\n"
- "ldr d2, [x3, #0x60]\n"
- "smlal v20.4s, v30.4h, v3.4h\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v27.4h, v3.4h\n"
- "smlal v17.4s, v23.4h, v3.4h\n"
- "smlal2 v19.4s, v30.8h, v4.8h\n"
- "smlal2 v10.4s, v30.8h, v3.8h\n"
- "ldr d30, [x21, x0]\n"
- "ssubl v30.8h, v30.8b, v9.8b\n"
- "smlal2 v7.4s, v27.8h, v3.8h\n"
- "smlal v13.4s, v22.4h, v0.4h\n"
- "ldr x15, [x4, #0x108]\n"
- "smlal2 v21.4s, v23.8h, v3.8h\n"
- "ldr d3, [x3, #0x68]\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v23.4h, v4.4h\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "smlal2 v19.4s, v22.8h, v0.8h\n"
- "ldr d22, [x20, x0]\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "ldr d26, [x14, x0]\n"
- "smlal2 v7.4s, v23.8h, v4.8h\n"
- "smlal v13.4s, v25.4h, v1.4h\n"
- "ssubl v26.8h, v26.8b, v9.8b\n"
- "ldr x21, [x4, #0x110]\n"
- "smlal2 v21.4s, v28.8h, v4.8h\n"
- "ldr d4, [x3, #0x70]\n"
- "smlal v20.4s, v25.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v31.4h, v0.4h\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "ssubl v22.8h, v22.8b, v9.8b\n"
- "ldr x20, [x4, #0x118]\n"
- "smlal2 v19.4s, v25.8h, v1.8h\n"
- "smlal2 v10.4s, v25.8h, v0.8h\n"
- "ldr d25, [x13, x0]\n"
- "ssubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v0.8h\n"
- "smlal v13.4s, v24.4h, v2.4h\n"
- "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal2 v21.4s, v30.8h, v0.8h\n"
- "ldr d0, [x3, #0x78]\n"
- "smlal v20.4s, v24.4h, v1.4h\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v30.4h, v1.4h\n"
- "smlal v17.4s, v26.4h, v1.4h\n"
- "smlal2 v19.4s, v24.8h, v2.8h\n"
- "smlal2 v10.4s, v24.8h, v1.8h\n"
- "ldr d24, [x12, x0]\n"
- "ssubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v7.4s, v30.8h, v1.8h\n"
- "smlal v13.4s, v27.4h, v3.4h\n"
- "smlal2 v21.4s, v26.8h, v1.8h\n"
- "ldr d1, [x3, #0x80]\n"
- "smlal v20.4s, v27.4h, v2.4h\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v26.4h, v2.4h\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v19.4s, v27.8h, v3.8h\n"
- "smlal2 v10.4s, v27.8h, v2.8h\n"
- "ldr d27, [x11, x0]\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal2 v7.4s, v26.8h, v2.8h\n"
- "smlal v13.4s, v23.4h, v4.4h\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "ldr d2, [x3, #0x88]\n"
- "smlal v20.4s, v23.4h, v3.4h\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal2 v19.4s, v23.8h, v4.8h\n"
- "smlal2 v10.4s, v23.8h, v3.8h\n"
- "ldr d23, [x22, x0]\n"
- "ssubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "ldr d3, [x3, #0x90]\n"
- "smlal v20.4s, v28.4h, v4.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal v17.4s, v22.4h, v4.4h\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "ldr d31, [x9, x0]\n"
- "smlal2 v10.4s, v28.8h, v4.8h\n"
- "ldr d28, [x27, x0]\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "ssubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v21.4s, v22.8h, v4.8h\n"
- "ldr d4, [x3, #0x98]\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v27.4h, v0.4h\n"
- "smlal v17.4s, v23.4h, v0.4h\n"
- "ssubl v28.8h, v28.8b, v9.8b\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr d30, [x28, x0]\n"
- "ssubl v30.8h, v30.8b, v9.8b\n"
- "smlal2 v7.4s, v27.8h, v0.8h\n"
- "smlal v13.4s, v26.4h, v2.4h\n"
- "smlal2 v21.4s, v23.8h, v0.8h\n"
- "ldr d0, [x3, #0xa0]\n"
- "smlal v20.4s, v26.4h, v1.4h\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v23.4h, v1.4h\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal2 v19.4s, v26.8h, v2.8h\n"
- "smlal2 v10.4s, v26.8h, v1.8h\n"
- "ldr d26, [x26, x0]\n"
- "ssubl v26.8h, v26.8b, v9.8b\n"
- "smlal2 v7.4s, v23.8h, v1.8h\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "smlal2 v21.4s, v31.8h, v1.8h\n"
- "ldr d1, [x3, #0xa8]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v31.4h, v2.4h\n"
- "smlal v17.4s, v30.4h, v2.4h\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "ldr d25, [x25, x0]\n"
- "ssubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v2.8h\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "smlal2 v21.4s, v30.8h, v2.8h\n"
- "ldr d2, [x3, #0xb0]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v30.4h, v3.4h\n"
- "smlal v17.4s, v28.4h, v3.4h\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "ldr d24, [x24, x0]\n"
- "ssubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v7.4s, v30.8h, v3.8h\n"
- "smlal v13.4s, v27.4h, v0.4h\n"
- "smlal2 v21.4s, v28.8h, v3.8h\n"
- "ldr d3, [x3, #0xb8]\n"
- "smlal v20.4s, v22.4h, v4.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v28.4h, v4.4h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal2 v19.4s, v27.8h, v0.8h\n"
- "ldr d27, [x23, x0]\n"
- "smlal2 v7.4s, v28.8h, v4.8h\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal v13.4s, v23.4h, v1.4h\n"
- "smlal2 v10.4s, v22.8h, v4.8h\n"
- "ldr q22, [x8, #0x10]\n"
+ "ldr d1, [x26, x3]\n"
+ "smlal v20.4s, v16.4h, v31.4h\n"
+ "ssubl v1.8h, v1.8b, v18.8b\n"
+ "smlal v24.4s, v8.4h, v31.4h\n"
+ "smlal v23.4s, v14.4h, v31.4h\n"
+ "smlal2 v15.4s, v16.8h, v29.8h\n"
+ "smlal v7.4s, v4.4h, v21.4h\n"
+ "smlal2 v5.4s, v16.8h, v31.8h\n"
+ "ldr d16, [x25, x3]\n"
+ "smlal2 v22.4s, v8.8h, v31.8h\n"
+ "ssubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v31.8h\n"
+ "ldr d31, [x6, #0xb0]\n"
+ "smlal v20.4s, v4.4h, v29.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v29.4h\n"
+ "smlal v23.4s, v25.4h, v29.4h\n"
+ "smlal2 v15.4s, v4.8h, v21.8h\n"
+ "smlal v7.4s, v17.4h, v3.4h\n"
+ "smlal2 v5.4s, v4.8h, v29.8h\n"
+ "ldr d4, [x24, x3]\n"
+ "smlal2 v22.4s, v14.8h, v29.8h\n"
+ "ssubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v19.4s, v25.8h, v29.8h\n"
+ "ldr d29, [x6, #0xb8]\n"
+ "smlal v20.4s, v17.4h, v21.4h\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v21.4h\n"
+ "smlal v23.4s, v10.4h, v21.4h\n"
+ "smlal2 v15.4s, v17.8h, v3.8h\n"
+ "smlal v7.4s, v6.4h, v2.4h\n"
+ "smlal2 v5.4s, v17.8h, v21.8h\n"
+ "ldr d17, [x23, x3]\n"
+ "smlal2 v22.4s, v25.8h, v21.8h\n"
+ "ssubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v21.8h\n"
+ "ldr d21, [x6, #0xc0]\n"
+ "smlal v20.4s, v6.4h, v3.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v3.4h\n"
+ "smlal v23.4s, v9.4h, v3.4h\n"
+ "add x6, x6, #0xc8\n"
+ "smlal2 v15.4s, v6.8h, v2.8h\n"
+ "smlal v7.4s, v8.4h, v27.4h\n"
+ "smlal2 v5.4s, v6.8h, v3.8h\n"
+ "ldr d6, [x22, x3]\n"
+ "smlal2 v22.4s, v10.8h, v3.8h\n"
+ "ssubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v3.8h\n"
+ "ldr d3, [x21, x3]\n"
+ "smlal v20.4s, v28.4h, v2.4h\n"
+ "ssubl v3.8h, v3.8b, v18.8b\n"
+ "smlal v24.4s, v9.4h, v2.4h\n"
+ "smlal v23.4s, v12.4h, v2.4h\n"
+ "add x3, x3, #0x8\n"
+ "smlal2 v15.4s, v8.8h, v27.8h\n"
+ "ldr q8, [x7, #0x0]\n"
+ "smlal v7.4s, v14.4h, v30.4h\n"
+ "smlal2 v5.4s, v28.8h, v2.8h\n"
+ "ldr q28, [x8, #0x0]\n"
+ "smlal2 v22.4s, v9.8h, v2.8h\n"
+ "smlal2 v19.4s, v12.8h, v2.8h\n"
+ "ldr q2, [x7, #0x10]\n"
+ "smlal v20.4s, v14.4h, v27.4h\n"
+ "add x7, x7, #0x20\n"
+ "smlal v24.4s, v1.4h, v27.4h\n"
+ "smlal v23.4s, v16.4h, v27.4h\n"
+ "smlal2 v15.4s, v14.8h, v30.8h\n"
+ "smlal v7.4s, v25.4h, v31.4h\n"
+ "smlal2 v5.4s, v14.8h, v27.8h\n"
+ "ldr q14, [x8, #0x10]\n"
+ "smlal2 v22.4s, v1.8h, v27.8h\n"
"add x8, x8, #0x20\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0xc0]\n"
- "smlal v20.4s, v23.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v0.4h\n"
- "smlal v17.4s, v24.4h, v0.4h\n"
- "add x3, x3, #0xc8\n"
- "smlal2 v19.4s, v23.8h, v1.8h\n"
- "smlal2 v7.4s, v25.8h, v0.8h\n"
- "ldr d25, [x15, x0]\n"
- "ssubl v25.8h, v25.8b, v9.8b\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v10.4s, v23.8h, v0.8h\n"
- "smlal2 v21.4s, v24.8h, v0.8h\n"
- "smlal v20.4s, v31.4h, v1.4h\n"
- "smlal v8.4s, v24.4h, v1.4h\n"
- "smlal v17.4s, v27.4h, v1.4h\n"
- "smlal2 v19.4s, v31.8h, v2.8h\n"
- "smlal2 v7.4s, v24.8h, v1.8h\n"
- "ldr d24, [x21, x0]\n"
- "ssubl v24.8h, v24.8b, v9.8b\n"
- "smlal v13.4s, v30.4h, v3.4h\n"
- "smlal2 v10.4s, v31.8h, v1.8h\n"
- "smlal2 v21.4s, v27.8h, v1.8h\n"
- "smlal v20.4s, v30.4h, v2.4h\n"
- "smlal v8.4s, v27.4h, v2.4h\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v19.4s, v30.8h, v3.8h\n"
- "smlal2 v7.4s, v27.8h, v2.8h\n"
- "ldr d27, [x20, x0]\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal v13.4s, v28.4h, v4.4h\n"
- "smlal2 v10.4s, v30.8h, v2.8h\n"
- "sqrdmulh v13.4s, v13.4s, v18.4s\n"
- "add x0, x0, #0x8\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "smlal v20.4s, v28.4h, v3.4h\n"
- "and v30.16b, v13.16b, v6.16b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "smlal2 v19.4s, v28.8h, v4.8h\n"
- "smlal2 v10.4s, v28.8h, v3.8h\n"
- "sqrdmulh v19.4s, v19.4s, v5.4s\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "and v16.16b, v19.16b, v22.16b\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "sqrdmulh v20.4s, v20.4s, v18.4s\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "sqrdmulh v8.4s, v8.4s, v18.4s\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal2 v21.4s, v27.8h, v4.8h\n"
- "sqrdmulh v17.4s, v17.4s, v18.4s\n"
- "sqadd v13.4s, v13.4s, v30.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "and v0.16b, v20.16b, v6.16b\n"
- "sqrdmulh v10.4s, v10.4s, v5.4s\n"
- "and v18.16b, v8.16b, v6.16b\n"
- "sqrdmulh v7.4s, v7.4s, v5.4s\n"
- "and v30.16b, v17.16b, v6.16b\n"
- "sqrdmulh v21.4s, v21.4s, v5.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v26.16b, v10.16b, v22.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "and v23.16b, v7.16b, v22.16b\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "and v16.16b, v21.16b, v22.16b\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v18.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v30.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v6.4s\n"
- "srshl v20.4s, v20.4s, v6.4s\n"
- "sqadd v10.4s, v10.4s, v26.4s\n"
- "srshl v8.4s, v8.4s, v6.4s\n"
- "sqadd v7.4s, v7.4s, v23.4s\n"
- "srshl v17.4s, v17.4s, v6.4s\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "srshl v19.4s, v19.4s, v22.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v10.4s, v10.4s, v22.4s\n"
+ "smlal2 v19.4s, v16.8h, v27.8h\n"
+ "smlal v20.4s, v25.4h, v30.4h\n"
+ "smlal v24.4s, v16.4h, v30.4h\n"
+ "smlal v23.4s, v4.4h, v30.4h\n"
+ "smlal2 v15.4s, v25.8h, v31.8h\n"
+ "smlal v7.4s, v10.4h, v29.4h\n"
+ "smlal2 v5.4s, v25.8h, v30.8h\n"
+ "smlal2 v22.4s, v16.8h, v30.8h\n"
+ "smlal2 v19.4s, v4.8h, v30.8h\n"
+ "smlal v20.4s, v10.4h, v31.4h\n"
+ "smlal v24.4s, v4.4h, v31.4h\n"
+ "smlal v23.4s, v17.4h, v31.4h\n"
+ "smlal2 v15.4s, v10.8h, v29.8h\n"
+ "smlal v7.4s, v9.4h, v21.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v8.4s\n"
+ "smlal2 v5.4s, v10.8h, v31.8h\n"
+ "smlal2 v22.4s, v4.8h, v31.8h\n"
+ "and v27.16b, v7.16b, v28.16b\n"
+ "smlal2 v19.4s, v17.8h, v31.8h\n"
+ "smlal v20.4s, v9.4h, v29.4h\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "smlal v24.4s, v17.4h, v29.4h\n"
+ "smlal v23.4s, v6.4h, v29.4h\n"
+ "sqadd v7.4s, v7.4s, v27.4s\n"
+ "smlal2 v15.4s, v9.8h, v21.8h\n"
+ "smlal2 v5.4s, v9.8h, v29.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v2.4s\n"
+ "smlal2 v22.4s, v17.8h, v29.8h\n"
+ "smlal2 v19.4s, v6.8h, v29.8h\n"
+ "and v9.16b, v15.16b, v14.16b\n"
+ "smlal v20.4s, v12.4h, v21.4h\n"
+ "smlal v24.4s, v6.4h, v21.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+ "smlal v23.4s, v3.4h, v21.4h\n"
+ "smlal2 v5.4s, v12.8h, v21.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "smlal2 v22.4s, v6.8h, v21.8h\n"
+ "smlal2 v19.4s, v3.8h, v21.8h\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "and v25.16b, v20.16b, v28.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v2.4s\n"
+ "and v10.16b, v24.16b, v28.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v2.4s\n"
+ "and v21.16b, v23.16b, v28.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v2.4s\n"
+ "sqadd v15.4s, v15.4s, v9.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "and v9.16b, v5.16b, v14.16b\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "and v12.16b, v22.16b, v14.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v17.16b, v19.16b, v14.16b\n"
+ "sqadd v20.4s, v20.4s, v25.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v10.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v21.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v7.4s, v7.4s, v28.4s\n"
+ "srshl v20.4s, v20.4s, v28.4s\n"
+ "sqadd v5.4s, v5.4s, v9.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
+ "sqadd v22.4s, v22.4s, v12.4s\n"
+ "srshl v23.4s, v23.4s, v28.4s\n"
+ "sqadd v19.4s, v19.4s, v17.4s\n"
+ "srshl v15.4s, v15.4s, v14.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v5.4s, v5.4s, v14.4s\n"
"sqxtn v20.4h, v20.4s\n"
- "srshl v7.4s, v7.4s, v22.4s\n"
- "sqxtn v8.4h, v8.4s\n"
- "srshl v21.4s, v21.4s, v22.4s\n"
- "sqxtn v17.4h, v17.4s\n"
- "sqxtn2 v13.8h, v19.4s\n"
- "sqxtn2 v20.8h, v10.4s\n"
- "sqxtn2 v8.8h, v7.4s\n"
- "sqxtn2 v17.8h, v21.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v20.8h, v20.8h, v14.8h\n"
- "sqadd v8.8h, v8.8h, v14.8h\n"
- "sqadd v17.8h, v17.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v20.8h, v20.8h, v12.8h\n"
- "smax v8.8h, v8.8h, v12.8h\n"
- "smax v17.8h, v17.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v20.8h, v20.8h, v11.8h\n"
- "smin v8.8h, v8.8h, v11.8h\n"
- "smin v17.8h, v17.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "srshl v22.4s, v22.4s, v14.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "srshl v19.4s, v19.4s, v14.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v7.8h, v15.4s\n"
+ "sqxtn2 v20.8h, v5.4s\n"
+ "sqxtn2 v24.8h, v22.4s\n"
+ "sqxtn2 v23.8h, v19.4s\n"
+ "sqadd v7.8h, v7.8h, v26.8h\n"
+ "sqadd v20.8h, v20.8h, v26.8h\n"
+ "sqadd v24.8h, v24.8h, v26.8h\n"
+ "sqadd v23.8h, v23.8h, v26.8h\n"
+ "smax v7.8h, v7.8h, v11.8h\n"
+ "smax v20.8h, v20.8h, v11.8h\n"
+ "smax v24.8h, v24.8h, v11.8h\n"
+ "smax v23.8h, v23.8h, v11.8h\n"
+ "smin v7.8h, v7.8h, v0.8h\n"
+ "smin v20.8h, v20.8h, v0.8h\n"
+ "smin v24.8h, v24.8h, v0.8h\n"
+ "smin v23.8h, v23.8h, v0.8h\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "str d7, [x17, x4]\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str d13, [x17, x10]\n"
- "uzp1 v8.16b, v8.16b, v8.16b\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
- "str d20, [x6, x10]\n"
- "str d8, [x7, x10]\n"
- "str d17, [x16, x10]\n"
- "ldr q13, [x13, #0x0]\n"
- "ldr q19, [x13, #0x10]\n"
- "add x13, x13, #0x20\n"
- "ldr d0, [x3, #0x0]\n"
- "ldr d1, [x3, #0x8]\n"
- "add x10, x10, #0x8\n"
- "str x13, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d2, [x3, #0x10]\n"
- "ldr d3, [x3, #0x18]\n"
- "mov v20.16b, v13.16b\n"
- "mov v10.16b, v19.16b\n"
- "ldr d4, [x3, #0x20]\n"
- "ldp x9, x28, [x4, #0x0]\n"
- "mov v8.16b, v13.16b\n"
- "mov v7.16b, v19.16b\n"
- "ldp x27, x26, [x4, #0x10]\n"
- "ldp x25, x24, [x4, #0x20]\n"
- "mov v17.16b, v13.16b\n"
- "mov v21.16b, v19.16b\n"
- "ldp x23, x22, [x4, #0x30]\n"
- "ldp x21, x20, [x4, #0x40]\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldr d31, [x9, x0]\n"
- "ldr d30, [x28, x0]\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr d29, [x27, x0]\n"
- "ldr d28, [x26, x0]\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ssubl v31.8h, v31.8b, v9.8b\n"
- "ldr d27, [x25, x0]\n"
- "ldr d23, [x24, x0]\n"
- "ssubl v30.8h, v30.8b, v9.8b\n"
- "ssubl v29.8h, v29.8b, v9.8b\n"
- "ldr d25, [x23, x0]\n"
- "ldr d24, [x22, x0]\n"
- "ssubl v28.8h, v28.8b, v9.8b\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "ldr d26, [x21, x0]\n"
- "ldr d22, [x20, x0]\n"
- "ssubl v23.8h, v23.8b, v9.8b\n"
- "ssubl v25.8h, v25.8b, v9.8b\n"
- "ssubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v26.8h, v26.8b, v9.8b\n"
- "ssubl v22.8h, v22.8b, v9.8b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str d20, [x16, x4]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d24, [x15, x4]\n"
+ "str d23, [x14, x4]\n"
+ "ldr q7, [x20, #0x0]\n"
+ "ldr q15, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d6, [x6, #0x0]\n"
+ "ldr d14, [x6, #0x8]\n"
+ "add x4, x4, #0x8\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d10, [x6, #0x10]\n"
+ "ldr d21, [x6, #0x18]\n"
+ "mov v20.16b, v7.16b\n"
+ "mov v5.16b, v15.16b\n"
+ "ldr d12, [x6, #0x20]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "mov v24.16b, v7.16b\n"
+ "mov v22.16b, v15.16b\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "mov v23.16b, v7.16b\n"
+ "mov v19.16b, v15.16b\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ssubl v14.8h, v14.8b, v13.8b\n"
+ "ldr d31, [x9, x3]\n"
+ "ldr d17, [x28, x3]\n"
+ "ssubl v10.8h, v10.8b, v13.8b\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ldr d30, [x27, x3]\n"
+ "ldr d16, [x26, x3]\n"
+ "ssubl v12.8h, v12.8b, v13.8b\n"
+ "ssubl v31.8h, v31.8b, v18.8b\n"
+ "ldr d3, [x25, x3]\n"
+ "ldr d4, [x24, x3]\n"
+ "ssubl v17.8h, v17.8b, v18.8b\n"
+ "ssubl v30.8h, v30.8b, v18.8b\n"
+ "ldr d25, [x23, x3]\n"
+ "ldr d9, [x22, x3]\n"
+ "ssubl v16.8h, v16.8b, v18.8b\n"
+ "ssubl v3.8h, v3.8b, v18.8b\n"
+ "ldr d29, [x21, x3]\n"
+ "ldr d28, [x20, x3]\n"
+ "ssubl v4.8h, v4.8b, v18.8b\n"
+ "ssubl v25.8h, v25.8b, v18.8b\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "ssubl v29.8h, v29.8b, v18.8b\n"
+ "ssubl v28.8h, v28.8b, v18.8b\n"
"bgt 1b\n"
"2:" // Tail
- "ldr q18, [x5, #0x0]\n"
- "ldr q6, [x8, #0x0]\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "ldr q5, [x5, #0x10]\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "ldr x20, [x4, #0x50]\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "smlal v8.4s, v29.4h, v0.4h\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "ldr x22, [x4, #0x58]\n"
- "ldr x21, [x4, #0x60]\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr d31, [x20, x0]\n"
- "ssubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v7.4s, v29.8h, v0.8h\n"
- "smlal v13.4s, v27.4h, v2.4h\n"
- "ldr x20, [x4, #0x68]\n"
- "ldr x26, [x4, #0x70]\n"
- "smlal2 v21.4s, v28.8h, v0.8h\n"
- "ldr d30, [x22, x0]\n"
- "smlal v20.4s, v27.4h, v1.4h\n"
- "ssubl v30.8h, v30.8b, v9.8b\n"
- "smlal v8.4s, v28.4h, v1.4h\n"
- "smlal v17.4s, v23.4h, v1.4h\n"
- "ldr x25, [x4, #0x78]\n"
- "ldr x23, [x4, #0x80]\n"
- "smlal2 v19.4s, v27.8h, v2.8h\n"
- "smlal2 v10.4s, v27.8h, v1.8h\n"
- "ldr d0, [x3, #0x28]\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal2 v7.4s, v28.8h, v1.8h\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "ldr x24, [x4, #0x88]\n"
- "ldr x15, [x4, #0x90]\n"
- "smlal2 v21.4s, v23.8h, v1.8h\n"
- "ldr d27, [x21, x0]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal v8.4s, v23.4h, v2.4h\n"
- "smlal v17.4s, v31.4h, v2.4h\n"
- "ldr x21, [x4, #0x98]\n"
- "ldr x14, [x4, #0xa0]\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "ldr d1, [x3, #0x30]\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal2 v7.4s, v23.8h, v2.8h\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "ldr x13, [x4, #0xa8]\n"
- "ldr x12, [x4, #0xb0]\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "ldr d25, [x20, x0]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "ssubl v25.8h, v25.8b, v9.8b\n"
- "smlal v8.4s, v31.4h, v3.4h\n"
- "smlal v17.4s, v30.4h, v3.4h\n"
- "ldr x20, [x4, #0xb8]\n"
- "ldr x11, [x4, #0xc0]\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "ldr d2, [x3, #0x38]\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal2 v7.4s, v31.8h, v3.8h\n"
- "smlal v13.4s, v29.4h, v0.4h\n"
- "ldr x22, [x4, #0xc8]\n"
- "ldr x9, [x4, #0xd0]\n"
- "smlal2 v21.4s, v30.8h, v3.8h\n"
- "ldr d24, [x26, x0]\n"
- "smlal v20.4s, v27.4h, v4.4h\n"
- "ssubl v24.8h, v24.8b, v9.8b\n"
- "smlal v8.4s, v30.4h, v4.4h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "ldr x28, [x4, #0xd8]\n"
- "ldr x27, [x4, #0xe0]\n"
- "smlal2 v19.4s, v29.8h, v0.8h\n"
- "ldr d3, [x3, #0x40]\n"
- "smlal2 v10.4s, v27.8h, v4.8h\n"
- "ldr d27, [x25, x0]\n"
- "smlal2 v7.4s, v30.8h, v4.8h\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr x26, [x4, #0xe8]\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0x48]\n"
- "smlal v20.4s, v28.4h, v0.4h\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal v8.4s, v22.4h, v0.4h\n"
- "smlal v17.4s, v25.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ldr x25, [x4, #0xf0]\n"
- "smlal2 v19.4s, v28.8h, v1.8h\n"
- "smlal2 v10.4s, v28.8h, v0.8h\n"
- "ldr d28, [x24, x0]\n"
- "ssubl v28.8h, v28.8b, v9.8b\n"
- "smlal2 v7.4s, v22.8h, v0.8h\n"
- "smlal v13.4s, v23.4h, v2.4h\n"
- "ldr x24, [x4, #0xf8]\n"
- "tst x1, #0x7\n"
- "smlal2 v21.4s, v25.8h, v0.8h\n"
- "ldr d0, [x3, #0x50]\n"
- "smlal v20.4s, v23.4h, v1.4h\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v1.4h\n"
- "smlal v17.4s, v24.4h, v1.4h\n"
- "add x5, x5, #0x20\n"
- "smlal2 v19.4s, v23.8h, v2.8h\n"
- "smlal2 v10.4s, v23.8h, v1.8h\n"
- "ldr d23, [x23, x0]\n"
- "ssubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v7.4s, v25.8h, v1.8h\n"
- "smlal v13.4s, v31.4h, v3.4h\n"
- "ldr x23, [x4, #0x100]\n"
- "smlal2 v21.4s, v24.8h, v1.8h\n"
- "ldr d1, [x3, #0x58]\n"
- "smlal v20.4s, v31.4h, v2.4h\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v24.4h, v2.4h\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal2 v19.4s, v31.8h, v3.8h\n"
- "smlal2 v10.4s, v31.8h, v2.8h\n"
- "ldr d31, [x15, x0]\n"
- "ssubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v7.4s, v24.8h, v2.8h\n"
- "smlal v13.4s, v30.4h, v4.4h\n"
- "ldr x15, [x4, #0x108]\n"
- "smlal2 v21.4s, v27.8h, v2.8h\n"
- "ldr d2, [x3, #0x60]\n"
- "smlal v20.4s, v30.4h, v3.4h\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v27.4h, v3.4h\n"
- "smlal v17.4s, v23.4h, v3.4h\n"
- "smlal2 v19.4s, v30.8h, v4.8h\n"
- "smlal2 v10.4s, v30.8h, v3.8h\n"
- "ldr d30, [x21, x0]\n"
- "ssubl v30.8h, v30.8b, v9.8b\n"
- "smlal2 v7.4s, v27.8h, v3.8h\n"
- "smlal v13.4s, v22.4h, v0.4h\n"
- "ldr x21, [x4, #0x110]\n"
- "smlal2 v21.4s, v23.8h, v3.8h\n"
- "ldr d3, [x3, #0x68]\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v23.4h, v4.4h\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "smlal2 v19.4s, v22.8h, v0.8h\n"
- "ldr d22, [x20, x0]\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "ldr d26, [x14, x0]\n"
- "smlal2 v7.4s, v23.8h, v4.8h\n"
- "smlal v13.4s, v25.4h, v1.4h\n"
- "ssubl v26.8h, v26.8b, v9.8b\n"
- "ldr x20, [x4, #0x118]\n"
- "smlal2 v21.4s, v28.8h, v4.8h\n"
- "ldr d4, [x3, #0x70]\n"
- "smlal v20.4s, v25.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v31.4h, v0.4h\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "ssubl v22.8h, v22.8b, v9.8b\n"
+ "ldr d27, [x6, #0x28]\n"
+ "ldr d1, [x6, #0x30]\n"
+ "smlal v7.4s, v31.4h, v6.4h\n"
+ "smlal2 v15.4s, v31.8h, v6.8h\n"
+ "ldr d2, [x6, #0x38]\n"
+ "ldr d31, [x6, #0x40]\n"
+ "smlal v7.4s, v17.4h, v14.4h\n"
+ "smlal v20.4s, v17.4h, v6.4h\n"
+ "ldr d8, [x6, #0x48]\n"
+ "ldr x22, [x5, #0x50]\n"
+ "smlal v24.4s, v30.4h, v6.4h\n"
+ "smlal v23.4s, v16.4h, v6.4h\n"
+ "smlal2 v15.4s, v17.8h, v14.8h\n"
+ "smlal v7.4s, v3.4h, v10.4h\n"
+ "ldr x20, [x5, #0x58]\n"
+ "ldr x21, [x5, #0x60]\n"
+ "smlal2 v5.4s, v17.8h, v6.8h\n"
+ "ldr d17, [x22, x3]\n"
+ "smlal2 v22.4s, v30.8h, v6.8h\n"
+ "ssubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v6.8h\n"
+ "ldr d6, [x20, x3]\n"
+ "smlal v20.4s, v3.4h, v14.4h\n"
+ "ssubl v6.8h, v6.8b, v18.8b\n"
+ "smlal v24.4s, v16.4h, v14.4h\n"
+ "smlal v23.4s, v4.4h, v14.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal2 v15.4s, v3.8h, v10.8h\n"
+ "smlal v7.4s, v25.4h, v21.4h\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "ldr x22, [x5, #0x70]\n"
+ "smlal2 v5.4s, v3.8h, v14.8h\n"
+ "ldr d3, [x21, x3]\n"
+ "smlal2 v22.4s, v16.8h, v14.8h\n"
+ "ssubl v3.8h, v3.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v14.8h\n"
+ "ldr d14, [x20, x3]\n"
+ "smlal v20.4s, v25.4h, v10.4h\n"
+ "ssubl v14.8h, v14.8b, v18.8b\n"
+ "smlal v24.4s, v4.4h, v10.4h\n"
+ "smlal v23.4s, v17.4h, v10.4h\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "ldr x21, [x5, #0x78]\n"
+ "smlal2 v15.4s, v25.8h, v21.8h\n"
+ "smlal v7.4s, v9.4h, v12.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x20, [x5, #0x80]\n"
+ "smlal2 v5.4s, v25.8h, v10.8h\n"
+ "ldr d25, [x22, x3]\n"
+ "smlal2 v22.4s, v4.8h, v10.8h\n"
+ "ssubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v10.8h\n"
+ "ldr d10, [x21, x3]\n"
+ "smlal v20.4s, v9.4h, v21.4h\n"
+ "ssubl v10.8h, v10.8b, v18.8b\n"
+ "smlal v24.4s, v17.4h, v21.4h\n"
+ "smlal v23.4s, v6.4h, v21.4h\n"
+ "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x24, [x5, #0x88]\n"
+ "smlal2 v15.4s, v9.8h, v12.8h\n"
+ "smlal v7.4s, v30.4h, v27.4h\n"
+ "ldr x23, [x5, #0x90]\n"
+ "ldr x22, [x5, #0x98]\n"
+ "smlal2 v5.4s, v9.8h, v21.8h\n"
+ "ldr d9, [x20, x3]\n"
+ "smlal2 v22.4s, v17.8h, v21.8h\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v21.8h\n"
+ "ldr d21, [x6, #0x50]\n"
+ "smlal v20.4s, v3.4h, v12.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v12.4h\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "ldr x21, [x5, #0xa0]\n"
+ "ldr x20, [x5, #0xa8]\n"
+ "smlal2 v15.4s, v30.8h, v27.8h\n"
+ "ldr d30, [x24, x3]\n"
+ "smlal v7.4s, v16.4h, v1.4h\n"
+ "ssubl v30.8h, v30.8b, v18.8b\n"
+ "smlal2 v5.4s, v3.8h, v12.8h\n"
+ "ldr d3, [x6, #0x58]\n"
+ "smlal2 v22.4s, v6.8h, v12.8h\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "ldr d12, [x23, x3]\n"
+ "smlal v20.4s, v16.4h, v27.4h\n"
+ "ssubl v12.8h, v12.8b, v18.8b\n"
+ "smlal v24.4s, v28.4h, v27.4h\n"
+ "smlal v23.4s, v14.4h, v27.4h\n"
+ "ldr x13, [x5, #0xb0]\n"
+ "ldr x12, [x5, #0xb8]\n"
+ "smlal2 v15.4s, v16.8h, v1.8h\n"
+ "smlal v7.4s, v4.4h, v2.4h\n"
+ "ldr x11, [x5, #0xc0]\n"
+ "ldr x10, [x5, #0xc8]\n"
+ "smlal2 v5.4s, v16.8h, v27.8h\n"
+ "ldr d16, [x22, x3]\n"
+ "smlal2 v22.4s, v28.8h, v27.8h\n"
+ "ssubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v27.8h\n"
+ "ldr d27, [x6, #0x60]\n"
+ "smlal v20.4s, v4.4h, v1.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v1.4h\n"
+ "smlal v23.4s, v25.4h, v1.4h\n"
+ "ldr x9, [x5, #0xd0]\n"
+ "ldr x28, [x5, #0xd8]\n"
+ "smlal2 v15.4s, v4.8h, v2.8h\n"
+ "smlal v7.4s, v17.4h, v31.4h\n"
+ "ldr x27, [x5, #0xe0]\n"
+ "ldr x26, [x5, #0xe8]\n"
+ "smlal2 v5.4s, v4.8h, v1.8h\n"
+ "ldr d4, [x21, x3]\n"
+ "smlal2 v22.4s, v14.8h, v1.8h\n"
+ "ssubl v4.8h, v4.8b, v18.8b\n"
"smlal2 v19.4s, v25.8h, v1.8h\n"
- "smlal2 v10.4s, v25.8h, v0.8h\n"
- "ldr d25, [x13, x0]\n"
- "ssubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v0.8h\n"
- "smlal v13.4s, v24.4h, v2.4h\n"
- "smlal2 v21.4s, v30.8h, v0.8h\n"
- "ldr d0, [x3, #0x78]\n"
- "smlal v20.4s, v24.4h, v1.4h\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v30.4h, v1.4h\n"
- "smlal v17.4s, v26.4h, v1.4h\n"
- "smlal2 v19.4s, v24.8h, v2.8h\n"
- "smlal2 v10.4s, v24.8h, v1.8h\n"
- "ldr d24, [x12, x0]\n"
- "ssubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v7.4s, v30.8h, v1.8h\n"
- "smlal v13.4s, v27.4h, v3.4h\n"
- "smlal2 v21.4s, v26.8h, v1.8h\n"
- "ldr d1, [x3, #0x80]\n"
- "smlal v20.4s, v27.4h, v2.4h\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v26.4h, v2.4h\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v19.4s, v27.8h, v3.8h\n"
- "smlal2 v10.4s, v27.8h, v2.8h\n"
- "ldr d27, [x11, x0]\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal2 v7.4s, v26.8h, v2.8h\n"
- "smlal v13.4s, v23.4h, v4.4h\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "ldr d2, [x3, #0x88]\n"
- "smlal v20.4s, v23.4h, v3.4h\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal2 v19.4s, v23.8h, v4.8h\n"
- "smlal2 v10.4s, v23.8h, v3.8h\n"
- "ldr d23, [x22, x0]\n"
- "ssubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "ldr d3, [x3, #0x90]\n"
- "smlal v20.4s, v28.4h, v4.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal v17.4s, v22.4h, v4.4h\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "ldr d31, [x9, x0]\n"
- "smlal2 v10.4s, v28.8h, v4.8h\n"
- "ldr d28, [x27, x0]\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "ssubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v21.4s, v22.8h, v4.8h\n"
- "ldr d4, [x3, #0x98]\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v27.4h, v0.4h\n"
- "smlal v17.4s, v23.4h, v0.4h\n"
- "ssubl v28.8h, v28.8b, v9.8b\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr d30, [x28, x0]\n"
- "ssubl v30.8h, v30.8b, v9.8b\n"
- "smlal2 v7.4s, v27.8h, v0.8h\n"
- "smlal v13.4s, v26.4h, v2.4h\n"
- "smlal2 v21.4s, v23.8h, v0.8h\n"
- "ldr d0, [x3, #0xa0]\n"
- "smlal v20.4s, v26.4h, v1.4h\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v23.4h, v1.4h\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal2 v19.4s, v26.8h, v2.8h\n"
- "smlal2 v10.4s, v26.8h, v1.8h\n"
- "ldr d26, [x26, x0]\n"
- "ssubl v26.8h, v26.8b, v9.8b\n"
- "smlal2 v7.4s, v23.8h, v1.8h\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "smlal2 v21.4s, v31.8h, v1.8h\n"
- "ldr d1, [x3, #0xa8]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v31.4h, v2.4h\n"
- "smlal v17.4s, v30.4h, v2.4h\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "ldr d25, [x25, x0]\n"
- "ssubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v2.8h\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "smlal2 v21.4s, v30.8h, v2.8h\n"
- "ldr d2, [x3, #0xb0]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v30.4h, v3.4h\n"
- "smlal v17.4s, v28.4h, v3.4h\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "ldr d24, [x24, x0]\n"
- "ssubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v7.4s, v30.8h, v3.8h\n"
- "smlal v13.4s, v27.4h, v0.4h\n"
- "smlal2 v21.4s, v28.8h, v3.8h\n"
- "ldr d3, [x3, #0xb8]\n"
- "smlal v20.4s, v22.4h, v4.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v28.4h, v4.4h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal2 v19.4s, v27.8h, v0.8h\n"
- "ldr d27, [x23, x0]\n"
- "smlal2 v7.4s, v28.8h, v4.8h\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal v13.4s, v23.4h, v1.4h\n"
- "smlal2 v10.4s, v22.8h, v4.8h\n"
- "ldr q22, [x8, #0x10]\n"
- "add x8, x8, #0x20\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0xc0]\n"
- "smlal v20.4s, v23.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v0.4h\n"
- "smlal v17.4s, v24.4h, v0.4h\n"
- "smlal2 v19.4s, v23.8h, v1.8h\n"
- "smlal2 v7.4s, v25.8h, v0.8h\n"
- "ldr d25, [x15, x0]\n"
- "ssubl v25.8h, v25.8b, v9.8b\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v10.4s, v23.8h, v0.8h\n"
- "smlal2 v21.4s, v24.8h, v0.8h\n"
- "smlal v20.4s, v31.4h, v1.4h\n"
- "smlal v8.4s, v24.4h, v1.4h\n"
- "smlal v17.4s, v27.4h, v1.4h\n"
- "smlal2 v19.4s, v31.8h, v2.8h\n"
- "smlal2 v7.4s, v24.8h, v1.8h\n"
- "ldr d24, [x21, x0]\n"
- "ssubl v24.8h, v24.8b, v9.8b\n"
- "smlal v13.4s, v30.4h, v3.4h\n"
- "smlal2 v10.4s, v31.8h, v1.8h\n"
- "smlal2 v21.4s, v27.8h, v1.8h\n"
+ "ldr d1, [x6, #0x68]\n"
+ "smlal v20.4s, v17.4h, v2.4h\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v2.4h\n"
+ "smlal v23.4s, v10.4h, v2.4h\n"
+ "ldr x25, [x5, #0xf0]\n"
+ "ldr x24, [x5, #0xf8]\n"
+ "smlal2 v15.4s, v17.8h, v31.8h\n"
+ "smlal v7.4s, v6.4h, v8.4h\n"
+ "ldr x23, [x5, #0x100]\n"
+ "ldr x22, [x5, #0x108]\n"
+ "smlal2 v5.4s, v17.8h, v2.8h\n"
+ "ldr d17, [x20, x3]\n"
+ "smlal2 v22.4s, v25.8h, v2.8h\n"
+ "ssubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v2.8h\n"
+ "ldr d2, [x6, #0x70]\n"
+ "smlal v20.4s, v6.4h, v31.4h\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v31.4h\n"
+ "smlal v23.4s, v9.4h, v31.4h\n"
+ "ldr x21, [x5, #0x110]\n"
+ "ldr x20, [x5, #0x118]\n"
+ "smlal2 v15.4s, v6.8h, v8.8h\n"
+ "smlal v7.4s, v28.4h, v21.4h\n"
+ "tst x1, #0x7\n"
+ "smlal2 v5.4s, v6.8h, v31.8h\n"
+ "ldr d6, [x13, x3]\n"
+ "smlal2 v22.4s, v10.8h, v31.8h\n"
+ "ssubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v31.8h\n"
+ "ldr d31, [x6, #0x78]\n"
+ "smlal v20.4s, v29.4h, v8.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v9.4h, v8.4h\n"
+ "smlal v23.4s, v30.4h, v8.4h\n"
+ "smlal2 v15.4s, v28.8h, v21.8h\n"
+ "ldr d28, [x12, x3]\n"
+ "smlal v7.4s, v14.4h, v3.4h\n"
+ "ssubl v28.8h, v28.8b, v18.8b\n"
+ "smlal2 v5.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x6, #0x80]\n"
+ "smlal2 v22.4s, v9.8h, v8.8h\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "smlal2 v19.4s, v30.8h, v8.8h\n"
+ "ldr d8, [x11, x3]\n"
+ "smlal v20.4s, v14.4h, v21.4h\n"
+ "ssubl v8.8h, v8.8b, v18.8b\n"
+ "smlal v24.4s, v12.4h, v21.4h\n"
+ "smlal v23.4s, v16.4h, v21.4h\n"
+ "smlal2 v15.4s, v14.8h, v3.8h\n"
+ "smlal v7.4s, v25.4h, v27.4h\n"
+ "smlal2 v5.4s, v14.8h, v21.8h\n"
+ "ldr d14, [x10, x3]\n"
+ "smlal2 v22.4s, v12.8h, v21.8h\n"
+ "ssubl v14.8h, v14.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v21.8h\n"
+ "ldr d21, [x6, #0x88]\n"
+ "smlal v20.4s, v25.4h, v3.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v16.4h, v3.4h\n"
+ "smlal v23.4s, v4.4h, v3.4h\n"
+ "smlal2 v15.4s, v25.8h, v27.8h\n"
+ "smlal v7.4s, v10.4h, v1.4h\n"
+ "smlal2 v5.4s, v25.8h, v3.8h\n"
+ "ldr d25, [x9, x3]\n"
+ "smlal2 v22.4s, v16.8h, v3.8h\n"
+ "ssubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v3.8h\n"
+ "ldr d3, [x6, #0x90]\n"
+ "smlal v20.4s, v10.4h, v27.4h\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v4.4h, v27.4h\n"
+ "smlal v23.4s, v17.4h, v27.4h\n"
+ "smlal2 v15.4s, v10.8h, v1.8h\n"
+ "smlal v7.4s, v9.4h, v2.4h\n"
+ "smlal2 v5.4s, v10.8h, v27.8h\n"
+ "ldr d10, [x28, x3]\n"
+ "smlal2 v22.4s, v4.8h, v27.8h\n"
+ "ssubl v10.8h, v10.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v27.8h\n"
+ "ldr d27, [x6, #0x98]\n"
+ "smlal v20.4s, v9.4h, v1.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v17.4h, v1.4h\n"
+ "smlal v23.4s, v6.4h, v1.4h\n"
+ "smlal2 v15.4s, v9.8h, v2.8h\n"
+ "smlal v7.4s, v12.4h, v31.4h\n"
+ "smlal2 v5.4s, v9.8h, v1.8h\n"
+ "ldr d9, [x27, x3]\n"
+ "smlal2 v22.4s, v17.8h, v1.8h\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v1.8h\n"
+ "ldr d1, [x6, #0xa0]\n"
"smlal v20.4s, v30.4h, v2.4h\n"
- "smlal v8.4s, v27.4h, v2.4h\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v19.4s, v30.8h, v3.8h\n"
- "smlal2 v7.4s, v27.8h, v2.8h\n"
- "ldr d27, [x20, x0]\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal v13.4s, v28.4h, v4.4h\n"
- "smlal2 v10.4s, v30.8h, v2.8h\n"
- "sqrdmulh v13.4s, v13.4s, v18.4s\n"
- "add x0, x0, #0x8\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "smlal v20.4s, v28.4h, v3.4h\n"
- "and v30.16b, v13.16b, v6.16b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "smlal2 v19.4s, v28.8h, v4.8h\n"
- "smlal2 v10.4s, v28.8h, v3.8h\n"
- "sqrdmulh v19.4s, v19.4s, v5.4s\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "and v16.16b, v19.16b, v22.16b\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "sqrdmulh v20.4s, v20.4s, v18.4s\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "sqrdmulh v8.4s, v8.4s, v18.4s\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal2 v21.4s, v27.8h, v4.8h\n"
- "sqrdmulh v17.4s, v17.4s, v18.4s\n"
- "sqadd v13.4s, v13.4s, v30.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "and v0.16b, v20.16b, v6.16b\n"
- "sqrdmulh v10.4s, v10.4s, v5.4s\n"
- "and v18.16b, v8.16b, v6.16b\n"
- "sqrdmulh v7.4s, v7.4s, v5.4s\n"
- "and v30.16b, v17.16b, v6.16b\n"
- "sqrdmulh v21.4s, v21.4s, v5.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v26.16b, v10.16b, v22.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "and v23.16b, v7.16b, v22.16b\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v2.4h\n"
+ "smlal v23.4s, v28.4h, v2.4h\n"
+ "smlal2 v15.4s, v12.8h, v31.8h\n"
+ "ldr d12, [x26, x3]\n"
+ "smlal v7.4s, v16.4h, v29.4h\n"
+ "ssubl v12.8h, v12.8b, v18.8b\n"
+ "smlal2 v5.4s, v30.8h, v2.8h\n"
+ "ldr d30, [x6, #0xa8]\n"
+ "smlal2 v22.4s, v6.8h, v2.8h\n"
+ "ssubl v30.8h, v30.8b, v13.8b\n"
+ "smlal2 v19.4s, v28.8h, v2.8h\n"
+ "ldr d2, [x25, x3]\n"
+ "smlal v20.4s, v16.4h, v31.4h\n"
+ "ssubl v2.8h, v2.8b, v18.8b\n"
+ "smlal v24.4s, v8.4h, v31.4h\n"
+ "smlal v23.4s, v14.4h, v31.4h\n"
+ "smlal2 v15.4s, v16.8h, v29.8h\n"
+ "smlal v7.4s, v4.4h, v21.4h\n"
+ "smlal2 v5.4s, v16.8h, v31.8h\n"
+ "ldr d16, [x24, x3]\n"
+ "smlal2 v22.4s, v8.8h, v31.8h\n"
+ "ssubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v31.8h\n"
+ "ldr d31, [x6, #0xb0]\n"
+ "smlal v20.4s, v4.4h, v29.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v29.4h\n"
+ "smlal v23.4s, v25.4h, v29.4h\n"
+ "smlal2 v15.4s, v4.8h, v21.8h\n"
+ "smlal v7.4s, v17.4h, v3.4h\n"
+ "smlal2 v5.4s, v4.8h, v29.8h\n"
+ "ldr d4, [x23, x3]\n"
+ "smlal2 v22.4s, v14.8h, v29.8h\n"
+ "ssubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v19.4s, v25.8h, v29.8h\n"
+ "ldr d29, [x6, #0xb8]\n"
+ "smlal v20.4s, v17.4h, v21.4h\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v21.4h\n"
+ "smlal v23.4s, v10.4h, v21.4h\n"
+ "smlal2 v15.4s, v17.8h, v3.8h\n"
+ "smlal v7.4s, v6.4h, v27.4h\n"
+ "smlal2 v5.4s, v17.8h, v21.8h\n"
+ "ldr d17, [x22, x3]\n"
+ "smlal2 v22.4s, v25.8h, v21.8h\n"
+ "ssubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v21.8h\n"
+ "ldr d21, [x6, #0xc0]\n"
+ "smlal v20.4s, v6.4h, v3.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v3.4h\n"
+ "smlal v23.4s, v9.4h, v3.4h\n"
+ "smlal2 v15.4s, v6.8h, v27.8h\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "smlal2 v5.4s, v6.8h, v3.8h\n"
+ "ldr d6, [x21, x3]\n"
+ "smlal2 v22.4s, v10.8h, v3.8h\n"
+ "ssubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v3.8h\n"
+ "ldr d3, [x20, x3]\n"
+ "smlal v20.4s, v28.4h, v27.4h\n"
+ "ssubl v3.8h, v3.8b, v18.8b\n"
+ "smlal v24.4s, v9.4h, v27.4h\n"
+ "smlal v23.4s, v12.4h, v27.4h\n"
+ "add x3, x3, #0x8\n"
+ "smlal2 v15.4s, v8.8h, v1.8h\n"
+ "ldr q8, [x7, #0x0]\n"
+ "smlal v7.4s, v14.4h, v30.4h\n"
+ "smlal2 v5.4s, v28.8h, v27.8h\n"
+ "ldr q28, [x8, #0x0]\n"
+ "smlal2 v22.4s, v9.8h, v27.8h\n"
+ "smlal2 v19.4s, v12.8h, v27.8h\n"
+ "ldr q27, [x7, #0x10]\n"
+ "smlal v20.4s, v14.4h, v1.4h\n"
+ "add x7, x7, #0x20\n"
+ "smlal v24.4s, v2.4h, v1.4h\n"
+ "smlal v23.4s, v16.4h, v1.4h\n"
+ "smlal2 v15.4s, v14.8h, v30.8h\n"
+ "smlal v7.4s, v25.4h, v31.4h\n"
+ "smlal2 v5.4s, v14.8h, v1.8h\n"
+ "ldr q14, [x8, #0x10]\n"
+ "smlal2 v22.4s, v2.8h, v1.8h\n"
+ "add x8, x8, #0x20\n"
+ "smlal2 v19.4s, v16.8h, v1.8h\n"
+ "smlal v20.4s, v25.4h, v30.4h\n"
+ "smlal v24.4s, v16.4h, v30.4h\n"
+ "smlal v23.4s, v4.4h, v30.4h\n"
+ "smlal2 v15.4s, v25.8h, v31.8h\n"
+ "smlal v7.4s, v10.4h, v29.4h\n"
+ "smlal2 v5.4s, v25.8h, v30.8h\n"
+ "smlal2 v22.4s, v16.8h, v30.8h\n"
+ "smlal2 v19.4s, v4.8h, v30.8h\n"
+ "smlal v20.4s, v10.4h, v31.4h\n"
+ "smlal v24.4s, v4.4h, v31.4h\n"
+ "smlal v23.4s, v17.4h, v31.4h\n"
+ "smlal2 v15.4s, v10.8h, v29.8h\n"
+ "smlal v7.4s, v9.4h, v21.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v8.4s\n"
+ "smlal2 v5.4s, v10.8h, v31.8h\n"
+ "smlal2 v22.4s, v4.8h, v31.8h\n"
+ "and v4.16b, v7.16b, v28.16b\n"
+ "smlal2 v19.4s, v17.8h, v31.8h\n"
+ "smlal v20.4s, v9.4h, v29.4h\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "smlal v24.4s, v17.4h, v29.4h\n"
+ "smlal v23.4s, v6.4h, v29.4h\n"
+ "sqadd v7.4s, v7.4s, v4.4s\n"
+ "smlal2 v15.4s, v9.8h, v21.8h\n"
+ "smlal2 v5.4s, v9.8h, v29.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v27.4s\n"
+ "smlal2 v22.4s, v17.8h, v29.8h\n"
+ "smlal2 v19.4s, v6.8h, v29.8h\n"
+ "and v30.16b, v15.16b, v14.16b\n"
+ "smlal v20.4s, v12.4h, v21.4h\n"
+ "smlal v24.4s, v6.4h, v21.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+ "smlal v23.4s, v3.4h, v21.4h\n"
+ "smlal2 v5.4s, v12.8h, v21.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "smlal2 v22.4s, v6.8h, v21.8h\n"
+ "smlal2 v19.4s, v3.8h, v21.8h\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
"sshr v30.4s, v30.4s, #0x1f\n"
- "and v16.16b, v21.16b, v22.16b\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v18.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v30.4s\n"
+ "and v3.16b, v20.16b, v28.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v27.4s\n"
+ "and v25.16b, v24.16b, v28.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+ "and v16.16b, v23.16b, v28.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v27.4s\n"
+ "sqadd v15.4s, v15.4s, v30.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "and v4.16b, v5.16b, v14.16b\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "and v10.16b, v22.16b, v14.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v6.4s\n"
- "srshl v20.4s, v20.4s, v6.4s\n"
- "sqadd v10.4s, v10.4s, v26.4s\n"
- "srshl v8.4s, v8.4s, v6.4s\n"
- "sqadd v7.4s, v7.4s, v23.4s\n"
- "srshl v17.4s, v17.4s, v6.4s\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "srshl v19.4s, v19.4s, v22.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v10.4s, v10.4s, v22.4s\n"
+ "and v12.16b, v19.16b, v14.16b\n"
+ "sqadd v20.4s, v20.4s, v3.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v25.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v16.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "srshl v7.4s, v7.4s, v28.4s\n"
+ "srshl v20.4s, v20.4s, v28.4s\n"
+ "sqadd v5.4s, v5.4s, v4.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
+ "sqadd v22.4s, v22.4s, v10.4s\n"
+ "srshl v23.4s, v23.4s, v28.4s\n"
+ "sqadd v19.4s, v19.4s, v12.4s\n"
+ "srshl v15.4s, v15.4s, v14.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v5.4s, v5.4s, v14.4s\n"
"sqxtn v20.4h, v20.4s\n"
- "srshl v7.4s, v7.4s, v22.4s\n"
- "sqxtn v8.4h, v8.4s\n"
- "srshl v21.4s, v21.4s, v22.4s\n"
- "sqxtn v17.4h, v17.4s\n"
- "sqxtn2 v13.8h, v19.4s\n"
- "sqxtn2 v20.8h, v10.4s\n"
- "sqxtn2 v8.8h, v7.4s\n"
- "sqxtn2 v17.8h, v21.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v20.8h, v20.8h, v14.8h\n"
- "sqadd v8.8h, v8.8h, v14.8h\n"
- "sqadd v17.8h, v17.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v20.8h, v20.8h, v12.8h\n"
- "smax v8.8h, v8.8h, v12.8h\n"
- "smax v17.8h, v17.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v20.8h, v20.8h, v11.8h\n"
- "smin v8.8h, v8.8h, v11.8h\n"
- "smin v17.8h, v17.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "srshl v22.4s, v22.4s, v14.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "srshl v19.4s, v19.4s, v14.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v7.8h, v15.4s\n"
+ "sqxtn2 v20.8h, v5.4s\n"
+ "sqxtn2 v24.8h, v22.4s\n"
+ "sqxtn2 v23.8h, v19.4s\n"
+ "sqadd v7.8h, v7.8h, v26.8h\n"
+ "sqadd v20.8h, v20.8h, v26.8h\n"
+ "sqadd v24.8h, v24.8h, v26.8h\n"
+ "sqadd v23.8h, v23.8h, v26.8h\n"
+ "smax v7.8h, v7.8h, v11.8h\n"
+ "smax v20.8h, v20.8h, v11.8h\n"
+ "smax v24.8h, v24.8h, v11.8h\n"
+ "smax v23.8h, v23.8h, v11.8h\n"
+ "smin v7.8h, v7.8h, v0.8h\n"
+ "smin v20.8h, v20.8h, v0.8h\n"
+ "smin v24.8h, v24.8h, v0.8h\n"
+ "smin v23.8h, v23.8h, v0.8h\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "str d7, [x17, x4]\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str d13, [x17, x10]\n"
- "uzp1 v8.16b, v8.16b, v8.16b\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
- "str d20, [x6, x10]\n"
- "str d8, [x7, x10]\n"
- "str d17, [x16, x10]\n"
- "add x10, x10, #0x8\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str d20, [x16, x4]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d24, [x15, x4]\n"
+ "str d23, [x14, x4]\n"
+ "add x4, x4, #0x8\n"
"beq 124f\n"
- "add x3, x3, #0xc8\n"
+ "add x6, x6, #0xc8\n"
"3:" // Oddments
- "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
"tbz x1, #2, 5f\n"
- "ld1 { v13.4s }, [x13], #0x10\n"
+ "ld1 { v7.4s }, [x20], #0x10\n"
"tbz x1, #1, 4f\n"
- "ld1 { v19.d }[0], [x13], #0x8\n"
+ "ld1 { v15.d }[0], [x20], #0x8\n"
"tbz x1, #0, 7f\n"
- "ld1 { v19.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
"tbz x1, #0, 7f\n"
- "ld1 { v19.s }[0], [x13]\n"
+ "ld1 { v15.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
"tbz x1, #1, 6f\n"
- "ld1 { v13.d }[0], [x13], #0x8\n"
+ "ld1 { v7.d }[0], [x20], #0x8\n"
"tbz x1, #0, 7f\n"
- "ld1 { v13.s }[2], [x13]\n"
+ "ld1 { v7.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 7f\n"
- "ld1 { v13.s }[0], [x13]\n"
+ "ld1 { v7.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x3, #0x0]\n"
- "ldr d1, [x3, #0x8]\n"
- "mov v20.16b, v13.16b\n"
- "mov v10.16b, v19.16b\n"
- "ldr d2, [x3, #0x10]\n"
- "ldr d3, [x3, #0x18]\n"
- "mov v8.16b, v13.16b\n"
- "mov v7.16b, v19.16b\n"
- "ldr d4, [x3, #0x20]\n"
- "ldp x9, x28, [x4, #0x0]\n"
- "mov v17.16b, v13.16b\n"
- "mov v21.16b, v19.16b\n"
- "ldp x27, x26, [x4, #0x10]\n"
- "ldp x25, x24, [x4, #0x20]\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldp x23, x22, [x4, #0x30]\n"
- "ldp x21, x20, [x4, #0x40]\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "add x9, x9, x0\n"
- "add x28, x28, x0\n"
- "add x27, x27, x0\n"
- "add x26, x26, x0\n"
- "add x25, x25, x0\n"
- "add x24, x24, x0\n"
- "add x23, x23, x0\n"
- "add x22, x22, x0\n"
- "add x21, x21, x0\n"
- "add x20, x20, x0\n"
+ "ldr d6, [x6, #0x0]\n"
+ "ldr d14, [x6, #0x8]\n"
+ "mov v20.16b, v7.16b\n"
+ "mov v5.16b, v15.16b\n"
+ "ldr d10, [x6, #0x10]\n"
+ "ldr d21, [x6, #0x18]\n"
+ "mov v24.16b, v7.16b\n"
+ "mov v22.16b, v15.16b\n"
+ "ldr d12, [x6, #0x20]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "mov v23.16b, v7.16b\n"
+ "mov v19.16b, v15.16b\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ssubl v14.8h, v14.8b, v13.8b\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "ssubl v10.8h, v10.8b, v13.8b\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ssubl v12.8h, v12.8b, v13.8b\n"
+ "add x9, x9, x3\n"
+ "add x28, x28, x3\n"
+ "add x27, x27, x3\n"
+ "add x26, x26, x3\n"
+ "add x25, x25, x3\n"
+ "add x24, x24, x3\n"
+ "add x23, x23, x3\n"
+ "add x22, x22, x3\n"
+ "add x21, x21, x3\n"
+ "add x20, x20, x3\n"
"tbz x1, #2, 9f\n"
"ld1 { v31.s }[0], [x9], #0x4\n"
- "ld1 { v30.s }[0], [x28], #0x4\n"
- "ld1 { v29.s }[0], [x27], #0x4\n"
- "ld1 { v28.s }[0], [x26], #0x4\n"
- "ld1 { v27.s }[0], [x25], #0x4\n"
- "ld1 { v23.s }[0], [x24], #0x4\n"
+ "ld1 { v17.s }[0], [x28], #0x4\n"
+ "ld1 { v30.s }[0], [x27], #0x4\n"
+ "ld1 { v16.s }[0], [x26], #0x4\n"
+ "ld1 { v3.s }[0], [x25], #0x4\n"
+ "ld1 { v4.s }[0], [x24], #0x4\n"
"ld1 { v25.s }[0], [x23], #0x4\n"
- "ld1 { v24.s }[0], [x22], #0x4\n"
- "ld1 { v26.s }[0], [x21], #0x4\n"
- "ld1 { v22.s }[0], [x20], #0x4\n"
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v29.s }[0], [x21], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
"tbz x1, #1, 8f\n"
"ld1 { v31.h }[2], [x9], #0x2\n"
- "ld1 { v30.h }[2], [x28], #0x2\n"
- "ld1 { v29.h }[2], [x27], #0x2\n"
- "ld1 { v28.h }[2], [x26], #0x2\n"
- "ld1 { v27.h }[2], [x25], #0x2\n"
- "ld1 { v23.h }[2], [x24], #0x2\n"
+ "ld1 { v17.h }[2], [x28], #0x2\n"
+ "ld1 { v30.h }[2], [x27], #0x2\n"
+ "ld1 { v16.h }[2], [x26], #0x2\n"
+ "ld1 { v3.h }[2], [x25], #0x2\n"
+ "ld1 { v4.h }[2], [x24], #0x2\n"
"ld1 { v25.h }[2], [x23], #0x2\n"
- "ld1 { v24.h }[2], [x22], #0x2\n"
- "ld1 { v26.h }[2], [x21], #0x2\n"
- "ld1 { v22.h }[2], [x20], #0x2\n"
+ "ld1 { v9.h }[2], [x22], #0x2\n"
+ "ld1 { v29.h }[2], [x21], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
"tbz x1, #0, 11f\n"
"ld1 { v31.b }[6], [x9]\n"
- "ld1 { v30.b }[6], [x28]\n"
- "ld1 { v29.b }[6], [x27]\n"
- "ld1 { v28.b }[6], [x26]\n"
- "ld1 { v27.b }[6], [x25]\n"
- "ld1 { v23.b }[6], [x24]\n"
+ "ld1 { v17.b }[6], [x28]\n"
+ "ld1 { v30.b }[6], [x27]\n"
+ "ld1 { v16.b }[6], [x26]\n"
+ "ld1 { v3.b }[6], [x25]\n"
+ "ld1 { v4.b }[6], [x24]\n"
"ld1 { v25.b }[6], [x23]\n"
- "ld1 { v24.b }[6], [x22]\n"
- "ld1 { v26.b }[6], [x21]\n"
- "ld1 { v22.b }[6], [x20]\n"
+ "ld1 { v9.b }[6], [x22]\n"
+ "ld1 { v29.b }[6], [x21]\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
"tbz x1, #0, 11f\n"
"ld1 { v31.b }[4], [x9]\n"
- "ld1 { v30.b }[4], [x28]\n"
- "ld1 { v29.b }[4], [x27]\n"
- "ld1 { v28.b }[4], [x26]\n"
- "ld1 { v27.b }[4], [x25]\n"
- "ld1 { v23.b }[4], [x24]\n"
+ "ld1 { v17.b }[4], [x28]\n"
+ "ld1 { v30.b }[4], [x27]\n"
+ "ld1 { v16.b }[4], [x26]\n"
+ "ld1 { v3.b }[4], [x25]\n"
+ "ld1 { v4.b }[4], [x24]\n"
"ld1 { v25.b }[4], [x23]\n"
- "ld1 { v24.b }[4], [x22]\n"
- "ld1 { v26.b }[4], [x21]\n"
- "ld1 { v22.b }[4], [x20]\n"
+ "ld1 { v9.b }[4], [x22]\n"
+ "ld1 { v29.b }[4], [x21]\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
"tbz x1, #1, 10f\n"
"ld1 { v31.h }[0], [x9], #0x2\n"
- "ld1 { v30.h }[0], [x28], #0x2\n"
- "ld1 { v29.h }[0], [x27], #0x2\n"
- "ld1 { v28.h }[0], [x26], #0x2\n"
- "ld1 { v27.h }[0], [x25], #0x2\n"
- "ld1 { v23.h }[0], [x24], #0x2\n"
+ "ld1 { v17.h }[0], [x28], #0x2\n"
+ "ld1 { v30.h }[0], [x27], #0x2\n"
+ "ld1 { v16.h }[0], [x26], #0x2\n"
+ "ld1 { v3.h }[0], [x25], #0x2\n"
+ "ld1 { v4.h }[0], [x24], #0x2\n"
"ld1 { v25.h }[0], [x23], #0x2\n"
- "ld1 { v24.h }[0], [x22], #0x2\n"
- "ld1 { v26.h }[0], [x21], #0x2\n"
- "ld1 { v22.h }[0], [x20], #0x2\n"
+ "ld1 { v9.h }[0], [x22], #0x2\n"
+ "ld1 { v29.h }[0], [x21], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
"tbz x1, #0, 11f\n"
"ld1 { v31.b }[2], [x9]\n"
- "ld1 { v30.b }[2], [x28]\n"
- "ld1 { v29.b }[2], [x27]\n"
- "ld1 { v28.b }[2], [x26]\n"
- "ld1 { v27.b }[2], [x25]\n"
- "ld1 { v23.b }[2], [x24]\n"
+ "ld1 { v17.b }[2], [x28]\n"
+ "ld1 { v30.b }[2], [x27]\n"
+ "ld1 { v16.b }[2], [x26]\n"
+ "ld1 { v3.b }[2], [x25]\n"
+ "ld1 { v4.b }[2], [x24]\n"
"ld1 { v25.b }[2], [x23]\n"
- "ld1 { v24.b }[2], [x22]\n"
- "ld1 { v26.b }[2], [x21]\n"
- "ld1 { v22.b }[2], [x20]\n"
+ "ld1 { v9.b }[2], [x22]\n"
+ "ld1 { v29.b }[2], [x21]\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 11f\n"
"ld1 { v31.b }[0], [x9]\n"
- "ld1 { v30.b }[0], [x28]\n"
- "ld1 { v29.b }[0], [x27]\n"
- "ld1 { v28.b }[0], [x26]\n"
- "ld1 { v27.b }[0], [x25]\n"
- "ld1 { v23.b }[0], [x24]\n"
+ "ld1 { v17.b }[0], [x28]\n"
+ "ld1 { v30.b }[0], [x27]\n"
+ "ld1 { v16.b }[0], [x26]\n"
+ "ld1 { v3.b }[0], [x25]\n"
+ "ld1 { v4.b }[0], [x24]\n"
"ld1 { v25.b }[0], [x23]\n"
- "ld1 { v24.b }[0], [x22]\n"
- "ld1 { v26.b }[0], [x21]\n"
- "ld1 { v22.b }[0], [x20]\n"
+ "ld1 { v9.b }[0], [x22]\n"
+ "ld1 { v29.b }[0], [x21]\n"
+ "ld1 { v28.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "ssubl v31.8h, v31.8b, v9.8b\n"
- "ssubl v30.8h, v30.8b, v9.8b\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "ldr x20, [x4, #0x50]\n"
- "ssubl v29.8h, v29.8b, v9.8b\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "smlal v8.4s, v29.4h, v0.4h\n"
- "ssubl v28.8h, v28.8b, v9.8b\n"
- "add x20, x20, x0\n"
- "smlal2 v7.4s, v29.8h, v0.8h\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "smlal2 v21.4s, v28.8h, v0.8h\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "ssubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal v20.4s, v27.4h, v1.4h\n"
- "ssubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v10.4s, v27.8h, v1.8h\n"
- "smlal v8.4s, v28.4h, v1.4h\n"
- "ssubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v7.4s, v28.8h, v1.8h\n"
- "ssubl v26.8h, v26.8b, v9.8b\n"
- "smlal v17.4s, v23.4h, v1.4h\n"
- "ssubl v22.8h, v22.8b, v9.8b\n"
- "smlal2 v21.4s, v23.8h, v1.8h\n"
- "smlal v13.4s, v27.4h, v2.4h\n"
- "smlal2 v19.4s, v27.8h, v2.8h\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "smlal v8.4s, v23.4h, v2.4h\n"
- "smlal2 v7.4s, v23.8h, v2.8h\n"
+ "ssubl v31.8h, v31.8b, v18.8b\n"
+ "ssubl v17.8h, v17.8b, v18.8b\n"
+ "smlal v7.4s, v31.4h, v6.4h\n"
+ "ldr x20, [x5, #0x50]\n"
+ "ssubl v30.8h, v30.8b, v18.8b\n"
+ "smlal2 v15.4s, v31.8h, v6.8h\n"
+ "smlal v20.4s, v17.4h, v6.4h\n"
+ "smlal2 v5.4s, v17.8h, v6.8h\n"
+ "smlal v24.4s, v30.4h, v6.4h\n"
+ "ssubl v16.8h, v16.8b, v18.8b\n"
+ "add x20, x20, x3\n"
+ "smlal2 v22.4s, v30.8h, v6.8h\n"
+ "ssubl v3.8h, v3.8b, v18.8b\n"
+ "smlal v23.4s, v16.4h, v6.4h\n"
+ "smlal2 v19.4s, v16.8h, v6.8h\n"
+ "smlal v7.4s, v17.4h, v14.4h\n"
+ "ssubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v15.4s, v17.8h, v14.8h\n"
+ "smlal v20.4s, v3.4h, v14.4h\n"
+ "ssubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v5.4s, v3.8h, v14.8h\n"
+ "smlal v24.4s, v16.4h, v14.4h\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v22.4s, v16.8h, v14.8h\n"
+ "ssubl v29.8h, v29.8b, v18.8b\n"
+ "smlal v23.4s, v4.4h, v14.4h\n"
+ "ssubl v28.8h, v28.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v14.8h\n"
+ "smlal v7.4s, v3.4h, v10.4h\n"
+ "smlal2 v15.4s, v3.8h, v10.8h\n"
+ "smlal v20.4s, v25.4h, v10.4h\n"
+ "smlal2 v5.4s, v25.8h, v10.8h\n"
+ "smlal v24.4s, v4.4h, v10.4h\n"
+ "smlal2 v22.4s, v4.8h, v10.8h\n"
"tbz x1, #2, 13f\n"
- "ld1 { v31.s }[0], [x20], #0x4\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
"tbz x1, #1, 12f\n"
- "ld1 { v31.h }[2], [x20], #0x2\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
"tbz x1, #0, 15f\n"
- "ld1 { v31.b }[6], [x20]\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 15f\n"
- "ld1 { v31.b }[4], [x20]\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
"tbz x1, #1, 14f\n"
- "ld1 { v31.h }[0], [x20], #0x2\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
"tbz x1, #0, 15f\n"
- "ld1 { v31.b }[2], [x20]\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 15f\n"
- "ld1 { v31.b }[0], [x20]\n"
+ "ld1 { v27.b }[0], [x20]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
- "ssubl v31.8h, v31.8b, v9.8b\n"
- "ldr x22, [x4, #0x58]\n"
- "smlal v17.4s, v31.4h, v2.4h\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "add x22, x22, x0\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "smlal v8.4s, v31.4h, v3.4h\n"
- "smlal2 v7.4s, v31.8h, v3.8h\n"
+ "ssubl v27.8h, v27.8b, v18.8b\n"
+ "ldr x20, [x5, #0x58]\n"
+ "smlal v23.4s, v27.4h, v10.4h\n"
+ "smlal2 v19.4s, v27.8h, v10.8h\n"
+ "smlal v7.4s, v25.4h, v21.4h\n"
+ "smlal2 v15.4s, v25.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v20.4s, v9.4h, v21.4h\n"
+ "smlal2 v5.4s, v9.8h, v21.8h\n"
+ "smlal v24.4s, v27.4h, v21.4h\n"
+ "smlal2 v22.4s, v27.8h, v21.8h\n"
"tbz x1, #2, 17f\n"
- "ld1 { v30.s }[0], [x22], #0x4\n"
+ "ld1 { v6.s }[0], [x20], #0x4\n"
"tbz x1, #1, 16f\n"
- "ld1 { v30.h }[2], [x22], #0x2\n"
+ "ld1 { v6.h }[2], [x20], #0x2\n"
"tbz x1, #0, 19f\n"
- "ld1 { v30.b }[6], [x22]\n"
+ "ld1 { v6.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 19f\n"
- "ld1 { v30.b }[4], [x22]\n"
+ "ld1 { v6.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
"tbz x1, #1, 18f\n"
- "ld1 { v30.h }[0], [x22], #0x2\n"
+ "ld1 { v6.h }[0], [x20], #0x2\n"
"tbz x1, #0, 19f\n"
- "ld1 { v30.b }[2], [x22]\n"
+ "ld1 { v6.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 19f\n"
- "ld1 { v30.b }[0], [x22]\n"
+ "ld1 { v6.b }[0], [x20]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
- "ssubl v30.8h, v30.8b, v9.8b\n"
- "ldr x21, [x4, #0x60]\n"
- "smlal v17.4s, v30.4h, v3.4h\n"
- "smlal2 v21.4s, v30.8h, v3.8h\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "add x21, x21, x0\n"
+ "ssubl v6.8h, v6.8b, v18.8b\n"
+ "ldr x20, [x5, #0x60]\n"
+ "smlal v23.4s, v6.4h, v21.4h\n"
+ "smlal2 v19.4s, v6.8h, v21.8h\n"
+ "smlal v7.4s, v9.4h, v12.4h\n"
+ "smlal2 v15.4s, v9.8h, v12.8h\n"
+ "add x20, x20, x3\n"
"tbz x1, #2, 21f\n"
- "ld1 { v27.s }[0], [x21], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz x1, #1, 20f\n"
- "ld1 { v27.h }[2], [x21], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"tbz x1, #0, 23f\n"
- "ld1 { v27.b }[6], [x21]\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 23f\n"
- "ld1 { v27.b }[4], [x21]\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 5): Bit 2: Unset
"tbz x1, #1, 22f\n"
- "ld1 { v27.h }[0], [x21], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"tbz x1, #0, 23f\n"
- "ld1 { v27.b }[2], [x21]\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 23f\n"
- "ld1 { v27.b }[0], [x21]\n"
+ "ld1 { v9.b }[0], [x20]\n"
"23:" // Oddments: Load (0, 5): Bit 2: End
- "ldr d0, [x3, #0x28]\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal v20.4s, v27.4h, v4.4h\n"
- "smlal2 v10.4s, v27.8h, v4.8h\n"
- "smlal v8.4s, v30.4h, v4.4h\n"
- "smlal2 v7.4s, v30.8h, v4.8h\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ldr x20, [x4, #0x68]\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "add x20, x20, x0\n"
- "smlal v13.4s, v29.4h, v0.4h\n"
- "smlal2 v19.4s, v29.8h, v0.8h\n"
- "smlal v20.4s, v28.4h, v0.4h\n"
- "smlal2 v10.4s, v28.8h, v0.8h\n"
- "smlal v8.4s, v22.4h, v0.4h\n"
- "smlal2 v7.4s, v22.8h, v0.8h\n"
+ "ldr d14, [x6, #0x28]\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "smlal v20.4s, v9.4h, v12.4h\n"
+ "smlal2 v5.4s, v9.8h, v12.8h\n"
+ "smlal v24.4s, v6.4h, v12.4h\n"
+ "smlal2 v22.4s, v6.8h, v12.8h\n"
+ "ssubl v14.8h, v14.8b, v13.8b\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v30.4h, v14.4h\n"
+ "smlal2 v15.4s, v30.8h, v14.8h\n"
+ "smlal v20.4s, v16.4h, v14.4h\n"
+ "smlal2 v5.4s, v16.8h, v14.8h\n"
+ "smlal v24.4s, v28.4h, v14.4h\n"
+ "smlal2 v22.4s, v28.8h, v14.8h\n"
"tbz x1, #2, 25f\n"
"ld1 { v25.s }[0], [x20], #0x4\n"
"tbz x1, #1, 24f\n"
@@ -1315,869 +1315,869 @@ void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
"tbz x1, #0, 27f\n"
"ld1 { v25.b }[0], [x20]\n"
"27:" // Oddments: Load (2, 1): Bit 2: End
- "ldr d1, [x3, #0x30]\n"
- "ssubl v25.8h, v25.8b, v9.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldr x26, [x4, #0x70]\n"
- "smlal v17.4s, v25.4h, v0.4h\n"
- "smlal2 v21.4s, v25.8h, v0.8h\n"
- "add x26, x26, x0\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v19.4s, v28.8h, v1.8h\n"
- "smlal v20.4s, v23.4h, v1.4h\n"
- "smlal2 v10.4s, v23.8h, v1.8h\n"
- "smlal v8.4s, v25.4h, v1.4h\n"
- "smlal2 v7.4s, v25.8h, v1.8h\n"
+ "ldr d21, [x6, #0x30]\n"
+ "ssubl v25.8h, v25.8b, v18.8b\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ldr x20, [x5, #0x70]\n"
+ "smlal v23.4s, v25.4h, v14.4h\n"
+ "smlal2 v19.4s, v25.8h, v14.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v16.4h, v21.4h\n"
+ "smlal2 v15.4s, v16.8h, v21.8h\n"
+ "smlal v20.4s, v4.4h, v21.4h\n"
+ "smlal2 v5.4s, v4.8h, v21.8h\n"
+ "smlal v24.4s, v25.4h, v21.4h\n"
+ "smlal2 v22.4s, v25.8h, v21.8h\n"
"tbz x1, #2, 29f\n"
- "ld1 { v24.s }[0], [x26], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz x1, #1, 28f\n"
- "ld1 { v24.h }[2], [x26], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"tbz x1, #0, 31f\n"
- "ld1 { v24.b }[6], [x26]\n"
+ "ld1 { v10.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 31f\n"
- "ld1 { v24.b }[4], [x26]\n"
+ "ld1 { v10.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
"tbz x1, #1, 30f\n"
- "ld1 { v24.h }[0], [x26], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"tbz x1, #0, 31f\n"
- "ld1 { v24.b }[2], [x26]\n"
+ "ld1 { v10.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 31f\n"
- "ld1 { v24.b }[0], [x26]\n"
+ "ld1 { v10.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "ldr d2, [x3, #0x38]\n"
- "ssubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ldr x25, [x4, #0x78]\n"
- "smlal v17.4s, v24.4h, v1.4h\n"
- "smlal2 v21.4s, v24.8h, v1.8h\n"
- "add x25, x25, x0\n"
- "smlal v13.4s, v23.4h, v2.4h\n"
- "smlal2 v19.4s, v23.8h, v2.8h\n"
- "smlal v20.4s, v31.4h, v2.4h\n"
- "smlal2 v10.4s, v31.8h, v2.8h\n"
- "smlal v8.4s, v24.4h, v2.4h\n"
- "smlal2 v7.4s, v24.8h, v2.8h\n"
+ "ldr d9, [x6, #0x38]\n"
+ "ssubl v10.8h, v10.8b, v18.8b\n"
+ "ssubl v9.8h, v9.8b, v13.8b\n"
+ "ldr x20, [x5, #0x78]\n"
+ "smlal v23.4s, v10.4h, v21.4h\n"
+ "smlal2 v19.4s, v10.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v4.4h, v9.4h\n"
+ "smlal2 v15.4s, v4.8h, v9.8h\n"
+ "smlal v20.4s, v27.4h, v9.4h\n"
+ "smlal2 v5.4s, v27.8h, v9.8h\n"
+ "smlal v24.4s, v10.4h, v9.4h\n"
+ "smlal2 v22.4s, v10.8h, v9.8h\n"
"tbz x1, #2, 33f\n"
- "ld1 { v27.s }[0], [x25], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz x1, #1, 32f\n"
- "ld1 { v27.h }[2], [x25], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"tbz x1, #0, 35f\n"
- "ld1 { v27.b }[6], [x25]\n"
+ "ld1 { v12.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 35f\n"
- "ld1 { v27.b }[4], [x25]\n"
+ "ld1 { v12.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (2, 3): Bit 2: Unset
"tbz x1, #1, 34f\n"
- "ld1 { v27.h }[0], [x25], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"tbz x1, #0, 35f\n"
- "ld1 { v27.b }[2], [x25]\n"
+ "ld1 { v12.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 35f\n"
- "ld1 { v27.b }[0], [x25]\n"
+ "ld1 { v12.b }[0], [x20]\n"
"35:" // Oddments: Load (2, 3): Bit 2: End
- "ldr d3, [x3, #0x40]\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr x23, [x4, #0x80]\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal2 v21.4s, v27.8h, v2.8h\n"
- "add x23, x23, x0\n"
- "smlal v13.4s, v31.4h, v3.4h\n"
- "smlal2 v19.4s, v31.8h, v3.8h\n"
- "smlal v20.4s, v30.4h, v3.4h\n"
- "smlal2 v10.4s, v30.8h, v3.8h\n"
- "smlal v8.4s, v27.4h, v3.4h\n"
- "smlal2 v7.4s, v27.8h, v3.8h\n"
+ "ldr d31, [x6, #0x40]\n"
+ "ssubl v12.8h, v12.8b, v18.8b\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x20, [x5, #0x80]\n"
+ "smlal v23.4s, v12.4h, v9.4h\n"
+ "smlal2 v19.4s, v12.8h, v9.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v27.4h, v31.4h\n"
+ "smlal2 v15.4s, v27.8h, v31.8h\n"
+ "smlal v20.4s, v6.4h, v31.4h\n"
+ "smlal2 v5.4s, v6.8h, v31.8h\n"
+ "smlal v24.4s, v12.4h, v31.4h\n"
+ "smlal2 v22.4s, v12.8h, v31.8h\n"
"tbz x1, #2, 37f\n"
- "ld1 { v23.s }[0], [x23], #0x4\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
"tbz x1, #1, 36f\n"
- "ld1 { v23.h }[2], [x23], #0x2\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
"tbz x1, #0, 39f\n"
- "ld1 { v23.b }[6], [x23]\n"
+ "ld1 { v8.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 39f\n"
- "ld1 { v23.b }[4], [x23]\n"
+ "ld1 { v8.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 4): Bit 2: Unset
"tbz x1, #1, 38f\n"
- "ld1 { v23.h }[0], [x23], #0x2\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
"tbz x1, #0, 39f\n"
- "ld1 { v23.b }[2], [x23]\n"
+ "ld1 { v8.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 39f\n"
- "ld1 { v23.b }[0], [x23]\n"
+ "ld1 { v8.b }[0], [x20]\n"
"39:" // Oddments: Load (2, 4): Bit 2: End
- "ldr d4, [x3, #0x48]\n"
- "ssubl v23.8h, v23.8b, v9.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ldr x24, [x4, #0x88]\n"
- "smlal v17.4s, v23.4h, v3.4h\n"
- "smlal2 v21.4s, v23.8h, v3.8h\n"
- "add x24, x24, x0\n"
- "smlal v13.4s, v30.4h, v4.4h\n"
- "smlal2 v19.4s, v30.8h, v4.8h\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "smlal v8.4s, v23.4h, v4.4h\n"
- "smlal2 v7.4s, v23.8h, v4.8h\n"
+ "ldr d16, [x6, #0x48]\n"
+ "ssubl v8.8h, v8.8b, v18.8b\n"
+ "ssubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0x88]\n"
+ "smlal v23.4s, v8.4h, v31.4h\n"
+ "smlal2 v19.4s, v8.8h, v31.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v6.4h, v16.4h\n"
+ "smlal2 v15.4s, v6.8h, v16.8h\n"
+ "smlal v20.4s, v29.4h, v16.4h\n"
+ "smlal2 v5.4s, v29.8h, v16.8h\n"
+ "smlal v24.4s, v8.4h, v16.4h\n"
+ "smlal2 v22.4s, v8.8h, v16.8h\n"
"tbz x1, #2, 41f\n"
- "ld1 { v28.s }[0], [x24], #0x4\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
"tbz x1, #1, 40f\n"
- "ld1 { v28.h }[2], [x24], #0x2\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
"tbz x1, #0, 43f\n"
- "ld1 { v28.b }[6], [x24]\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 43f\n"
- "ld1 { v28.b }[4], [x24]\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 5): Bit 2: Unset
"tbz x1, #1, 42f\n"
- "ld1 { v28.h }[0], [x24], #0x2\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
"tbz x1, #0, 43f\n"
- "ld1 { v28.b }[2], [x24]\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 43f\n"
- "ld1 { v28.b }[0], [x24]\n"
+ "ld1 { v27.b }[0], [x20]\n"
"43:" // Oddments: Load (2, 5): Bit 2: End
- "ldr d0, [x3, #0x50]\n"
- "ssubl v28.8h, v28.8b, v9.8b\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ldr x15, [x4, #0x90]\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "smlal2 v21.4s, v28.8h, v4.8h\n"
- "add x15, x15, x0\n"
- "smlal v13.4s, v22.4h, v0.4h\n"
- "smlal2 v19.4s, v22.8h, v0.8h\n"
- "smlal v20.4s, v25.4h, v0.4h\n"
- "smlal2 v10.4s, v25.8h, v0.8h\n"
+ "ldr d21, [x6, #0x50]\n"
+ "ssubl v27.8h, v27.8b, v18.8b\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ldr x20, [x5, #0x90]\n"
+ "smlal v23.4s, v27.4h, v16.4h\n"
+ "smlal2 v19.4s, v27.8h, v16.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v28.4h, v21.4h\n"
+ "smlal2 v15.4s, v28.8h, v21.8h\n"
+ "smlal v20.4s, v25.4h, v21.4h\n"
+ "smlal2 v5.4s, v25.8h, v21.8h\n"
"tbz x1, #2, 45f\n"
- "ld1 { v31.s }[0], [x15], #0x4\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
"tbz x1, #1, 44f\n"
- "ld1 { v31.h }[2], [x15], #0x2\n"
+ "ld1 { v31.h }[2], [x20], #0x2\n"
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[6], [x15]\n"
+ "ld1 { v31.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[4], [x15]\n"
+ "ld1 { v31.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (3, 0): Bit 2: Unset
"tbz x1, #1, 46f\n"
- "ld1 { v31.h }[0], [x15], #0x2\n"
+ "ld1 { v31.h }[0], [x20], #0x2\n"
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[2], [x15]\n"
+ "ld1 { v31.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[0], [x15]\n"
+ "ld1 { v31.b }[0], [x20]\n"
"47:" // Oddments: Load (3, 0): Bit 2: End
- "ssubl v31.8h, v31.8b, v9.8b\n"
- "ldr x21, [x4, #0x98]\n"
- "smlal v8.4s, v31.4h, v0.4h\n"
- "smlal2 v7.4s, v31.8h, v0.8h\n"
- "add x21, x21, x0\n"
+ "ssubl v31.8h, v31.8b, v18.8b\n"
+ "ldr x20, [x5, #0x98]\n"
+ "smlal v24.4s, v31.4h, v21.4h\n"
+ "smlal2 v22.4s, v31.8h, v21.8h\n"
+ "add x20, x20, x3\n"
"tbz x1, #2, 49f\n"
- "ld1 { v30.s }[0], [x21], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
"tbz x1, #1, 48f\n"
- "ld1 { v30.h }[2], [x21], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
"tbz x1, #0, 51f\n"
- "ld1 { v30.b }[6], [x21]\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
"tbz x1, #0, 51f\n"
- "ld1 { v30.b }[4], [x21]\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
"tbz x1, #1, 50f\n"
- "ld1 { v30.h }[0], [x21], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
"tbz x1, #0, 51f\n"
- "ld1 { v30.b }[2], [x21]\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 51f\n"
- "ld1 { v30.b }[0], [x21]\n"
+ "ld1 { v28.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "ldr d1, [x3, #0x58]\n"
- "ssubl v30.8h, v30.8b, v9.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldr x14, [x4, #0xa0]\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "smlal2 v21.4s, v30.8h, v0.8h\n"
- "add x14, x14, x0\n"
- "smlal v13.4s, v25.4h, v1.4h\n"
- "smlal2 v19.4s, v25.8h, v1.8h\n"
- "smlal v20.4s, v24.4h, v1.4h\n"
- "smlal2 v10.4s, v24.8h, v1.8h\n"
- "smlal v8.4s, v30.4h, v1.4h\n"
- "smlal2 v7.4s, v30.8h, v1.8h\n"
+ "ldr d2, [x6, #0x58]\n"
+ "ssubl v28.8h, v28.8b, v18.8b\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "ldr x20, [x5, #0xa0]\n"
+ "smlal v23.4s, v28.4h, v21.4h\n"
+ "smlal2 v19.4s, v28.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v25.4h, v2.4h\n"
+ "smlal2 v15.4s, v25.8h, v2.8h\n"
+ "smlal v20.4s, v10.4h, v2.4h\n"
+ "smlal2 v5.4s, v10.8h, v2.8h\n"
+ "smlal v24.4s, v28.4h, v2.4h\n"
+ "smlal2 v22.4s, v28.8h, v2.8h\n"
"tbz x1, #2, 53f\n"
- "ld1 { v26.s }[0], [x14], #0x4\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
"tbz x1, #1, 52f\n"
- "ld1 { v26.h }[2], [x14], #0x2\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
"tbz x1, #0, 55f\n"
- "ld1 { v26.b }[6], [x14]\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 55f\n"
- "ld1 { v26.b }[4], [x14]\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
"tbz x1, #1, 54f\n"
- "ld1 { v26.h }[0], [x14], #0x2\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
"tbz x1, #0, 55f\n"
- "ld1 { v26.b }[2], [x14]\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 55f\n"
- "ld1 { v26.b }[0], [x14]\n"
+ "ld1 { v21.b }[0], [x20]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "ldr d2, [x3, #0x60]\n"
- "ssubl v26.8h, v26.8b, v9.8b\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ldr x13, [x4, #0xa8]\n"
- "smlal v17.4s, v26.4h, v1.4h\n"
- "smlal2 v21.4s, v26.8h, v1.8h\n"
- "add x13, x13, x0\n"
- "smlal v13.4s, v24.4h, v2.4h\n"
- "smlal2 v19.4s, v24.8h, v2.8h\n"
- "smlal v20.4s, v27.4h, v2.4h\n"
- "smlal2 v10.4s, v27.8h, v2.8h\n"
- "smlal v8.4s, v26.4h, v2.4h\n"
- "smlal2 v7.4s, v26.8h, v2.8h\n"
+ "ldr d25, [x6, #0x60]\n"
+ "ssubl v21.8h, v21.8b, v18.8b\n"
+ "ssubl v25.8h, v25.8b, v13.8b\n"
+ "ldr x20, [x5, #0xa8]\n"
+ "smlal v23.4s, v21.4h, v2.4h\n"
+ "smlal2 v19.4s, v21.8h, v2.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v10.4h, v25.4h\n"
+ "smlal2 v15.4s, v10.8h, v25.8h\n"
+ "smlal v20.4s, v12.4h, v25.4h\n"
+ "smlal2 v5.4s, v12.8h, v25.8h\n"
+ "smlal v24.4s, v21.4h, v25.4h\n"
+ "smlal2 v22.4s, v21.8h, v25.8h\n"
"tbz x1, #2, 57f\n"
- "ld1 { v25.s }[0], [x13], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz x1, #1, 56f\n"
- "ld1 { v25.h }[2], [x13], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"tbz x1, #0, 59f\n"
- "ld1 { v25.b }[6], [x13]\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 59f\n"
"56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 59f\n"
- "ld1 { v25.b }[4], [x13]\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 59f\n"
"57:" // Oddments: Load (3, 3): Bit 2: Unset
"tbz x1, #1, 58f\n"
- "ld1 { v25.h }[0], [x13], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"tbz x1, #0, 59f\n"
- "ld1 { v25.b }[2], [x13]\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 59f\n"
"58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 59f\n"
- "ld1 { v25.b }[0], [x13]\n"
+ "ld1 { v9.b }[0], [x20]\n"
"59:" // Oddments: Load (3, 3): Bit 2: End
- "ldr d3, [x3, #0x68]\n"
- "ssubl v25.8h, v25.8b, v9.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr x12, [x4, #0xb0]\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "add x12, x12, x0\n"
- "smlal v13.4s, v27.4h, v3.4h\n"
- "smlal2 v19.4s, v27.8h, v3.8h\n"
- "smlal v20.4s, v23.4h, v3.4h\n"
- "smlal2 v10.4s, v23.8h, v3.8h\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "ldr d1, [x6, #0x68]\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "ldr x20, [x5, #0xb0]\n"
+ "smlal v23.4s, v9.4h, v25.4h\n"
+ "smlal2 v19.4s, v9.8h, v25.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v12.4h, v1.4h\n"
+ "smlal2 v15.4s, v12.8h, v1.8h\n"
+ "smlal v20.4s, v8.4h, v1.4h\n"
+ "smlal2 v5.4s, v8.8h, v1.8h\n"
+ "smlal v24.4s, v9.4h, v1.4h\n"
+ "smlal2 v22.4s, v9.8h, v1.8h\n"
"tbz x1, #2, 61f\n"
- "ld1 { v24.s }[0], [x12], #0x4\n"
+ "ld1 { v3.s }[0], [x20], #0x4\n"
"tbz x1, #1, 60f\n"
- "ld1 { v24.h }[2], [x12], #0x2\n"
+ "ld1 { v3.h }[2], [x20], #0x2\n"
"tbz x1, #0, 63f\n"
- "ld1 { v24.b }[6], [x12]\n"
+ "ld1 { v3.b }[6], [x20]\n"
"b 63f\n"
"60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 63f\n"
- "ld1 { v24.b }[4], [x12]\n"
+ "ld1 { v3.b }[4], [x20]\n"
"b 63f\n"
"61:" // Oddments: Load (3, 4): Bit 2: Unset
"tbz x1, #1, 62f\n"
- "ld1 { v24.h }[0], [x12], #0x2\n"
+ "ld1 { v3.h }[0], [x20], #0x2\n"
"tbz x1, #0, 63f\n"
- "ld1 { v24.b }[2], [x12]\n"
+ "ld1 { v3.b }[2], [x20]\n"
"b 63f\n"
"62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 63f\n"
- "ld1 { v24.b }[0], [x12]\n"
+ "ld1 { v3.b }[0], [x20]\n"
"63:" // Oddments: Load (3, 4): Bit 2: End
- "ldr d4, [x3, #0x70]\n"
- "ssubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ldr x20, [x4, #0xb8]\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "add x20, x20, x0\n"
- "smlal v13.4s, v23.4h, v4.4h\n"
- "smlal2 v19.4s, v23.8h, v4.8h\n"
- "smlal v20.4s, v28.4h, v4.4h\n"
- "smlal2 v10.4s, v28.8h, v4.8h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
+ "ldr d16, [x6, #0x70]\n"
+ "ssubl v3.8h, v3.8b, v18.8b\n"
+ "ssubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0xb8]\n"
+ "smlal v23.4s, v3.4h, v1.4h\n"
+ "smlal2 v19.4s, v3.8h, v1.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v8.4h, v16.4h\n"
+ "smlal2 v15.4s, v8.8h, v16.8h\n"
+ "smlal v20.4s, v27.4h, v16.4h\n"
+ "smlal2 v5.4s, v27.8h, v16.8h\n"
+ "smlal v24.4s, v3.4h, v16.4h\n"
+ "smlal2 v22.4s, v3.8h, v16.8h\n"
"tbz x1, #2, 65f\n"
- "ld1 { v22.s }[0], [x20], #0x4\n"
+ "ld1 { v14.s }[0], [x20], #0x4\n"
"tbz x1, #1, 64f\n"
- "ld1 { v22.h }[2], [x20], #0x2\n"
+ "ld1 { v14.h }[2], [x20], #0x2\n"
"tbz x1, #0, 67f\n"
- "ld1 { v22.b }[6], [x20]\n"
+ "ld1 { v14.b }[6], [x20]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 67f\n"
- "ld1 { v22.b }[4], [x20]\n"
+ "ld1 { v14.b }[4], [x20]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 5): Bit 2: Unset
"tbz x1, #1, 66f\n"
- "ld1 { v22.h }[0], [x20], #0x2\n"
+ "ld1 { v14.h }[0], [x20], #0x2\n"
"tbz x1, #0, 67f\n"
- "ld1 { v22.b }[2], [x20]\n"
+ "ld1 { v14.b }[2], [x20]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 67f\n"
- "ld1 { v22.b }[0], [x20]\n"
+ "ld1 { v14.b }[0], [x20]\n"
"67:" // Oddments: Load (3, 5): Bit 2: End
- "ldr d0, [x3, #0x78]\n"
- "ssubl v22.8h, v22.8b, v9.8b\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ldr x11, [x4, #0xc0]\n"
- "smlal v17.4s, v22.4h, v4.4h\n"
- "smlal2 v21.4s, v22.8h, v4.8h\n"
- "add x11, x11, x0\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "ldr d17, [x6, #0x78]\n"
+ "ssubl v14.8h, v14.8b, v18.8b\n"
+ "ssubl v17.8h, v17.8b, v13.8b\n"
+ "ldr x20, [x5, #0xc0]\n"
+ "smlal v23.4s, v14.4h, v16.4h\n"
+ "smlal2 v19.4s, v14.8h, v16.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v31.4h, v17.4h\n"
+ "smlal2 v15.4s, v31.8h, v17.8h\n"
+ "smlal v20.4s, v28.4h, v17.4h\n"
+ "smlal2 v5.4s, v28.8h, v17.8h\n"
"tbz x1, #2, 69f\n"
- "ld1 { v27.s }[0], [x11], #0x4\n"
+ "ld1 { v1.s }[0], [x20], #0x4\n"
"tbz x1, #1, 68f\n"
- "ld1 { v27.h }[2], [x11], #0x2\n"
+ "ld1 { v1.h }[2], [x20], #0x2\n"
"tbz x1, #0, 71f\n"
- "ld1 { v27.b }[6], [x11]\n"
+ "ld1 { v1.b }[6], [x20]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
"tbz x1, #0, 71f\n"
- "ld1 { v27.b }[4], [x11]\n"
+ "ld1 { v1.b }[4], [x20]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 0): Bit 2: Unset
"tbz x1, #1, 70f\n"
- "ld1 { v27.h }[0], [x11], #0x2\n"
+ "ld1 { v1.h }[0], [x20], #0x2\n"
"tbz x1, #0, 71f\n"
- "ld1 { v27.b }[2], [x11]\n"
+ "ld1 { v1.b }[2], [x20]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 71f\n"
- "ld1 { v27.b }[0], [x11]\n"
+ "ld1 { v1.b }[0], [x20]\n"
"71:" // Oddments: Load (4, 0): Bit 2: End
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "ldr x22, [x4, #0xc8]\n"
- "smlal v8.4s, v27.4h, v0.4h\n"
- "smlal2 v7.4s, v27.8h, v0.8h\n"
- "add x22, x22, x0\n"
+ "ssubl v1.8h, v1.8b, v18.8b\n"
+ "ldr x20, [x5, #0xc8]\n"
+ "smlal v24.4s, v1.4h, v17.4h\n"
+ "smlal2 v22.4s, v1.8h, v17.8h\n"
+ "add x20, x20, x3\n"
"tbz x1, #2, 73f\n"
- "ld1 { v23.s }[0], [x22], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x1, #1, 72f\n"
- "ld1 { v23.h }[2], [x22], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x1, #0, 75f\n"
- "ld1 { v23.b }[6], [x22]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
"tbz x1, #0, 75f\n"
- "ld1 { v23.b }[4], [x22]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 1): Bit 2: Unset
"tbz x1, #1, 74f\n"
- "ld1 { v23.h }[0], [x22], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x1, #0, 75f\n"
- "ld1 { v23.b }[2], [x22]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 75f\n"
- "ld1 { v23.b }[0], [x22]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"75:" // Oddments: Load (4, 1): Bit 2: End
- "ldr d1, [x3, #0x80]\n"
- "ssubl v23.8h, v23.8b, v9.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldr x9, [x4, #0xd0]\n"
- "smlal v17.4s, v23.4h, v0.4h\n"
- "smlal2 v21.4s, v23.8h, v0.8h\n"
- "add x9, x9, x0\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal v20.4s, v26.4h, v1.4h\n"
- "smlal2 v10.4s, v26.8h, v1.8h\n"
- "smlal v8.4s, v23.4h, v1.4h\n"
- "smlal2 v7.4s, v23.8h, v1.8h\n"
+ "ldr d29, [x6, #0x80]\n"
+ "ssubl v16.8h, v16.8b, v18.8b\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "ldr x20, [x5, #0xd0]\n"
+ "smlal v23.4s, v16.4h, v17.4h\n"
+ "smlal2 v19.4s, v16.8h, v17.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v28.4h, v29.4h\n"
+ "smlal2 v15.4s, v28.8h, v29.8h\n"
+ "smlal v20.4s, v21.4h, v29.4h\n"
+ "smlal2 v5.4s, v21.8h, v29.8h\n"
+ "smlal v24.4s, v16.4h, v29.4h\n"
+ "smlal2 v22.4s, v16.8h, v29.8h\n"
"tbz x1, #2, 77f\n"
- "ld1 { v31.s }[0], [x9], #0x4\n"
+ "ld1 { v30.s }[0], [x20], #0x4\n"
"tbz x1, #1, 76f\n"
- "ld1 { v31.h }[2], [x9], #0x2\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
"tbz x1, #0, 79f\n"
- "ld1 { v31.b }[6], [x9]\n"
+ "ld1 { v30.b }[6], [x20]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 79f\n"
- "ld1 { v31.b }[4], [x9]\n"
+ "ld1 { v30.b }[4], [x20]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 2): Bit 2: Unset
"tbz x1, #1, 78f\n"
- "ld1 { v31.h }[0], [x9], #0x2\n"
+ "ld1 { v30.h }[0], [x20], #0x2\n"
"tbz x1, #0, 79f\n"
- "ld1 { v31.b }[2], [x9]\n"
+ "ld1 { v30.b }[2], [x20]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 79f\n"
- "ld1 { v31.b }[0], [x9]\n"
+ "ld1 { v30.b }[0], [x20]\n"
"79:" // Oddments: Load (4, 2): Bit 2: End
- "ldr d2, [x3, #0x88]\n"
- "ssubl v31.8h, v31.8b, v9.8b\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ldr x28, [x4, #0xd8]\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal2 v21.4s, v31.8h, v1.8h\n"
- "add x28, x28, x0\n"
- "smlal v13.4s, v26.4h, v2.4h\n"
- "smlal2 v19.4s, v26.8h, v2.8h\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "smlal v8.4s, v31.4h, v2.4h\n"
- "smlal2 v7.4s, v31.8h, v2.8h\n"
+ "ldr d12, [x6, #0x88]\n"
+ "ssubl v30.8h, v30.8b, v18.8b\n"
+ "ssubl v12.8h, v12.8b, v13.8b\n"
+ "ldr x20, [x5, #0xd8]\n"
+ "smlal v23.4s, v30.4h, v29.4h\n"
+ "smlal2 v19.4s, v30.8h, v29.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v21.4h, v12.4h\n"
+ "smlal2 v15.4s, v21.8h, v12.8h\n"
+ "smlal v20.4s, v9.4h, v12.4h\n"
+ "smlal2 v5.4s, v9.8h, v12.8h\n"
+ "smlal v24.4s, v30.4h, v12.4h\n"
+ "smlal2 v22.4s, v30.8h, v12.8h\n"
"tbz x1, #2, 81f\n"
- "ld1 { v30.s }[0], [x28], #0x4\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
"tbz x1, #1, 80f\n"
- "ld1 { v30.h }[2], [x28], #0x2\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
"tbz x1, #0, 83f\n"
- "ld1 { v30.b }[6], [x28]\n"
+ "ld1 { v29.b }[6], [x20]\n"
"b 83f\n"
"80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 83f\n"
- "ld1 { v30.b }[4], [x28]\n"
+ "ld1 { v29.b }[4], [x20]\n"
"b 83f\n"
"81:" // Oddments: Load (4, 3): Bit 2: Unset
"tbz x1, #1, 82f\n"
- "ld1 { v30.h }[0], [x28], #0x2\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
"tbz x1, #0, 83f\n"
- "ld1 { v30.b }[2], [x28]\n"
+ "ld1 { v29.b }[2], [x20]\n"
"b 83f\n"
"82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 83f\n"
- "ld1 { v30.b }[0], [x28]\n"
+ "ld1 { v29.b }[0], [x20]\n"
"83:" // Oddments: Load (4, 3): Bit 2: End
- "ldr d3, [x3, #0x90]\n"
- "ssubl v30.8h, v30.8b, v9.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr x27, [x4, #0xe0]\n"
- "smlal v17.4s, v30.4h, v2.4h\n"
- "smlal2 v21.4s, v30.8h, v2.8h\n"
- "add x27, x27, x0\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "smlal v8.4s, v30.4h, v3.4h\n"
- "smlal2 v7.4s, v30.8h, v3.8h\n"
+ "ldr d21, [x6, #0x90]\n"
+ "ssubl v29.8h, v29.8b, v18.8b\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ldr x20, [x5, #0xe0]\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v9.4h, v21.4h\n"
+ "smlal2 v15.4s, v9.8h, v21.8h\n"
+ "smlal v20.4s, v3.4h, v21.4h\n"
+ "smlal2 v5.4s, v3.8h, v21.8h\n"
+ "smlal v24.4s, v29.4h, v21.4h\n"
+ "smlal2 v22.4s, v29.8h, v21.8h\n"
"tbz x1, #2, 85f\n"
- "ld1 { v28.s }[0], [x27], #0x4\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
"tbz x1, #1, 84f\n"
- "ld1 { v28.h }[2], [x27], #0x2\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
"tbz x1, #0, 87f\n"
- "ld1 { v28.b }[6], [x27]\n"
+ "ld1 { v25.b }[6], [x20]\n"
"b 87f\n"
"84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 87f\n"
- "ld1 { v28.b }[4], [x27]\n"
+ "ld1 { v25.b }[4], [x20]\n"
"b 87f\n"
"85:" // Oddments: Load (4, 4): Bit 2: Unset
"tbz x1, #1, 86f\n"
- "ld1 { v28.h }[0], [x27], #0x2\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
"tbz x1, #0, 87f\n"
- "ld1 { v28.b }[2], [x27]\n"
+ "ld1 { v25.b }[2], [x20]\n"
"b 87f\n"
"86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 87f\n"
- "ld1 { v28.b }[0], [x27]\n"
+ "ld1 { v25.b }[0], [x20]\n"
"87:" // Oddments: Load (4, 4): Bit 2: End
- "ldr d4, [x3, #0x98]\n"
- "ssubl v28.8h, v28.8b, v9.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ldr x26, [x4, #0xe8]\n"
- "smlal v17.4s, v28.4h, v3.4h\n"
- "smlal2 v21.4s, v28.8h, v3.8h\n"
- "add x26, x26, x0\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "smlal v20.4s, v22.4h, v4.4h\n"
- "smlal2 v10.4s, v22.8h, v4.8h\n"
- "smlal v8.4s, v28.4h, v4.4h\n"
- "smlal2 v7.4s, v28.8h, v4.8h\n"
+ "ldr d8, [x6, #0x98]\n"
+ "ssubl v25.8h, v25.8b, v18.8b\n"
+ "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x20, [x5, #0xe8]\n"
+ "smlal v23.4s, v25.4h, v21.4h\n"
+ "smlal2 v19.4s, v25.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v3.4h, v8.4h\n"
+ "smlal2 v15.4s, v3.8h, v8.8h\n"
+ "smlal v20.4s, v14.4h, v8.4h\n"
+ "smlal2 v5.4s, v14.8h, v8.8h\n"
+ "smlal v24.4s, v25.4h, v8.4h\n"
+ "smlal2 v22.4s, v25.8h, v8.8h\n"
"tbz x1, #2, 89f\n"
- "ld1 { v26.s }[0], [x26], #0x4\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
"tbz x1, #1, 88f\n"
- "ld1 { v26.h }[2], [x26], #0x2\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
"tbz x1, #0, 91f\n"
- "ld1 { v26.b }[6], [x26]\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 91f\n"
"88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 91f\n"
- "ld1 { v26.b }[4], [x26]\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 91f\n"
"89:" // Oddments: Load (4, 5): Bit 2: Unset
"tbz x1, #1, 90f\n"
- "ld1 { v26.h }[0], [x26], #0x2\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
"tbz x1, #0, 91f\n"
- "ld1 { v26.b }[2], [x26]\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 91f\n"
"90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 91f\n"
- "ld1 { v26.b }[0], [x26]\n"
+ "ld1 { v21.b }[0], [x20]\n"
"91:" // Oddments: Load (4, 5): Bit 2: End
- "ldr d0, [x3, #0xa0]\n"
- "ssubl v26.8h, v26.8b, v9.8b\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ldr x25, [x4, #0xf0]\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "add x25, x25, x0\n"
- "smlal v13.4s, v27.4h, v0.4h\n"
- "smlal2 v19.4s, v27.8h, v0.8h\n"
- "smlal v20.4s, v23.4h, v0.4h\n"
- "smlal2 v10.4s, v23.8h, v0.8h\n"
+ "ldr d9, [x6, #0xa0]\n"
+ "ssubl v21.8h, v21.8b, v18.8b\n"
+ "ssubl v9.8h, v9.8b, v13.8b\n"
+ "ldr x20, [x5, #0xf0]\n"
+ "smlal v23.4s, v21.4h, v8.4h\n"
+ "smlal2 v19.4s, v21.8h, v8.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v1.4h, v9.4h\n"
+ "smlal2 v15.4s, v1.8h, v9.8h\n"
+ "smlal v20.4s, v16.4h, v9.4h\n"
+ "smlal2 v5.4s, v16.8h, v9.8h\n"
"tbz x1, #2, 93f\n"
- "ld1 { v25.s }[0], [x25], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz x1, #1, 92f\n"
- "ld1 { v25.h }[2], [x25], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"tbz x1, #0, 95f\n"
- "ld1 { v25.b }[6], [x25]\n"
+ "ld1 { v12.b }[6], [x20]\n"
"b 95f\n"
"92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
"tbz x1, #0, 95f\n"
- "ld1 { v25.b }[4], [x25]\n"
+ "ld1 { v12.b }[4], [x20]\n"
"b 95f\n"
"93:" // Oddments: Load (5, 0): Bit 2: Unset
"tbz x1, #1, 94f\n"
- "ld1 { v25.h }[0], [x25], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"tbz x1, #0, 95f\n"
- "ld1 { v25.b }[2], [x25]\n"
+ "ld1 { v12.b }[2], [x20]\n"
"b 95f\n"
"94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 95f\n"
- "ld1 { v25.b }[0], [x25]\n"
+ "ld1 { v12.b }[0], [x20]\n"
"95:" // Oddments: Load (5, 0): Bit 2: End
- "ssubl v25.8h, v25.8b, v9.8b\n"
- "ldr x24, [x4, #0xf8]\n"
- "smlal v8.4s, v25.4h, v0.4h\n"
- "smlal2 v7.4s, v25.8h, v0.8h\n"
- "add x24, x24, x0\n"
+ "ssubl v12.8h, v12.8b, v18.8b\n"
+ "ldr x20, [x5, #0xf8]\n"
+ "smlal v24.4s, v12.4h, v9.4h\n"
+ "smlal2 v22.4s, v12.8h, v9.8h\n"
+ "add x20, x20, x3\n"
"tbz x1, #2, 97f\n"
- "ld1 { v24.s }[0], [x24], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz x1, #1, 96f\n"
- "ld1 { v24.h }[2], [x24], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"tbz x1, #0, 99f\n"
- "ld1 { v24.b }[6], [x24]\n"
+ "ld1 { v10.b }[6], [x20]\n"
"b 99f\n"
"96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
"tbz x1, #0, 99f\n"
- "ld1 { v24.b }[4], [x24]\n"
+ "ld1 { v10.b }[4], [x20]\n"
"b 99f\n"
"97:" // Oddments: Load (5, 1): Bit 2: Unset
"tbz x1, #1, 98f\n"
- "ld1 { v24.h }[0], [x24], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"tbz x1, #0, 99f\n"
- "ld1 { v24.b }[2], [x24]\n"
+ "ld1 { v10.b }[2], [x20]\n"
"b 99f\n"
"98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 99f\n"
- "ld1 { v24.b }[0], [x24]\n"
+ "ld1 { v10.b }[0], [x20]\n"
"99:" // Oddments: Load (5, 1): Bit 2: End
- "ldr d1, [x3, #0xa8]\n"
- "ssubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldr x23, [x4, #0x100]\n"
- "smlal v17.4s, v24.4h, v0.4h\n"
- "smlal2 v21.4s, v24.8h, v0.8h\n"
- "add x23, x23, x0\n"
- "smlal v13.4s, v23.4h, v1.4h\n"
- "smlal2 v19.4s, v23.8h, v1.8h\n"
- "smlal v20.4s, v31.4h, v1.4h\n"
- "smlal2 v10.4s, v31.8h, v1.8h\n"
- "smlal v8.4s, v24.4h, v1.4h\n"
- "smlal2 v7.4s, v24.8h, v1.8h\n"
+ "ldr d12, [x6, #0xa8]\n"
+ "ssubl v10.8h, v10.8b, v18.8b\n"
+ "ssubl v12.8h, v12.8b, v13.8b\n"
+ "ldr x20, [x5, #0x100]\n"
+ "smlal v23.4s, v10.4h, v9.4h\n"
+ "smlal2 v19.4s, v10.8h, v9.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v16.4h, v12.4h\n"
+ "smlal2 v15.4s, v16.8h, v12.8h\n"
+ "smlal v20.4s, v30.4h, v12.4h\n"
+ "smlal2 v5.4s, v30.8h, v12.8h\n"
+ "smlal v24.4s, v10.4h, v12.4h\n"
+ "smlal2 v22.4s, v10.8h, v12.8h\n"
"tbz x1, #2, 101f\n"
- "ld1 { v27.s }[0], [x23], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz x1, #1, 100f\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"tbz x1, #0, 103f\n"
- "ld1 { v27.b }[6], [x23]\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 103f\n"
"100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 103f\n"
- "ld1 { v27.b }[4], [x23]\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 103f\n"
"101:" // Oddments: Load (5, 2): Bit 2: Unset
"tbz x1, #1, 102f\n"
- "ld1 { v27.h }[0], [x23], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"tbz x1, #0, 103f\n"
- "ld1 { v27.b }[2], [x23]\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 103f\n"
"102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 103f\n"
- "ld1 { v27.b }[0], [x23]\n"
+ "ld1 { v9.b }[0], [x20]\n"
"103:" // Oddments: Load (5, 2): Bit 2: End
- "ldr d2, [x3, #0xb0]\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ldr x15, [x4, #0x108]\n"
- "smlal v17.4s, v27.4h, v1.4h\n"
- "smlal2 v21.4s, v27.8h, v1.8h\n"
- "add x15, x15, x0\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v19.4s, v31.8h, v2.8h\n"
- "smlal v20.4s, v30.4h, v2.4h\n"
- "smlal2 v10.4s, v30.8h, v2.8h\n"
- "smlal v8.4s, v27.4h, v2.4h\n"
- "smlal2 v7.4s, v27.8h, v2.8h\n"
+ "ldr d28, [x6, #0xb0]\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "ssubl v28.8h, v28.8b, v13.8b\n"
+ "ldr x20, [x5, #0x108]\n"
+ "smlal v23.4s, v9.4h, v12.4h\n"
+ "smlal2 v19.4s, v9.8h, v12.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v30.4h, v28.4h\n"
+ "smlal2 v15.4s, v30.8h, v28.8h\n"
+ "smlal v20.4s, v29.4h, v28.4h\n"
+ "smlal2 v5.4s, v29.8h, v28.8h\n"
+ "smlal v24.4s, v9.4h, v28.4h\n"
+ "smlal2 v22.4s, v9.8h, v28.8h\n"
"tbz x1, #2, 105f\n"
- "ld1 { v25.s }[0], [x15], #0x4\n"
+ "ld1 { v2.s }[0], [x20], #0x4\n"
"tbz x1, #1, 104f\n"
- "ld1 { v25.h }[2], [x15], #0x2\n"
+ "ld1 { v2.h }[2], [x20], #0x2\n"
"tbz x1, #0, 107f\n"
- "ld1 { v25.b }[6], [x15]\n"
+ "ld1 { v2.b }[6], [x20]\n"
"b 107f\n"
"104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 107f\n"
- "ld1 { v25.b }[4], [x15]\n"
+ "ld1 { v2.b }[4], [x20]\n"
"b 107f\n"
"105:" // Oddments: Load (5, 3): Bit 2: Unset
"tbz x1, #1, 106f\n"
- "ld1 { v25.h }[0], [x15], #0x2\n"
+ "ld1 { v2.h }[0], [x20], #0x2\n"
"tbz x1, #0, 107f\n"
- "ld1 { v25.b }[2], [x15]\n"
+ "ld1 { v2.b }[2], [x20]\n"
"b 107f\n"
"106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 107f\n"
- "ld1 { v25.b }[0], [x15]\n"
+ "ld1 { v2.b }[0], [x20]\n"
"107:" // Oddments: Load (5, 3): Bit 2: End
- "ldr d3, [x3, #0xb8]\n"
- "ssubl v25.8h, v25.8b, v9.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr x21, [x4, #0x110]\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "add x21, x21, x0\n"
- "smlal v13.4s, v30.4h, v3.4h\n"
- "smlal2 v19.4s, v30.8h, v3.8h\n"
- "smlal v20.4s, v28.4h, v3.4h\n"
- "smlal2 v10.4s, v28.8h, v3.8h\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "ldr d30, [x6, #0xb8]\n"
+ "ssubl v2.8h, v2.8b, v18.8b\n"
+ "ssubl v30.8h, v30.8b, v13.8b\n"
+ "ldr x20, [x5, #0x110]\n"
+ "smlal v23.4s, v2.4h, v28.4h\n"
+ "smlal2 v19.4s, v2.8h, v28.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v29.4h, v30.4h\n"
+ "smlal2 v15.4s, v29.8h, v30.8h\n"
+ "smlal v20.4s, v25.4h, v30.4h\n"
+ "smlal2 v5.4s, v25.8h, v30.8h\n"
+ "smlal v24.4s, v2.4h, v30.4h\n"
+ "smlal2 v22.4s, v2.8h, v30.8h\n"
"tbz x1, #2, 109f\n"
- "ld1 { v24.s }[0], [x21], #0x4\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
"tbz x1, #1, 108f\n"
- "ld1 { v24.h }[2], [x21], #0x2\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
"tbz x1, #0, 111f\n"
- "ld1 { v24.b }[6], [x21]\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 111f\n"
"108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 111f\n"
- "ld1 { v24.b }[4], [x21]\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 111f\n"
"109:" // Oddments: Load (5, 4): Bit 2: Unset
"tbz x1, #1, 110f\n"
- "ld1 { v24.h }[0], [x21], #0x2\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
"tbz x1, #0, 111f\n"
- "ld1 { v24.b }[2], [x21]\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 111f\n"
"110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 111f\n"
- "ld1 { v24.b }[0], [x21]\n"
+ "ld1 { v27.b }[0], [x20]\n"
"111:" // Oddments: Load (5, 4): Bit 2: End
- "ldr d4, [x3, #0xc0]\n"
- "ssubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ldr x20, [x4, #0x118]\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "add x20, x20, x0\n"
- "smlal v13.4s, v28.4h, v4.4h\n"
- "smlal2 v19.4s, v28.8h, v4.8h\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
+ "ldr d8, [x6, #0xc0]\n"
+ "ssubl v27.8h, v27.8b, v18.8b\n"
+ "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x20, [x5, #0x118]\n"
+ "smlal v23.4s, v27.4h, v30.4h\n"
+ "smlal2 v19.4s, v27.8h, v30.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v25.4h, v8.4h\n"
+ "smlal2 v15.4s, v25.8h, v8.8h\n"
+ "smlal v20.4s, v21.4h, v8.4h\n"
+ "smlal2 v5.4s, v21.8h, v8.8h\n"
+ "smlal v24.4s, v27.4h, v8.4h\n"
+ "smlal2 v22.4s, v27.8h, v8.8h\n"
"tbz x1, #2, 113f\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz x1, #1, 112f\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"tbz x1, #0, 115f\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 115f\n"
"112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 115f\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 115f\n"
"113:" // Oddments: Load (5, 5): Bit 2: Unset
"tbz x1, #1, 114f\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"tbz x1, #0, 115f\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 115f\n"
"114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 115f\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "ld1 { v9.b }[0], [x20]\n"
"115:" // Oddments: Load (5, 5): Bit 2: End
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
- "smlal2 v21.4s, v27.8h, v4.8h\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "smlal v23.4s, v9.4h, v8.4h\n"
+ "smlal2 v19.4s, v9.8h, v8.8h\n"
"tbz x1, #2, 117f\n"
- "ld1 { v18.4s }, [x5], #0x10\n"
- "ld1 { v6.4s }, [x8], #0x10\n"
+ "ld1 { v30.4s }, [x7], #0x10\n"
+ "ld1 { v12.4s }, [x8], #0x10\n"
"tbz x1, #1, 116f\n"
- "ld1 { v5.d }[0], [x5], #0x8\n"
- "ld1 { v22.d }[0], [x8], #0x8\n"
+ "ld1 { v14.d }[0], [x7], #0x8\n"
+ "ld1 { v27.d }[0], [x8], #0x8\n"
"tbz x1, #0, 119f\n"
- "ld1 { v5.s }[2], [x5]\n"
- "ld1 { v22.s }[2], [x8]\n"
+ "ld1 { v14.s }[2], [x7]\n"
+ "ld1 { v27.s }[2], [x8]\n"
"b 119f\n"
"116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
"tbz x1, #0, 119f\n"
- "ld1 { v5.s }[0], [x5]\n"
- "ld1 { v22.s }[0], [x8]\n"
+ "ld1 { v14.s }[0], [x7]\n"
+ "ld1 { v27.s }[0], [x8]\n"
"b 119f\n"
"117:" // Oddments: Load requant params: Bit 2: Unset
"tbz x1, #1, 118f\n"
- "ld1 { v18.d }[0], [x5], #0x8\n"
- "ld1 { v6.d }[0], [x8], #0x8\n"
+ "ld1 { v30.d }[0], [x7], #0x8\n"
+ "ld1 { v12.d }[0], [x8], #0x8\n"
"tbz x1, #0, 119f\n"
- "ld1 { v18.s }[2], [x5]\n"
- "ld1 { v6.s }[2], [x8]\n"
+ "ld1 { v30.s }[2], [x7]\n"
+ "ld1 { v12.s }[2], [x8]\n"
"b 119f\n"
"118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 119f\n"
- "ld1 { v18.s }[0], [x5]\n"
- "ld1 { v6.s }[0], [x8]\n"
+ "ld1 { v30.s }[0], [x7]\n"
+ "ld1 { v12.s }[0], [x8]\n"
"119:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v13.4s, v13.4s, v18.4s\n"
- "and v30.16b, v13.16b, v6.16b\n"
- "add x17, x17, x10\n"
- "add x6, x6, x10\n"
- "sqrdmulh v19.4s, v19.4s, v5.4s\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "add x7, x7, x10\n"
- "add x16, x16, x10\n"
- "and v16.16b, v19.16b, v22.16b\n"
- "sqrdmulh v20.4s, v20.4s, v18.4s\n"
- "sqrdmulh v8.4s, v8.4s, v18.4s\n"
- "sqrdmulh v17.4s, v17.4s, v18.4s\n"
- "sqadd v13.4s, v13.4s, v30.4s\n"
+ "sqrdmulh v7.4s, v7.4s, v30.4s\n"
+ "and v16.16b, v7.16b, v12.16b\n"
+ "add x17, x17, x4\n"
+ "add x16, x16, x4\n"
+ "sqrdmulh v15.4s, v15.4s, v14.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "and v0.16b, v20.16b, v6.16b\n"
- "sqrdmulh v10.4s, v10.4s, v5.4s\n"
- "and v18.16b, v8.16b, v6.16b\n"
- "sqrdmulh v7.4s, v7.4s, v5.4s\n"
- "and v30.16b, v17.16b, v6.16b\n"
- "sqrdmulh v21.4s, v21.4s, v5.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v26.16b, v10.16b, v22.16b\n"
+ "add x15, x15, x4\n"
+ "add x14, x14, x4\n"
+ "and v2.16b, v15.16b, v27.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+ "sqadd v7.4s, v7.4s, v16.4s\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "and v21.16b, v20.16b, v12.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v14.4s\n"
+ "and v18.16b, v24.16b, v12.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v14.4s\n"
+ "and v31.16b, v23.16b, v12.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v14.4s\n"
+ "sqadd v15.4s, v15.4s, v2.4s\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v9.16b, v5.16b, v27.16b\n"
"sshr v18.4s, v18.4s, #0x1f\n"
- "and v23.16b, v7.16b, v22.16b\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "and v16.16b, v21.16b, v22.16b\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v18.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v30.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v6.4s\n"
- "srshl v20.4s, v20.4s, v6.4s\n"
- "sqadd v10.4s, v10.4s, v26.4s\n"
- "srshl v8.4s, v8.4s, v6.4s\n"
- "sqadd v7.4s, v7.4s, v23.4s\n"
- "srshl v17.4s, v17.4s, v6.4s\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "srshl v19.4s, v19.4s, v22.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v10.4s, v10.4s, v22.4s\n"
+ "and v4.16b, v22.16b, v27.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v28.16b, v19.16b, v27.16b\n"
+ "sqadd v20.4s, v20.4s, v21.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v18.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v31.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "srshl v7.4s, v7.4s, v12.4s\n"
+ "srshl v20.4s, v20.4s, v12.4s\n"
+ "sqadd v5.4s, v5.4s, v9.4s\n"
+ "srshl v24.4s, v24.4s, v12.4s\n"
+ "sqadd v22.4s, v22.4s, v4.4s\n"
+ "srshl v23.4s, v23.4s, v12.4s\n"
+ "sqadd v19.4s, v19.4s, v28.4s\n"
+ "srshl v15.4s, v15.4s, v27.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v5.4s, v5.4s, v27.4s\n"
"sqxtn v20.4h, v20.4s\n"
- "srshl v7.4s, v7.4s, v22.4s\n"
- "sqxtn v8.4h, v8.4s\n"
- "srshl v21.4s, v21.4s, v22.4s\n"
- "sqxtn v17.4h, v17.4s\n"
- "sqxtn2 v13.8h, v19.4s\n"
- "sqxtn2 v20.8h, v10.4s\n"
- "sqxtn2 v8.8h, v7.4s\n"
- "sqxtn2 v17.8h, v21.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v20.8h, v20.8h, v14.8h\n"
- "sqadd v8.8h, v8.8h, v14.8h\n"
- "sqadd v17.8h, v17.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v20.8h, v20.8h, v12.8h\n"
- "smax v8.8h, v8.8h, v12.8h\n"
- "smax v17.8h, v17.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v20.8h, v20.8h, v11.8h\n"
- "smin v8.8h, v8.8h, v11.8h\n"
- "smin v17.8h, v17.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "srshl v22.4s, v22.4s, v27.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "srshl v19.4s, v19.4s, v27.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v7.8h, v15.4s\n"
+ "sqxtn2 v20.8h, v5.4s\n"
+ "sqxtn2 v24.8h, v22.4s\n"
+ "sqxtn2 v23.8h, v19.4s\n"
+ "sqadd v7.8h, v7.8h, v26.8h\n"
+ "sqadd v20.8h, v20.8h, v26.8h\n"
+ "sqadd v24.8h, v24.8h, v26.8h\n"
+ "sqadd v23.8h, v23.8h, v26.8h\n"
+ "smax v7.8h, v7.8h, v11.8h\n"
+ "smax v20.8h, v20.8h, v11.8h\n"
+ "smax v24.8h, v24.8h, v11.8h\n"
+ "smax v23.8h, v23.8h, v11.8h\n"
+ "smin v7.8h, v7.8h, v0.8h\n"
+ "smin v20.8h, v20.8h, v0.8h\n"
+ "smin v24.8h, v24.8h, v0.8h\n"
+ "smin v23.8h, v23.8h, v0.8h\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v8.16b, v8.16b, v8.16b\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
"tbz x1, #2, 121f\n"
- "st1 { v13.s }[0], [x17], #0x4\n"
- "st1 { v20.s }[0], [x6], #0x4\n"
- "st1 { v8.s }[0], [x7], #0x4\n"
- "st1 { v17.s }[0], [x16], #0x4\n"
+ "st1 { v7.s }[0], [x17], #0x4\n"
+ "st1 { v20.s }[0], [x16], #0x4\n"
+ "st1 { v24.s }[0], [x15], #0x4\n"
+ "st1 { v23.s }[0], [x14], #0x4\n"
"tbz x1, #1, 120f\n"
- "st1 { v13.h }[2], [x17], #0x2\n"
- "st1 { v20.h }[2], [x6], #0x2\n"
- "st1 { v8.h }[2], [x7], #0x2\n"
- "st1 { v17.h }[2], [x16], #0x2\n"
+ "st1 { v7.h }[2], [x17], #0x2\n"
+ "st1 { v20.h }[2], [x16], #0x2\n"
+ "st1 { v24.h }[2], [x15], #0x2\n"
+ "st1 { v23.h }[2], [x14], #0x2\n"
"tbz x1, #0, 123f\n"
- "st1 { v13.b }[6], [x17], #0x1\n"
- "st1 { v20.b }[6], [x6], #0x1\n"
- "st1 { v8.b }[6], [x7], #0x1\n"
- "st1 { v17.b }[6], [x16], #0x1\n"
+ "st1 { v7.b }[6], [x17], #0x1\n"
+ "st1 { v20.b }[6], [x16], #0x1\n"
+ "st1 { v24.b }[6], [x15], #0x1\n"
+ "st1 { v23.b }[6], [x14], #0x1\n"
"b 123f\n"
"120:" // Oddments: Bit 2: Bit 1: Unset
"tbz x1, #0, 123f\n"
- "st1 { v13.b }[4], [x17], #0x1\n"
- "st1 { v20.b }[4], [x6], #0x1\n"
- "st1 { v8.b }[4], [x7], #0x1\n"
- "st1 { v17.b }[4], [x16], #0x1\n"
+ "st1 { v7.b }[4], [x17], #0x1\n"
+ "st1 { v20.b }[4], [x16], #0x1\n"
+ "st1 { v24.b }[4], [x15], #0x1\n"
+ "st1 { v23.b }[4], [x14], #0x1\n"
"b 123f\n"
"121:" // Oddments: Bit 2: Unset
"tbz x1, #1, 122f\n"
- "st1 { v13.h }[0], [x17], #0x2\n"
- "st1 { v20.h }[0], [x6], #0x2\n"
- "st1 { v8.h }[0], [x7], #0x2\n"
- "st1 { v17.h }[0], [x16], #0x2\n"
+ "st1 { v7.h }[0], [x17], #0x2\n"
+ "st1 { v20.h }[0], [x16], #0x2\n"
+ "st1 { v24.h }[0], [x15], #0x2\n"
+ "st1 { v23.h }[0], [x14], #0x2\n"
"tbz x1, #0, 123f\n"
- "st1 { v13.b }[2], [x17], #0x1\n"
- "st1 { v20.b }[2], [x6], #0x1\n"
- "st1 { v8.b }[2], [x7], #0x1\n"
- "st1 { v17.b }[2], [x16], #0x1\n"
+ "st1 { v7.b }[2], [x17], #0x1\n"
+ "st1 { v20.b }[2], [x16], #0x1\n"
+ "st1 { v24.b }[2], [x15], #0x1\n"
+ "st1 { v23.b }[2], [x14], #0x1\n"
"b 123f\n"
"122:" // Oddments: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 123f\n"
- "st1 { v13.b }[0], [x17], #0x1\n"
- "st1 { v20.b }[0], [x6], #0x1\n"
- "st1 { v8.b }[0], [x7], #0x1\n"
- "st1 { v17.b }[0], [x16], #0x1\n"
+ "st1 { v7.b }[0], [x17], #0x1\n"
+ "st1 { v20.b }[0], [x16], #0x1\n"
+ "st1 { v24.b }[0], [x15], #0x1\n"
+ "st1 { v23.b }[0], [x14], #0x1\n"
"123:" // Oddments: Bit 2: End
"124:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp
index 9b1f7c239f..9c92a9dd46 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -47,4 +47,5 @@ class a64_s8q_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKern
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
index 3f345cf95a..77b7d231e0 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -22,12 +22,13 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include "arm_gemm.hpp"
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
@@ -41,7 +42,7 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
)
{
__asm__ __volatile__(
- "lsr x12, %x[n_channels], #0x2\n"
+ "lsr x9, %x[n_channels], #0x2\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
"ld1r { v8.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
@@ -59,7 +60,7 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
"ld1r { v1.4s }, [x20]\n"
"mov x11, #0x0\n"
- "cbz x12, 6f\n"
+ "cbz x9, 6f\n"
"1:" // Channel loop
"movi v23.4s, #0x0\n"
"cbz %x[bias], 2f\n"
@@ -67,34 +68,34 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
"ldr q23, [%x[bias], x20]\n"
"2:" // Channel loop: Load bias: Done
"ldr s0, [%x[params]], #0x4\n"
- "mov x21, %x[inptrs]\n"
- "ldp x10, x9, [x21], #0x10\n"
- "subs x20, %x[n_points], #0x1\n"
- "ldr s14, [x10, x11]\n"
- "ldr s15, [x9, x11]\n"
+ "mov x25, %x[inptrs]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "subs x24, %x[n_points], #0x1\n"
+ "ldr s14, [x21, x11]\n"
+ "ldr s15, [x20, x11]\n"
"mov v24.16b, v23.16b\n"
"mov v25.16b, v23.16b\n"
- "ldp x28, x27, [x21], #0x10\n"
- "ldr s16, [x28, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "ldr s16, [x21, x11]\n"
"mov v26.16b, v23.16b\n"
"mov v27.16b, v23.16b\n"
- "ldr s17, [x27, x11]\n"
- "ldp x26, x25, [x21], #0x10\n"
+ "ldr s17, [x20, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
"mov v28.16b, v23.16b\n"
"mov v29.16b, v23.16b\n"
- "ldr s18, [x26, x11]\n"
- "ldr s19, [x25, x11]\n"
+ "ldr s18, [x21, x11]\n"
+ "ldr s19, [x20, x11]\n"
"mov v30.16b, v23.16b\n"
"mov v31.16b, v23.16b\n"
- "ldp x24, x23, [x21], #0x10\n"
- "ldr s20, [x24, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "ldr s20, [x21, x11]\n"
"ssubl v0.8h, v0.8b, v5.8b\n"
"ssubl v14.8h, v14.8b, v6.8b\n"
- "ldr s21, [x23, x11]\n"
- "ldr x22, [x21], #0x8\n"
+ "ldr s21, [x20, x11]\n"
+ "ldr x20, [x25], #0x8\n"
"ssubl v15.8h, v15.8b, v6.8b\n"
"ssubl v16.8h, v16.8b, v6.8b\n"
- "ldr s22, [x22, x11]\n"
+ "ldr s22, [x20, x11]\n"
"ssubl v17.8h, v17.8b, v6.8b\n"
"ssubl v18.8h, v18.8b, v6.8b\n"
"ssubl v19.8h, v19.8b, v6.8b\n"
@@ -103,35 +104,35 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
"ssubl v22.8h, v22.8b, v6.8b\n"
"ble 4f\n"
"3:" // Channel loop: Planar loop
- "ldp x10, x9, [x21], #0x10\n"
- "ldp x28, x27, [x21], #0x10\n"
+ "ldp x23, x22, [x25], #0x10\n"
+ "ldp x21, x20, [x25], #0x10\n"
"smlal v23.4s, v14.4h, v0.4h\n"
"smlal v24.4s, v15.4h, v0.4h\n"
- "ldr s14, [x10, x11]\n"
- "ldr s15, [x9, x11]\n"
+ "ldr s14, [x23, x11]\n"
+ "ldr s15, [x22, x11]\n"
"smlal v25.4s, v16.4h, v0.4h\n"
"smlal v26.4s, v17.4h, v0.4h\n"
- "ldr s16, [x28, x11]\n"
- "ldr s17, [x27, x11]\n"
+ "ldr s16, [x21, x11]\n"
+ "ldr s17, [x20, x11]\n"
"smlal v27.4s, v18.4h, v0.4h\n"
"smlal v28.4s, v19.4h, v0.4h\n"
- "ldp x26, x25, [x21], #0x10\n"
- "ldr s18, [x26, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "ldr s18, [x21, x11]\n"
"smlal v29.4s, v20.4h, v0.4h\n"
"smlal v30.4s, v21.4h, v0.4h\n"
- "ldr s19, [x25, x11]\n"
- "ldp x24, x23, [x21], #0x10\n"
+ "ldr s19, [x20, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
"smlal v31.4s, v22.4h, v0.4h\n"
- "subs x20, x20, #0x1\n"
+ "subs x24, x24, #0x1\n"
"ldr s0, [%x[params]], #0x4\n"
- "ldr s20, [x24, x11]\n"
+ "ldr s20, [x21, x11]\n"
"ssubl v0.8h, v0.8b, v5.8b\n"
"ssubl v14.8h, v14.8b, v6.8b\n"
- "ldr s21, [x23, x11]\n"
- "ldr x22, [x21], #0x8\n"
+ "ldr s21, [x20, x11]\n"
+ "ldr x20, [x25], #0x8\n"
"ssubl v15.8h, v15.8b, v6.8b\n"
"ssubl v16.8h, v16.8b, v6.8b\n"
- "ldr s22, [x22, x11]\n"
+ "ldr s22, [x20, x11]\n"
"ssubl v17.8h, v17.8b, v6.8b\n"
"ssubl v18.8h, v18.8b, v6.8b\n"
"ssubl v19.8h, v19.8b, v6.8b\n"
@@ -167,45 +168,45 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
"sqrdmulh v24.4s, v24.4s, v2.4s\n"
"sqrdmulh v25.4s, v25.4s, v2.4s\n"
"ldr x20, [%x[outptrs], #0x40]\n"
- "and v21.16b, v23.16b, v1.16b\n"
- "and v20.16b, v24.16b, v1.16b\n"
- "and v19.16b, v25.16b, v1.16b\n"
+ "and v18.16b, v23.16b, v1.16b\n"
+ "and v17.16b, v24.16b, v1.16b\n"
+ "and v16.16b, v25.16b, v1.16b\n"
"sshl v26.4s, v26.4s, v3.4s\n"
"sshl v27.4s, v27.4s, v3.4s\n"
"sshl v28.4s, v28.4s, v3.4s\n"
"sshl v29.4s, v29.4s, v3.4s\n"
"sshl v30.4s, v30.4s, v3.4s\n"
"sshl v31.4s, v31.4s, v3.4s\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v2.4s\n"
"sqrdmulh v27.4s, v27.4s, v2.4s\n"
"sqrdmulh v28.4s, v28.4s, v2.4s\n"
"sqrdmulh v29.4s, v29.4s, v2.4s\n"
"sqrdmulh v30.4s, v30.4s, v2.4s\n"
"sqrdmulh v31.4s, v31.4s, v2.4s\n"
- "sqadd v23.4s, v23.4s, v21.4s\n"
- "sqadd v24.4s, v24.4s, v20.4s\n"
- "sqadd v25.4s, v25.4s, v19.4s\n"
- "and v18.16b, v26.16b, v1.16b\n"
- "and v17.16b, v27.16b, v1.16b\n"
- "and v16.16b, v28.16b, v1.16b\n"
- "and v21.16b, v29.16b, v1.16b\n"
- "and v20.16b, v30.16b, v1.16b\n"
- "and v19.16b, v31.16b, v1.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v18.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "and v21.16b, v26.16b, v1.16b\n"
+ "and v20.16b, v27.16b, v1.16b\n"
+ "and v19.16b, v28.16b, v1.16b\n"
+ "and v18.16b, v29.16b, v1.16b\n"
+ "and v17.16b, v30.16b, v1.16b\n"
+ "and v16.16b, v31.16b, v1.16b\n"
"sshr v21.4s, v21.4s, #0x1f\n"
"sshr v20.4s, v20.4s, #0x1f\n"
"sshr v19.4s, v19.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v18.4s\n"
- "sqadd v27.4s, v27.4s, v17.4s\n"
- "sqadd v28.4s, v28.4s, v16.4s\n"
- "sqadd v29.4s, v29.4s, v21.4s\n"
- "sqadd v30.4s, v30.4s, v20.4s\n"
- "sqadd v31.4s, v31.4s, v19.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v21.4s\n"
+ "sqadd v27.4s, v27.4s, v20.4s\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
"srshl v23.4s, v23.4s, v1.4s\n"
"srshl v24.4s, v24.4s, v1.4s\n"
"srshl v25.4s, v25.4s, v1.4s\n"
@@ -270,7 +271,7 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
"str s30, [x21, x11]\n"
"str s31, [x20, x11]\n"
"add x11, x11, #0x4\n"
- "cmp x11, x12, LSL #2\n"
+ "cmp x11, x9, LSL #2\n"
"blt 1b\n"
"6:" // Oddments
"tst %x[n_channels], #0x3\n"
@@ -288,61 +289,61 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
"8:" // Oddments: Load bias: Bit 1: End
"9:" // Oddments: Load bias: Done
"ldr s0, [%x[params]], #0x4\n"
- "mov x21, %x[inptrs]\n"
- "ldp x10, x9, [x21], #0x10\n"
+ "mov x10, %x[inptrs]\n"
+ "ldp x9, x28, [x10], #0x10\n"
"mov v24.16b, v23.16b\n"
- "ldp x28, x27, [x21], #0x10\n"
- "ldp x26, x25, [x21], #0x10\n"
+ "ldp x27, x26, [x10], #0x10\n"
+ "ldp x25, x24, [x10], #0x10\n"
"mov v25.16b, v23.16b\n"
"mov v26.16b, v23.16b\n"
- "ldp x24, x23, [x21], #0x10\n"
- "ldr x22, [x21], #0x8\n"
+ "ldp x23, x22, [x10], #0x10\n"
+ "ldr x21, [x10], #0x8\n"
"mov v27.16b, v23.16b\n"
"mov v28.16b, v23.16b\n"
"mov v29.16b, v23.16b\n"
"mov v30.16b, v23.16b\n"
- "add x10, x10, x11\n"
"add x9, x9, x11\n"
+ "add x28, x28, x11\n"
"mov v31.16b, v23.16b\n"
"ssubl v0.8h, v0.8b, v5.8b\n"
- "add x28, x28, x11\n"
"add x27, x27, x11\n"
"add x26, x26, x11\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
+ "add x21, x21, x11\n"
"tbz %x[n_channels], #1, 10f\n"
- "ldr h14, [x10], #0x2\n"
- "ldr h15, [x9], #0x2\n"
- "ldr h16, [x28], #0x2\n"
- "ldr h17, [x27], #0x2\n"
- "ldr h18, [x26], #0x2\n"
- "ldr h19, [x25], #0x2\n"
- "ldr h20, [x24], #0x2\n"
- "ldr h21, [x23], #0x2\n"
- "ldr h22, [x22], #0x2\n"
+ "ldr h14, [x9], #0x2\n"
+ "ldr h15, [x28], #0x2\n"
+ "ldr h16, [x27], #0x2\n"
+ "ldr h17, [x26], #0x2\n"
+ "ldr h18, [x25], #0x2\n"
+ "ldr h19, [x24], #0x2\n"
+ "ldr h20, [x23], #0x2\n"
+ "ldr h21, [x22], #0x2\n"
+ "ldr h22, [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v14.b }[2], [x10], #0x1\n"
- "ld1 { v15.b }[2], [x9], #0x1\n"
- "ld1 { v16.b }[2], [x28], #0x1\n"
- "ld1 { v17.b }[2], [x27], #0x1\n"
- "ld1 { v18.b }[2], [x26], #0x1\n"
- "ld1 { v19.b }[2], [x25], #0x1\n"
- "ld1 { v20.b }[2], [x24], #0x1\n"
- "ld1 { v21.b }[2], [x23], #0x1\n"
- "ld1 { v22.b }[2], [x22], #0x1\n"
+ "ld1 { v14.b }[2], [x9], #0x1\n"
+ "ld1 { v15.b }[2], [x28], #0x1\n"
+ "ld1 { v16.b }[2], [x27], #0x1\n"
+ "ld1 { v17.b }[2], [x26], #0x1\n"
+ "ld1 { v18.b }[2], [x25], #0x1\n"
+ "ld1 { v19.b }[2], [x24], #0x1\n"
+ "ld1 { v20.b }[2], [x23], #0x1\n"
+ "ld1 { v21.b }[2], [x22], #0x1\n"
+ "ld1 { v22.b }[2], [x21], #0x1\n"
"b 11f\n"
"10:" // Oddments: Load: Bit 1: Unset
- "ldr b14, [x10], #0x1\n"
- "ldr b15, [x9], #0x1\n"
- "ldr b16, [x28], #0x1\n"
- "ldr b17, [x27], #0x1\n"
- "ldr b18, [x26], #0x1\n"
- "ldr b19, [x25], #0x1\n"
- "ldr b20, [x24], #0x1\n"
- "ldr b21, [x23], #0x1\n"
- "ldr b22, [x22], #0x1\n"
+ "ldr b14, [x9], #0x1\n"
+ "ldr b15, [x28], #0x1\n"
+ "ldr b16, [x27], #0x1\n"
+ "ldr b17, [x26], #0x1\n"
+ "ldr b18, [x25], #0x1\n"
+ "ldr b19, [x24], #0x1\n"
+ "ldr b20, [x23], #0x1\n"
+ "ldr b21, [x22], #0x1\n"
+ "ldr b22, [x21], #0x1\n"
"11:" // Oddments: Load: Bit 1: End
"subs x20, %x[n_points], #0x1\n"
"ssubl v14.8h, v14.8b, v6.8b\n"
@@ -356,62 +357,62 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
"ssubl v22.8h, v22.8b, v6.8b\n"
"ble 15f\n"
"12:" // Oddments: Planar loop
- "ldp x10, x9, [x21], #0x10\n"
- "ldp x28, x27, [x21], #0x10\n"
+ "ldp x9, x28, [x10], #0x10\n"
+ "ldp x27, x26, [x10], #0x10\n"
"smlal v23.4s, v14.4h, v0.4h\n"
"smlal v24.4s, v15.4h, v0.4h\n"
- "ldp x26, x25, [x21], #0x10\n"
- "ldp x24, x23, [x21], #0x10\n"
+ "ldp x25, x24, [x10], #0x10\n"
+ "ldp x23, x22, [x10], #0x10\n"
"smlal v25.4s, v16.4h, v0.4h\n"
"smlal v26.4s, v17.4h, v0.4h\n"
"smlal v27.4s, v18.4h, v0.4h\n"
"smlal v28.4s, v19.4h, v0.4h\n"
- "ldr x22, [x21], #0x8\n"
- "add x10, x10, x11\n"
+ "ldr x21, [x10], #0x8\n"
+ "add x9, x9, x11\n"
"smlal v29.4s, v20.4h, v0.4h\n"
"smlal v30.4s, v21.4h, v0.4h\n"
- "add x9, x9, x11\n"
"add x28, x28, x11\n"
+ "add x27, x27, x11\n"
"smlal v31.4s, v22.4h, v0.4h\n"
"ldr s0, [%x[params]], #0x4\n"
"ssubl v0.8h, v0.8b, v5.8b\n"
- "add x27, x27, x11\n"
"add x26, x26, x11\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
+ "add x21, x21, x11\n"
"tbz %x[n_channels], #1, 13f\n"
- "ldr h14, [x10], #0x2\n"
- "ldr h15, [x9], #0x2\n"
- "ldr h16, [x28], #0x2\n"
- "ldr h17, [x27], #0x2\n"
- "ldr h18, [x26], #0x2\n"
- "ldr h19, [x25], #0x2\n"
- "ldr h20, [x24], #0x2\n"
- "ldr h21, [x23], #0x2\n"
- "ldr h22, [x22], #0x2\n"
+ "ldr h14, [x9], #0x2\n"
+ "ldr h15, [x28], #0x2\n"
+ "ldr h16, [x27], #0x2\n"
+ "ldr h17, [x26], #0x2\n"
+ "ldr h18, [x25], #0x2\n"
+ "ldr h19, [x24], #0x2\n"
+ "ldr h20, [x23], #0x2\n"
+ "ldr h21, [x22], #0x2\n"
+ "ldr h22, [x21], #0x2\n"
"tbz %x[n_channels], #0, 14f\n"
- "ld1 { v14.b }[2], [x10], #0x1\n"
- "ld1 { v15.b }[2], [x9], #0x1\n"
- "ld1 { v16.b }[2], [x28], #0x1\n"
- "ld1 { v17.b }[2], [x27], #0x1\n"
- "ld1 { v18.b }[2], [x26], #0x1\n"
- "ld1 { v19.b }[2], [x25], #0x1\n"
- "ld1 { v20.b }[2], [x24], #0x1\n"
- "ld1 { v21.b }[2], [x23], #0x1\n"
- "ld1 { v22.b }[2], [x22], #0x1\n"
+ "ld1 { v14.b }[2], [x9], #0x1\n"
+ "ld1 { v15.b }[2], [x28], #0x1\n"
+ "ld1 { v16.b }[2], [x27], #0x1\n"
+ "ld1 { v17.b }[2], [x26], #0x1\n"
+ "ld1 { v18.b }[2], [x25], #0x1\n"
+ "ld1 { v19.b }[2], [x24], #0x1\n"
+ "ld1 { v20.b }[2], [x23], #0x1\n"
+ "ld1 { v21.b }[2], [x22], #0x1\n"
+ "ld1 { v22.b }[2], [x21], #0x1\n"
"b 14f\n"
"13:" // Oddments: Planar loop: Load: Bit 1: Unset
- "ldr b14, [x10], #0x1\n"
- "ldr b15, [x9], #0x1\n"
- "ldr b16, [x28], #0x1\n"
- "ldr b17, [x27], #0x1\n"
- "ldr b18, [x26], #0x1\n"
- "ldr b19, [x25], #0x1\n"
- "ldr b20, [x24], #0x1\n"
- "ldr b21, [x23], #0x1\n"
- "ldr b22, [x22], #0x1\n"
+ "ldr b14, [x9], #0x1\n"
+ "ldr b15, [x28], #0x1\n"
+ "ldr b16, [x27], #0x1\n"
+ "ldr b17, [x26], #0x1\n"
+ "ldr b18, [x25], #0x1\n"
+ "ldr b19, [x24], #0x1\n"
+ "ldr b20, [x23], #0x1\n"
+ "ldr b21, [x22], #0x1\n"
+ "ldr b22, [x21], #0x1\n"
"14:" // Oddments: Planar loop: Load: Bit 1: End
"subs x20, x20, #0x1\n"
"ssubl v14.8h, v14.8b, v6.8b\n"
@@ -457,9 +458,7 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
"cbz %x[rq_left_shift_ptr], 19f\n"
"ld1 { v3.s }[0], [x20], #0x4\n"
"19:" // Oddments: Load quantisation parameters: Bit 1: Unset: Bit 0: Load left shift: Done
-
"20:" // Oddments: Load quantisation parameters: Bit 1: End
-
"21:" // Oddments: Load quantisation parameters: Done
"sshl v23.4s, v23.4s, v3.4s\n"
"sshl v24.4s, v24.4s, v3.4s\n"
@@ -473,11 +472,11 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
"sqrdmulh v25.4s, v25.4s, v2.4s\n"
"ldr x20, [%x[outptrs], #0x40]\n"
"add x28, x28, x11\n"
- "and v21.16b, v23.16b, v1.16b\n"
- "and v20.16b, v24.16b, v1.16b\n"
+ "and v18.16b, v23.16b, v1.16b\n"
+ "and v17.16b, v24.16b, v1.16b\n"
"add x27, x27, x11\n"
"add x26, x26, x11\n"
- "and v19.16b, v25.16b, v1.16b\n"
+ "and v16.16b, v25.16b, v1.16b\n"
"sshl v26.4s, v26.4s, v3.4s\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
@@ -490,36 +489,36 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
"add x21, x21, x11\n"
"add x20, x20, x11\n"
"sshl v31.4s, v31.4s, v3.4s\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v2.4s\n"
"sqrdmulh v27.4s, v27.4s, v2.4s\n"
"sqrdmulh v28.4s, v28.4s, v2.4s\n"
"sqrdmulh v29.4s, v29.4s, v2.4s\n"
"sqrdmulh v30.4s, v30.4s, v2.4s\n"
"sqrdmulh v31.4s, v31.4s, v2.4s\n"
- "sqadd v23.4s, v23.4s, v21.4s\n"
- "sqadd v24.4s, v24.4s, v20.4s\n"
- "sqadd v25.4s, v25.4s, v19.4s\n"
- "and v18.16b, v26.16b, v1.16b\n"
- "and v17.16b, v27.16b, v1.16b\n"
- "and v16.16b, v28.16b, v1.16b\n"
- "and v21.16b, v29.16b, v1.16b\n"
- "and v20.16b, v30.16b, v1.16b\n"
- "and v19.16b, v31.16b, v1.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v18.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "and v21.16b, v26.16b, v1.16b\n"
+ "and v20.16b, v27.16b, v1.16b\n"
+ "and v19.16b, v28.16b, v1.16b\n"
+ "and v18.16b, v29.16b, v1.16b\n"
+ "and v17.16b, v30.16b, v1.16b\n"
+ "and v16.16b, v31.16b, v1.16b\n"
"sshr v21.4s, v21.4s, #0x1f\n"
"sshr v20.4s, v20.4s, #0x1f\n"
"sshr v19.4s, v19.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v18.4s\n"
- "sqadd v27.4s, v27.4s, v17.4s\n"
- "sqadd v28.4s, v28.4s, v16.4s\n"
- "sqadd v29.4s, v29.4s, v21.4s\n"
- "sqadd v30.4s, v30.4s, v20.4s\n"
- "sqadd v31.4s, v31.4s, v19.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v21.4s\n"
+ "sqadd v27.4s, v27.4s, v20.4s\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
"srshl v23.4s, v23.4s, v1.4s\n"
"srshl v24.4s, v24.4s, v1.4s\n"
"srshl v25.4s, v25.4s, v1.4s\n"
@@ -606,15 +605,14 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
"st1 { v30.b }[0], [x21], #0x1\n"
"st1 { v31.b }[0], [x20], #0x1\n"
"23:" // Oddments: Store: Bit 1: End
-
"24:" // End
-
: [params] "+&r" (params)
: [bias] "r" (qp.bias), [inptrs] "r" (inptrs), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (qp.per_channel_left_shifts), [rq_mul_ptr] "r" (qp.per_channel_muls), [rq_right_shift_ptr] "r" (qp.per_channel_right_shifts)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
index 5ca3ccd4bf..14adf8880f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -58,4 +58,4 @@ struct a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst :
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
index 342a297dd4..be8fbfa0e2 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -22,12 +22,13 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include "arm_gemm.hpp"
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
@@ -40,169 +41,169 @@ void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
)
{
__asm__ __volatile__(
- "ldr q14, [%x[params], #0x0]\n"
+ "ldr q11, [%x[params], #0x0]\n"
"ldr q5, [%x[params], #0x10]\n"
- "movi v15.16b, #0x1\n"
- "ushr v15.4s, v15.4s, #0x8\n"
+ "movi v8.16b, #0x1\n"
+ "ushr v8.4s, v8.4s, #0x8\n"
"ldr q6, [%x[params], #0x20]\n"
"ldr q7, [%x[params], #0x30]\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
"ldr x20, [%x[inptrs], #0x8]\n"
"ld1 { v1.16b }, [x20]\n"
- "mov v29.16b, v1.16b\n"
- "mov v16.16b, v1.16b\n"
+ "mov v28.16b, v1.16b\n"
+ "mov v23.16b, v1.16b\n"
"ldr x20, [%x[inptrs], #0x10]\n"
"ld1 { v2.16b }, [x20]\n"
- "mov v28.16b, v1.16b\n"
- "mov v22.16b, v2.16b\n"
+ "mov v30.16b, v1.16b\n"
+ "mov v21.16b, v2.16b\n"
"ldr x20, [%x[inptrs], #0x20]\n"
"ld1 { v4.16b }, [x20]\n"
- "mov v31.16b, v2.16b\n"
- "mov v30.16b, v2.16b\n"
+ "mov v20.16b, v2.16b\n"
+ "mov v29.16b, v2.16b\n"
"ldr x20, [%x[inptrs], #0x0]\n"
"ld1 { v0.16b }, [x20]\n"
- "mov v23.16b, v4.16b\n"
- "mov v21.16b, v4.16b\n"
+ "mov v9.16b, v4.16b\n"
+ "mov v22.16b, v4.16b\n"
"ldr x20, [%x[inptrs], #0x18]\n"
"ld1 { v3.16b }, [x20]\n"
- "mov v20.16b, v4.16b\n"
- "ext v29.16b, v29.16b, v29.16b, #0x2\n"
- "ext v16.16b, v16.16b, v16.16b, #0x4\n"
- "ext v28.16b, v28.16b, v28.16b, #0x6\n"
+ "mov v31.16b, v4.16b\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x2\n"
+ "ext v23.16b, v23.16b, v23.16b, #0x4\n"
+ "ext v30.16b, v30.16b, v30.16b, #0x6\n"
"add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v13.4s }, [x20]\n"
- "ext v22.16b, v22.16b, v22.16b, #0x2\n"
- "ext v31.16b, v31.16b, v31.16b, #0x4\n"
- "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
"ld1r { v12.4s }, [x20]\n"
- "ext v30.16b, v30.16b, v30.16b, #0x6\n"
- "ext v23.16b, v23.16b, v23.16b, #0x2\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x2\n"
+ "ext v20.16b, v20.16b, v20.16b, #0x4\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v14.4s }, [x20]\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x6\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x2\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
- "ld1r { v11.4s }, [x20]\n"
- "ext v21.16b, v21.16b, v21.16b, #0x4\n"
- "ext v20.16b, v20.16b, v20.16b, #0x6\n"
+ "ld1r { v13.4s }, [x20]\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x4\n"
+ "ext v31.16b, v31.16b, v31.16b, #0x6\n"
"add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v10.4s }, [x20]\n"
- "mov v25.16b, v0.16b\n"
+ "ld1r { v15.4s }, [x20]\n"
+ "mov v27.16b, v0.16b\n"
"mov v19.16b, v0.16b\n"
"cmp %x[n_channels], #0x4\n"
"mov x9, #0x0\n"
"mov v18.16b, v0.16b\n"
- "mov v24.16b, v3.16b\n"
+ "mov v26.16b, v3.16b\n"
"mov x28, #0x0\n"
"ldp x27, x26, [%x[outptrs], #0x0]\n"
"mov v17.16b, v3.16b\n"
- "ext v25.16b, v25.16b, v25.16b, #0x2\n"
+ "mov v16.16b, v3.16b\n"
"ldp x25, x24, [%x[outptrs], #0x10]\n"
"ldp x23, x22, [%x[outptrs], #0x20]\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x2\n"
"ext v19.16b, v19.16b, v19.16b, #0x4\n"
- "ext v18.16b, v18.16b, v18.16b, #0x6\n"
"ldp x21, x20, [%x[outptrs], #0x30]\n"
"add %x[params], %x[params], #0x40\n"
- "zip1 v1.4s, v1.4s, v16.4s\n"
- "mov v16.16b, v3.16b\n"
- "zip1 v29.4s, v29.4s, v28.4s\n"
- "zip1 v2.4s, v2.4s, v31.4s\n"
- "zip1 v22.4s, v22.4s, v30.4s\n"
- "ext v24.16b, v24.16b, v24.16b, #0x2\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x6\n"
+ "zip1 v1.4s, v1.4s, v23.4s\n"
+ "zip1 v28.4s, v28.4s, v30.4s\n"
+ "zip1 v2.4s, v2.4s, v20.4s\n"
+ "zip1 v21.4s, v21.4s, v29.4s\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x2\n"
"ext v17.16b, v17.16b, v17.16b, #0x4\n"
"ext v16.16b, v16.16b, v16.16b, #0x6\n"
- "zip1 v4.4s, v4.4s, v21.4s\n"
- "zip1 v23.4s, v23.4s, v20.4s\n"
+ "zip1 v4.4s, v4.4s, v22.4s\n"
+ "zip1 v9.4s, v9.4s, v31.4s\n"
"zip1 v0.4s, v0.4s, v19.4s\n"
- "zip1 v25.4s, v25.4s, v18.4s\n"
- "zip1 v1.4s, v1.4s, v29.4s\n"
- "zip1 v2.4s, v2.4s, v22.4s\n"
- ".inst 0x4f81e1fa // sdot v26.4s, v15.16b, v1.4b[0]\n"
+ "zip1 v27.4s, v27.4s, v18.4s\n"
+ "zip1 v1.4s, v1.4s, v28.4s\n"
+ "zip1 v2.4s, v2.4s, v21.4s\n"
+ ".inst 0x4f81e118 // sdot v24.4s, v8.16b, v1.4b[0]\n"
"zip1 v3.4s, v3.4s, v17.4s\n"
- "zip1 v24.4s, v24.4s, v16.4s\n"
- ".inst 0x4fa1e1fb // sdot v27.4s, v15.16b, v1.4b[1]\n"
- "zip1 v4.4s, v4.4s, v23.4s\n"
+ "zip1 v26.4s, v26.4s, v16.4s\n"
+ ".inst 0x4fa1e119 // sdot v25.4s, v8.16b, v1.4b[1]\n"
+ "zip1 v4.4s, v4.4s, v9.4s\n"
"movi v23.4s, #0x0\n"
- ".inst 0x4f81e9f7 // sdot v23.4s, v15.16b, v1.4b[2]\n"
+ ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n"
"movi v22.4s, #0x0\n"
"movi v21.4s, #0x0\n"
- ".inst 0x4fa1e9f6 // sdot v22.4s, v15.16b, v1.4b[3]\n"
- "movi v20.4s, #0x0\n"
- "movi v9.4s, #0x0\n"
- ".inst 0x4f82e1f5 // sdot v21.4s, v15.16b, v2.4b[0]\n"
- "movi v8.4s, #0x0\n"
+ ".inst 0x4fa1e916 // sdot v22.4s, v8.16b, v1.4b[3]\n"
"movi v19.4s, #0x0\n"
- ".inst 0x4fa2e1f4 // sdot v20.4s, v15.16b, v2.4b[1]\n"
+ "movi v9.4s, #0x0\n"
+ ".inst 0x4f82e115 // sdot v21.4s, v8.16b, v2.4b[0]\n"
+ "movi v10.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ ".inst 0x4fa2e113 // sdot v19.4s, v8.16b, v2.4b[1]\n"
"movi v18.4s, #0x0\n"
"movi v17.4s, #0x0\n"
- ".inst 0x4f82e9e9 // sdot v9.4s, v15.16b, v2.4b[2]\n"
+ ".inst 0x4f82e909 // sdot v9.4s, v8.16b, v2.4b[2]\n"
"movi v16.4s, #0x0\n"
- "zip1 v0.4s, v0.4s, v25.4s\n"
- ".inst 0x4fa2e9e8 // sdot v8.4s, v15.16b, v2.4b[3]\n"
- "zip1 v3.4s, v3.4s, v24.4s\n"
- ".inst 0x4f84e1f3 // sdot v19.4s, v15.16b, v4.4b[0]\n"
- ".inst 0x4fa4e1f2 // sdot v18.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x4f84e9f1 // sdot v17.4s, v15.16b, v4.4b[2]\n"
- ".inst 0x4fa4e9f0 // sdot v16.4s, v15.16b, v4.4b[3]\n"
+ "zip1 v0.4s, v0.4s, v27.4s\n"
+ ".inst 0x4fa2e90a // sdot v10.4s, v8.16b, v2.4b[3]\n"
+ "zip1 v3.4s, v3.4s, v26.4s\n"
+ ".inst 0x4f84e114 // sdot v20.4s, v8.16b, v4.4b[0]\n"
+ ".inst 0x4fa4e112 // sdot v18.4s, v8.16b, v4.4b[1]\n"
+ ".inst 0x4f84e911 // sdot v17.4s, v8.16b, v4.4b[2]\n"
+ ".inst 0x4fa4e910 // sdot v16.4s, v8.16b, v4.4b[3]\n"
"movi v31.4s, #0x0\n"
"movi v30.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- ".inst 0x4f80e1ff // sdot v31.4s, v15.16b, v0.4b[0]\n"
+ "movi v26.4s, #0x0\n"
+ ".inst 0x4f80e11f // sdot v31.4s, v8.16b, v0.4b[0]\n"
+ "movi v27.4s, #0x0\n"
"movi v28.4s, #0x0\n"
- ".inst 0x4fa0e1fe // sdot v30.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x4f80e9fd // sdot v29.4s, v15.16b, v0.4b[2]\n"
- ".inst 0x4fa0e9fc // sdot v28.4s, v15.16b, v0.4b[3]\n"
- "add v24.4s, v26.4s, v21.4s\n"
- "add v25.4s, v27.4s, v20.4s\n"
- "add v26.4s, v23.4s, v9.4s\n"
- "add v27.4s, v22.4s, v8.4s\n"
- "add v23.4s, v19.4s, v21.4s\n"
- "movi v22.4s, #0x0\n"
- ".inst 0x4f83e1f6 // sdot v22.4s, v15.16b, v3.4b[0]\n"
- "add v21.4s, v18.4s, v20.4s\n"
+ ".inst 0x4fa0e11e // sdot v30.4s, v8.16b, v0.4b[1]\n"
+ "movi v29.4s, #0x0\n"
+ ".inst 0x4f80e91a // sdot v26.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x4fa0e91b // sdot v27.4s, v8.16b, v0.4b[3]\n"
+ ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ ".inst 0x4fa3e11d // sdot v29.4s, v8.16b, v3.4b[1]\n"
+ "add v24.4s, v24.4s, v21.4s\n"
+ "add v25.4s, v25.4s, v19.4s\n"
+ "add v23.4s, v23.4s, v9.4s\n"
+ "add v22.4s, v22.4s, v10.4s\n"
+ "add v21.4s, v20.4s, v21.4s\n"
"movi v20.4s, #0x0\n"
- ".inst 0x4fa3e1f4 // sdot v20.4s, v15.16b, v3.4b[1]\n"
- "add v19.4s, v17.4s, v9.4s\n"
+ ".inst 0x4f83e914 // sdot v20.4s, v8.16b, v3.4b[2]\n"
+ "add v19.4s, v18.4s, v19.4s\n"
"movi v18.4s, #0x0\n"
- ".inst 0x4f83e9f2 // sdot v18.4s, v15.16b, v3.4b[2]\n"
- "add v17.4s, v16.4s, v8.4s\n"
- "movi v16.4s, #0x0\n"
- ".inst 0x4fa3e9f0 // sdot v16.4s, v15.16b, v3.4b[3]\n"
+ ".inst 0x4fa3e912 // sdot v18.4s, v8.16b, v3.4b[3]\n"
+ "add v17.4s, v17.4s, v9.4s\n"
+ "add v16.4s, v16.4s, v10.4s\n"
"add v24.4s, v24.4s, v31.4s\n"
"add v25.4s, v25.4s, v30.4s\n"
- "add v26.4s, v26.4s, v29.4s\n"
- "add v27.4s, v27.4s, v28.4s\n"
- "add v28.4s, v23.4s, v22.4s\n"
- "add v29.4s, v21.4s, v20.4s\n"
- "add v30.4s, v19.4s, v18.4s\n"
- "add v31.4s, v17.4s, v16.4s\n"
- "neg v13.4s, v13.4s\n"
- "mul v24.4s, v24.4s, v13.4s\n"
- "mul v25.4s, v25.4s, v13.4s\n"
- "mul v26.4s, v26.4s, v13.4s\n"
- "mul v27.4s, v27.4s, v13.4s\n"
- "mul v28.4s, v28.4s, v13.4s\n"
- "mul v29.4s, v29.4s, v13.4s\n"
- "mul v30.4s, v30.4s, v13.4s\n"
- "mul v31.4s, v31.4s, v13.4s\n"
+ "add v26.4s, v23.4s, v26.4s\n"
+ "add v27.4s, v22.4s, v27.4s\n"
+ "add v28.4s, v21.4s, v28.4s\n"
+ "add v29.4s, v19.4s, v29.4s\n"
+ "add v30.4s, v17.4s, v20.4s\n"
+ "add v31.4s, v16.4s, v18.4s\n"
+ "neg v12.4s, v12.4s\n"
+ "mul v24.4s, v24.4s, v12.4s\n"
+ "mul v25.4s, v25.4s, v12.4s\n"
+ "mul v26.4s, v26.4s, v12.4s\n"
+ "mul v27.4s, v27.4s, v12.4s\n"
+ "mul v28.4s, v28.4s, v12.4s\n"
+ "mul v29.4s, v29.4s, v12.4s\n"
+ "mul v30.4s, v30.4s, v12.4s\n"
+ "mul v31.4s, v31.4s, v12.4s\n"
"zip1 v19.4s, v24.4s, v26.4s\n"
"zip1 v18.4s, v25.4s, v27.4s\n"
"zip1 v17.4s, v28.4s, v30.4s\n"
"zip1 v16.4s, v29.4s, v31.4s\n"
"zip1 v22.4s, v19.4s, v18.4s\n"
"zip1 v23.4s, v17.4s, v16.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
"ble 2f\n"
"1:" // Loop
- "ldr q21, [%x[params], #0x0]\n"
- "ldr q20, [%x[params], #0x10]\n"
+ "ldr q8, [%x[params], #0x0]\n"
+ "ldr q21, [%x[params], #0x10]\n"
".inst 0x4f80e0b8 // sdot v24.4s, v5.16b, v0.4b[0]\n"
".inst 0x4fa0e0b9 // sdot v25.4s, v5.16b, v0.4b[1]\n"
- "ldr q14, [%x[params], #0x20]\n"
+ "ldr q20, [%x[params], #0x20]\n"
".inst 0x4f80e8ba // sdot v26.4s, v5.16b, v0.4b[2]\n"
".inst 0x4fa0e8bb // sdot v27.4s, v5.16b, v0.4b[3]\n"
"sub %x[n_channels], %x[n_channels], #0x4\n"
@@ -219,43 +220,43 @@ void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
"ldr q5, [%x[params], #0x30]\n"
".inst 0x4f82e0f8 // sdot v24.4s, v7.16b, v2.4b[0]\n"
".inst 0x4fa2e0f9 // sdot v25.4s, v7.16b, v2.4b[1]\n"
- "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n"
".inst 0x4fa2e8fb // sdot v27.4s, v7.16b, v2.4b[3]\n"
- "sqrdmulh v25.4s, v25.4s, v21.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v8.4s\n"
".inst 0x4f83e0dc // sdot v28.4s, v6.16b, v3.4b[0]\n"
".inst 0x4fa3e0dd // sdot v29.4s, v6.16b, v3.4b[1]\n"
- "sqrdmulh v26.4s, v26.4s, v21.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v8.4s\n"
".inst 0x4f83e8de // sdot v30.4s, v6.16b, v3.4b[2]\n"
".inst 0x4fa3e8df // sdot v31.4s, v6.16b, v3.4b[3]\n"
"ldr q6, [%x[params], #0x40]\n"
- "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v8.4s\n"
".inst 0x4f84e0fc // sdot v28.4s, v7.16b, v4.4b[0]\n"
".inst 0x4fa4e0fd // sdot v29.4s, v7.16b, v4.4b[1]\n"
- "and v19.16b, v24.16b, v20.16b\n"
+ "and v19.16b, v24.16b, v21.16b\n"
".inst 0x4f84e8fe // sdot v30.4s, v7.16b, v4.4b[2]\n"
".inst 0x4fa4e8ff // sdot v31.4s, v7.16b, v4.4b[3]\n"
"ldr q7, [%x[params], #0x50]\n"
- "and v18.16b, v25.16b, v20.16b\n"
- "and v17.16b, v26.16b, v20.16b\n"
- "and v16.16b, v27.16b, v20.16b\n"
+ "and v18.16b, v25.16b, v21.16b\n"
+ "and v17.16b, v26.16b, v21.16b\n"
+ "and v16.16b, v27.16b, v21.16b\n"
"add %x[params], %x[params], #0x60\n"
"sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v28.4s, v28.4s, v21.4s\n"
- "sqrdmulh v29.4s, v29.4s, v21.4s\n"
- "sqrdmulh v30.4s, v30.4s, v21.4s\n"
- "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v8.4s\n"
"sqadd v24.4s, v24.4s, v19.4s\n"
"sqadd v25.4s, v25.4s, v18.4s\n"
"sqadd v26.4s, v26.4s, v17.4s\n"
"sqadd v27.4s, v27.4s, v16.4s\n"
- "and v19.16b, v28.16b, v20.16b\n"
- "and v18.16b, v29.16b, v20.16b\n"
- "and v17.16b, v30.16b, v20.16b\n"
- "and v16.16b, v31.16b, v20.16b\n"
+ "and v19.16b, v28.16b, v21.16b\n"
+ "and v18.16b, v29.16b, v21.16b\n"
+ "and v17.16b, v30.16b, v21.16b\n"
+ "and v16.16b, v31.16b, v21.16b\n"
"sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
@@ -264,38 +265,38 @@ void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
"sqadd v29.4s, v29.4s, v18.4s\n"
"sqadd v30.4s, v30.4s, v17.4s\n"
"sqadd v31.4s, v31.4s, v16.4s\n"
- "srshl v24.4s, v24.4s, v20.4s\n"
- "srshl v25.4s, v25.4s, v20.4s\n"
- "srshl v26.4s, v26.4s, v20.4s\n"
- "srshl v27.4s, v27.4s, v20.4s\n"
- "srshl v28.4s, v28.4s, v20.4s\n"
- "srshl v29.4s, v29.4s, v20.4s\n"
- "srshl v30.4s, v30.4s, v20.4s\n"
- "srshl v31.4s, v31.4s, v20.4s\n"
- "add v24.4s, v24.4s, v12.4s\n"
- "add v25.4s, v25.4s, v12.4s\n"
- "add v26.4s, v26.4s, v12.4s\n"
- "add v27.4s, v27.4s, v12.4s\n"
- "add v28.4s, v28.4s, v12.4s\n"
- "add v29.4s, v29.4s, v12.4s\n"
- "add v30.4s, v30.4s, v12.4s\n"
- "add v31.4s, v31.4s, v12.4s\n"
- "smin v24.4s, v24.4s, v10.4s\n"
- "smin v25.4s, v25.4s, v10.4s\n"
- "smin v26.4s, v26.4s, v10.4s\n"
- "smin v27.4s, v27.4s, v10.4s\n"
- "smin v28.4s, v28.4s, v10.4s\n"
- "smin v29.4s, v29.4s, v10.4s\n"
- "smin v30.4s, v30.4s, v10.4s\n"
- "smin v31.4s, v31.4s, v10.4s\n"
- "smax v24.4s, v24.4s, v11.4s\n"
- "smax v25.4s, v25.4s, v11.4s\n"
- "smax v26.4s, v26.4s, v11.4s\n"
- "smax v27.4s, v27.4s, v11.4s\n"
- "smax v28.4s, v28.4s, v11.4s\n"
- "smax v29.4s, v29.4s, v11.4s\n"
- "smax v30.4s, v30.4s, v11.4s\n"
- "smax v31.4s, v31.4s, v11.4s\n"
+ "srshl v24.4s, v24.4s, v21.4s\n"
+ "srshl v25.4s, v25.4s, v21.4s\n"
+ "srshl v26.4s, v26.4s, v21.4s\n"
+ "srshl v27.4s, v27.4s, v21.4s\n"
+ "srshl v28.4s, v28.4s, v21.4s\n"
+ "srshl v29.4s, v29.4s, v21.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "srshl v31.4s, v31.4s, v21.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smin v24.4s, v24.4s, v15.4s\n"
+ "smin v25.4s, v25.4s, v15.4s\n"
+ "smin v26.4s, v26.4s, v15.4s\n"
+ "smin v27.4s, v27.4s, v15.4s\n"
+ "smin v28.4s, v28.4s, v15.4s\n"
+ "smin v29.4s, v29.4s, v15.4s\n"
+ "smin v30.4s, v30.4s, v15.4s\n"
+ "smin v31.4s, v31.4s, v15.4s\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "smax v27.4s, v27.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
@@ -329,14 +330,14 @@ void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
"dup v30.4s, v23.s[2]\n"
"dup v31.4s, v23.s[3]\n"
"add x28, x28, #0x4\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v20.4s\n"
+ "add v25.4s, v25.4s, v20.4s\n"
+ "add v26.4s, v26.4s, v20.4s\n"
+ "add v27.4s, v27.4s, v20.4s\n"
+ "add v28.4s, v28.4s, v20.4s\n"
+ "add v29.4s, v29.4s, v20.4s\n"
+ "add v30.4s, v30.4s, v20.4s\n"
+ "add v31.4s, v31.4s, v20.4s\n"
"bgt 1b\n"
"2:" // Tail
"ldr q21, [%x[params], #0x0]\n"
@@ -415,30 +416,30 @@ void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
"srshl v29.4s, v29.4s, v20.4s\n"
"srshl v30.4s, v30.4s, v20.4s\n"
"srshl v31.4s, v31.4s, v20.4s\n"
- "add v24.4s, v24.4s, v12.4s\n"
- "add v25.4s, v25.4s, v12.4s\n"
- "add v26.4s, v26.4s, v12.4s\n"
- "add v27.4s, v27.4s, v12.4s\n"
- "add v28.4s, v28.4s, v12.4s\n"
- "add v29.4s, v29.4s, v12.4s\n"
- "add v30.4s, v30.4s, v12.4s\n"
- "add v31.4s, v31.4s, v12.4s\n"
- "smin v24.4s, v24.4s, v10.4s\n"
- "smin v25.4s, v25.4s, v10.4s\n"
- "smin v26.4s, v26.4s, v10.4s\n"
- "smin v27.4s, v27.4s, v10.4s\n"
- "smin v28.4s, v28.4s, v10.4s\n"
- "smin v29.4s, v29.4s, v10.4s\n"
- "smin v30.4s, v30.4s, v10.4s\n"
- "smin v31.4s, v31.4s, v10.4s\n"
- "smax v24.4s, v24.4s, v11.4s\n"
- "smax v25.4s, v25.4s, v11.4s\n"
- "smax v26.4s, v26.4s, v11.4s\n"
- "smax v27.4s, v27.4s, v11.4s\n"
- "smax v28.4s, v28.4s, v11.4s\n"
- "smax v29.4s, v29.4s, v11.4s\n"
- "smax v30.4s, v30.4s, v11.4s\n"
- "smax v31.4s, v31.4s, v11.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smin v24.4s, v24.4s, v15.4s\n"
+ "smin v25.4s, v25.4s, v15.4s\n"
+ "smin v26.4s, v26.4s, v15.4s\n"
+ "smin v27.4s, v27.4s, v15.4s\n"
+ "smin v28.4s, v28.4s, v15.4s\n"
+ "smin v29.4s, v29.4s, v15.4s\n"
+ "smin v30.4s, v30.4s, v15.4s\n"
+ "smin v31.4s, v31.4s, v15.4s\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "smax v27.4s, v27.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
@@ -514,4 +515,5 @@ void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
index 0641229aa7..62b033f48d 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,14 +22,14 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
-#if defined(__aarch64__)
-
#pragma once
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
@@ -57,4 +57,5 @@ struct a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst :
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
index 9fa38c6efe..17afc92e30 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -22,12 +22,13 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include "arm_gemm.hpp"
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
@@ -42,133 +43,133 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
__asm__ __volatile__(
"ldr q12, [%x[params], #0x0]\n"
"ldr q8, [%x[params], #0x10]\n"
- "movi v28.16b, #0x1\n"
- "movi v18.4s, #0x0\n"
+ "movi v30.16b, #0x1\n"
+ "movi v17.4s, #0x0\n"
"ldr q9, [%x[params], #0x20]\n"
"ldr q10, [%x[params], #0x30]\n"
- "movi v31.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
"ldr q11, [%x[params], #0x40]\n"
"ldr x20, [%x[inptrs], #0x18]\n"
- "movi v30.4s, #0x0\n"
- "movi v21.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
"ld1 { v3.16b }, [x20]\n"
"ldr x20, [%x[inptrs], #0x20]\n"
- "mov v16.16b, v3.16b\n"
- "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+ "mov v26.16b, v3.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
"ld1 { v4.16b }, [x20]\n"
"ldr x20, [%x[inptrs], #0x10]\n"
- "mov v15.16b, v4.16b\n"
- "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+ "mov v21.16b, v4.16b\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
"ld1 { v2.16b }, [x20]\n"
"ldr x20, [%x[inptrs], #0x8]\n"
- "mov v20.16b, v2.16b\n"
- "ext v20.16b, v20.16b, v20.16b, #0x1\n"
+ "mov v27.16b, v2.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
"ld1 { v1.16b }, [x20]\n"
"ldr x20, [%x[inptrs], #0x28]\n"
- "zip1 v3.2d, v3.2d, v16.2d\n"
- "zip1 v4.2d, v4.2d, v15.2d\n"
+ "zip1 v3.2d, v3.2d, v26.2d\n"
+ "zip1 v4.2d, v4.2d, v21.2d\n"
"ld1 { v5.16b }, [x20]\n"
"ldr x20, [%x[inptrs], #0x30]\n"
"mov v26.16b, v1.16b\n"
- "mov v13.16b, v5.16b\n"
+ "mov v22.16b, v5.16b\n"
"ld1 { v6.16b }, [x20]\n"
"ldr x20, [%x[inptrs], #0x38]\n"
"mov v19.16b, v6.16b\n"
"ext v26.16b, v26.16b, v26.16b, #0x1\n"
"ld1 { v7.16b }, [x20]\n"
"ldr x20, [%x[inptrs], #0x0]\n"
- "mov v17.16b, v7.16b\n"
- "zip1 v2.2d, v2.2d, v20.2d\n"
+ "mov v21.16b, v7.16b\n"
+ "zip1 v2.2d, v2.2d, v27.2d\n"
"ld1 { v0.16b }, [x20]\n"
- "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
"ext v19.16b, v19.16b, v19.16b, #0x1\n"
- ".inst 0x4f83e392 // sdot v18.4s, v28.16b, v3.4b[0]\n"
- "ext v17.16b, v17.16b, v17.16b, #0x1\n"
- ".inst 0x4f83eb9f // sdot v31.4s, v28.16b, v3.4b[2]\n"
- ".inst 0x4f84e398 // sdot v24.4s, v28.16b, v4.4b[0]\n"
+ ".inst 0x4f83e3d1 // sdot v17.4s, v30.16b, v3.4b[0]\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ ".inst 0x4f83ebd0 // sdot v16.4s, v30.16b, v3.4b[2]\n"
+ ".inst 0x4f84e3d9 // sdot v25.4s, v30.16b, v4.4b[0]\n"
"add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
"ld1r { v23.4s }, [x20]\n"
- ".inst 0x4f84eb9e // sdot v30.4s, v28.16b, v4.4b[2]\n"
- "mov v16.16b, v0.16b\n"
- ".inst 0x4f82e395 // sdot v21.4s, v28.16b, v2.4b[0]\n"
- "movi v20.4s, #0x0\n"
- "movi v29.4s, #0x1\n"
- ".inst 0x4f82eb94 // sdot v20.4s, v28.16b, v2.4b[2]\n"
+ ".inst 0x4f84ebd8 // sdot v24.4s, v30.16b, v4.4b[2]\n"
+ "mov v18.16b, v0.16b\n"
+ ".inst 0x4f82e3df // sdot v31.4s, v30.16b, v2.4b[0]\n"
+ "movi v29.4s, #0x0\n"
+ "movi v28.4s, #0x1\n"
+ ".inst 0x4f82ebdd // sdot v29.4s, v30.16b, v2.4b[2]\n"
"add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v14.4s }, [x20]\n"
- "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+ "ld1r { v13.4s }, [x20]\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x1\n"
"zip1 v1.2d, v1.2d, v26.2d\n"
- ".inst 0x4fa3e3b2 // sdot v18.4s, v29.16b, v3.4b[1]\n"
- "zip1 v5.2d, v5.2d, v13.2d\n"
+ ".inst 0x4fa3e391 // sdot v17.4s, v28.16b, v3.4b[1]\n"
+ "zip1 v5.2d, v5.2d, v22.2d\n"
"zip1 v6.2d, v6.2d, v19.2d\n"
- ".inst 0x4fa3ebbf // sdot v31.4s, v29.16b, v3.4b[3]\n"
+ ".inst 0x4fa3eb90 // sdot v16.4s, v28.16b, v3.4b[3]\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
- "ld1r { v13.4s }, [x20]\n"
- "zip1 v7.2d, v7.2d, v17.2d\n"
+ "ld1r { v14.4s }, [x20]\n"
+ "zip1 v7.2d, v7.2d, v21.2d\n"
"movi v22.4s, #0x0\n"
- ".inst 0x4fa4e3b8 // sdot v24.4s, v29.16b, v4.4b[1]\n"
- "movi v26.4s, #0x0\n"
- ".inst 0x4fa4ebbe // sdot v30.4s, v29.16b, v4.4b[3]\n"
- ".inst 0x4f81e396 // sdot v22.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x4fa4e399 // sdot v25.4s, v28.16b, v4.4b[1]\n"
+ "movi v21.4s, #0x0\n"
+ ".inst 0x4fa4eb98 // sdot v24.4s, v28.16b, v4.4b[3]\n"
+ ".inst 0x4f81e3d6 // sdot v22.4s, v30.16b, v1.4b[0]\n"
"add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
"ld1r { v15.4s }, [x20]\n"
- "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
"movi v27.4s, #0x0\n"
- ".inst 0x4f81eb9a // sdot v26.4s, v28.16b, v1.4b[2]\n"
- "zip1 v0.2d, v0.2d, v16.2d\n"
+ ".inst 0x4f81ebd5 // sdot v21.4s, v30.16b, v1.4b[2]\n"
+ "movi v20.4s, #0x0\n"
"movi v19.4s, #0x0\n"
- ".inst 0x4f85e399 // sdot v25.4s, v28.16b, v5.4b[0]\n"
+ ".inst 0x4f85e3da // sdot v26.4s, v30.16b, v5.4b[0]\n"
"cmp %x[n_channels], #0x4\n"
- ".inst 0x4f85eb9b // sdot v27.4s, v28.16b, v5.4b[2]\n"
- ".inst 0x4f86e393 // sdot v19.4s, v28.16b, v6.4b[0]\n"
- "add v24.4s, v18.4s, v24.4s\n"
- "mov x9, #0x0\n"
+ "zip1 v0.2d, v0.2d, v18.2d\n"
"movi v18.4s, #0x0\n"
- ".inst 0x4f86eb92 // sdot v18.4s, v28.16b, v6.4b[2]\n"
- ".inst 0x4fa2e3b5 // sdot v21.4s, v29.16b, v2.4b[1]\n"
+ ".inst 0x4f85ebdb // sdot v27.4s, v30.16b, v5.4b[2]\n"
+ "mov x9, #0x0\n"
+ ".inst 0x4f86e3d4 // sdot v20.4s, v30.16b, v6.4b[0]\n"
+ ".inst 0x4f86ebd3 // sdot v19.4s, v30.16b, v6.4b[2]\n"
+ "add v17.4s, v17.4s, v25.4s\n"
"mov x28, #0x0\n"
- ".inst 0x4fa2ebb4 // sdot v20.4s, v29.16b, v2.4b[3]\n"
- "add v17.4s, v31.4s, v30.4s\n"
- ".inst 0x4fa1e3b6 // sdot v22.4s, v29.16b, v1.4b[1]\n"
+ "movi v25.4s, #0x0\n"
+ ".inst 0x4f87e3d2 // sdot v18.4s, v30.16b, v7.4b[0]\n"
+ ".inst 0x4f87ebd9 // sdot v25.4s, v30.16b, v7.4b[2]\n"
"ldp x27, x26, [%x[outptrs], #0x0]\n"
- "movi v16.4s, #0x0\n"
- ".inst 0x4f87e390 // sdot v16.4s, v28.16b, v7.4b[0]\n"
- ".inst 0x4fa1ebba // sdot v26.4s, v29.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e39f // sdot v31.4s, v28.16b, v2.4b[1]\n"
+ ".inst 0x4fa2eb9d // sdot v29.4s, v28.16b, v2.4b[3]\n"
+ "add v16.4s, v16.4s, v24.4s\n"
"ldp x25, x24, [%x[outptrs], #0x10]\n"
- ".inst 0x4fa5e3b9 // sdot v25.4s, v29.16b, v5.4b[1]\n"
- ".inst 0x4fa5ebbb // sdot v27.4s, v29.16b, v5.4b[3]\n"
- "add v30.4s, v21.4s, v24.4s\n"
+ "movi v24.4s, #0x0\n"
+ ".inst 0x4f80e3d8 // sdot v24.4s, v30.16b, v0.4b[0]\n"
+ ".inst 0x4fa1e396 // sdot v22.4s, v28.16b, v1.4b[1]\n"
"ldp x23, x22, [%x[outptrs], #0x20]\n"
- ".inst 0x4fa6e3b3 // sdot v19.4s, v29.16b, v6.4b[1]\n"
- ".inst 0x4fa6ebb2 // sdot v18.4s, v29.16b, v6.4b[3]\n"
- "add v31.4s, v20.4s, v17.4s\n"
+ ".inst 0x4fa1eb95 // sdot v21.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x4fa5e39a // sdot v26.4s, v28.16b, v5.4b[1]\n"
+ "add v31.4s, v31.4s, v17.4s\n"
"ldp x21, x20, [%x[outptrs], #0x30]\n"
- ".inst 0x4fa7e3b0 // sdot v16.4s, v29.16b, v7.4b[1]\n"
- "add v22.4s, v22.4s, v30.4s\n"
+ ".inst 0x4fa5eb9b // sdot v27.4s, v28.16b, v5.4b[3]\n"
+ ".inst 0x4fa6e394 // sdot v20.4s, v28.16b, v6.4b[1]\n"
+ "add v29.4s, v29.4s, v16.4s\n"
"add %x[params], %x[params], #0x50\n"
- "add v21.4s, v26.4s, v31.4s\n"
- "add v20.4s, v25.4s, v19.4s\n"
- "add v19.4s, v27.4s, v18.4s\n"
- "add v18.4s, v16.4s, v24.4s\n"
- "movi v16.4s, #0x0\n"
- ".inst 0x4f87eb90 // sdot v16.4s, v28.16b, v7.4b[2]\n"
- ".inst 0x4fa7ebb0 // sdot v16.4s, v29.16b, v7.4b[3]\n"
- "add v17.4s, v16.4s, v17.4s\n"
- "movi v16.4s, #0x0\n"
- ".inst 0x4f80e390 // sdot v16.4s, v28.16b, v0.4b[0]\n"
- ".inst 0x4fa0e3b0 // sdot v16.4s, v29.16b, v0.4b[1]\n"
- "add v24.4s, v22.4s, v16.4s\n"
- "add v26.4s, v22.4s, v25.4s\n"
- "movi v16.4s, #0x0\n"
- ".inst 0x4f80eb90 // sdot v16.4s, v28.16b, v0.4b[2]\n"
- ".inst 0x4fa0ebb0 // sdot v16.4s, v29.16b, v0.4b[3]\n"
- "add v25.4s, v21.4s, v16.4s\n"
- "add v27.4s, v21.4s, v27.4s\n"
- "add v28.4s, v20.4s, v30.4s\n"
- "add v29.4s, v19.4s, v31.4s\n"
- "add v30.4s, v18.4s, v20.4s\n"
- "add v31.4s, v17.4s, v19.4s\n"
+ ".inst 0x4fa6eb93 // sdot v19.4s, v28.16b, v6.4b[3]\n"
+ ".inst 0x4fa7e392 // sdot v18.4s, v28.16b, v7.4b[1]\n"
+ "add v22.4s, v22.4s, v31.4s\n"
+ ".inst 0x4fa7eb99 // sdot v25.4s, v28.16b, v7.4b[3]\n"
+ ".inst 0x4fa0e398 // sdot v24.4s, v28.16b, v0.4b[1]\n"
+ "add v21.4s, v21.4s, v29.4s\n"
+ "add v20.4s, v26.4s, v20.4s\n"
+ "add v19.4s, v27.4s, v19.4s\n"
+ "add v18.4s, v18.4s, v17.4s\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x4f80ebd1 // sdot v17.4s, v30.16b, v0.4b[2]\n"
+ ".inst 0x4fa0eb91 // sdot v17.4s, v28.16b, v0.4b[3]\n"
+ "add v16.4s, v25.4s, v16.4s\n"
+ "add v24.4s, v22.4s, v24.4s\n"
+ "add v25.4s, v21.4s, v17.4s\n"
+ "add v26.4s, v26.4s, v22.4s\n"
+ "add v27.4s, v27.4s, v21.4s\n"
+ "add v28.4s, v20.4s, v31.4s\n"
+ "add v29.4s, v19.4s, v29.4s\n"
+ "add v30.4s, v20.4s, v18.4s\n"
+ "add v31.4s, v19.4s, v16.4s\n"
"neg v23.4s, v23.4s\n"
"mul v24.4s, v24.4s, v23.4s\n"
"mul v25.4s, v25.4s, v23.4s\n"
@@ -194,11 +195,11 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"add v31.4s, v31.4s, v12.4s\n"
"ble 2f\n"
"1:" // Loop
- "ldr q21, [%x[params], #0x60]\n"
- "ldr q20, [%x[params], #0x70]\n"
+ "ldr q12, [%x[params], #0x60]\n"
+ "ldr q21, [%x[params], #0x70]\n"
".inst 0x4f80e118 // sdot v24.4s, v8.16b, v0.4b[0]\n"
".inst 0x4f80e919 // sdot v25.4s, v8.16b, v0.4b[2]\n"
- "ldr q12, [%x[params], #0x80]\n"
+ "ldr q20, [%x[params], #0x80]\n"
".inst 0x4f81e11a // sdot v26.4s, v8.16b, v1.4b[0]\n"
".inst 0x4f81e91b // sdot v27.4s, v8.16b, v1.4b[2]\n"
"sub %x[n_channels], %x[n_channels], #0x4\n"
@@ -212,7 +213,7 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
".inst 0x4f82e91d // sdot v29.4s, v8.16b, v2.4b[2]\n"
".inst 0x4f83e11e // sdot v30.4s, v8.16b, v3.4b[0]\n"
".inst 0x4f83e91f // sdot v31.4s, v8.16b, v3.4b[2]\n"
- "ldr q8, [%x[params], #0x0]\n"
+ "ldr q17, [%x[params], #0x0]\n"
".inst 0x4f81e158 // sdot v24.4s, v10.16b, v1.4b[0]\n"
".inst 0x4f81e959 // sdot v25.4s, v10.16b, v1.4b[2]\n"
".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
@@ -221,7 +222,7 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
".inst 0x4fa2e93d // sdot v29.4s, v9.16b, v2.4b[3]\n"
".inst 0x4fa3e13e // sdot v30.4s, v9.16b, v3.4b[1]\n"
".inst 0x4fa3e93f // sdot v31.4s, v9.16b, v3.4b[3]\n"
- "ldr q9, [%x[params], #0x10]\n"
+ "ldr q16, [%x[params], #0x10]\n"
".inst 0x4fa1e178 // sdot v24.4s, v11.16b, v1.4b[1]\n"
".inst 0x4fa1e979 // sdot v25.4s, v11.16b, v1.4b[3]\n"
".inst 0x4fa2e17a // sdot v26.4s, v11.16b, v2.4b[1]\n"
@@ -230,115 +231,115 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
".inst 0x4f83e95d // sdot v29.4s, v10.16b, v3.4b[2]\n"
".inst 0x4f84e15e // sdot v30.4s, v10.16b, v4.4b[0]\n"
".inst 0x4f84e95f // sdot v31.4s, v10.16b, v4.4b[2]\n"
- "ldr q10, [%x[params], #0x20]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x4f82e919 // sdot v25.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x4f83e11a // sdot v26.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x4f83e91b // sdot v27.4s, v8.16b, v3.4b[2]\n"
+ "ldr q19, [%x[params], #0x20]\n"
+ ".inst 0x4f82e238 // sdot v24.4s, v17.16b, v2.4b[0]\n"
+ ".inst 0x4f82ea39 // sdot v25.4s, v17.16b, v2.4b[2]\n"
+ ".inst 0x4f83e23a // sdot v26.4s, v17.16b, v3.4b[0]\n"
+ ".inst 0x4f83ea3b // sdot v27.4s, v17.16b, v3.4b[2]\n"
".inst 0x4fa3e17c // sdot v28.4s, v11.16b, v3.4b[1]\n"
".inst 0x4fa3e97d // sdot v29.4s, v11.16b, v3.4b[3]\n"
".inst 0x4fa4e17e // sdot v30.4s, v11.16b, v4.4b[1]\n"
".inst 0x4fa4e97f // sdot v31.4s, v11.16b, v4.4b[3]\n"
- "ldr q11, [%x[params], #0x30]\n"
- ".inst 0x4fa2e138 // sdot v24.4s, v9.16b, v2.4b[1]\n"
- ".inst 0x4fa2e939 // sdot v25.4s, v9.16b, v2.4b[3]\n"
- ".inst 0x4fa3e13a // sdot v26.4s, v9.16b, v3.4b[1]\n"
- ".inst 0x4fa3e93b // sdot v27.4s, v9.16b, v3.4b[3]\n"
- ".inst 0x4f84e11c // sdot v28.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x4f84e91d // sdot v29.4s, v8.16b, v4.4b[2]\n"
- ".inst 0x4f85e11e // sdot v30.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x4f85e91f // sdot v31.4s, v8.16b, v5.4b[2]\n"
- "ldr q8, [%x[params], #0x40]\n"
- ".inst 0x4f83e158 // sdot v24.4s, v10.16b, v3.4b[0]\n"
- ".inst 0x4f83e959 // sdot v25.4s, v10.16b, v3.4b[2]\n"
- ".inst 0x4f84e15a // sdot v26.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x4f84e95b // sdot v27.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x4fa4e13c // sdot v28.4s, v9.16b, v4.4b[1]\n"
- ".inst 0x4fa4e93d // sdot v29.4s, v9.16b, v4.4b[3]\n"
- ".inst 0x4fa5e13e // sdot v30.4s, v9.16b, v5.4b[1]\n"
- ".inst 0x4fa5e93f // sdot v31.4s, v9.16b, v5.4b[3]\n"
- "ldr q9, [%x[params], #0x50]\n"
- ".inst 0x4fa3e178 // sdot v24.4s, v11.16b, v3.4b[1]\n"
- ".inst 0x4fa3e979 // sdot v25.4s, v11.16b, v3.4b[3]\n"
- ".inst 0x4fa4e17a // sdot v26.4s, v11.16b, v4.4b[1]\n"
- ".inst 0x4fa4e97b // sdot v27.4s, v11.16b, v4.4b[3]\n"
- ".inst 0x4f85e15c // sdot v28.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x4f85e95d // sdot v29.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x4f86e15e // sdot v30.4s, v10.16b, v6.4b[0]\n"
- ".inst 0x4f86e95f // sdot v31.4s, v10.16b, v6.4b[2]\n"
+ "ldr q18, [%x[params], #0x30]\n"
+ ".inst 0x4fa2e218 // sdot v24.4s, v16.16b, v2.4b[1]\n"
+ ".inst 0x4fa2ea19 // sdot v25.4s, v16.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e21a // sdot v26.4s, v16.16b, v3.4b[1]\n"
+ ".inst 0x4fa3ea1b // sdot v27.4s, v16.16b, v3.4b[3]\n"
+ ".inst 0x4f84e23c // sdot v28.4s, v17.16b, v4.4b[0]\n"
+ ".inst 0x4f84ea3d // sdot v29.4s, v17.16b, v4.4b[2]\n"
+ ".inst 0x4f85e23e // sdot v30.4s, v17.16b, v5.4b[0]\n"
+ ".inst 0x4f85ea3f // sdot v31.4s, v17.16b, v5.4b[2]\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ ".inst 0x4f83e278 // sdot v24.4s, v19.16b, v3.4b[0]\n"
+ ".inst 0x4f83ea79 // sdot v25.4s, v19.16b, v3.4b[2]\n"
+ ".inst 0x4f84e27a // sdot v26.4s, v19.16b, v4.4b[0]\n"
+ ".inst 0x4f84ea7b // sdot v27.4s, v19.16b, v4.4b[2]\n"
+ ".inst 0x4fa4e21c // sdot v28.4s, v16.16b, v4.4b[1]\n"
+ ".inst 0x4fa4ea1d // sdot v29.4s, v16.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e21e // sdot v30.4s, v16.16b, v5.4b[1]\n"
+ ".inst 0x4fa5ea1f // sdot v31.4s, v16.16b, v5.4b[3]\n"
+ "ldr q16, [%x[params], #0x50]\n"
+ ".inst 0x4fa3e258 // sdot v24.4s, v18.16b, v3.4b[1]\n"
+ ".inst 0x4fa3ea59 // sdot v25.4s, v18.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e25a // sdot v26.4s, v18.16b, v4.4b[1]\n"
+ ".inst 0x4fa4ea5b // sdot v27.4s, v18.16b, v4.4b[3]\n"
+ ".inst 0x4f85e27c // sdot v28.4s, v19.16b, v5.4b[0]\n"
+ ".inst 0x4f85ea7d // sdot v29.4s, v19.16b, v5.4b[2]\n"
+ ".inst 0x4f86e27e // sdot v30.4s, v19.16b, v6.4b[0]\n"
+ ".inst 0x4f86ea7f // sdot v31.4s, v19.16b, v6.4b[2]\n"
"ldr q10, [%x[params], #0xb0]\n"
- ".inst 0x4f84e118 // sdot v24.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x4f84e919 // sdot v25.4s, v8.16b, v4.4b[2]\n"
- ".inst 0x4f85e11a // sdot v26.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x4f85e91b // sdot v27.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x4fa5e17c // sdot v28.4s, v11.16b, v5.4b[1]\n"
- ".inst 0x4fa5e97d // sdot v29.4s, v11.16b, v5.4b[3]\n"
- ".inst 0x4fa6e17e // sdot v30.4s, v11.16b, v6.4b[1]\n"
- ".inst 0x4fa6e97f // sdot v31.4s, v11.16b, v6.4b[3]\n"
+ ".inst 0x4f84e238 // sdot v24.4s, v17.16b, v4.4b[0]\n"
+ ".inst 0x4f84ea39 // sdot v25.4s, v17.16b, v4.4b[2]\n"
+ ".inst 0x4f85e23a // sdot v26.4s, v17.16b, v5.4b[0]\n"
+ ".inst 0x4f85ea3b // sdot v27.4s, v17.16b, v5.4b[2]\n"
+ ".inst 0x4fa5e25c // sdot v28.4s, v18.16b, v5.4b[1]\n"
+ ".inst 0x4fa5ea5d // sdot v29.4s, v18.16b, v5.4b[3]\n"
+ ".inst 0x4fa6e25e // sdot v30.4s, v18.16b, v6.4b[1]\n"
+ ".inst 0x4fa6ea5f // sdot v31.4s, v18.16b, v6.4b[3]\n"
"ldr q11, [%x[params], #0xc0]\n"
- ".inst 0x4fa4e138 // sdot v24.4s, v9.16b, v4.4b[1]\n"
- ".inst 0x4fa4e939 // sdot v25.4s, v9.16b, v4.4b[3]\n"
- "sqrdmulh v24.4s, v24.4s, v21.4s\n"
- ".inst 0x4fa5e13a // sdot v26.4s, v9.16b, v5.4b[1]\n"
- ".inst 0x4fa5e93b // sdot v27.4s, v9.16b, v5.4b[3]\n"
- "sqrdmulh v25.4s, v25.4s, v21.4s\n"
- ".inst 0x4f86e11c // sdot v28.4s, v8.16b, v6.4b[0]\n"
- ".inst 0x4f86e91d // sdot v29.4s, v8.16b, v6.4b[2]\n"
- "sqrdmulh v26.4s, v26.4s, v21.4s\n"
- ".inst 0x4f87e11e // sdot v30.4s, v8.16b, v7.4b[0]\n"
- ".inst 0x4f87e91f // sdot v31.4s, v8.16b, v7.4b[2]\n"
+ ".inst 0x4fa4e218 // sdot v24.4s, v16.16b, v4.4b[1]\n"
+ ".inst 0x4fa4ea19 // sdot v25.4s, v16.16b, v4.4b[3]\n"
+ "sqrdmulh v24.4s, v24.4s, v12.4s\n"
+ ".inst 0x4fa5e21a // sdot v26.4s, v16.16b, v5.4b[1]\n"
+ ".inst 0x4fa5ea1b // sdot v27.4s, v16.16b, v5.4b[3]\n"
+ "sqrdmulh v25.4s, v25.4s, v12.4s\n"
+ ".inst 0x4f86e23c // sdot v28.4s, v17.16b, v6.4b[0]\n"
+ ".inst 0x4f86ea3d // sdot v29.4s, v17.16b, v6.4b[2]\n"
+ "sqrdmulh v26.4s, v26.4s, v12.4s\n"
+ ".inst 0x4f87e23e // sdot v30.4s, v17.16b, v7.4b[0]\n"
+ ".inst 0x4f87ea3f // sdot v31.4s, v17.16b, v7.4b[2]\n"
"ldr q8, [%x[params], #0x90]\n"
- "sqrdmulh v27.4s, v27.4s, v21.4s\n"
- ".inst 0x4fa6e13c // sdot v28.4s, v9.16b, v6.4b[1]\n"
- ".inst 0x4fa6e93d // sdot v29.4s, v9.16b, v6.4b[3]\n"
- "and v19.16b, v24.16b, v20.16b\n"
- ".inst 0x4fa7e13e // sdot v30.4s, v9.16b, v7.4b[1]\n"
- ".inst 0x4fa7e93f // sdot v31.4s, v9.16b, v7.4b[3]\n"
+ "sqrdmulh v27.4s, v27.4s, v12.4s\n"
+ ".inst 0x4fa6e21c // sdot v28.4s, v16.16b, v6.4b[1]\n"
+ ".inst 0x4fa6ea1d // sdot v29.4s, v16.16b, v6.4b[3]\n"
+ "and v19.16b, v24.16b, v21.16b\n"
+ ".inst 0x4fa7e21e // sdot v30.4s, v16.16b, v7.4b[1]\n"
+ ".inst 0x4fa7ea1f // sdot v31.4s, v16.16b, v7.4b[3]\n"
"ldr q9, [%x[params], #0xa0]\n"
- "and v18.16b, v25.16b, v20.16b\n"
+ "and v18.16b, v25.16b, v21.16b\n"
+ "and v17.16b, v26.16b, v21.16b\n"
+ "and v16.16b, v27.16b, v21.16b\n"
+ "add %x[params], %x[params], #0xd0\n"
"sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
- "add %x[params], %x[params], #0xd0\n"
- "sqrdmulh v28.4s, v28.4s, v21.4s\n"
- "sqrdmulh v29.4s, v29.4s, v21.4s\n"
- "sqrdmulh v30.4s, v30.4s, v21.4s\n"
- "sqrdmulh v31.4s, v31.4s, v21.4s\n"
- "and v17.16b, v26.16b, v20.16b\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v24.4s, v24.4s, v19.4s\n"
- "and v16.16b, v27.16b, v20.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v28.4s, v28.4s, v12.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v12.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v12.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v12.4s\n"
+ "sqadd v24.4s, v24.4s, v19.4s\n"
"sqadd v25.4s, v25.4s, v18.4s\n"
"sqadd v26.4s, v26.4s, v17.4s\n"
"sqadd v27.4s, v27.4s, v16.4s\n"
- "and v19.16b, v28.16b, v20.16b\n"
- "and v18.16b, v29.16b, v20.16b\n"
- "and v17.16b, v30.16b, v20.16b\n"
+ "and v19.16b, v28.16b, v21.16b\n"
+ "and v18.16b, v29.16b, v21.16b\n"
+ "and v17.16b, v30.16b, v21.16b\n"
+ "and v16.16b, v31.16b, v21.16b\n"
"sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v28.4s, v28.4s, v19.4s\n"
- "and v16.16b, v31.16b, v20.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
"sqadd v29.4s, v29.4s, v18.4s\n"
"sqadd v30.4s, v30.4s, v17.4s\n"
"sqadd v31.4s, v31.4s, v16.4s\n"
- "srshl v24.4s, v24.4s, v20.4s\n"
- "srshl v25.4s, v25.4s, v20.4s\n"
- "srshl v26.4s, v26.4s, v20.4s\n"
- "srshl v27.4s, v27.4s, v20.4s\n"
- "srshl v28.4s, v28.4s, v20.4s\n"
- "srshl v29.4s, v29.4s, v20.4s\n"
- "srshl v30.4s, v30.4s, v20.4s\n"
- "srshl v31.4s, v31.4s, v20.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
+ "srshl v24.4s, v24.4s, v21.4s\n"
+ "srshl v25.4s, v25.4s, v21.4s\n"
+ "srshl v26.4s, v26.4s, v21.4s\n"
+ "srshl v27.4s, v27.4s, v21.4s\n"
+ "srshl v28.4s, v28.4s, v21.4s\n"
+ "srshl v29.4s, v29.4s, v21.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "srshl v31.4s, v31.4s, v21.4s\n"
+ "add v24.4s, v24.4s, v13.4s\n"
+ "add v25.4s, v25.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "add v27.4s, v27.4s, v13.4s\n"
+ "add v28.4s, v28.4s, v13.4s\n"
+ "add v29.4s, v29.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v13.4s\n"
+ "add v31.4s, v31.4s, v13.4s\n"
"smin v24.4s, v24.4s, v15.4s\n"
"smin v25.4s, v25.4s, v15.4s\n"
"smin v26.4s, v26.4s, v15.4s\n"
@@ -347,14 +348,14 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"smin v29.4s, v29.4s, v15.4s\n"
"smin v30.4s, v30.4s, v15.4s\n"
"smin v31.4s, v31.4s, v15.4s\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v14.4s\n"
+ "smax v25.4s, v25.4s, v14.4s\n"
+ "smax v26.4s, v26.4s, v14.4s\n"
+ "smax v27.4s, v27.4s, v14.4s\n"
+ "smax v28.4s, v28.4s, v14.4s\n"
+ "smax v29.4s, v29.4s, v14.4s\n"
+ "smax v30.4s, v30.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v14.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
@@ -388,14 +389,14 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"dup v30.4s, v23.s[2]\n"
"dup v31.4s, v23.s[3]\n"
"add x28, x28, #0x4\n"
- "add v24.4s, v24.4s, v12.4s\n"
- "add v25.4s, v25.4s, v12.4s\n"
- "add v26.4s, v26.4s, v12.4s\n"
- "add v27.4s, v27.4s, v12.4s\n"
- "add v28.4s, v28.4s, v12.4s\n"
- "add v29.4s, v29.4s, v12.4s\n"
- "add v30.4s, v30.4s, v12.4s\n"
- "add v31.4s, v31.4s, v12.4s\n"
+ "add v24.4s, v24.4s, v20.4s\n"
+ "add v25.4s, v25.4s, v20.4s\n"
+ "add v26.4s, v26.4s, v20.4s\n"
+ "add v27.4s, v27.4s, v20.4s\n"
+ "add v28.4s, v28.4s, v20.4s\n"
+ "add v29.4s, v29.4s, v20.4s\n"
+ "add v30.4s, v30.4s, v20.4s\n"
+ "add v31.4s, v31.4s, v20.4s\n"
"bgt 1b\n"
"2:" // Tail
"ldr q21, [%x[params], #0x60]\n"
@@ -420,7 +421,7 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"add x21, x21, x28\n"
".inst 0x4f83e11e // sdot v30.4s, v8.16b, v3.4b[0]\n"
".inst 0x4f83e91f // sdot v31.4s, v8.16b, v3.4b[2]\n"
- "ldr q8, [%x[params], #0x0]\n"
+ "ldr q17, [%x[params], #0x0]\n"
"add x20, x20, x28\n"
".inst 0x4f81e158 // sdot v24.4s, v10.16b, v1.4b[0]\n"
".inst 0x4f81e959 // sdot v25.4s, v10.16b, v1.4b[2]\n"
@@ -430,7 +431,7 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
".inst 0x4fa2e93d // sdot v29.4s, v9.16b, v2.4b[3]\n"
".inst 0x4fa3e13e // sdot v30.4s, v9.16b, v3.4b[1]\n"
".inst 0x4fa3e93f // sdot v31.4s, v9.16b, v3.4b[3]\n"
- "ldr q9, [%x[params], #0x10]\n"
+ "ldr q16, [%x[params], #0x10]\n"
".inst 0x4fa1e178 // sdot v24.4s, v11.16b, v1.4b[1]\n"
".inst 0x4fa1e979 // sdot v25.4s, v11.16b, v1.4b[3]\n"
".inst 0x4fa2e17a // sdot v26.4s, v11.16b, v2.4b[1]\n"
@@ -439,68 +440,68 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
".inst 0x4f83e95d // sdot v29.4s, v10.16b, v3.4b[2]\n"
".inst 0x4f84e15e // sdot v30.4s, v10.16b, v4.4b[0]\n"
".inst 0x4f84e95f // sdot v31.4s, v10.16b, v4.4b[2]\n"
- "ldr q10, [%x[params], #0x20]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x4f82e919 // sdot v25.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x4f83e11a // sdot v26.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x4f83e91b // sdot v27.4s, v8.16b, v3.4b[2]\n"
+ "ldr q19, [%x[params], #0x20]\n"
+ ".inst 0x4f82e238 // sdot v24.4s, v17.16b, v2.4b[0]\n"
+ ".inst 0x4f82ea39 // sdot v25.4s, v17.16b, v2.4b[2]\n"
+ ".inst 0x4f83e23a // sdot v26.4s, v17.16b, v3.4b[0]\n"
+ ".inst 0x4f83ea3b // sdot v27.4s, v17.16b, v3.4b[2]\n"
".inst 0x4fa3e17c // sdot v28.4s, v11.16b, v3.4b[1]\n"
".inst 0x4fa3e97d // sdot v29.4s, v11.16b, v3.4b[3]\n"
".inst 0x4fa4e17e // sdot v30.4s, v11.16b, v4.4b[1]\n"
".inst 0x4fa4e97f // sdot v31.4s, v11.16b, v4.4b[3]\n"
- "ldr q11, [%x[params], #0x30]\n"
- ".inst 0x4fa2e138 // sdot v24.4s, v9.16b, v2.4b[1]\n"
- ".inst 0x4fa2e939 // sdot v25.4s, v9.16b, v2.4b[3]\n"
- ".inst 0x4fa3e13a // sdot v26.4s, v9.16b, v3.4b[1]\n"
- ".inst 0x4fa3e93b // sdot v27.4s, v9.16b, v3.4b[3]\n"
- ".inst 0x4f84e11c // sdot v28.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x4f84e91d // sdot v29.4s, v8.16b, v4.4b[2]\n"
- ".inst 0x4f85e11e // sdot v30.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x4f85e91f // sdot v31.4s, v8.16b, v5.4b[2]\n"
- "ldr q8, [%x[params], #0x40]\n"
- ".inst 0x4f83e158 // sdot v24.4s, v10.16b, v3.4b[0]\n"
- ".inst 0x4f83e959 // sdot v25.4s, v10.16b, v3.4b[2]\n"
- ".inst 0x4f84e15a // sdot v26.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x4f84e95b // sdot v27.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x4fa4e13c // sdot v28.4s, v9.16b, v4.4b[1]\n"
- ".inst 0x4fa4e93d // sdot v29.4s, v9.16b, v4.4b[3]\n"
- ".inst 0x4fa5e13e // sdot v30.4s, v9.16b, v5.4b[1]\n"
- ".inst 0x4fa5e93f // sdot v31.4s, v9.16b, v5.4b[3]\n"
- "ldr q9, [%x[params], #0x50]\n"
+ "ldr q18, [%x[params], #0x30]\n"
+ ".inst 0x4fa2e218 // sdot v24.4s, v16.16b, v2.4b[1]\n"
+ ".inst 0x4fa2ea19 // sdot v25.4s, v16.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e21a // sdot v26.4s, v16.16b, v3.4b[1]\n"
+ ".inst 0x4fa3ea1b // sdot v27.4s, v16.16b, v3.4b[3]\n"
+ ".inst 0x4f84e23c // sdot v28.4s, v17.16b, v4.4b[0]\n"
+ ".inst 0x4f84ea3d // sdot v29.4s, v17.16b, v4.4b[2]\n"
+ ".inst 0x4f85e23e // sdot v30.4s, v17.16b, v5.4b[0]\n"
+ ".inst 0x4f85ea3f // sdot v31.4s, v17.16b, v5.4b[2]\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ ".inst 0x4f83e278 // sdot v24.4s, v19.16b, v3.4b[0]\n"
+ ".inst 0x4f83ea79 // sdot v25.4s, v19.16b, v3.4b[2]\n"
+ ".inst 0x4f84e27a // sdot v26.4s, v19.16b, v4.4b[0]\n"
+ ".inst 0x4f84ea7b // sdot v27.4s, v19.16b, v4.4b[2]\n"
+ ".inst 0x4fa4e21c // sdot v28.4s, v16.16b, v4.4b[1]\n"
+ ".inst 0x4fa4ea1d // sdot v29.4s, v16.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e21e // sdot v30.4s, v16.16b, v5.4b[1]\n"
+ ".inst 0x4fa5ea1f // sdot v31.4s, v16.16b, v5.4b[3]\n"
+ "ldr q16, [%x[params], #0x50]\n"
"add %x[params], %x[params], #0x80\n"
- ".inst 0x4fa3e178 // sdot v24.4s, v11.16b, v3.4b[1]\n"
- ".inst 0x4fa3e979 // sdot v25.4s, v11.16b, v3.4b[3]\n"
- ".inst 0x4fa4e17a // sdot v26.4s, v11.16b, v4.4b[1]\n"
- ".inst 0x4fa4e97b // sdot v27.4s, v11.16b, v4.4b[3]\n"
- ".inst 0x4f85e15c // sdot v28.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x4f85e95d // sdot v29.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x4f86e15e // sdot v30.4s, v10.16b, v6.4b[0]\n"
- ".inst 0x4f86e95f // sdot v31.4s, v10.16b, v6.4b[2]\n"
- ".inst 0x4f84e118 // sdot v24.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x4f84e919 // sdot v25.4s, v8.16b, v4.4b[2]\n"
- ".inst 0x4f85e11a // sdot v26.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x4f85e91b // sdot v27.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x4fa5e17c // sdot v28.4s, v11.16b, v5.4b[1]\n"
- ".inst 0x4fa5e97d // sdot v29.4s, v11.16b, v5.4b[3]\n"
- ".inst 0x4fa6e17e // sdot v30.4s, v11.16b, v6.4b[1]\n"
- ".inst 0x4fa6e97f // sdot v31.4s, v11.16b, v6.4b[3]\n"
- ".inst 0x4fa4e138 // sdot v24.4s, v9.16b, v4.4b[1]\n"
- ".inst 0x4fa4e939 // sdot v25.4s, v9.16b, v4.4b[3]\n"
+ ".inst 0x4fa3e258 // sdot v24.4s, v18.16b, v3.4b[1]\n"
+ ".inst 0x4fa3ea59 // sdot v25.4s, v18.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e25a // sdot v26.4s, v18.16b, v4.4b[1]\n"
+ ".inst 0x4fa4ea5b // sdot v27.4s, v18.16b, v4.4b[3]\n"
+ ".inst 0x4f85e27c // sdot v28.4s, v19.16b, v5.4b[0]\n"
+ ".inst 0x4f85ea7d // sdot v29.4s, v19.16b, v5.4b[2]\n"
+ ".inst 0x4f86e27e // sdot v30.4s, v19.16b, v6.4b[0]\n"
+ ".inst 0x4f86ea7f // sdot v31.4s, v19.16b, v6.4b[2]\n"
+ ".inst 0x4f84e238 // sdot v24.4s, v17.16b, v4.4b[0]\n"
+ ".inst 0x4f84ea39 // sdot v25.4s, v17.16b, v4.4b[2]\n"
+ ".inst 0x4f85e23a // sdot v26.4s, v17.16b, v5.4b[0]\n"
+ ".inst 0x4f85ea3b // sdot v27.4s, v17.16b, v5.4b[2]\n"
+ ".inst 0x4fa5e25c // sdot v28.4s, v18.16b, v5.4b[1]\n"
+ ".inst 0x4fa5ea5d // sdot v29.4s, v18.16b, v5.4b[3]\n"
+ ".inst 0x4fa6e25e // sdot v30.4s, v18.16b, v6.4b[1]\n"
+ ".inst 0x4fa6ea5f // sdot v31.4s, v18.16b, v6.4b[3]\n"
+ ".inst 0x4fa4e218 // sdot v24.4s, v16.16b, v4.4b[1]\n"
+ ".inst 0x4fa4ea19 // sdot v25.4s, v16.16b, v4.4b[3]\n"
"sqrdmulh v24.4s, v24.4s, v21.4s\n"
- ".inst 0x4fa5e13a // sdot v26.4s, v9.16b, v5.4b[1]\n"
- ".inst 0x4fa5e93b // sdot v27.4s, v9.16b, v5.4b[3]\n"
+ ".inst 0x4fa5e21a // sdot v26.4s, v16.16b, v5.4b[1]\n"
+ ".inst 0x4fa5ea1b // sdot v27.4s, v16.16b, v5.4b[3]\n"
"sqrdmulh v25.4s, v25.4s, v21.4s\n"
- ".inst 0x4f86e11c // sdot v28.4s, v8.16b, v6.4b[0]\n"
- ".inst 0x4f86e91d // sdot v29.4s, v8.16b, v6.4b[2]\n"
+ ".inst 0x4f86e23c // sdot v28.4s, v17.16b, v6.4b[0]\n"
+ ".inst 0x4f86ea3d // sdot v29.4s, v17.16b, v6.4b[2]\n"
"sqrdmulh v26.4s, v26.4s, v21.4s\n"
- ".inst 0x4f87e11e // sdot v30.4s, v8.16b, v7.4b[0]\n"
- ".inst 0x4f87e91f // sdot v31.4s, v8.16b, v7.4b[2]\n"
+ ".inst 0x4f87e23e // sdot v30.4s, v17.16b, v7.4b[0]\n"
+ ".inst 0x4f87ea3f // sdot v31.4s, v17.16b, v7.4b[2]\n"
"sqrdmulh v27.4s, v27.4s, v21.4s\n"
- ".inst 0x4fa6e13c // sdot v28.4s, v9.16b, v6.4b[1]\n"
- ".inst 0x4fa6e93d // sdot v29.4s, v9.16b, v6.4b[3]\n"
+ ".inst 0x4fa6e21c // sdot v28.4s, v16.16b, v6.4b[1]\n"
+ ".inst 0x4fa6ea1d // sdot v29.4s, v16.16b, v6.4b[3]\n"
"and v19.16b, v24.16b, v20.16b\n"
- ".inst 0x4fa7e13e // sdot v30.4s, v9.16b, v7.4b[1]\n"
- ".inst 0x4fa7e93f // sdot v31.4s, v9.16b, v7.4b[3]\n"
+ ".inst 0x4fa7e21e // sdot v30.4s, v16.16b, v7.4b[1]\n"
+ ".inst 0x4fa7ea1f // sdot v31.4s, v16.16b, v7.4b[3]\n"
"and v18.16b, v25.16b, v20.16b\n"
"and v17.16b, v26.16b, v20.16b\n"
"and v16.16b, v27.16b, v20.16b\n"
@@ -536,14 +537,14 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"srshl v29.4s, v29.4s, v20.4s\n"
"srshl v30.4s, v30.4s, v20.4s\n"
"srshl v31.4s, v31.4s, v20.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v13.4s\n"
+ "add v25.4s, v25.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "add v27.4s, v27.4s, v13.4s\n"
+ "add v28.4s, v28.4s, v13.4s\n"
+ "add v29.4s, v29.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v13.4s\n"
+ "add v31.4s, v31.4s, v13.4s\n"
"smin v24.4s, v24.4s, v15.4s\n"
"smin v25.4s, v25.4s, v15.4s\n"
"smin v26.4s, v26.4s, v15.4s\n"
@@ -552,14 +553,14 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"smin v29.4s, v29.4s, v15.4s\n"
"smin v30.4s, v30.4s, v15.4s\n"
"smin v31.4s, v31.4s, v15.4s\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v14.4s\n"
+ "smax v25.4s, v25.4s, v14.4s\n"
+ "smax v26.4s, v26.4s, v14.4s\n"
+ "smax v27.4s, v27.4s, v14.4s\n"
+ "smax v28.4s, v28.4s, v14.4s\n"
+ "smax v29.4s, v29.4s, v14.4s\n"
+ "smax v30.4s, v30.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v14.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
@@ -635,4 +636,5 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
index 3dad8d5604..3f71c5fb64 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
index 3a544e0697..b21ad484e5 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -22,12 +22,13 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include "arm_gemm.hpp"
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
@@ -47,21 +48,21 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
__asm__ __volatile__(
"lsr x10, %x[n_output_channels], #0x2\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
- "ld1r { v13.4s }, [x20]\n"
+ "ld1r { v15.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v11.4s }, [x20]\n"
+ "ld1r { v14.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v3.16b }, [x20]\n"
+ "ld1r { v13.16b }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
"ld1r { v12.16b }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v14.4s }, [x20]\n"
+ "ld1r { v11.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
- "ld1r { v15.4s }, [x20]\n"
+ "ld1r { v10.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
"ld1r { v9.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
- "ld1r { v10.4s }, [x20]\n"
+ "ld1r { v8.4s }, [x20]\n"
"mov x9, #0x0\n"
"cbz x10, 9f\n"
"1:" // Output channel loop
@@ -89,256 +90,256 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"cbz %x[rq_mul_ptr], 3f\n"
"lsl x20, x9, #0x2\n"
"ldr q9, [%x[rq_mul_ptr], x20]\n"
- "ldr q10, [%x[rq_right_shift_ptr], x20]\n"
+ "ldr q8, [%x[rq_right_shift_ptr], x20]\n"
"cbz %x[rq_left_shift_ptr], 3f\n"
- "ldr q15, [%x[rq_left_shift_ptr], x20]\n"
+ "ldr q10, [%x[rq_left_shift_ptr], x20]\n"
"3:" // Output channel loop: Load quantization parameters: Done
- "ldr s8, [%x[weights]], #0x4\n"
- "mov x20, %x[inptrs]\n"
- "ldp x25, x28, [x20], #0x10\n"
- "lsr x21, %x[kernel_points], #0x1\n"
- "ldr d2, [x25, #0x0]\n"
- "ldr d7, [x28, #0x0]\n"
- "ssubl v2.8h, v2.8b, v3.8b\n"
- "ssubl v7.8h, v7.8b, v3.8b\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "cbz x21, 7f\n"
- "ldr s6, [%x[weights]], #0x4\n"
- "ldp x25, x28, [x20], #0x10\n"
- "subs x21, x21, #0x1\n"
- "ssubl v6.8h, v6.8b, v12.8b\n"
- "ldr d1, [x25, #0x0]\n"
- "ldr d0, [x28, #0x0]\n"
- "ssubl v1.8h, v1.8b, v3.8b\n"
- "ssubl v0.8h, v0.8b, v3.8b\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "mov x22, %x[inptrs]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "lsr x23, %x[kernel_points], #0x1\n"
+ "ldr d0, [x21, #0x0]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "cbz x23, 7f\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "subs x23, x23, #0x1\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
+ "ldr d3, [x21, #0x0]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
"beq 5f\n"
"4:" // Output channel loop: Kernel loop
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "subs x21, x21, #0x1\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "ldr d2, [x25, #0x0]\n"
- "ssubl v2.8h, v2.8b, v3.8b\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "ldr d7, [x28, #0x0]\n"
- "ldr s8, [%x[weights]], #0x4\n"
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "ssubl v7.8h, v7.8b, v3.8b\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
- "ldr d1, [x25, #0x0]\n"
- "ssubl v1.8h, v1.8b, v3.8b\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
- "ldr d0, [x28, #0x0]\n"
- "ldr s6, [%x[weights]], #0x4\n"
- "ssubl v0.8h, v0.8b, v3.8b\n"
- "ssubl v6.8h, v6.8b, v12.8b\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "subs x23, x23, #0x1\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d0, [x21, #0x0]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "ldr d3, [x21, #0x0]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
"bgt 4b\n"
"5:" // Output channel loop: Kernel loop tail
"tbnz %x[kernel_points], #0, 6f\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "sshl v16.4s, v16.4s, v15.4s\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "sshl v17.4s, v17.4s, v15.4s\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "sshl v18.4s, v18.4s, v15.4s\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "sshl v19.4s, v19.4s, v15.4s\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
"sqrdmulh v16.4s, v16.4s, v9.4s\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
"sqrdmulh v17.4s, v17.4s, v9.4s\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
"sqrdmulh v18.4s, v18.4s, v9.4s\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
"sqrdmulh v19.4s, v19.4s, v9.4s\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "and v5.16b, v16.16b, v10.16b\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "and v4.16b, v17.16b, v10.16b\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "and v2.16b, v18.16b, v10.16b\n"
- "and v1.16b, v19.16b, v10.16b\n"
- "sshl v20.4s, v20.4s, v15.4s\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "sshl v21.4s, v21.4s, v15.4s\n"
- "sshl v22.4s, v22.4s, v15.4s\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "sshl v23.4s, v23.4s, v15.4s\n"
- "sshl v24.4s, v24.4s, v15.4s\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "sshl v25.4s, v25.4s, v15.4s\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v20.4s, v20.4s, v9.4s\n"
"sqrdmulh v21.4s, v21.4s, v9.4s\n"
"sqrdmulh v22.4s, v22.4s, v9.4s\n"
"sqrdmulh v23.4s, v23.4s, v9.4s\n"
"sqrdmulh v24.4s, v24.4s, v9.4s\n"
"sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqadd v16.4s, v16.4s, v5.4s\n"
- "sqadd v17.4s, v17.4s, v4.4s\n"
- "sqadd v18.4s, v18.4s, v2.4s\n"
- "sqadd v19.4s, v19.4s, v1.4s\n"
- "and v8.16b, v20.16b, v10.16b\n"
- "and v0.16b, v21.16b, v10.16b\n"
- "and v5.16b, v22.16b, v10.16b\n"
- "and v4.16b, v23.16b, v10.16b\n"
- "and v2.16b, v24.16b, v10.16b\n"
- "and v1.16b, v25.16b, v10.16b\n"
- "sshl v26.4s, v26.4s, v15.4s\n"
- "sshl v27.4s, v27.4s, v15.4s\n"
- "sshl v28.4s, v28.4s, v15.4s\n"
- "sshl v29.4s, v29.4s, v15.4s\n"
- "sshl v30.4s, v30.4s, v15.4s\n"
- "sshl v31.4s, v31.4s, v15.4s\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v9.4s\n"
"sqrdmulh v27.4s, v27.4s, v9.4s\n"
"sqrdmulh v28.4s, v28.4s, v9.4s\n"
"sqrdmulh v29.4s, v29.4s, v9.4s\n"
"sqrdmulh v30.4s, v30.4s, v9.4s\n"
"sqrdmulh v31.4s, v31.4s, v9.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sqadd v22.4s, v22.4s, v5.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v2.4s\n"
- "sqadd v25.4s, v25.4s, v1.4s\n"
- "and v8.16b, v26.16b, v10.16b\n"
- "and v0.16b, v27.16b, v10.16b\n"
- "and v5.16b, v28.16b, v10.16b\n"
- "and v4.16b, v29.16b, v10.16b\n"
- "and v2.16b, v30.16b, v10.16b\n"
- "and v1.16b, v31.16b, v10.16b\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "srshl v17.4s, v17.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v10.4s\n"
- "srshl v19.4s, v19.4s, v10.4s\n"
- "srshl v20.4s, v20.4s, v10.4s\n"
- "srshl v21.4s, v21.4s, v10.4s\n"
- "srshl v22.4s, v22.4s, v10.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "sqadd v26.4s, v26.4s, v8.4s\n"
- "sqadd v27.4s, v27.4s, v0.4s\n"
- "sqadd v28.4s, v28.4s, v5.4s\n"
- "sqadd v29.4s, v29.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v2.4s\n"
- "sqadd v31.4s, v31.4s, v1.4s\n"
- "add v16.4s, v16.4s, v14.4s\n"
- "add v17.4s, v17.4s, v14.4s\n"
- "add v18.4s, v18.4s, v14.4s\n"
- "add v19.4s, v19.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v22.4s, v22.4s, v14.4s\n"
- "add v23.4s, v23.4s, v14.4s\n"
- "srshl v24.4s, v24.4s, v10.4s\n"
- "srshl v25.4s, v25.4s, v10.4s\n"
- "srshl v26.4s, v26.4s, v10.4s\n"
- "srshl v27.4s, v27.4s, v10.4s\n"
- "srshl v28.4s, v28.4s, v10.4s\n"
- "srshl v29.4s, v29.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v10.4s\n"
- "srshl v31.4s, v31.4s, v10.4s\n"
- "smin v16.4s, v16.4s, v11.4s\n"
- "smin v17.4s, v17.4s, v11.4s\n"
- "smin v18.4s, v18.4s, v11.4s\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "smin v23.4s, v23.4s, v11.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "smax v16.4s, v16.4s, v13.4s\n"
- "smax v17.4s, v17.4s, v13.4s\n"
- "smax v18.4s, v18.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smax v22.4s, v22.4s, v13.4s\n"
- "smax v23.4s, v23.4s, v13.4s\n"
- "smin v24.4s, v24.4s, v11.4s\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "smin v26.4s, v26.4s, v11.4s\n"
- "smin v27.4s, v27.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v31.4s, v31.4s, v11.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
@@ -347,263 +348,263 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s16, [x20, x9]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "str s17, [x21, x9]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s18, [x22, x9]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s19, [x23, x9]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s20, [x24, x9]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s21, [x25, x9]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s22, [x26, x9]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s23, [x27, x9]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x20, x9]\n"
+ "str s24, [x27, x9]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s25, [x21, x9]\n"
+ "str s25, [x26, x9]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s26, [x22, x9]\n"
+ "str s26, [x25, x9]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s27, [x23, x9]\n"
+ "str s27, [x24, x9]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s28, [x24, x9]\n"
- "str s29, [x25, x9]\n"
- "str s30, [x26, x9]\n"
- "str s31, [x27, x9]\n"
+ "str s28, [x23, x9]\n"
+ "str s29, [x22, x9]\n"
+ "str s30, [x21, x9]\n"
+ "str s31, [x20, x9]\n"
"b 8f\n"
"6:" // Output channel loop: Odd tail
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "ldr d2, [x25, #0x0]\n"
- "ssubl v2.8h, v2.8b, v3.8b\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "ldr s8, [%x[weights]], #0x4\n"
- "ldr d7, [x28, #0x0]\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "ssubl v7.8h, v7.8b, v3.8b\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "sshl v16.4s, v16.4s, v15.4s\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "sshl v17.4s, v17.4s, v15.4s\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
- "sshl v18.4s, v18.4s, v15.4s\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
- "sshl v19.4s, v19.4s, v15.4s\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
+ "ldp x20, x28, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d0, [x20, #0x0]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "ldr d4, [x28, #0x0]\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
"sqrdmulh v16.4s, v16.4s, v9.4s\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
"sqrdmulh v17.4s, v17.4s, v9.4s\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
"sqrdmulh v18.4s, v18.4s, v9.4s\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
"sqrdmulh v19.4s, v19.4s, v9.4s\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "and v5.16b, v16.16b, v10.16b\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "and v4.16b, v17.16b, v10.16b\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
- "and v2.16b, v18.16b, v10.16b\n"
- "and v1.16b, v19.16b, v10.16b\n"
- "sshl v20.4s, v20.4s, v15.4s\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "sshl v21.4s, v21.4s, v15.4s\n"
- "sshl v22.4s, v22.4s, v15.4s\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "sshl v23.4s, v23.4s, v15.4s\n"
- "sshl v24.4s, v24.4s, v15.4s\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "sshl v25.4s, v25.4s, v15.4s\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v20.4s, v20.4s, v9.4s\n"
"sqrdmulh v21.4s, v21.4s, v9.4s\n"
"sqrdmulh v22.4s, v22.4s, v9.4s\n"
"sqrdmulh v23.4s, v23.4s, v9.4s\n"
"sqrdmulh v24.4s, v24.4s, v9.4s\n"
"sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqadd v16.4s, v16.4s, v5.4s\n"
- "sqadd v17.4s, v17.4s, v4.4s\n"
- "sqadd v18.4s, v18.4s, v2.4s\n"
- "sqadd v19.4s, v19.4s, v1.4s\n"
- "and v8.16b, v20.16b, v10.16b\n"
- "and v0.16b, v21.16b, v10.16b\n"
- "and v5.16b, v22.16b, v10.16b\n"
- "and v4.16b, v23.16b, v10.16b\n"
- "and v2.16b, v24.16b, v10.16b\n"
- "and v1.16b, v25.16b, v10.16b\n"
- "sshl v26.4s, v26.4s, v15.4s\n"
- "sshl v27.4s, v27.4s, v15.4s\n"
- "sshl v28.4s, v28.4s, v15.4s\n"
- "sshl v29.4s, v29.4s, v15.4s\n"
- "sshl v30.4s, v30.4s, v15.4s\n"
- "sshl v31.4s, v31.4s, v15.4s\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v9.4s\n"
"sqrdmulh v27.4s, v27.4s, v9.4s\n"
"sqrdmulh v28.4s, v28.4s, v9.4s\n"
"sqrdmulh v29.4s, v29.4s, v9.4s\n"
"sqrdmulh v30.4s, v30.4s, v9.4s\n"
"sqrdmulh v31.4s, v31.4s, v9.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sqadd v22.4s, v22.4s, v5.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v2.4s\n"
- "sqadd v25.4s, v25.4s, v1.4s\n"
- "and v8.16b, v26.16b, v10.16b\n"
- "and v0.16b, v27.16b, v10.16b\n"
- "and v5.16b, v28.16b, v10.16b\n"
- "and v4.16b, v29.16b, v10.16b\n"
- "and v2.16b, v30.16b, v10.16b\n"
- "and v1.16b, v31.16b, v10.16b\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "srshl v17.4s, v17.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v10.4s\n"
- "srshl v19.4s, v19.4s, v10.4s\n"
- "srshl v20.4s, v20.4s, v10.4s\n"
- "srshl v21.4s, v21.4s, v10.4s\n"
- "srshl v22.4s, v22.4s, v10.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "sqadd v26.4s, v26.4s, v8.4s\n"
- "sqadd v27.4s, v27.4s, v0.4s\n"
- "sqadd v28.4s, v28.4s, v5.4s\n"
- "sqadd v29.4s, v29.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v2.4s\n"
- "sqadd v31.4s, v31.4s, v1.4s\n"
- "add v16.4s, v16.4s, v14.4s\n"
- "add v17.4s, v17.4s, v14.4s\n"
- "add v18.4s, v18.4s, v14.4s\n"
- "add v19.4s, v19.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v22.4s, v22.4s, v14.4s\n"
- "add v23.4s, v23.4s, v14.4s\n"
- "srshl v24.4s, v24.4s, v10.4s\n"
- "srshl v25.4s, v25.4s, v10.4s\n"
- "srshl v26.4s, v26.4s, v10.4s\n"
- "srshl v27.4s, v27.4s, v10.4s\n"
- "srshl v28.4s, v28.4s, v10.4s\n"
- "srshl v29.4s, v29.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v10.4s\n"
- "srshl v31.4s, v31.4s, v10.4s\n"
- "smin v16.4s, v16.4s, v11.4s\n"
- "smin v17.4s, v17.4s, v11.4s\n"
- "smin v18.4s, v18.4s, v11.4s\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "smin v23.4s, v23.4s, v11.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "smax v16.4s, v16.4s, v13.4s\n"
- "smax v17.4s, v17.4s, v13.4s\n"
- "smax v18.4s, v18.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smax v22.4s, v22.4s, v13.4s\n"
- "smax v23.4s, v23.4s, v13.4s\n"
- "smin v24.4s, v24.4s, v11.4s\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "smin v26.4s, v26.4s, v11.4s\n"
- "smin v27.4s, v27.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v31.4s, v31.4s, v11.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
@@ -612,224 +613,224 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s16, [x20, x9]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "str s17, [x21, x9]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s18, [x22, x9]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s19, [x23, x9]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s20, [x24, x9]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s21, [x25, x9]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s22, [x26, x9]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s23, [x27, x9]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x20, x9]\n"
+ "str s24, [x27, x9]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s25, [x21, x9]\n"
+ "str s25, [x26, x9]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s26, [x22, x9]\n"
+ "str s26, [x25, x9]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s27, [x23, x9]\n"
+ "str s27, [x24, x9]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s28, [x24, x9]\n"
- "str s29, [x25, x9]\n"
- "str s30, [x26, x9]\n"
- "str s31, [x27, x9]\n"
+ "str s28, [x23, x9]\n"
+ "str s29, [x22, x9]\n"
+ "str s30, [x21, x9]\n"
+ "str s31, [x20, x9]\n"
"b 8f\n"
"7:" // Output channel loop: Single kernel point
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "sshl v16.4s, v16.4s, v15.4s\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "sshl v17.4s, v17.4s, v15.4s\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "sshl v18.4s, v18.4s, v15.4s\n"
- "sshl v19.4s, v19.4s, v15.4s\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
"sqrdmulh v16.4s, v16.4s, v9.4s\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
"sqrdmulh v17.4s, v17.4s, v9.4s\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
"sqrdmulh v18.4s, v18.4s, v9.4s\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
"sqrdmulh v19.4s, v19.4s, v9.4s\n"
- "and v5.16b, v16.16b, v10.16b\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "and v4.16b, v17.16b, v10.16b\n"
- "and v2.16b, v18.16b, v10.16b\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "and v1.16b, v19.16b, v10.16b\n"
- "sshl v20.4s, v20.4s, v15.4s\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "sshl v21.4s, v21.4s, v15.4s\n"
- "sshl v22.4s, v22.4s, v15.4s\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "sshl v23.4s, v23.4s, v15.4s\n"
- "sshl v24.4s, v24.4s, v15.4s\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "sshl v25.4s, v25.4s, v15.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v20.4s, v20.4s, v9.4s\n"
"sqrdmulh v21.4s, v21.4s, v9.4s\n"
"sqrdmulh v22.4s, v22.4s, v9.4s\n"
"sqrdmulh v23.4s, v23.4s, v9.4s\n"
"sqrdmulh v24.4s, v24.4s, v9.4s\n"
"sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqadd v16.4s, v16.4s, v5.4s\n"
- "sqadd v17.4s, v17.4s, v4.4s\n"
- "sqadd v18.4s, v18.4s, v2.4s\n"
- "sqadd v19.4s, v19.4s, v1.4s\n"
- "and v8.16b, v20.16b, v10.16b\n"
- "and v0.16b, v21.16b, v10.16b\n"
- "and v5.16b, v22.16b, v10.16b\n"
- "and v4.16b, v23.16b, v10.16b\n"
- "and v2.16b, v24.16b, v10.16b\n"
- "and v1.16b, v25.16b, v10.16b\n"
- "sshl v26.4s, v26.4s, v15.4s\n"
- "sshl v27.4s, v27.4s, v15.4s\n"
- "sshl v28.4s, v28.4s, v15.4s\n"
- "sshl v29.4s, v29.4s, v15.4s\n"
- "sshl v30.4s, v30.4s, v15.4s\n"
- "sshl v31.4s, v31.4s, v15.4s\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v9.4s\n"
"sqrdmulh v27.4s, v27.4s, v9.4s\n"
"sqrdmulh v28.4s, v28.4s, v9.4s\n"
"sqrdmulh v29.4s, v29.4s, v9.4s\n"
"sqrdmulh v30.4s, v30.4s, v9.4s\n"
"sqrdmulh v31.4s, v31.4s, v9.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sqadd v22.4s, v22.4s, v5.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v2.4s\n"
- "sqadd v25.4s, v25.4s, v1.4s\n"
- "and v8.16b, v26.16b, v10.16b\n"
- "and v0.16b, v27.16b, v10.16b\n"
- "and v5.16b, v28.16b, v10.16b\n"
- "and v4.16b, v29.16b, v10.16b\n"
- "and v2.16b, v30.16b, v10.16b\n"
- "and v1.16b, v31.16b, v10.16b\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "srshl v17.4s, v17.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v10.4s\n"
- "srshl v19.4s, v19.4s, v10.4s\n"
- "srshl v20.4s, v20.4s, v10.4s\n"
- "srshl v21.4s, v21.4s, v10.4s\n"
- "srshl v22.4s, v22.4s, v10.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "sqadd v26.4s, v26.4s, v8.4s\n"
- "sqadd v27.4s, v27.4s, v0.4s\n"
- "sqadd v28.4s, v28.4s, v5.4s\n"
- "sqadd v29.4s, v29.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v2.4s\n"
- "sqadd v31.4s, v31.4s, v1.4s\n"
- "add v16.4s, v16.4s, v14.4s\n"
- "add v17.4s, v17.4s, v14.4s\n"
- "add v18.4s, v18.4s, v14.4s\n"
- "add v19.4s, v19.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v22.4s, v22.4s, v14.4s\n"
- "add v23.4s, v23.4s, v14.4s\n"
- "srshl v24.4s, v24.4s, v10.4s\n"
- "srshl v25.4s, v25.4s, v10.4s\n"
- "srshl v26.4s, v26.4s, v10.4s\n"
- "srshl v27.4s, v27.4s, v10.4s\n"
- "srshl v28.4s, v28.4s, v10.4s\n"
- "srshl v29.4s, v29.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v10.4s\n"
- "srshl v31.4s, v31.4s, v10.4s\n"
- "smin v16.4s, v16.4s, v11.4s\n"
- "smin v17.4s, v17.4s, v11.4s\n"
- "smin v18.4s, v18.4s, v11.4s\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "smin v23.4s, v23.4s, v11.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "smax v16.4s, v16.4s, v13.4s\n"
- "smax v17.4s, v17.4s, v13.4s\n"
- "smax v18.4s, v18.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smax v22.4s, v22.4s, v13.4s\n"
- "smax v23.4s, v23.4s, v13.4s\n"
- "smin v24.4s, v24.4s, v11.4s\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "smin v26.4s, v26.4s, v11.4s\n"
- "smin v27.4s, v27.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v31.4s, v31.4s, v11.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
@@ -838,62 +839,62 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s16, [x20, x9]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "str s17, [x21, x9]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s18, [x22, x9]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s19, [x23, x9]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s20, [x24, x9]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s21, [x25, x9]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s22, [x26, x9]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s23, [x27, x9]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x20, x9]\n"
+ "str s24, [x27, x9]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s25, [x21, x9]\n"
+ "str s25, [x26, x9]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s26, [x22, x9]\n"
+ "str s26, [x25, x9]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s27, [x23, x9]\n"
+ "str s27, [x24, x9]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s28, [x24, x9]\n"
- "str s29, [x25, x9]\n"
- "str s30, [x26, x9]\n"
- "str s31, [x27, x9]\n"
+ "str s28, [x23, x9]\n"
+ "str s29, [x22, x9]\n"
+ "str s30, [x21, x9]\n"
+ "str s31, [x20, x9]\n"
"8:" // Output channel loop: Done
"add x9, x9, #0x4\n"
"cmp x9, x10, LSL #2\n"
@@ -936,354 +937,354 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"cbz %x[rq_left_shift_ptr], 15f\n"
"tbz %x[n_output_channels], #1, 13f\n"
"ld1 { v9.d }[0], [x22], #0x8\n"
- "ld1 { v10.d }[0], [x21], #0x8\n"
- "ld1 { v15.d }[0], [x20], #0x8\n"
+ "ld1 { v8.d }[0], [x21], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_output_channels], #0, 14f\n"
"ld1 { v9.s }[2], [x22], #0x4\n"
- "ld1 { v10.s }[2], [x21], #0x4\n"
- "ld1 { v15.s }[2], [x20], #0x4\n"
+ "ld1 { v8.s }[2], [x21], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"b 14f\n"
"13:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: Unset
"ld1 { v9.s }[0], [x22], #0x4\n"
- "ld1 { v10.s }[0], [x21], #0x4\n"
- "ld1 { v15.s }[0], [x20], #0x4\n"
+ "ld1 { v8.s }[0], [x21], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"14:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: End
"b 18f\n"
"15:" // Output channel oddments: Load quantization parameters: No left shift
"tbz %x[n_output_channels], #1, 16f\n"
"ld1 { v9.d }[0], [x22], #0x8\n"
- "ld1 { v10.d }[0], [x21], #0x8\n"
+ "ld1 { v8.d }[0], [x21], #0x8\n"
"tbz %x[n_output_channels], #0, 17f\n"
"ld1 { v9.s }[2], [x22], #0x4\n"
- "ld1 { v10.s }[2], [x21], #0x4\n"
+ "ld1 { v8.s }[2], [x21], #0x4\n"
"b 17f\n"
"16:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: Unset
"ld1 { v9.s }[0], [x22], #0x4\n"
- "ld1 { v10.s }[0], [x21], #0x4\n"
+ "ld1 { v8.s }[0], [x21], #0x4\n"
"17:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: End
"18:" // Output channel oddments: Load quantization parameters: Done
- "ldr s8, [%x[weights]], #0x4\n"
- "mov x20, %x[inptrs]\n"
- "ldp x25, x28, [x20], #0x10\n"
- "lsr x21, %x[kernel_points], #0x1\n"
- "ldr d2, [x25, #0x0]\n"
- "ldr d7, [x28, #0x0]\n"
- "ssubl v2.8h, v2.8b, v3.8b\n"
- "ssubl v7.8h, v7.8b, v3.8b\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "cbz x21, 22f\n"
- "ldr s6, [%x[weights]], #0x4\n"
- "ldp x25, x28, [x20], #0x10\n"
- "subs x21, x21, #0x1\n"
- "ssubl v6.8h, v6.8b, v12.8b\n"
- "ldr d1, [x25, #0x0]\n"
- "ldr d0, [x28, #0x0]\n"
- "ssubl v1.8h, v1.8b, v3.8b\n"
- "ssubl v0.8h, v0.8b, v3.8b\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "mov x22, %x[inptrs]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "lsr x23, %x[kernel_points], #0x1\n"
+ "ldr d0, [x21, #0x0]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "cbz x23, 22f\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "subs x23, x23, #0x1\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
+ "ldr d3, [x21, #0x0]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
"beq 20f\n"
"19:" // Output channel oddments: Kernel loop
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "subs x21, x21, #0x1\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "ldr d2, [x25, #0x0]\n"
- "ssubl v2.8h, v2.8b, v3.8b\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "ldr d7, [x28, #0x0]\n"
- "ldr s8, [%x[weights]], #0x4\n"
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "ssubl v7.8h, v7.8b, v3.8b\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
- "ldr d1, [x25, #0x0]\n"
- "ssubl v1.8h, v1.8b, v3.8b\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
- "ldr d0, [x28, #0x0]\n"
- "ldr s6, [%x[weights]], #0x4\n"
- "ssubl v0.8h, v0.8b, v3.8b\n"
- "ssubl v6.8h, v6.8b, v12.8b\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "subs x23, x23, #0x1\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d0, [x21, #0x0]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "ldr d3, [x21, #0x0]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
"bgt 19b\n"
"20:" // Output channel oddments: Kernel loop tail
"tbnz %x[kernel_points], #0, 21f\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
"b 23f\n"
"21:" // Output channel oddments: Odd tail
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "ldr d2, [x25, #0x0]\n"
- "ssubl v2.8h, v2.8b, v3.8b\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "ldr d7, [x28, #0x0]\n"
- "ldr s8, [%x[weights]], #0x4\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "ssubl v7.8h, v7.8b, v3.8b\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d2, [x21, #0x0]\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr d1, [x20, #0x0]\n"
+ "ldr s0, [%x[weights]], #0x4\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "ssubl v0.8h, v0.8b, v12.8b\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "smlal v16.4s, v0.4h, v2.h[0]\n"
+ "smlal v17.4s, v0.4h, v2.h[1]\n"
+ "smlal v18.4s, v0.4h, v2.h[2]\n"
+ "smlal v19.4s, v0.4h, v2.h[3]\n"
+ "smlal v20.4s, v0.4h, v2.h[4]\n"
+ "smlal v21.4s, v0.4h, v2.h[5]\n"
+ "smlal v22.4s, v0.4h, v2.h[6]\n"
+ "smlal v23.4s, v0.4h, v2.h[7]\n"
+ "smlal v24.4s, v0.4h, v1.h[0]\n"
+ "smlal v25.4s, v0.4h, v1.h[1]\n"
+ "smlal v26.4s, v0.4h, v1.h[2]\n"
+ "smlal v27.4s, v0.4h, v1.h[3]\n"
+ "smlal v28.4s, v0.4h, v1.h[4]\n"
+ "smlal v29.4s, v0.4h, v1.h[5]\n"
+ "smlal v30.4s, v0.4h, v1.h[6]\n"
+ "smlal v31.4s, v0.4h, v1.h[7]\n"
"b 23f\n"
"22:" // Output channel oddments: Single kernel point
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
"23:" // Output channel oddments: Done
- "sshl v16.4s, v16.4s, v15.4s\n"
- "sshl v17.4s, v17.4s, v15.4s\n"
- "sshl v18.4s, v18.4s, v15.4s\n"
- "sshl v19.4s, v19.4s, v15.4s\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
"sqrdmulh v16.4s, v16.4s, v9.4s\n"
"sqrdmulh v17.4s, v17.4s, v9.4s\n"
"sqrdmulh v18.4s, v18.4s, v9.4s\n"
"sqrdmulh v19.4s, v19.4s, v9.4s\n"
- "and v5.16b, v16.16b, v10.16b\n"
- "and v4.16b, v17.16b, v10.16b\n"
- "and v2.16b, v18.16b, v10.16b\n"
- "and v1.16b, v19.16b, v10.16b\n"
- "sshl v20.4s, v20.4s, v15.4s\n"
- "sshl v21.4s, v21.4s, v15.4s\n"
- "sshl v22.4s, v22.4s, v15.4s\n"
- "sshl v23.4s, v23.4s, v15.4s\n"
- "sshl v24.4s, v24.4s, v15.4s\n"
- "sshl v25.4s, v25.4s, v15.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v20.4s, v20.4s, v9.4s\n"
"sqrdmulh v21.4s, v21.4s, v9.4s\n"
"sqrdmulh v22.4s, v22.4s, v9.4s\n"
"sqrdmulh v23.4s, v23.4s, v9.4s\n"
"sqrdmulh v24.4s, v24.4s, v9.4s\n"
"sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqadd v16.4s, v16.4s, v5.4s\n"
- "sqadd v17.4s, v17.4s, v4.4s\n"
- "sqadd v18.4s, v18.4s, v2.4s\n"
- "sqadd v19.4s, v19.4s, v1.4s\n"
- "and v8.16b, v20.16b, v10.16b\n"
- "and v0.16b, v21.16b, v10.16b\n"
- "and v5.16b, v22.16b, v10.16b\n"
- "and v4.16b, v23.16b, v10.16b\n"
- "and v2.16b, v24.16b, v10.16b\n"
- "and v1.16b, v25.16b, v10.16b\n"
- "sshl v26.4s, v26.4s, v15.4s\n"
- "sshl v27.4s, v27.4s, v15.4s\n"
- "sshl v28.4s, v28.4s, v15.4s\n"
- "sshl v29.4s, v29.4s, v15.4s\n"
- "sshl v30.4s, v30.4s, v15.4s\n"
- "sshl v31.4s, v31.4s, v15.4s\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v9.4s\n"
"sqrdmulh v27.4s, v27.4s, v9.4s\n"
"sqrdmulh v28.4s, v28.4s, v9.4s\n"
"sqrdmulh v29.4s, v29.4s, v9.4s\n"
"sqrdmulh v30.4s, v30.4s, v9.4s\n"
"sqrdmulh v31.4s, v31.4s, v9.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sqadd v22.4s, v22.4s, v5.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v2.4s\n"
- "sqadd v25.4s, v25.4s, v1.4s\n"
- "and v8.16b, v26.16b, v10.16b\n"
- "and v0.16b, v27.16b, v10.16b\n"
- "and v5.16b, v28.16b, v10.16b\n"
- "and v4.16b, v29.16b, v10.16b\n"
- "and v2.16b, v30.16b, v10.16b\n"
- "and v1.16b, v31.16b, v10.16b\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v8.4s\n"
- "sqadd v27.4s, v27.4s, v0.4s\n"
- "sqadd v28.4s, v28.4s, v5.4s\n"
- "sqadd v29.4s, v29.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v2.4s\n"
- "sqadd v31.4s, v31.4s, v1.4s\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "srshl v17.4s, v17.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v10.4s\n"
- "srshl v19.4s, v19.4s, v10.4s\n"
- "srshl v20.4s, v20.4s, v10.4s\n"
- "srshl v21.4s, v21.4s, v10.4s\n"
- "srshl v22.4s, v22.4s, v10.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "srshl v24.4s, v24.4s, v10.4s\n"
- "srshl v25.4s, v25.4s, v10.4s\n"
- "srshl v26.4s, v26.4s, v10.4s\n"
- "srshl v27.4s, v27.4s, v10.4s\n"
- "srshl v28.4s, v28.4s, v10.4s\n"
- "srshl v29.4s, v29.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v10.4s\n"
- "srshl v31.4s, v31.4s, v10.4s\n"
- "add v16.4s, v16.4s, v14.4s\n"
- "add v17.4s, v17.4s, v14.4s\n"
- "add v18.4s, v18.4s, v14.4s\n"
- "add v19.4s, v19.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v22.4s, v22.4s, v14.4s\n"
- "add v23.4s, v23.4s, v14.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "smin v16.4s, v16.4s, v11.4s\n"
- "smin v17.4s, v17.4s, v11.4s\n"
- "smin v18.4s, v18.4s, v11.4s\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "smin v23.4s, v23.4s, v11.4s\n"
- "smin v24.4s, v24.4s, v11.4s\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "smin v26.4s, v26.4s, v11.4s\n"
- "smin v27.4s, v27.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v31.4s, v31.4s, v11.4s\n"
- "smax v16.4s, v16.4s, v13.4s\n"
- "smax v17.4s, v17.4s, v13.4s\n"
- "smax v18.4s, v18.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smax v22.4s, v22.4s, v13.4s\n"
- "smax v23.4s, v23.4s, v13.4s\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
@@ -1317,158 +1318,156 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"uzp1 v30.16b, v30.16b, v30.16b\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
"tbz %x[n_output_channels], #1, 24f\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x9\n"
- "add x21, x21, x9\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x9\n"
- "add x23, x23, x9\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x9\n"
- "add x25, x25, x9\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x9\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x9\n"
- "st1 { v16.h }[0], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x9\n"
- "st1 { v17.h }[0], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x9\n"
- "st1 { v18.h }[0], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x9\n"
- "st1 { v19.h }[0], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x9\n"
- "st1 { v20.h }[0], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x9\n"
- "st1 { v21.h }[0], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x9\n"
- "st1 { v22.h }[0], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x9\n"
- "st1 { v23.h }[0], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
+ "st1 { v16.h }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x9\n"
+ "st1 { v17.h }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x9\n"
+ "st1 { v18.h }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x9\n"
+ "st1 { v19.h }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x9\n"
+ "st1 { v20.h }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v21.h }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x9\n"
+ "st1 { v22.h }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x9\n"
+ "st1 { v23.h }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x9\n"
"add x9, x9, #0x2\n"
- "st1 { v24.h }[0], [x20]\n"
- "st1 { v25.h }[0], [x21]\n"
- "st1 { v26.h }[0], [x22]\n"
- "st1 { v27.h }[0], [x23]\n"
- "st1 { v28.h }[0], [x24]\n"
- "st1 { v29.h }[0], [x25]\n"
- "st1 { v30.h }[0], [x26]\n"
- "st1 { v31.h }[0], [x27]\n"
+ "st1 { v24.h }[0], [x27]\n"
+ "st1 { v25.h }[0], [x26]\n"
+ "st1 { v26.h }[0], [x25]\n"
+ "st1 { v27.h }[0], [x24]\n"
+ "st1 { v28.h }[0], [x23]\n"
+ "st1 { v29.h }[0], [x22]\n"
+ "st1 { v30.h }[0], [x21]\n"
+ "st1 { v31.h }[0], [x20]\n"
"tbz %x[n_output_channels], #0, 25f\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x9\n"
- "add x21, x21, x9\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x9\n"
- "add x23, x23, x9\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x9\n"
- "add x25, x25, x9\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x9\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x9\n"
- "st1 { v16.b }[2], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x9\n"
- "st1 { v17.b }[2], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x9\n"
- "st1 { v18.b }[2], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x9\n"
- "st1 { v19.b }[2], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x9\n"
- "st1 { v20.b }[2], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x9\n"
- "st1 { v21.b }[2], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x9\n"
- "st1 { v22.b }[2], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x9\n"
- "st1 { v23.b }[2], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
+ "st1 { v16.b }[2], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x9\n"
- "st1 { v24.b }[2], [x20]\n"
- "st1 { v25.b }[2], [x21]\n"
- "st1 { v26.b }[2], [x22]\n"
- "st1 { v27.b }[2], [x23]\n"
- "st1 { v28.b }[2], [x24]\n"
- "st1 { v29.b }[2], [x25]\n"
- "st1 { v30.b }[2], [x26]\n"
- "st1 { v31.b }[2], [x27]\n"
+ "st1 { v17.b }[2], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x9\n"
+ "st1 { v18.b }[2], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x9\n"
+ "st1 { v19.b }[2], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x9\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v21.b }[2], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x9\n"
+ "st1 { v22.b }[2], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x9\n"
+ "st1 { v23.b }[2], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x9\n"
+ "st1 { v24.b }[2], [x27]\n"
+ "st1 { v25.b }[2], [x26]\n"
+ "st1 { v26.b }[2], [x25]\n"
+ "st1 { v27.b }[2], [x24]\n"
+ "st1 { v28.b }[2], [x23]\n"
+ "st1 { v29.b }[2], [x22]\n"
+ "st1 { v30.b }[2], [x21]\n"
+ "st1 { v31.b }[2], [x20]\n"
"b 25f\n"
"24:" // Output channel oddments: Done: Store: Bit 1: Unset
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x9\n"
- "add x21, x21, x9\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x9\n"
- "add x23, x23, x9\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x9\n"
- "add x25, x25, x9\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x9\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x9\n"
- "st1 { v16.b }[0], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x9\n"
- "st1 { v17.b }[0], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x9\n"
- "st1 { v18.b }[0], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x9\n"
- "st1 { v19.b }[0], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x9\n"
- "st1 { v20.b }[0], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x9\n"
- "st1 { v21.b }[0], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x9\n"
- "st1 { v22.b }[0], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x9\n"
- "st1 { v23.b }[0], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
+ "st1 { v16.b }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x9\n"
- "st1 { v24.b }[0], [x20]\n"
- "st1 { v25.b }[0], [x21]\n"
- "st1 { v26.b }[0], [x22]\n"
- "st1 { v27.b }[0], [x23]\n"
- "st1 { v28.b }[0], [x24]\n"
- "st1 { v29.b }[0], [x25]\n"
- "st1 { v30.b }[0], [x26]\n"
- "st1 { v31.b }[0], [x27]\n"
+ "st1 { v17.b }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x9\n"
+ "st1 { v18.b }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x9\n"
+ "st1 { v19.b }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x9\n"
+ "st1 { v20.b }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v21.b }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x9\n"
+ "st1 { v22.b }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x9\n"
+ "st1 { v23.b }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x9\n"
+ "st1 { v24.b }[0], [x27]\n"
+ "st1 { v25.b }[0], [x26]\n"
+ "st1 { v26.b }[0], [x25]\n"
+ "st1 { v27.b }[0], [x24]\n"
+ "st1 { v28.b }[0], [x23]\n"
+ "st1 { v29.b }[0], [x22]\n"
+ "st1 { v30.b }[0], [x21]\n"
+ "st1 { v31.b }[0], [x20]\n"
"25:" // Output channel oddments: Done: Store: Bit 1: End
-
"26:" // Done
-
: [weights] "+&r" (weights)
: [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [n_output_channels] "r" ((uint64_t) n_output_channels), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (per_channel_left_shifts), [rq_mul_ptr] "r" (per_channel_muls), [rq_right_shift_ptr] "r" (per_channel_right_shifts)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
@@ -1477,4 +1476,5 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
index 1d45804714..fc83aaf5d2 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
@@ -34,14 +34,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
- const unsigned int,
- const int8_t *const *const,
- const int8_t *,
- const int32_t *,
- const arm_gemm::Requantize32&,
- const int32_t *, const int32_t *,
- int8_t *const *const);
+void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, int8_t *const *);
class a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
index 3fc1b13d9c..aad34c4c25 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -30,716 +30,708 @@
namespace arm_conv {
namespace depthwise {
-void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
- const unsigned int n_channels,
- const int8_t *const *const inptrs,
- const int8_t *params,
- const int32_t *, // Bias, should be wrapped into the parameters
- const arm_gemm::Requantize32& qp,
- const int32_t *, const int32_t *, // Requant parameters, also wrapped
- int8_t *const *const outptrs
-)
+void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const int8_t *const *const inptrs, const int8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, int8_t *const *const outptrs)
{
__asm__ __volatile__(
"lsr x15, %x[n_channels], #0x4\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
- "ld1r { v9.4s }, [x20]\n"
+ "ld1r { v8.4s }, [x20]\n"
"ldp x14, x13, [%x[inptrs], #0x0]\n"
"add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v8.4s }, [x20]\n"
+ "ld1r { v12.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v7.4s }, [x20]\n"
+ "ld1r { v15.4s }, [x20]\n"
"mov x12, #0x0\n"
"mov x11, #0x0\n"
"ldp x10, x9, [%x[inptrs], #0x10]\n"
"ldp x28, x27, [%x[inptrs], #0x20]\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
- "ldp x24, x23, [%x[outptrs], #0x0]\n"
- "ldp x22, x21, [%x[outptrs], #0x10]\n"
+ "ldp x26, x21, [%x[inptrs], #0x30]\n"
+ "ldp x25, x24, [%x[outptrs], #0x0]\n"
+ "ldp x23, x22, [%x[outptrs], #0x10]\n"
"cbz x15, 3f\n"
- "ldr q6, [x14, x12]\n"
- "ldr q5, [x13, x12]\n"
+ "ldr q11, [x14, x12]\n"
+ "ldr q20, [x13, x12]\n"
"subs x15, x15, #0x1\n"
- "ldr q4, [x10, x12]\n"
- "ldr q3, [x9, x12]\n"
- "zip2 v2.16b, v6.16b, v4.16b\n"
- "zip1 v6.16b, v6.16b, v4.16b\n"
- "ldr q1, [x28, x12]\n"
- "ldr q0, [x27, x12]\n"
- "zip1 v4.16b, v5.16b, v3.16b\n"
- "zip2 v3.16b, v5.16b, v3.16b\n"
- "ldr q31, [x26, x12]\n"
- "ldr q30, [x25, x12]\n"
- "zip2 v5.16b, v6.16b, v4.16b\n"
- "zip1 v6.16b, v6.16b, v4.16b\n"
- "ldr q29, [%x[params], #0x10]\n"
- "ldr q28, [%x[params], #0x20]\n"
- "zip1 v4.16b, v2.16b, v3.16b\n"
- "zip2 v3.16b, v2.16b, v3.16b\n"
- "ldr q2, [%x[params], #0x0]\n"
- "ldr q27, [%x[params], #0x30]\n"
- "zip2 v26.16b, v1.16b, v31.16b\n"
- "zip1 v1.16b, v1.16b, v31.16b\n"
- "ldp x14, x13, [%x[inptrs], #0x40]\n"
- "ldr q25, [x14, x12]\n"
- "zip1 v31.16b, v0.16b, v30.16b\n"
- "zip2 v30.16b, v0.16b, v30.16b\n"
- "ldr q24, [x13, x12]\n"
- "ldp x10, x9, [%x[inptrs], #0x50]\n"
- "zip2 v0.16b, v1.16b, v31.16b\n"
- "zip1 v1.16b, v1.16b, v31.16b\n"
- "ldr q23, [x10, x12]\n"
- "ldr q22, [x9, x12]\n"
- "zip2 v21.16b, v25.16b, v23.16b\n"
- "zip1 v25.16b, v25.16b, v23.16b\n"
- "ldp x28, x27, [%x[inptrs], #0x60]\n"
- "ldr q20, [x28, x12]\n"
- "zip1 v23.16b, v24.16b, v22.16b\n"
- "zip2 v22.16b, v24.16b, v22.16b\n"
- "ldr q19, [x27, x12]\n"
- "ldp x26, x25, [%x[inptrs], #0x70]\n"
- "zip1 v31.16b, v26.16b, v30.16b\n"
- "zip2 v30.16b, v26.16b, v30.16b\n"
- "ldr q18, [x26, x12]\n"
- "ldr q17, [x25, x12]\n"
- "zip2 v16.16b, v20.16b, v18.16b\n"
- "zip1 v20.16b, v20.16b, v18.16b\n"
- "zip1 v18.16b, v19.16b, v17.16b\n"
- "zip2 v17.16b, v19.16b, v17.16b\n"
+ "ldr q16, [x10, x12]\n"
+ "ldr q14, [x9, x12]\n"
+ "zip2 v19.16b, v11.16b, v16.16b\n"
+ "zip1 v11.16b, v11.16b, v16.16b\n"
+ "ldr q13, [x28, x12]\n"
+ "ldr q18, [x27, x12]\n"
+ "zip1 v17.16b, v20.16b, v14.16b\n"
+ "zip2 v14.16b, v20.16b, v14.16b\n"
+ "ldr q16, [x26, x12]\n"
+ "ldr q27, [x21, x12]\n"
+ "zip2 v10.16b, v11.16b, v17.16b\n"
+ "zip1 v11.16b, v11.16b, v17.16b\n"
+ "ldr q24, [%x[params], #0x10]\n"
+ "ldr q9, [%x[params], #0x20]\n"
+ "zip1 v3.16b, v19.16b, v14.16b\n"
+ "zip2 v14.16b, v19.16b, v14.16b\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q6, [%x[params], #0x30]\n"
+ "zip2 v30.16b, v13.16b, v16.16b\n"
+ "zip1 v13.16b, v13.16b, v16.16b\n"
+ "ldp x21, x20, [%x[inptrs], #0x40]\n"
+ "ldr q5, [x21, x12]\n"
+ "zip1 v16.16b, v18.16b, v27.16b\n"
+ "zip2 v27.16b, v18.16b, v27.16b\n"
+ "ldr q17, [x20, x12]\n"
+ "ldp x21, x20, [%x[inptrs], #0x50]\n"
+ "zip2 v28.16b, v13.16b, v16.16b\n"
+ "zip1 v13.16b, v13.16b, v16.16b\n"
+ "ldr q16, [x21, x12]\n"
+ "ldr q7, [x20, x12]\n"
+ "zip2 v20.16b, v5.16b, v16.16b\n"
+ "zip1 v5.16b, v5.16b, v16.16b\n"
+ "ldp x21, x20, [%x[inptrs], #0x60]\n"
+ "ldr q16, [x21, x12]\n"
+ "zip1 v22.16b, v17.16b, v7.16b\n"
+ "zip2 v7.16b, v17.16b, v7.16b\n"
+ "ldr q19, [x20, x12]\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "zip1 v21.16b, v30.16b, v27.16b\n"
+ "zip2 v27.16b, v30.16b, v27.16b\n"
+ "ldr q30, [x21, x12]\n"
+ "ldr q1, [x20, x12]\n"
+ "zip2 v17.16b, v16.16b, v30.16b\n"
+ "zip1 v16.16b, v16.16b, v30.16b\n"
+ "zip1 v18.16b, v19.16b, v1.16b\n"
+ "zip2 v1.16b, v19.16b, v1.16b\n"
"ldp x14, x13, [%x[inptrs], #0x0]\n"
"ldp x10, x9, [%x[inptrs], #0x10]\n"
"ldp x28, x27, [%x[inptrs], #0x20]\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
- "zip2 v24.16b, v25.16b, v23.16b\n"
- "zip1 v25.16b, v25.16b, v23.16b\n"
- "zip1 v23.16b, v21.16b, v22.16b\n"
- "zip2 v22.16b, v21.16b, v22.16b\n"
+ "ldp x26, x21, [%x[inptrs], #0x30]\n"
+ "zip2 v29.16b, v5.16b, v22.16b\n"
+ "zip1 v5.16b, v5.16b, v22.16b\n"
+ "zip1 v0.16b, v20.16b, v7.16b\n"
+ "zip2 v7.16b, v20.16b, v7.16b\n"
"add %x[params], %x[params], #0x40\n"
- "zip2 v19.16b, v20.16b, v18.16b\n"
- "zip1 v20.16b, v20.16b, v18.16b\n"
- "zip1 v18.16b, v16.16b, v17.16b\n"
- "zip2 v17.16b, v16.16b, v17.16b\n"
- "mov v26.16b, v2.16b\n"
- "mov v21.16b, v2.16b\n"
- "mov v16.16b, v2.16b\n"
+ "zip2 v30.16b, v16.16b, v18.16b\n"
+ "zip1 v16.16b, v16.16b, v18.16b\n"
+ "zip1 v2.16b, v17.16b, v1.16b\n"
+ "zip2 v1.16b, v17.16b, v1.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ "mov v4.16b, v31.16b\n"
"beq 2f\n"
"1:" // Loop
- ".inst 0x4e8697a2 // sdot v2.4s, v29.16b, v6.16b\n"
- ".inst 0x4e8197b5 // sdot v21.4s, v29.16b, v1.16b\n"
- "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ ".inst 0x4e8b971f // sdot v31.4s, v24.16b, v11.16b\n"
+ ".inst 0x4e8d9712 // sdot v18.4s, v24.16b, v13.16b\n"
+ "ext v11.16b, v11.16b, v11.16b, #0x1\n"
"add x12, x12, #0x10\n"
- ".inst 0x4e819782 // sdot v2.4s, v28.16b, v1.16b\n"
- "ext v1.16b, v1.16b, v1.16b, #0x1\n"
- ".inst 0x4e8697ba // sdot v26.4s, v29.16b, v6.16b\n"
- "ldr q6, [%x[params], #0x0]\n"
- ".inst 0x4e8197b0 // sdot v16.4s, v29.16b, v1.16b\n"
- ".inst 0x4e999795 // sdot v21.4s, v28.16b, v25.16b\n"
+ ".inst 0x4e8d953f // sdot v31.4s, v9.16b, v13.16b\n"
+ "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+ ".inst 0x4e8b971a // sdot v26.4s, v24.16b, v11.16b\n"
+ "ldr q17, [%x[params], #0x0]\n"
+ ".inst 0x4e8d9704 // sdot v4.4s, v24.16b, v13.16b\n"
+ ".inst 0x4e859532 // sdot v18.4s, v9.16b, v5.16b\n"
"subs x15, x15, #0x1\n"
- ".inst 0x4e999762 // sdot v2.4s, v27.16b, v25.16b\n"
- "ext v25.16b, v25.16b, v25.16b, #0x1\n"
- ".inst 0x4e81979a // sdot v26.4s, v28.16b, v1.16b\n"
- "ldr q1, [%x[params], #0x10]\n"
- ".inst 0x4e999790 // sdot v16.4s, v28.16b, v25.16b\n"
- ".inst 0x4e949775 // sdot v21.4s, v27.16b, v20.16b\n"
- "ext v20.16b, v20.16b, v20.16b, #0x1\n"
- "sqrdmulh v2.4s, v2.4s, v6.4s\n"
- ".inst 0x4e99977a // sdot v26.4s, v27.16b, v25.16b\n"
- ".inst 0x4e949770 // sdot v16.4s, v27.16b, v20.16b\n"
- "and v29.16b, v2.16b, v1.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqrdmulh v26.4s, v26.4s, v6.4s\n"
- "sqrdmulh v21.4s, v21.4s, v6.4s\n"
- "sqrdmulh v16.4s, v16.4s, v6.4s\n"
- "ldr q6, [%x[params], #0x60]\n"
- "sqadd v2.4s, v2.4s, v29.4s\n"
- "and v28.16b, v26.16b, v1.16b\n"
- "and v27.16b, v21.16b, v1.16b\n"
- "and v29.16b, v16.16b, v1.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v2.4s, v2.4s, v1.4s\n"
- "sqadd v26.4s, v26.4s, v28.4s\n"
- "ldr q28, [%x[params], #0x40]\n"
- "sqadd v21.4s, v21.4s, v27.4s\n"
- "ldr q27, [%x[params], #0x50]\n"
- "sqadd v16.4s, v16.4s, v29.4s\n"
- "ldr q29, [%x[params], #0x30]\n"
- "add v2.4s, v2.4s, v7.4s\n"
- "srshl v26.4s, v26.4s, v1.4s\n"
- "srshl v21.4s, v21.4s, v1.4s\n"
- "srshl v16.4s, v16.4s, v1.4s\n"
- "ldr q1, [%x[params], #0x70]\n"
- "smax v2.4s, v2.4s, v9.4s\n"
- "add v26.4s, v26.4s, v7.4s\n"
- "add v21.4s, v21.4s, v7.4s\n"
- "add v16.4s, v16.4s, v7.4s\n"
- "smin v2.4s, v2.4s, v8.4s\n"
- "smax v26.4s, v26.4s, v9.4s\n"
- "smax v21.4s, v21.4s, v9.4s\n"
- "smax v16.4s, v16.4s, v9.4s\n"
- "smin v26.4s, v26.4s, v8.4s\n"
- "smin v21.4s, v21.4s, v8.4s\n"
- "smin v16.4s, v16.4s, v8.4s\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
+ ".inst 0x4e8594df // sdot v31.4s, v6.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x4e8d953a // sdot v26.4s, v9.16b, v13.16b\n"
+ "ldr q20, [%x[params], #0x10]\n"
+ ".inst 0x4e859524 // sdot v4.4s, v9.16b, v5.16b\n"
+ ".inst 0x4e9094d2 // sdot v18.4s, v6.16b, v16.16b\n"
+ "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x4e8594da // sdot v26.4s, v6.16b, v5.16b\n"
+ ".inst 0x4e9094c4 // sdot v4.4s, v6.16b, v16.16b\n"
+ "and v16.16b, v31.16b, v20.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+ "ldr q5, [%x[params], #0x60]\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v19.16b, v26.16b, v20.16b\n"
+ "and v17.16b, v18.16b, v20.16b\n"
+ "and v16.16b, v4.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "sqadd v26.4s, v26.4s, v19.4s\n"
+ "ldr q13, [%x[params], #0x40]\n"
+ "sqadd v18.4s, v18.4s, v17.4s\n"
+ "ldr q17, [%x[params], #0x50]\n"
+ "sqadd v4.4s, v4.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0x30]\n"
+ "add v31.4s, v31.4s, v15.4s\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "srshl v18.4s, v18.4s, v20.4s\n"
+ "srshl v4.4s, v4.4s, v20.4s\n"
+ "ldr q22, [%x[params], #0x70]\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "add v26.4s, v26.4s, v15.4s\n"
+ "add v18.4s, v18.4s, v15.4s\n"
+ "add v4.4s, v4.4s, v15.4s\n"
+ "smin v31.4s, v31.4s, v12.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v18.4s, v18.4s, v8.4s\n"
+ "smax v4.4s, v4.4s, v8.4s\n"
+ "smin v26.4s, v26.4s, v12.4s\n"
+ "smin v18.4s, v18.4s, v12.4s\n"
+ "smin v4.4s, v4.4s, v12.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s2, [x24, x11]\n"
- "ldr q2, [%x[params], #0x20]\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s31, [x25, x11]\n"
+ "ldr q24, [%x[params], #0x20]\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s26, [x23, x11]\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s21, [x22, x11]\n"
- "mov v26.16b, v2.16b\n"
- "str s16, [x21, x11]\n"
- "mov v21.16b, v2.16b\n"
- "mov v16.16b, v2.16b\n"
- ".inst 0x4e8597a2 // sdot v2.4s, v29.16b, v5.16b\n"
- ".inst 0x4e8097b5 // sdot v21.4s, v29.16b, v0.16b\n"
- ".inst 0x4e809782 // sdot v2.4s, v28.16b, v0.16b\n"
- "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str s26, [x24, x11]\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "str s18, [x23, x11]\n"
+ "mov v26.16b, v24.16b\n"
+ "str s4, [x22, x11]\n"
+ "mov v25.16b, v24.16b\n"
+ "mov v23.16b, v24.16b\n"
+ ".inst 0x4e8a9618 // sdot v24.4s, v16.16b, v10.16b\n"
+ ".inst 0x4e9c9619 // sdot v25.4s, v16.16b, v28.16b\n"
+ ".inst 0x4e9c95b8 // sdot v24.4s, v13.16b, v28.16b\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
"add x11, x11, #0x4\n"
- "ext v0.16b, v0.16b, v0.16b, #0x1\n"
- ".inst 0x4e8597ba // sdot v26.4s, v29.16b, v5.16b\n"
- "ldr q5, [x13, x12]\n"
- ".inst 0x4e8097b0 // sdot v16.4s, v29.16b, v0.16b\n"
- ".inst 0x4e989795 // sdot v21.4s, v28.16b, v24.16b\n"
- ".inst 0x4e989762 // sdot v2.4s, v27.16b, v24.16b\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x4e80979a // sdot v26.4s, v28.16b, v0.16b\n"
- "ldr q0, [x27, x12]\n"
- ".inst 0x4e989790 // sdot v16.4s, v28.16b, v24.16b\n"
- "sqrdmulh v2.4s, v2.4s, v6.4s\n"
- ".inst 0x4e939775 // sdot v21.4s, v27.16b, v19.16b\n"
- "ext v19.16b, v19.16b, v19.16b, #0x1\n"
- ".inst 0x4e98977a // sdot v26.4s, v27.16b, v24.16b\n"
- ".inst 0x4e939770 // sdot v16.4s, v27.16b, v19.16b\n"
- "and v29.16b, v2.16b, v1.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqrdmulh v26.4s, v26.4s, v6.4s\n"
- "sqrdmulh v21.4s, v21.4s, v6.4s\n"
- "sqrdmulh v16.4s, v16.4s, v6.4s\n"
- "ldr q6, [%x[params], #0xc0]\n"
- "sqadd v2.4s, v2.4s, v29.4s\n"
- "and v28.16b, v26.16b, v1.16b\n"
- "and v27.16b, v21.16b, v1.16b\n"
- "and v29.16b, v16.16b, v1.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v2.4s, v2.4s, v1.4s\n"
- "sqadd v26.4s, v26.4s, v28.4s\n"
- "ldr q28, [%x[params], #0xa0]\n"
- "sqadd v21.4s, v21.4s, v27.4s\n"
- "ldr q27, [%x[params], #0xb0]\n"
- "sqadd v16.4s, v16.4s, v29.4s\n"
- "ldr q29, [%x[params], #0x90]\n"
- "add v2.4s, v2.4s, v7.4s\n"
- "srshl v26.4s, v26.4s, v1.4s\n"
- "srshl v21.4s, v21.4s, v1.4s\n"
- "srshl v16.4s, v16.4s, v1.4s\n"
- "ldr q1, [%x[params], #0xd0]\n"
- "smax v2.4s, v2.4s, v9.4s\n"
- "add v26.4s, v26.4s, v7.4s\n"
- "add v21.4s, v21.4s, v7.4s\n"
- "add v16.4s, v16.4s, v7.4s\n"
- "smin v2.4s, v2.4s, v8.4s\n"
- "smax v26.4s, v26.4s, v9.4s\n"
- "smax v21.4s, v21.4s, v9.4s\n"
- "smax v16.4s, v16.4s, v9.4s\n"
- "smin v26.4s, v26.4s, v8.4s\n"
- "smin v21.4s, v21.4s, v8.4s\n"
- "smin v16.4s, v16.4s, v8.4s\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
- "str s2, [x24, x11]\n"
- "ldr q2, [%x[params], #0x80]\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+ ".inst 0x4e8a961a // sdot v26.4s, v16.16b, v10.16b\n"
+ "ldr q10, [x13, x12]\n"
+ ".inst 0x4e9c9617 // sdot v23.4s, v16.16b, v28.16b\n"
+ ".inst 0x4e9d95b9 // sdot v25.4s, v13.16b, v29.16b\n"
+ ".inst 0x4e9d9638 // sdot v24.4s, v17.16b, v29.16b\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x1\n"
+ ".inst 0x4e9c95ba // sdot v26.4s, v13.16b, v28.16b\n"
+ "ldr q20, [x27, x12]\n"
+ ".inst 0x4e9d95b7 // sdot v23.4s, v13.16b, v29.16b\n"
+ "sqrdmulh v24.4s, v24.4s, v5.4s\n"
+ ".inst 0x4e9e9639 // sdot v25.4s, v17.16b, v30.16b\n"
+ "ext v30.16b, v30.16b, v30.16b, #0x1\n"
+ ".inst 0x4e9d963a // sdot v26.4s, v17.16b, v29.16b\n"
+ ".inst 0x4e9e9637 // sdot v23.4s, v17.16b, v30.16b\n"
+ "and v16.16b, v24.16b, v22.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v5.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v5.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v5.4s\n"
+ "ldr q19, [%x[params], #0xc0]\n"
+ "sqadd v24.4s, v24.4s, v16.4s\n"
+ "and v18.16b, v26.16b, v22.16b\n"
+ "and v17.16b, v25.16b, v22.16b\n"
+ "and v16.16b, v23.16b, v22.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v24.4s, v24.4s, v22.4s\n"
+ "sqadd v26.4s, v26.4s, v18.4s\n"
+ "ldr q18, [%x[params], #0xa0]\n"
+ "sqadd v25.4s, v25.4s, v17.4s\n"
+ "ldr q17, [%x[params], #0xb0]\n"
+ "sqadd v23.4s, v23.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0x90]\n"
+ "add v24.4s, v24.4s, v15.4s\n"
+ "srshl v26.4s, v26.4s, v22.4s\n"
+ "srshl v25.4s, v25.4s, v22.4s\n"
+ "srshl v23.4s, v23.4s, v22.4s\n"
+ "ldr q22, [%x[params], #0xd0]\n"
+ "smax v24.4s, v24.4s, v8.4s\n"
+ "add v26.4s, v26.4s, v15.4s\n"
+ "add v25.4s, v25.4s, v15.4s\n"
+ "add v23.4s, v23.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v12.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v25.4s, v25.4s, v8.4s\n"
+ "smax v23.4s, v23.4s, v8.4s\n"
+ "smin v26.4s, v26.4s, v12.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
+ "smin v23.4s, v23.4s, v12.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x25, x11]\n"
+ "ldr q24, [%x[params], #0x80]\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s26, [x23, x11]\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s21, [x22, x11]\n"
- "str s16, [x21, x11]\n"
- "mov v26.16b, v2.16b\n"
- "mov v21.16b, v2.16b\n"
- ".inst 0x4e9f97b5 // sdot v21.4s, v29.16b, v31.16b\n"
- "mov v16.16b, v2.16b\n"
- ".inst 0x4e8497a2 // sdot v2.4s, v29.16b, v4.16b\n"
- ".inst 0x4e9f9782 // sdot v2.4s, v28.16b, v31.16b\n"
+ "str s26, [x24, x11]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s25, [x23, x11]\n"
+ "str s23, [x22, x11]\n"
+ "mov v23.16b, v24.16b\n"
+ "mov v31.16b, v24.16b\n"
+ ".inst 0x4e95961f // sdot v31.4s, v16.16b, v21.16b\n"
+ "mov v13.16b, v24.16b\n"
+ ".inst 0x4e839618 // sdot v24.4s, v16.16b, v3.16b\n"
+ ".inst 0x4e959658 // sdot v24.4s, v18.16b, v21.16b\n"
"add x11, x11, #0x4\n"
- "ext v4.16b, v4.16b, v4.16b, #0x1\n"
- "ext v31.16b, v31.16b, v31.16b, #0x1\n"
- ".inst 0x4e8497ba // sdot v26.4s, v29.16b, v4.16b\n"
- "ldr q4, [x10, x12]\n"
- ".inst 0x4e9f97b0 // sdot v16.4s, v29.16b, v31.16b\n"
- ".inst 0x4e979795 // sdot v21.4s, v28.16b, v23.16b\n"
- ".inst 0x4e979762 // sdot v2.4s, v27.16b, v23.16b\n"
- "ext v23.16b, v23.16b, v23.16b, #0x1\n"
- ".inst 0x4e9f979a // sdot v26.4s, v28.16b, v31.16b\n"
- "ldr q31, [x26, x12]\n"
- ".inst 0x4e979790 // sdot v16.4s, v28.16b, v23.16b\n"
- ".inst 0x4e929775 // sdot v21.4s, v27.16b, v18.16b\n"
- "ext v18.16b, v18.16b, v18.16b, #0x1\n"
- "sqrdmulh v2.4s, v2.4s, v6.4s\n"
- ".inst 0x4e97977a // sdot v26.4s, v27.16b, v23.16b\n"
- ".inst 0x4e929770 // sdot v16.4s, v27.16b, v18.16b\n"
- "and v29.16b, v2.16b, v1.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqrdmulh v26.4s, v26.4s, v6.4s\n"
- "sqrdmulh v21.4s, v21.4s, v6.4s\n"
- "sqrdmulh v16.4s, v16.4s, v6.4s\n"
- "ldr q6, [%x[params], #0x120]\n"
- "sqadd v2.4s, v2.4s, v29.4s\n"
- "and v28.16b, v26.16b, v1.16b\n"
- "and v27.16b, v21.16b, v1.16b\n"
- "and v29.16b, v16.16b, v1.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v2.4s, v2.4s, v1.4s\n"
- "sqadd v26.4s, v26.4s, v28.4s\n"
- "ldr q28, [%x[params], #0x100]\n"
- "sqadd v21.4s, v21.4s, v27.4s\n"
- "ldr q27, [%x[params], #0x110]\n"
- "sqadd v16.4s, v16.4s, v29.4s\n"
- "ldr q29, [%x[params], #0xf0]\n"
- "add v2.4s, v2.4s, v7.4s\n"
- "srshl v26.4s, v26.4s, v1.4s\n"
- "srshl v21.4s, v21.4s, v1.4s\n"
- "srshl v16.4s, v16.4s, v1.4s\n"
- "ldr q1, [%x[params], #0x130]\n"
- "smax v2.4s, v2.4s, v9.4s\n"
- "add v26.4s, v26.4s, v7.4s\n"
- "add v21.4s, v21.4s, v7.4s\n"
- "add v16.4s, v16.4s, v7.4s\n"
- "smin v2.4s, v2.4s, v8.4s\n"
- "smax v26.4s, v26.4s, v9.4s\n"
- "smax v21.4s, v21.4s, v9.4s\n"
- "smax v16.4s, v16.4s, v9.4s\n"
- "smin v26.4s, v26.4s, v8.4s\n"
- "smin v21.4s, v21.4s, v8.4s\n"
- "smin v16.4s, v16.4s, v8.4s\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s2, [x24, x11]\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ ".inst 0x4e839617 // sdot v23.4s, v16.16b, v3.16b\n"
+ "ldr q3, [x10, x12]\n"
+ ".inst 0x4e95960d // sdot v13.4s, v16.16b, v21.16b\n"
+ ".inst 0x4e80965f // sdot v31.4s, v18.16b, v0.16b\n"
+ ".inst 0x4e809638 // sdot v24.4s, v17.16b, v0.16b\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x4e959657 // sdot v23.4s, v18.16b, v21.16b\n"
+ "ldr q4, [x26, x12]\n"
+ ".inst 0x4e80964d // sdot v13.4s, v18.16b, v0.16b\n"
+ ".inst 0x4e82963f // sdot v31.4s, v17.16b, v2.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ "sqrdmulh v24.4s, v24.4s, v19.4s\n"
+ ".inst 0x4e809637 // sdot v23.4s, v17.16b, v0.16b\n"
+ ".inst 0x4e82962d // sdot v13.4s, v17.16b, v2.16b\n"
+ "and v16.16b, v24.16b, v22.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v23.4s, v23.4s, v19.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v19.4s\n"
+ "sqrdmulh v13.4s, v13.4s, v19.4s\n"
+ "ldr q19, [%x[params], #0x120]\n"
+ "sqadd v24.4s, v24.4s, v16.4s\n"
+ "and v18.16b, v23.16b, v22.16b\n"
+ "and v17.16b, v31.16b, v22.16b\n"
+ "and v16.16b, v13.16b, v22.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v24.4s, v24.4s, v22.4s\n"
+ "sqadd v23.4s, v23.4s, v18.4s\n"
+ "ldr q18, [%x[params], #0x100]\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "ldr q17, [%x[params], #0x110]\n"
+ "sqadd v13.4s, v13.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0xf0]\n"
+ "add v24.4s, v24.4s, v15.4s\n"
+ "srshl v23.4s, v23.4s, v22.4s\n"
+ "srshl v31.4s, v31.4s, v22.4s\n"
+ "srshl v13.4s, v13.4s, v22.4s\n"
+ "ldr q22, [%x[params], #0x130]\n"
+ "smax v24.4s, v24.4s, v8.4s\n"
+ "add v23.4s, v23.4s, v15.4s\n"
+ "add v31.4s, v31.4s, v15.4s\n"
+ "add v13.4s, v13.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v12.4s\n"
+ "smax v23.4s, v23.4s, v8.4s\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "smax v13.4s, v13.4s, v8.4s\n"
+ "smin v23.4s, v23.4s, v12.4s\n"
+ "smin v31.4s, v31.4s, v12.4s\n"
+ "smin v13.4s, v13.4s, v12.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s24, [x25, x11]\n"
"ldr q2, [%x[params], #0xe0]\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s26, [x23, x11]\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s21, [x22, x11]\n"
- "mov v26.16b, v2.16b\n"
- "str s16, [x21, x11]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s23, [x24, x11]\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "str s31, [x23, x11]\n"
+ "mov v25.16b, v2.16b\n"
+ "str s13, [x22, x11]\n"
"mov v21.16b, v2.16b\n"
- "mov v16.16b, v2.16b\n"
- ".inst 0x4e8397a2 // sdot v2.4s, v29.16b, v3.16b\n"
- ".inst 0x4e9e97b5 // sdot v21.4s, v29.16b, v30.16b\n"
- ".inst 0x4e9e9782 // sdot v2.4s, v28.16b, v30.16b\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "mov v30.16b, v2.16b\n"
+ ".inst 0x4e8e9602 // sdot v2.4s, v16.16b, v14.16b\n"
+ ".inst 0x4e9b9615 // sdot v21.4s, v16.16b, v27.16b\n"
+ ".inst 0x4e9b9642 // sdot v2.4s, v18.16b, v27.16b\n"
+ "ext v14.16b, v14.16b, v14.16b, #0x1\n"
"add x11, x11, #0x4\n"
- "ext v30.16b, v30.16b, v30.16b, #0x1\n"
- ".inst 0x4e8397ba // sdot v26.4s, v29.16b, v3.16b\n"
- "ldr q3, [x9, x12]\n"
- ".inst 0x4e9e97b0 // sdot v16.4s, v29.16b, v30.16b\n"
- ".inst 0x4e969795 // sdot v21.4s, v28.16b, v22.16b\n"
- ".inst 0x4e969762 // sdot v2.4s, v27.16b, v22.16b\n"
- "ext v22.16b, v22.16b, v22.16b, #0x1\n"
- ".inst 0x4e9e979a // sdot v26.4s, v28.16b, v30.16b\n"
- "ldr q30, [x25, x12]\n"
- ".inst 0x4e969790 // sdot v16.4s, v28.16b, v22.16b\n"
- "sqrdmulh v2.4s, v2.4s, v6.4s\n"
- ".inst 0x4e919775 // sdot v21.4s, v27.16b, v17.16b\n"
- "ext v17.16b, v17.16b, v17.16b, #0x1\n"
- ".inst 0x4e96977a // sdot v26.4s, v27.16b, v22.16b\n"
- ".inst 0x4e919770 // sdot v16.4s, v27.16b, v17.16b\n"
- "and v29.16b, v2.16b, v1.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqrdmulh v26.4s, v26.4s, v6.4s\n"
- "sqrdmulh v21.4s, v21.4s, v6.4s\n"
- "sqrdmulh v16.4s, v16.4s, v6.4s\n"
- "ldr q6, [x14, x12]\n"
- "ldp x14, x13, [%x[inptrs], #0x40]\n"
- "ldr q25, [x14, x12]\n"
- "ldr q24, [x13, x12]\n"
- "sqadd v2.4s, v2.4s, v29.4s\n"
- "and v28.16b, v26.16b, v1.16b\n"
- "and v27.16b, v21.16b, v1.16b\n"
- "and v29.16b, v16.16b, v1.16b\n"
- "ldp x10, x9, [%x[inptrs], #0x50]\n"
- "ldr q23, [x10, x12]\n"
- "ldr q22, [x9, x12]\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v2.4s, v2.4s, v1.4s\n"
- "sqadd v26.4s, v26.4s, v28.4s\n"
- "ldr q28, [%x[params], #0x160]\n"
- "sqadd v21.4s, v21.4s, v27.4s\n"
- "ldr q27, [%x[params], #0x170]\n"
- "sqadd v16.4s, v16.4s, v29.4s\n"
- "ldr q29, [%x[params], #0x150]\n"
- "add v2.4s, v2.4s, v7.4s\n"
- "srshl v26.4s, v26.4s, v1.4s\n"
- "srshl v21.4s, v21.4s, v1.4s\n"
- "srshl v16.4s, v16.4s, v1.4s\n"
- "ldr q1, [x28, x12]\n"
- "smax v2.4s, v2.4s, v9.4s\n"
- "ldp x28, x27, [%x[inptrs], #0x60]\n"
- "ldr q20, [x28, x12]\n"
- "ldr q19, [x27, x12]\n"
- "add v26.4s, v26.4s, v7.4s\n"
- "add v21.4s, v21.4s, v7.4s\n"
- "add v16.4s, v16.4s, v7.4s\n"
- "smin v2.4s, v2.4s, v8.4s\n"
- "ldp x26, x25, [%x[inptrs], #0x70]\n"
- "ldr q18, [x26, x12]\n"
- "ldr q17, [x25, x12]\n"
- "smax v26.4s, v26.4s, v9.4s\n"
- "smax v21.4s, v21.4s, v9.4s\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x4e8e9619 // sdot v25.4s, v16.16b, v14.16b\n"
+ "ldr q14, [x9, x12]\n"
+ ".inst 0x4e9b961e // sdot v30.4s, v16.16b, v27.16b\n"
+ ".inst 0x4e879655 // sdot v21.4s, v18.16b, v7.16b\n"
+ ".inst 0x4e879622 // sdot v2.4s, v17.16b, v7.16b\n"
+ "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+ ".inst 0x4e9b9659 // sdot v25.4s, v18.16b, v27.16b\n"
+ "ldr q27, [x21, x12]\n"
+ ".inst 0x4e87965e // sdot v30.4s, v18.16b, v7.16b\n"
+ "sqrdmulh v2.4s, v2.4s, v19.4s\n"
+ ".inst 0x4e819635 // sdot v21.4s, v17.16b, v1.16b\n"
+ "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+ ".inst 0x4e879639 // sdot v25.4s, v17.16b, v7.16b\n"
+ ".inst 0x4e81963e // sdot v30.4s, v17.16b, v1.16b\n"
+ "and v16.16b, v2.16b, v22.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v25.4s, v25.4s, v19.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v19.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v19.4s\n"
+ "ldr q11, [x14, x12]\n"
+ "ldp x21, x20, [%x[inptrs], #0x40]\n"
+ "ldr q5, [x21, x12]\n"
+ "ldr q29, [x20, x12]\n"
+ "sqadd v2.4s, v2.4s, v16.4s\n"
+ "and v19.16b, v25.16b, v22.16b\n"
+ "and v17.16b, v21.16b, v22.16b\n"
+ "and v16.16b, v30.16b, v22.16b\n"
+ "ldp x21, x20, [%x[inptrs], #0x50]\n"
+ "ldr q26, [x21, x12]\n"
+ "ldr q7, [x20, x12]\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v2.4s, v2.4s, v22.4s\n"
+ "sqadd v25.4s, v25.4s, v19.4s\n"
+ "ldr q9, [%x[params], #0x160]\n"
+ "sqadd v21.4s, v21.4s, v17.4s\n"
+ "ldr q6, [%x[params], #0x170]\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "ldr q24, [%x[params], #0x150]\n"
+ "add v2.4s, v2.4s, v15.4s\n"
+ "srshl v25.4s, v25.4s, v22.4s\n"
+ "srshl v21.4s, v21.4s, v22.4s\n"
+ "srshl v30.4s, v30.4s, v22.4s\n"
+ "ldr q13, [x28, x12]\n"
+ "smax v2.4s, v2.4s, v8.4s\n"
+ "ldp x21, x20, [%x[inptrs], #0x60]\n"
+ "ldr q16, [x21, x12]\n"
+ "ldr q28, [x20, x12]\n"
+ "add v25.4s, v25.4s, v15.4s\n"
+ "add v21.4s, v21.4s, v15.4s\n"
+ "add v30.4s, v30.4s, v15.4s\n"
+ "smin v2.4s, v2.4s, v12.4s\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "ldr q23, [x21, x12]\n"
+ "ldr q1, [x20, x12]\n"
+ "smax v25.4s, v25.4s, v8.4s\n"
+ "smax v21.4s, v21.4s, v8.4s\n"
"ldp x14, x13, [%x[inptrs], #0x0]\n"
- "smax v16.4s, v16.4s, v9.4s\n"
- "smin v26.4s, v26.4s, v8.4s\n"
+ "smax v30.4s, v30.4s, v8.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
"ldp x10, x9, [%x[inptrs], #0x10]\n"
"ldp x28, x27, [%x[inptrs], #0x20]\n"
- "smin v21.4s, v21.4s, v8.4s\n"
- "smin v16.4s, v16.4s, v8.4s\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
+ "smin v21.4s, v21.4s, v12.4s\n"
+ "smin v30.4s, v30.4s, v12.4s\n"
+ "ldp x26, x21, [%x[inptrs], #0x30]\n"
"uzp1 v2.16b, v2.16b, v2.16b\n"
"uzp1 v2.16b, v2.16b, v2.16b\n"
- "str s2, [x24, x11]\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s2, [x25, x11]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "zip2 v2.16b, v6.16b, v4.16b\n"
- "zip1 v6.16b, v6.16b, v4.16b\n"
- "zip1 v4.16b, v5.16b, v3.16b\n"
- "zip2 v3.16b, v5.16b, v3.16b\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s26, [x23, x11]\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "zip2 v18.16b, v11.16b, v3.16b\n"
+ "zip1 v11.16b, v11.16b, v3.16b\n"
+ "zip1 v17.16b, v10.16b, v14.16b\n"
+ "zip2 v14.16b, v10.16b, v14.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s25, [x24, x11]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s21, [x22, x11]\n"
- "str s16, [x21, x11]\n"
- "zip2 v5.16b, v6.16b, v4.16b\n"
- "zip1 v6.16b, v6.16b, v4.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s21, [x23, x11]\n"
+ "str s30, [x22, x11]\n"
+ "zip2 v10.16b, v11.16b, v17.16b\n"
+ "zip1 v11.16b, v11.16b, v17.16b\n"
"add x11, x11, #0x4\n"
- "zip1 v4.16b, v2.16b, v3.16b\n"
- "zip2 v3.16b, v2.16b, v3.16b\n"
- "ldr q2, [%x[params], #0x140]\n"
+ "zip1 v3.16b, v18.16b, v14.16b\n"
+ "zip2 v14.16b, v18.16b, v14.16b\n"
+ "ldr q31, [%x[params], #0x140]\n"
"add %x[params], %x[params], #0x180\n"
- "zip2 v26.16b, v1.16b, v31.16b\n"
- "zip1 v1.16b, v1.16b, v31.16b\n"
- "zip1 v31.16b, v0.16b, v30.16b\n"
- "zip2 v30.16b, v0.16b, v30.16b\n"
- "zip2 v21.16b, v25.16b, v23.16b\n"
- "zip1 v25.16b, v25.16b, v23.16b\n"
- "zip1 v23.16b, v24.16b, v22.16b\n"
- "zip2 v22.16b, v24.16b, v22.16b\n"
- "zip2 v16.16b, v20.16b, v18.16b\n"
- "zip1 v20.16b, v20.16b, v18.16b\n"
- "zip1 v18.16b, v19.16b, v17.16b\n"
- "zip2 v17.16b, v19.16b, v17.16b\n"
- "zip2 v0.16b, v1.16b, v31.16b\n"
- "zip1 v1.16b, v1.16b, v31.16b\n"
- "zip1 v31.16b, v26.16b, v30.16b\n"
- "zip2 v30.16b, v26.16b, v30.16b\n"
- "zip2 v24.16b, v25.16b, v23.16b\n"
- "zip1 v25.16b, v25.16b, v23.16b\n"
- "zip1 v23.16b, v21.16b, v22.16b\n"
- "zip2 v22.16b, v21.16b, v22.16b\n"
- "zip2 v19.16b, v20.16b, v18.16b\n"
- "zip1 v20.16b, v20.16b, v18.16b\n"
- "zip1 v18.16b, v16.16b, v17.16b\n"
- "zip2 v17.16b, v16.16b, v17.16b\n"
- "mov v26.16b, v2.16b\n"
- "mov v21.16b, v2.16b\n"
- "mov v16.16b, v2.16b\n"
+ "zip2 v22.16b, v13.16b, v4.16b\n"
+ "zip1 v13.16b, v13.16b, v4.16b\n"
+ "zip1 v2.16b, v20.16b, v27.16b\n"
+ "zip2 v27.16b, v20.16b, v27.16b\n"
+ "zip2 v19.16b, v5.16b, v26.16b\n"
+ "zip1 v5.16b, v5.16b, v26.16b\n"
+ "zip1 v18.16b, v29.16b, v7.16b\n"
+ "zip2 v7.16b, v29.16b, v7.16b\n"
+ "zip2 v4.16b, v16.16b, v23.16b\n"
+ "zip1 v16.16b, v16.16b, v23.16b\n"
+ "zip1 v17.16b, v28.16b, v1.16b\n"
+ "zip2 v1.16b, v28.16b, v1.16b\n"
+ "zip2 v28.16b, v13.16b, v2.16b\n"
+ "zip1 v13.16b, v13.16b, v2.16b\n"
+ "zip1 v21.16b, v22.16b, v27.16b\n"
+ "zip2 v27.16b, v22.16b, v27.16b\n"
+ "zip2 v29.16b, v5.16b, v18.16b\n"
+ "zip1 v5.16b, v5.16b, v18.16b\n"
+ "zip1 v0.16b, v19.16b, v7.16b\n"
+ "zip2 v7.16b, v19.16b, v7.16b\n"
+ "zip2 v30.16b, v16.16b, v17.16b\n"
+ "zip1 v16.16b, v16.16b, v17.16b\n"
+ "zip1 v2.16b, v4.16b, v1.16b\n"
+ "zip2 v1.16b, v4.16b, v1.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ "mov v4.16b, v31.16b\n"
"bgt 1b\n"
"2:" // Detached iteration
- ".inst 0x4e8697a2 // sdot v2.4s, v29.16b, v6.16b\n"
- ".inst 0x4e8197b5 // sdot v21.4s, v29.16b, v1.16b\n"
- "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ ".inst 0x4e8b971f // sdot v31.4s, v24.16b, v11.16b\n"
+ ".inst 0x4e8d9712 // sdot v18.4s, v24.16b, v13.16b\n"
+ "ext v11.16b, v11.16b, v11.16b, #0x1\n"
"tst %x[n_channels], #0xf\n"
- ".inst 0x4e819782 // sdot v2.4s, v28.16b, v1.16b\n"
- "ext v1.16b, v1.16b, v1.16b, #0x1\n"
- ".inst 0x4e8697ba // sdot v26.4s, v29.16b, v6.16b\n"
- "ldr q6, [%x[params], #0x0]\n"
- ".inst 0x4e8197b0 // sdot v16.4s, v29.16b, v1.16b\n"
- ".inst 0x4e999795 // sdot v21.4s, v28.16b, v25.16b\n"
+ ".inst 0x4e8d953f // sdot v31.4s, v9.16b, v13.16b\n"
+ "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+ ".inst 0x4e8b971a // sdot v26.4s, v24.16b, v11.16b\n"
+ "ldr q17, [%x[params], #0x0]\n"
+ ".inst 0x4e8d9704 // sdot v4.4s, v24.16b, v13.16b\n"
+ ".inst 0x4e859532 // sdot v18.4s, v9.16b, v5.16b\n"
"add x12, x12, #0x10\n"
- ".inst 0x4e999762 // sdot v2.4s, v27.16b, v25.16b\n"
- "ext v25.16b, v25.16b, v25.16b, #0x1\n"
- ".inst 0x4e81979a // sdot v26.4s, v28.16b, v1.16b\n"
- "ldr q1, [%x[params], #0x10]\n"
- ".inst 0x4e999790 // sdot v16.4s, v28.16b, v25.16b\n"
- ".inst 0x4e949775 // sdot v21.4s, v27.16b, v20.16b\n"
- "ext v20.16b, v20.16b, v20.16b, #0x1\n"
- "sqrdmulh v2.4s, v2.4s, v6.4s\n"
- ".inst 0x4e99977a // sdot v26.4s, v27.16b, v25.16b\n"
- ".inst 0x4e949770 // sdot v16.4s, v27.16b, v20.16b\n"
- "and v29.16b, v2.16b, v1.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqrdmulh v26.4s, v26.4s, v6.4s\n"
- "sqrdmulh v21.4s, v21.4s, v6.4s\n"
- "sqrdmulh v16.4s, v16.4s, v6.4s\n"
- "ldr q6, [%x[params], #0x60]\n"
- "sqadd v2.4s, v2.4s, v29.4s\n"
- "and v28.16b, v26.16b, v1.16b\n"
- "and v27.16b, v21.16b, v1.16b\n"
- "and v29.16b, v16.16b, v1.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v2.4s, v2.4s, v1.4s\n"
- "sqadd v26.4s, v26.4s, v28.4s\n"
- "ldr q28, [%x[params], #0x40]\n"
- "sqadd v21.4s, v21.4s, v27.4s\n"
- "ldr q27, [%x[params], #0x50]\n"
- "sqadd v16.4s, v16.4s, v29.4s\n"
- "ldr q29, [%x[params], #0x30]\n"
- "add v2.4s, v2.4s, v7.4s\n"
- "srshl v26.4s, v26.4s, v1.4s\n"
- "srshl v21.4s, v21.4s, v1.4s\n"
- "srshl v16.4s, v16.4s, v1.4s\n"
- "ldr q1, [%x[params], #0x70]\n"
- "smax v2.4s, v2.4s, v9.4s\n"
- "add v26.4s, v26.4s, v7.4s\n"
- "add v21.4s, v21.4s, v7.4s\n"
- "add v16.4s, v16.4s, v7.4s\n"
- "smin v2.4s, v2.4s, v8.4s\n"
- "smax v26.4s, v26.4s, v9.4s\n"
- "smax v21.4s, v21.4s, v9.4s\n"
- "smax v16.4s, v16.4s, v9.4s\n"
- "smin v26.4s, v26.4s, v8.4s\n"
- "smin v21.4s, v21.4s, v8.4s\n"
- "smin v16.4s, v16.4s, v8.4s\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s2, [x24, x11]\n"
- "ldr q2, [%x[params], #0x20]\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s26, [x23, x11]\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s21, [x22, x11]\n"
- "mov v26.16b, v2.16b\n"
- "str s16, [x21, x11]\n"
- "mov v21.16b, v2.16b\n"
- "mov v16.16b, v2.16b\n"
- ".inst 0x4e8597a2 // sdot v2.4s, v29.16b, v5.16b\n"
- ".inst 0x4e8097b5 // sdot v21.4s, v29.16b, v0.16b\n"
- ".inst 0x4e809782 // sdot v2.4s, v28.16b, v0.16b\n"
+ ".inst 0x4e8594df // sdot v31.4s, v6.16b, v5.16b\n"
"ext v5.16b, v5.16b, v5.16b, #0x1\n"
- "add x11, x11, #0x4\n"
- "ext v0.16b, v0.16b, v0.16b, #0x1\n"
- ".inst 0x4e8597ba // sdot v26.4s, v29.16b, v5.16b\n"
- ".inst 0x4e8097b0 // sdot v16.4s, v29.16b, v0.16b\n"
- ".inst 0x4e989795 // sdot v21.4s, v28.16b, v24.16b\n"
- ".inst 0x4e989762 // sdot v2.4s, v27.16b, v24.16b\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x4e80979a // sdot v26.4s, v28.16b, v0.16b\n"
- ".inst 0x4e989790 // sdot v16.4s, v28.16b, v24.16b\n"
- "sqrdmulh v2.4s, v2.4s, v6.4s\n"
- ".inst 0x4e939775 // sdot v21.4s, v27.16b, v19.16b\n"
- "ext v19.16b, v19.16b, v19.16b, #0x1\n"
- ".inst 0x4e98977a // sdot v26.4s, v27.16b, v24.16b\n"
- ".inst 0x4e939770 // sdot v16.4s, v27.16b, v19.16b\n"
- "and v29.16b, v2.16b, v1.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqrdmulh v26.4s, v26.4s, v6.4s\n"
- "sqrdmulh v21.4s, v21.4s, v6.4s\n"
- "sqrdmulh v16.4s, v16.4s, v6.4s\n"
- "ldr q6, [%x[params], #0xc0]\n"
- "sqadd v2.4s, v2.4s, v29.4s\n"
- "and v28.16b, v26.16b, v1.16b\n"
- "and v27.16b, v21.16b, v1.16b\n"
- "and v29.16b, v16.16b, v1.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v2.4s, v2.4s, v1.4s\n"
- "sqadd v26.4s, v26.4s, v28.4s\n"
- "ldr q28, [%x[params], #0xa0]\n"
- "sqadd v21.4s, v21.4s, v27.4s\n"
- "ldr q27, [%x[params], #0xb0]\n"
- "sqadd v16.4s, v16.4s, v29.4s\n"
- "ldr q29, [%x[params], #0x90]\n"
- "add v2.4s, v2.4s, v7.4s\n"
- "srshl v26.4s, v26.4s, v1.4s\n"
- "srshl v21.4s, v21.4s, v1.4s\n"
- "srshl v16.4s, v16.4s, v1.4s\n"
- "ldr q1, [%x[params], #0xd0]\n"
- "smax v2.4s, v2.4s, v9.4s\n"
- "add v26.4s, v26.4s, v7.4s\n"
- "add v21.4s, v21.4s, v7.4s\n"
- "add v16.4s, v16.4s, v7.4s\n"
- "smin v2.4s, v2.4s, v8.4s\n"
- "smax v26.4s, v26.4s, v9.4s\n"
- "smax v21.4s, v21.4s, v9.4s\n"
- "smax v16.4s, v16.4s, v9.4s\n"
- "smin v26.4s, v26.4s, v8.4s\n"
- "smin v21.4s, v21.4s, v8.4s\n"
- "smin v16.4s, v16.4s, v8.4s\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
- "str s2, [x24, x11]\n"
- "ldr q2, [%x[params], #0x80]\n"
+ ".inst 0x4e8d953a // sdot v26.4s, v9.16b, v13.16b\n"
+ "ldr q19, [%x[params], #0x10]\n"
+ ".inst 0x4e859524 // sdot v4.4s, v9.16b, v5.16b\n"
+ ".inst 0x4e9094d2 // sdot v18.4s, v6.16b, v16.16b\n"
+ "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x4e8594da // sdot v26.4s, v6.16b, v5.16b\n"
+ ".inst 0x4e9094c4 // sdot v4.4s, v6.16b, v16.16b\n"
+ "and v16.16b, v31.16b, v19.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+ "ldr q24, [%x[params], #0x60]\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v20.16b, v26.16b, v19.16b\n"
+ "and v17.16b, v18.16b, v19.16b\n"
+ "and v16.16b, v4.16b, v19.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v19.4s\n"
+ "sqadd v26.4s, v26.4s, v20.4s\n"
+ "ldr q5, [%x[params], #0x40]\n"
+ "sqadd v18.4s, v18.4s, v17.4s\n"
+ "ldr q17, [%x[params], #0x50]\n"
+ "sqadd v4.4s, v4.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0x30]\n"
+ "add v31.4s, v31.4s, v15.4s\n"
+ "srshl v26.4s, v26.4s, v19.4s\n"
+ "srshl v18.4s, v18.4s, v19.4s\n"
+ "srshl v4.4s, v4.4s, v19.4s\n"
+ "ldr q23, [%x[params], #0x70]\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "add v26.4s, v26.4s, v15.4s\n"
+ "add v18.4s, v18.4s, v15.4s\n"
+ "add v4.4s, v4.4s, v15.4s\n"
+ "smin v31.4s, v31.4s, v12.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v18.4s, v18.4s, v8.4s\n"
+ "smax v4.4s, v4.4s, v8.4s\n"
+ "smin v26.4s, v26.4s, v12.4s\n"
+ "smin v18.4s, v18.4s, v12.4s\n"
+ "smin v4.4s, v4.4s, v12.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s31, [x25, x11]\n"
+ "ldr q25, [%x[params], #0x20]\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s26, [x23, x11]\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s21, [x22, x11]\n"
- "str s16, [x21, x11]\n"
- "mov v26.16b, v2.16b\n"
- "mov v21.16b, v2.16b\n"
- ".inst 0x4e9f97b5 // sdot v21.4s, v29.16b, v31.16b\n"
- "mov v16.16b, v2.16b\n"
- ".inst 0x4e8497a2 // sdot v2.4s, v29.16b, v4.16b\n"
- ".inst 0x4e9f9782 // sdot v2.4s, v28.16b, v31.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str s26, [x24, x11]\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "str s18, [x23, x11]\n"
+ "mov v22.16b, v25.16b\n"
+ "str s4, [x22, x11]\n"
+ "mov v20.16b, v25.16b\n"
+ "mov v19.16b, v25.16b\n"
+ ".inst 0x4e8a9619 // sdot v25.4s, v16.16b, v10.16b\n"
+ ".inst 0x4e9c9614 // sdot v20.4s, v16.16b, v28.16b\n"
+ ".inst 0x4e9c94b9 // sdot v25.4s, v5.16b, v28.16b\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+ "add x11, x11, #0x4\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+ ".inst 0x4e8a9616 // sdot v22.4s, v16.16b, v10.16b\n"
+ ".inst 0x4e9c9613 // sdot v19.4s, v16.16b, v28.16b\n"
+ ".inst 0x4e9d94b4 // sdot v20.4s, v5.16b, v29.16b\n"
+ ".inst 0x4e9d9639 // sdot v25.4s, v17.16b, v29.16b\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x1\n"
+ ".inst 0x4e9c94b6 // sdot v22.4s, v5.16b, v28.16b\n"
+ ".inst 0x4e9d94b3 // sdot v19.4s, v5.16b, v29.16b\n"
+ "sqrdmulh v25.4s, v25.4s, v24.4s\n"
+ ".inst 0x4e9e9634 // sdot v20.4s, v17.16b, v30.16b\n"
+ "ext v30.16b, v30.16b, v30.16b, #0x1\n"
+ ".inst 0x4e9d9636 // sdot v22.4s, v17.16b, v29.16b\n"
+ ".inst 0x4e9e9633 // sdot v19.4s, v17.16b, v30.16b\n"
+ "and v16.16b, v25.16b, v23.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v24.4s\n"
+ "ldr q24, [%x[params], #0xc0]\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "and v18.16b, v22.16b, v23.16b\n"
+ "and v17.16b, v20.16b, v23.16b\n"
+ "and v16.16b, v19.16b, v23.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v25.4s, v25.4s, v23.4s\n"
+ "sqadd v22.4s, v22.4s, v18.4s\n"
+ "ldr q18, [%x[params], #0xa0]\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "ldr q17, [%x[params], #0xb0]\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0x90]\n"
+ "add v25.4s, v25.4s, v15.4s\n"
+ "srshl v22.4s, v22.4s, v23.4s\n"
+ "srshl v20.4s, v20.4s, v23.4s\n"
+ "srshl v19.4s, v19.4s, v23.4s\n"
+ "ldr q23, [%x[params], #0xd0]\n"
+ "smax v25.4s, v25.4s, v8.4s\n"
+ "add v22.4s, v22.4s, v15.4s\n"
+ "add v20.4s, v20.4s, v15.4s\n"
+ "add v19.4s, v19.4s, v15.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
+ "smax v22.4s, v22.4s, v8.4s\n"
+ "smax v20.4s, v20.4s, v8.4s\n"
+ "smax v19.4s, v19.4s, v8.4s\n"
+ "smin v22.4s, v22.4s, v12.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "smin v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s25, [x25, x11]\n"
+ "ldr q10, [%x[params], #0x80]\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s22, [x24, x11]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s20, [x23, x11]\n"
+ "str s19, [x22, x11]\n"
+ "mov v28.16b, v10.16b\n"
+ "mov v20.16b, v10.16b\n"
+ ".inst 0x4e959614 // sdot v20.4s, v16.16b, v21.16b\n"
+ "mov v19.16b, v10.16b\n"
+ ".inst 0x4e83960a // sdot v10.4s, v16.16b, v3.16b\n"
+ ".inst 0x4e95964a // sdot v10.4s, v18.16b, v21.16b\n"
"add x11, x11, #0x4\n"
- "ext v4.16b, v4.16b, v4.16b, #0x1\n"
- "ext v31.16b, v31.16b, v31.16b, #0x1\n"
- ".inst 0x4e8497ba // sdot v26.4s, v29.16b, v4.16b\n"
- ".inst 0x4e9f97b0 // sdot v16.4s, v29.16b, v31.16b\n"
- ".inst 0x4e979795 // sdot v21.4s, v28.16b, v23.16b\n"
- ".inst 0x4e979762 // sdot v2.4s, v27.16b, v23.16b\n"
- "ext v23.16b, v23.16b, v23.16b, #0x1\n"
- ".inst 0x4e9f979a // sdot v26.4s, v28.16b, v31.16b\n"
- ".inst 0x4e979790 // sdot v16.4s, v28.16b, v23.16b\n"
- ".inst 0x4e929775 // sdot v21.4s, v27.16b, v18.16b\n"
- "ext v18.16b, v18.16b, v18.16b, #0x1\n"
- "sqrdmulh v2.4s, v2.4s, v6.4s\n"
- ".inst 0x4e97977a // sdot v26.4s, v27.16b, v23.16b\n"
- ".inst 0x4e929770 // sdot v16.4s, v27.16b, v18.16b\n"
- "and v29.16b, v2.16b, v1.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqrdmulh v26.4s, v26.4s, v6.4s\n"
- "sqrdmulh v21.4s, v21.4s, v6.4s\n"
- "sqrdmulh v16.4s, v16.4s, v6.4s\n"
- "ldr q6, [%x[params], #0x120]\n"
- "sqadd v2.4s, v2.4s, v29.4s\n"
- "and v28.16b, v26.16b, v1.16b\n"
- "and v27.16b, v21.16b, v1.16b\n"
- "and v29.16b, v16.16b, v1.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v2.4s, v2.4s, v1.4s\n"
- "sqadd v26.4s, v26.4s, v28.4s\n"
- "ldr q28, [%x[params], #0x100]\n"
- "sqadd v21.4s, v21.4s, v27.4s\n"
- "ldr q27, [%x[params], #0x110]\n"
- "sqadd v16.4s, v16.4s, v29.4s\n"
- "ldr q29, [%x[params], #0xf0]\n"
- "add v2.4s, v2.4s, v7.4s\n"
- "srshl v26.4s, v26.4s, v1.4s\n"
- "srshl v21.4s, v21.4s, v1.4s\n"
- "srshl v16.4s, v16.4s, v1.4s\n"
- "ldr q1, [%x[params], #0x130]\n"
- "smax v2.4s, v2.4s, v9.4s\n"
- "add v26.4s, v26.4s, v7.4s\n"
- "add v21.4s, v21.4s, v7.4s\n"
- "add v16.4s, v16.4s, v7.4s\n"
- "smin v2.4s, v2.4s, v8.4s\n"
- "smax v26.4s, v26.4s, v9.4s\n"
- "smax v21.4s, v21.4s, v9.4s\n"
- "smax v16.4s, v16.4s, v9.4s\n"
- "smin v26.4s, v26.4s, v8.4s\n"
- "smin v21.4s, v21.4s, v8.4s\n"
- "smin v16.4s, v16.4s, v8.4s\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s2, [x24, x11]\n"
- "ldr q2, [%x[params], #0xe0]\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "add %x[params], %x[params], #0x140\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s26, [x23, x11]\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s21, [x22, x11]\n"
- "mov v26.16b, v2.16b\n"
- "str s16, [x21, x11]\n"
- "mov v21.16b, v2.16b\n"
- "mov v16.16b, v2.16b\n"
- ".inst 0x4e8397a2 // sdot v2.4s, v29.16b, v3.16b\n"
- ".inst 0x4e9e97b5 // sdot v21.4s, v29.16b, v30.16b\n"
- ".inst 0x4e9e9782 // sdot v2.4s, v28.16b, v30.16b\n"
"ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ ".inst 0x4e83961c // sdot v28.4s, v16.16b, v3.16b\n"
+ ".inst 0x4e959613 // sdot v19.4s, v16.16b, v21.16b\n"
+ ".inst 0x4e809654 // sdot v20.4s, v18.16b, v0.16b\n"
+ ".inst 0x4e80962a // sdot v10.4s, v17.16b, v0.16b\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x4e95965c // sdot v28.4s, v18.16b, v21.16b\n"
+ ".inst 0x4e809653 // sdot v19.4s, v18.16b, v0.16b\n"
+ ".inst 0x4e829634 // sdot v20.4s, v17.16b, v2.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ "sqrdmulh v10.4s, v10.4s, v24.4s\n"
+ ".inst 0x4e80963c // sdot v28.4s, v17.16b, v0.16b\n"
+ ".inst 0x4e829633 // sdot v19.4s, v17.16b, v2.16b\n"
+ "and v16.16b, v10.16b, v23.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v28.4s, v28.4s, v24.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v24.4s\n"
+ "ldr q24, [%x[params], #0x120]\n"
+ "sqadd v10.4s, v10.4s, v16.4s\n"
+ "and v18.16b, v28.16b, v23.16b\n"
+ "and v17.16b, v20.16b, v23.16b\n"
+ "and v16.16b, v19.16b, v23.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v10.4s, v10.4s, v23.4s\n"
+ "sqadd v28.4s, v28.4s, v18.4s\n"
+ "ldr q18, [%x[params], #0x100]\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "ldr q17, [%x[params], #0x110]\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0xf0]\n"
+ "add v10.4s, v10.4s, v15.4s\n"
+ "srshl v28.4s, v28.4s, v23.4s\n"
+ "srshl v20.4s, v20.4s, v23.4s\n"
+ "srshl v19.4s, v19.4s, v23.4s\n"
+ "ldr q23, [%x[params], #0x130]\n"
+ "smax v10.4s, v10.4s, v8.4s\n"
+ "add v28.4s, v28.4s, v15.4s\n"
+ "add v20.4s, v20.4s, v15.4s\n"
+ "add v19.4s, v19.4s, v15.4s\n"
+ "smin v10.4s, v10.4s, v12.4s\n"
+ "smax v28.4s, v28.4s, v8.4s\n"
+ "smax v20.4s, v20.4s, v8.4s\n"
+ "smax v19.4s, v19.4s, v8.4s\n"
+ "smin v28.4s, v28.4s, v12.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "smin v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s10, [x25, x11]\n"
+ "ldr q22, [%x[params], #0xe0]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "add %x[params], %x[params], #0x140\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s28, [x24, x11]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s20, [x23, x11]\n"
+ "mov v21.16b, v22.16b\n"
+ "str s19, [x22, x11]\n"
+ "mov v20.16b, v22.16b\n"
+ "mov v19.16b, v22.16b\n"
+ ".inst 0x4e8e9616 // sdot v22.4s, v16.16b, v14.16b\n"
+ ".inst 0x4e9b9614 // sdot v20.4s, v16.16b, v27.16b\n"
+ ".inst 0x4e9b9656 // sdot v22.4s, v18.16b, v27.16b\n"
+ "ext v14.16b, v14.16b, v14.16b, #0x1\n"
"add x11, x11, #0x4\n"
- "ext v30.16b, v30.16b, v30.16b, #0x1\n"
- ".inst 0x4e8397ba // sdot v26.4s, v29.16b, v3.16b\n"
- ".inst 0x4e9e97b0 // sdot v16.4s, v29.16b, v30.16b\n"
- ".inst 0x4e969795 // sdot v21.4s, v28.16b, v22.16b\n"
- ".inst 0x4e969762 // sdot v2.4s, v27.16b, v22.16b\n"
- "ext v22.16b, v22.16b, v22.16b, #0x1\n"
- ".inst 0x4e9e979a // sdot v26.4s, v28.16b, v30.16b\n"
- ".inst 0x4e969790 // sdot v16.4s, v28.16b, v22.16b\n"
- "sqrdmulh v2.4s, v2.4s, v6.4s\n"
- ".inst 0x4e919775 // sdot v21.4s, v27.16b, v17.16b\n"
- "ext v17.16b, v17.16b, v17.16b, #0x1\n"
- ".inst 0x4e96977a // sdot v26.4s, v27.16b, v22.16b\n"
- ".inst 0x4e919770 // sdot v16.4s, v27.16b, v17.16b\n"
- "and v29.16b, v2.16b, v1.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqrdmulh v26.4s, v26.4s, v6.4s\n"
- "sqrdmulh v21.4s, v21.4s, v6.4s\n"
- "sqrdmulh v16.4s, v16.4s, v6.4s\n"
- "sqadd v2.4s, v2.4s, v29.4s\n"
- "and v28.16b, v26.16b, v1.16b\n"
- "and v27.16b, v21.16b, v1.16b\n"
- "and v29.16b, v16.16b, v1.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v28.4s\n"
- "sqadd v21.4s, v21.4s, v27.4s\n"
- "sqadd v16.4s, v16.4s, v29.4s\n"
- "srshl v2.4s, v2.4s, v1.4s\n"
- "srshl v26.4s, v26.4s, v1.4s\n"
- "srshl v21.4s, v21.4s, v1.4s\n"
- "srshl v16.4s, v16.4s, v1.4s\n"
- "add v2.4s, v2.4s, v7.4s\n"
- "add v26.4s, v26.4s, v7.4s\n"
- "add v21.4s, v21.4s, v7.4s\n"
- "add v16.4s, v16.4s, v7.4s\n"
- "smax v2.4s, v2.4s, v9.4s\n"
- "smax v26.4s, v26.4s, v9.4s\n"
- "smax v21.4s, v21.4s, v9.4s\n"
- "smax v16.4s, v16.4s, v9.4s\n"
- "smin v2.4s, v2.4s, v8.4s\n"
- "smin v26.4s, v26.4s, v8.4s\n"
- "smin v21.4s, v21.4s, v8.4s\n"
- "smin v16.4s, v16.4s, v8.4s\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x4e8e9615 // sdot v21.4s, v16.16b, v14.16b\n"
+ ".inst 0x4e9b9613 // sdot v19.4s, v16.16b, v27.16b\n"
+ ".inst 0x4e879654 // sdot v20.4s, v18.16b, v7.16b\n"
+ ".inst 0x4e879636 // sdot v22.4s, v17.16b, v7.16b\n"
+ "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+ ".inst 0x4e9b9655 // sdot v21.4s, v18.16b, v27.16b\n"
+ ".inst 0x4e879653 // sdot v19.4s, v18.16b, v7.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+ ".inst 0x4e819634 // sdot v20.4s, v17.16b, v1.16b\n"
+ "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+ ".inst 0x4e879635 // sdot v21.4s, v17.16b, v7.16b\n"
+ ".inst 0x4e819633 // sdot v19.4s, v17.16b, v1.16b\n"
+ "and v16.16b, v22.16b, v23.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v21.4s, v21.4s, v24.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v24.4s\n"
+ "sqadd v22.4s, v22.4s, v16.4s\n"
+ "and v18.16b, v21.16b, v23.16b\n"
+ "and v17.16b, v20.16b, v23.16b\n"
+ "and v16.16b, v19.16b, v23.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v21.4s, v21.4s, v18.4s\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "srshl v22.4s, v22.4s, v23.4s\n"
+ "srshl v21.4s, v21.4s, v23.4s\n"
+ "srshl v20.4s, v20.4s, v23.4s\n"
+ "srshl v19.4s, v19.4s, v23.4s\n"
+ "add v22.4s, v22.4s, v15.4s\n"
+ "add v21.4s, v21.4s, v15.4s\n"
+ "add v20.4s, v20.4s, v15.4s\n"
+ "add v19.4s, v19.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v8.4s\n"
+ "smax v21.4s, v21.4s, v8.4s\n"
+ "smax v20.4s, v20.4s, v8.4s\n"
+ "smax v19.4s, v19.4s, v8.4s\n"
+ "smin v22.4s, v22.4s, v12.4s\n"
+ "smin v21.4s, v21.4s, v12.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "smin v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s2, [x24, x11]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s26, [x23, x11]\n"
- "str s21, [x22, x11]\n"
- "str s16, [x21, x11]\n"
+ "str s22, [x25, x11]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s21, [x24, x11]\n"
+ "str s20, [x23, x11]\n"
+ "str s19, [x22, x11]\n"
"add x11, x11, #0x4\n"
"beq 35f\n"
"3:" // Oddments
@@ -751,740 +743,738 @@ void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
"add x28, x28, x12\n"
"add x27, x27, x12\n"
"add x26, x26, x12\n"
- "add x25, x25, x12\n"
+ "add x21, x21, x12\n"
"tbz %x[n_channels], #3, 7f\n"
- "ldr d6, [x14], #0x8\n"
- "ldr d5, [x13], #0x8\n"
- "ldr d4, [x10], #0x8\n"
- "ldr d3, [x9], #0x8\n"
- "ldr d1, [x28], #0x8\n"
- "ldr d0, [x27], #0x8\n"
- "ldr d31, [x26], #0x8\n"
- "ldr d30, [x25], #0x8\n"
+ "ldr d11, [x14], #0x8\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d3, [x10], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d13, [x28], #0x8\n"
+ "ldr d28, [x27], #0x8\n"
+ "ldr d21, [x26], #0x8\n"
+ "ldr d27, [x21], #0x8\n"
"tbz %x[n_channels], #2, 5f\n"
- "ld1 { v6.s }[2], [x14], #0x4\n"
- "ld1 { v5.s }[2], [x13], #0x4\n"
- "ld1 { v4.s }[2], [x10], #0x4\n"
- "ld1 { v3.s }[2], [x9], #0x4\n"
- "ld1 { v1.s }[2], [x28], #0x4\n"
- "ld1 { v0.s }[2], [x27], #0x4\n"
- "ld1 { v31.s }[2], [x26], #0x4\n"
- "ld1 { v30.s }[2], [x25], #0x4\n"
+ "ld1 { v11.s }[2], [x14], #0x4\n"
+ "ld1 { v10.s }[2], [x13], #0x4\n"
+ "ld1 { v3.s }[2], [x10], #0x4\n"
+ "ld1 { v14.s }[2], [x9], #0x4\n"
+ "ld1 { v13.s }[2], [x28], #0x4\n"
+ "ld1 { v28.s }[2], [x27], #0x4\n"
+ "ld1 { v21.s }[2], [x26], #0x4\n"
+ "ld1 { v27.s }[2], [x21], #0x4\n"
"tbz %x[n_channels], #1, 4f\n"
- "ld1 { v6.h }[6], [x14], #0x2\n"
- "ld1 { v5.h }[6], [x13], #0x2\n"
- "ld1 { v4.h }[6], [x10], #0x2\n"
- "ld1 { v3.h }[6], [x9], #0x2\n"
- "ld1 { v1.h }[6], [x28], #0x2\n"
- "ld1 { v0.h }[6], [x27], #0x2\n"
- "ld1 { v31.h }[6], [x26], #0x2\n"
- "ld1 { v30.h }[6], [x25], #0x2\n"
+ "ld1 { v11.h }[6], [x14], #0x2\n"
+ "ld1 { v10.h }[6], [x13], #0x2\n"
+ "ld1 { v3.h }[6], [x10], #0x2\n"
+ "ld1 { v14.h }[6], [x9], #0x2\n"
+ "ld1 { v13.h }[6], [x28], #0x2\n"
+ "ld1 { v28.h }[6], [x27], #0x2\n"
+ "ld1 { v21.h }[6], [x26], #0x2\n"
+ "ld1 { v27.h }[6], [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v6.b }[14], [x14], #0x1\n"
- "ld1 { v5.b }[14], [x13], #0x1\n"
- "ld1 { v4.b }[14], [x10], #0x1\n"
- "ld1 { v3.b }[14], [x9], #0x1\n"
- "ld1 { v1.b }[14], [x28], #0x1\n"
- "ld1 { v0.b }[14], [x27], #0x1\n"
- "ld1 { v31.b }[14], [x26], #0x1\n"
- "ld1 { v30.b }[14], [x25], #0x1\n"
+ "ld1 { v11.b }[14], [x14], #0x1\n"
+ "ld1 { v10.b }[14], [x13], #0x1\n"
+ "ld1 { v3.b }[14], [x10], #0x1\n"
+ "ld1 { v14.b }[14], [x9], #0x1\n"
+ "ld1 { v13.b }[14], [x28], #0x1\n"
+ "ld1 { v28.b }[14], [x27], #0x1\n"
+ "ld1 { v21.b }[14], [x26], #0x1\n"
+ "ld1 { v27.b }[14], [x21], #0x1\n"
"b 11f\n"
"4:" // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v6.b }[12], [x14], #0x1\n"
- "ld1 { v5.b }[12], [x13], #0x1\n"
- "ld1 { v4.b }[12], [x10], #0x1\n"
- "ld1 { v3.b }[12], [x9], #0x1\n"
- "ld1 { v1.b }[12], [x28], #0x1\n"
- "ld1 { v0.b }[12], [x27], #0x1\n"
- "ld1 { v31.b }[12], [x26], #0x1\n"
- "ld1 { v30.b }[12], [x25], #0x1\n"
+ "ld1 { v11.b }[12], [x14], #0x1\n"
+ "ld1 { v10.b }[12], [x13], #0x1\n"
+ "ld1 { v3.b }[12], [x10], #0x1\n"
+ "ld1 { v14.b }[12], [x9], #0x1\n"
+ "ld1 { v13.b }[12], [x28], #0x1\n"
+ "ld1 { v28.b }[12], [x27], #0x1\n"
+ "ld1 { v21.b }[12], [x26], #0x1\n"
+ "ld1 { v27.b }[12], [x21], #0x1\n"
"b 11f\n"
"5:" // Oddments: Load (A): Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 6f\n"
- "ld1 { v6.h }[4], [x14], #0x2\n"
- "ld1 { v5.h }[4], [x13], #0x2\n"
- "ld1 { v4.h }[4], [x10], #0x2\n"
- "ld1 { v3.h }[4], [x9], #0x2\n"
- "ld1 { v1.h }[4], [x28], #0x2\n"
- "ld1 { v0.h }[4], [x27], #0x2\n"
- "ld1 { v31.h }[4], [x26], #0x2\n"
- "ld1 { v30.h }[4], [x25], #0x2\n"
+ "ld1 { v11.h }[4], [x14], #0x2\n"
+ "ld1 { v10.h }[4], [x13], #0x2\n"
+ "ld1 { v3.h }[4], [x10], #0x2\n"
+ "ld1 { v14.h }[4], [x9], #0x2\n"
+ "ld1 { v13.h }[4], [x28], #0x2\n"
+ "ld1 { v28.h }[4], [x27], #0x2\n"
+ "ld1 { v21.h }[4], [x26], #0x2\n"
+ "ld1 { v27.h }[4], [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v6.b }[10], [x14], #0x1\n"
- "ld1 { v5.b }[10], [x13], #0x1\n"
- "ld1 { v4.b }[10], [x10], #0x1\n"
- "ld1 { v3.b }[10], [x9], #0x1\n"
- "ld1 { v1.b }[10], [x28], #0x1\n"
- "ld1 { v0.b }[10], [x27], #0x1\n"
- "ld1 { v31.b }[10], [x26], #0x1\n"
- "ld1 { v30.b }[10], [x25], #0x1\n"
+ "ld1 { v11.b }[10], [x14], #0x1\n"
+ "ld1 { v10.b }[10], [x13], #0x1\n"
+ "ld1 { v3.b }[10], [x10], #0x1\n"
+ "ld1 { v14.b }[10], [x9], #0x1\n"
+ "ld1 { v13.b }[10], [x28], #0x1\n"
+ "ld1 { v28.b }[10], [x27], #0x1\n"
+ "ld1 { v21.b }[10], [x26], #0x1\n"
+ "ld1 { v27.b }[10], [x21], #0x1\n"
"b 11f\n"
"6:" // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v6.b }[8], [x14], #0x1\n"
- "ld1 { v5.b }[8], [x13], #0x1\n"
- "ld1 { v4.b }[8], [x10], #0x1\n"
- "ld1 { v3.b }[8], [x9], #0x1\n"
- "ld1 { v1.b }[8], [x28], #0x1\n"
- "ld1 { v0.b }[8], [x27], #0x1\n"
- "ld1 { v31.b }[8], [x26], #0x1\n"
- "ld1 { v30.b }[8], [x25], #0x1\n"
+ "ld1 { v11.b }[8], [x14], #0x1\n"
+ "ld1 { v10.b }[8], [x13], #0x1\n"
+ "ld1 { v3.b }[8], [x10], #0x1\n"
+ "ld1 { v14.b }[8], [x9], #0x1\n"
+ "ld1 { v13.b }[8], [x28], #0x1\n"
+ "ld1 { v28.b }[8], [x27], #0x1\n"
+ "ld1 { v21.b }[8], [x26], #0x1\n"
+ "ld1 { v27.b }[8], [x21], #0x1\n"
"b 11f\n"
"7:" // Oddments: Load (A): Bit 3: Unset
"tbz %x[n_channels], #2, 9f\n"
- "ldr s6, [x14], #0x4\n"
- "ldr s5, [x13], #0x4\n"
- "ldr s4, [x10], #0x4\n"
- "ldr s3, [x9], #0x4\n"
- "ldr s1, [x28], #0x4\n"
- "ldr s0, [x27], #0x4\n"
- "ldr s31, [x26], #0x4\n"
- "ldr s30, [x25], #0x4\n"
+ "ldr s11, [x14], #0x4\n"
+ "ldr s10, [x13], #0x4\n"
+ "ldr s3, [x10], #0x4\n"
+ "ldr s14, [x9], #0x4\n"
+ "ldr s13, [x28], #0x4\n"
+ "ldr s28, [x27], #0x4\n"
+ "ldr s21, [x26], #0x4\n"
+ "ldr s27, [x21], #0x4\n"
"tbz %x[n_channels], #1, 8f\n"
- "ld1 { v6.h }[2], [x14], #0x2\n"
- "ld1 { v5.h }[2], [x13], #0x2\n"
- "ld1 { v4.h }[2], [x10], #0x2\n"
- "ld1 { v3.h }[2], [x9], #0x2\n"
- "ld1 { v1.h }[2], [x28], #0x2\n"
- "ld1 { v0.h }[2], [x27], #0x2\n"
- "ld1 { v31.h }[2], [x26], #0x2\n"
- "ld1 { v30.h }[2], [x25], #0x2\n"
+ "ld1 { v11.h }[2], [x14], #0x2\n"
+ "ld1 { v10.h }[2], [x13], #0x2\n"
+ "ld1 { v3.h }[2], [x10], #0x2\n"
+ "ld1 { v14.h }[2], [x9], #0x2\n"
+ "ld1 { v13.h }[2], [x28], #0x2\n"
+ "ld1 { v28.h }[2], [x27], #0x2\n"
+ "ld1 { v21.h }[2], [x26], #0x2\n"
+ "ld1 { v27.h }[2], [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v6.b }[6], [x14], #0x1\n"
- "ld1 { v5.b }[6], [x13], #0x1\n"
- "ld1 { v4.b }[6], [x10], #0x1\n"
- "ld1 { v3.b }[6], [x9], #0x1\n"
- "ld1 { v1.b }[6], [x28], #0x1\n"
- "ld1 { v0.b }[6], [x27], #0x1\n"
- "ld1 { v31.b }[6], [x26], #0x1\n"
- "ld1 { v30.b }[6], [x25], #0x1\n"
+ "ld1 { v11.b }[6], [x14], #0x1\n"
+ "ld1 { v10.b }[6], [x13], #0x1\n"
+ "ld1 { v3.b }[6], [x10], #0x1\n"
+ "ld1 { v14.b }[6], [x9], #0x1\n"
+ "ld1 { v13.b }[6], [x28], #0x1\n"
+ "ld1 { v28.b }[6], [x27], #0x1\n"
+ "ld1 { v21.b }[6], [x26], #0x1\n"
+ "ld1 { v27.b }[6], [x21], #0x1\n"
"b 11f\n"
"8:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v6.b }[4], [x14], #0x1\n"
- "ld1 { v5.b }[4], [x13], #0x1\n"
- "ld1 { v4.b }[4], [x10], #0x1\n"
- "ld1 { v3.b }[4], [x9], #0x1\n"
- "ld1 { v1.b }[4], [x28], #0x1\n"
- "ld1 { v0.b }[4], [x27], #0x1\n"
- "ld1 { v31.b }[4], [x26], #0x1\n"
- "ld1 { v30.b }[4], [x25], #0x1\n"
+ "ld1 { v11.b }[4], [x14], #0x1\n"
+ "ld1 { v10.b }[4], [x13], #0x1\n"
+ "ld1 { v3.b }[4], [x10], #0x1\n"
+ "ld1 { v14.b }[4], [x9], #0x1\n"
+ "ld1 { v13.b }[4], [x28], #0x1\n"
+ "ld1 { v28.b }[4], [x27], #0x1\n"
+ "ld1 { v21.b }[4], [x26], #0x1\n"
+ "ld1 { v27.b }[4], [x21], #0x1\n"
"b 11f\n"
"9:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 10f\n"
- "ldr h6, [x14], #0x2\n"
- "ldr h5, [x13], #0x2\n"
- "ldr h4, [x10], #0x2\n"
- "ldr h3, [x9], #0x2\n"
- "ldr h1, [x28], #0x2\n"
- "ldr h0, [x27], #0x2\n"
- "ldr h31, [x26], #0x2\n"
- "ldr h30, [x25], #0x2\n"
+ "ldr h11, [x14], #0x2\n"
+ "ldr h10, [x13], #0x2\n"
+ "ldr h3, [x10], #0x2\n"
+ "ldr h14, [x9], #0x2\n"
+ "ldr h13, [x28], #0x2\n"
+ "ldr h28, [x27], #0x2\n"
+ "ldr h21, [x26], #0x2\n"
+ "ldr h27, [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v6.b }[2], [x14], #0x1\n"
- "ld1 { v5.b }[2], [x13], #0x1\n"
- "ld1 { v4.b }[2], [x10], #0x1\n"
- "ld1 { v3.b }[2], [x9], #0x1\n"
- "ld1 { v1.b }[2], [x28], #0x1\n"
- "ld1 { v0.b }[2], [x27], #0x1\n"
- "ld1 { v31.b }[2], [x26], #0x1\n"
- "ld1 { v30.b }[2], [x25], #0x1\n"
+ "ld1 { v11.b }[2], [x14], #0x1\n"
+ "ld1 { v10.b }[2], [x13], #0x1\n"
+ "ld1 { v3.b }[2], [x10], #0x1\n"
+ "ld1 { v14.b }[2], [x9], #0x1\n"
+ "ld1 { v13.b }[2], [x28], #0x1\n"
+ "ld1 { v28.b }[2], [x27], #0x1\n"
+ "ld1 { v21.b }[2], [x26], #0x1\n"
+ "ld1 { v27.b }[2], [x21], #0x1\n"
"b 11f\n"
"10:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
- "ldr b6, [x14], #0x1\n"
- "ldr b5, [x13], #0x1\n"
- "ldr b4, [x10], #0x1\n"
- "ldr b3, [x9], #0x1\n"
- "ldr b1, [x28], #0x1\n"
- "ldr b0, [x27], #0x1\n"
- "ldr b31, [x26], #0x1\n"
- "ldr b30, [x25], #0x1\n"
+ "ldr b11, [x14], #0x1\n"
+ "ldr b10, [x13], #0x1\n"
+ "ldr b3, [x10], #0x1\n"
+ "ldr b14, [x9], #0x1\n"
+ "ldr b13, [x28], #0x1\n"
+ "ldr b28, [x27], #0x1\n"
+ "ldr b21, [x26], #0x1\n"
+ "ldr b27, [x21], #0x1\n"
"11:" // Oddments: Load (A): Bit 3: End
"ldp x14, x13, [%x[inptrs], #0x40]\n"
"ldp x10, x9, [%x[inptrs], #0x50]\n"
"add x14, x14, x12\n"
"add x13, x13, x12\n"
"ldp x28, x27, [%x[inptrs], #0x60]\n"
- "ldp x26, x25, [%x[inptrs], #0x70]\n"
+ "ldp x26, x21, [%x[inptrs], #0x70]\n"
"add x10, x10, x12\n"
"add x9, x9, x12\n"
"add x28, x28, x12\n"
"add x27, x27, x12\n"
"add x26, x26, x12\n"
- "add x25, x25, x12\n"
+ "add x21, x21, x12\n"
"tbz %x[n_channels], #3, 15f\n"
- "ldr d25, [x14], #0x8\n"
- "ldr d24, [x13], #0x8\n"
- "ldr d23, [x10], #0x8\n"
- "ldr d22, [x9], #0x8\n"
- "ldr d20, [x28], #0x8\n"
- "ldr d19, [x27], #0x8\n"
- "ldr d18, [x26], #0x8\n"
- "ldr d17, [x25], #0x8\n"
+ "ldr d5, [x14], #0x8\n"
+ "ldr d29, [x13], #0x8\n"
+ "ldr d0, [x10], #0x8\n"
+ "ldr d7, [x9], #0x8\n"
+ "ldr d16, [x28], #0x8\n"
+ "ldr d30, [x27], #0x8\n"
+ "ldr d2, [x26], #0x8\n"
+ "ldr d1, [x21], #0x8\n"
"tbz %x[n_channels], #2, 13f\n"
- "ld1 { v25.s }[2], [x14], #0x4\n"
- "ld1 { v24.s }[2], [x13], #0x4\n"
- "ld1 { v23.s }[2], [x10], #0x4\n"
- "ld1 { v22.s }[2], [x9], #0x4\n"
- "ld1 { v20.s }[2], [x28], #0x4\n"
- "ld1 { v19.s }[2], [x27], #0x4\n"
- "ld1 { v18.s }[2], [x26], #0x4\n"
- "ld1 { v17.s }[2], [x25], #0x4\n"
+ "ld1 { v5.s }[2], [x14], #0x4\n"
+ "ld1 { v29.s }[2], [x13], #0x4\n"
+ "ld1 { v0.s }[2], [x10], #0x4\n"
+ "ld1 { v7.s }[2], [x9], #0x4\n"
+ "ld1 { v16.s }[2], [x28], #0x4\n"
+ "ld1 { v30.s }[2], [x27], #0x4\n"
+ "ld1 { v2.s }[2], [x26], #0x4\n"
+ "ld1 { v1.s }[2], [x21], #0x4\n"
"tbz %x[n_channels], #1, 12f\n"
- "ld1 { v25.h }[6], [x14], #0x2\n"
- "ld1 { v24.h }[6], [x13], #0x2\n"
- "ld1 { v23.h }[6], [x10], #0x2\n"
- "ld1 { v22.h }[6], [x9], #0x2\n"
- "ld1 { v20.h }[6], [x28], #0x2\n"
- "ld1 { v19.h }[6], [x27], #0x2\n"
- "ld1 { v18.h }[6], [x26], #0x2\n"
- "ld1 { v17.h }[6], [x25], #0x2\n"
+ "ld1 { v5.h }[6], [x14], #0x2\n"
+ "ld1 { v29.h }[6], [x13], #0x2\n"
+ "ld1 { v0.h }[6], [x10], #0x2\n"
+ "ld1 { v7.h }[6], [x9], #0x2\n"
+ "ld1 { v16.h }[6], [x28], #0x2\n"
+ "ld1 { v30.h }[6], [x27], #0x2\n"
+ "ld1 { v2.h }[6], [x26], #0x2\n"
+ "ld1 { v1.h }[6], [x21], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v25.b }[14], [x14], #0x1\n"
- "ld1 { v24.b }[14], [x13], #0x1\n"
- "ld1 { v23.b }[14], [x10], #0x1\n"
- "ld1 { v22.b }[14], [x9], #0x1\n"
- "ld1 { v20.b }[14], [x28], #0x1\n"
- "ld1 { v19.b }[14], [x27], #0x1\n"
- "ld1 { v18.b }[14], [x26], #0x1\n"
- "ld1 { v17.b }[14], [x25], #0x1\n"
+ "ld1 { v5.b }[14], [x14], #0x1\n"
+ "ld1 { v29.b }[14], [x13], #0x1\n"
+ "ld1 { v0.b }[14], [x10], #0x1\n"
+ "ld1 { v7.b }[14], [x9], #0x1\n"
+ "ld1 { v16.b }[14], [x28], #0x1\n"
+ "ld1 { v30.b }[14], [x27], #0x1\n"
+ "ld1 { v2.b }[14], [x26], #0x1\n"
+ "ld1 { v1.b }[14], [x21], #0x1\n"
"b 19f\n"
"12:" // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v25.b }[12], [x14], #0x1\n"
- "ld1 { v24.b }[12], [x13], #0x1\n"
- "ld1 { v23.b }[12], [x10], #0x1\n"
- "ld1 { v22.b }[12], [x9], #0x1\n"
- "ld1 { v20.b }[12], [x28], #0x1\n"
- "ld1 { v19.b }[12], [x27], #0x1\n"
- "ld1 { v18.b }[12], [x26], #0x1\n"
- "ld1 { v17.b }[12], [x25], #0x1\n"
+ "ld1 { v5.b }[12], [x14], #0x1\n"
+ "ld1 { v29.b }[12], [x13], #0x1\n"
+ "ld1 { v0.b }[12], [x10], #0x1\n"
+ "ld1 { v7.b }[12], [x9], #0x1\n"
+ "ld1 { v16.b }[12], [x28], #0x1\n"
+ "ld1 { v30.b }[12], [x27], #0x1\n"
+ "ld1 { v2.b }[12], [x26], #0x1\n"
+ "ld1 { v1.b }[12], [x21], #0x1\n"
"b 19f\n"
"13:" // Oddments: Load (B): Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 14f\n"
- "ld1 { v25.h }[4], [x14], #0x2\n"
- "ld1 { v24.h }[4], [x13], #0x2\n"
- "ld1 { v23.h }[4], [x10], #0x2\n"
- "ld1 { v22.h }[4], [x9], #0x2\n"
- "ld1 { v20.h }[4], [x28], #0x2\n"
- "ld1 { v19.h }[4], [x27], #0x2\n"
- "ld1 { v18.h }[4], [x26], #0x2\n"
- "ld1 { v17.h }[4], [x25], #0x2\n"
+ "ld1 { v5.h }[4], [x14], #0x2\n"
+ "ld1 { v29.h }[4], [x13], #0x2\n"
+ "ld1 { v0.h }[4], [x10], #0x2\n"
+ "ld1 { v7.h }[4], [x9], #0x2\n"
+ "ld1 { v16.h }[4], [x28], #0x2\n"
+ "ld1 { v30.h }[4], [x27], #0x2\n"
+ "ld1 { v2.h }[4], [x26], #0x2\n"
+ "ld1 { v1.h }[4], [x21], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v25.b }[10], [x14], #0x1\n"
- "ld1 { v24.b }[10], [x13], #0x1\n"
- "ld1 { v23.b }[10], [x10], #0x1\n"
- "ld1 { v22.b }[10], [x9], #0x1\n"
- "ld1 { v20.b }[10], [x28], #0x1\n"
- "ld1 { v19.b }[10], [x27], #0x1\n"
- "ld1 { v18.b }[10], [x26], #0x1\n"
- "ld1 { v17.b }[10], [x25], #0x1\n"
+ "ld1 { v5.b }[10], [x14], #0x1\n"
+ "ld1 { v29.b }[10], [x13], #0x1\n"
+ "ld1 { v0.b }[10], [x10], #0x1\n"
+ "ld1 { v7.b }[10], [x9], #0x1\n"
+ "ld1 { v16.b }[10], [x28], #0x1\n"
+ "ld1 { v30.b }[10], [x27], #0x1\n"
+ "ld1 { v2.b }[10], [x26], #0x1\n"
+ "ld1 { v1.b }[10], [x21], #0x1\n"
"b 19f\n"
"14:" // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v25.b }[8], [x14], #0x1\n"
- "ld1 { v24.b }[8], [x13], #0x1\n"
- "ld1 { v23.b }[8], [x10], #0x1\n"
- "ld1 { v22.b }[8], [x9], #0x1\n"
- "ld1 { v20.b }[8], [x28], #0x1\n"
- "ld1 { v19.b }[8], [x27], #0x1\n"
- "ld1 { v18.b }[8], [x26], #0x1\n"
- "ld1 { v17.b }[8], [x25], #0x1\n"
+ "ld1 { v5.b }[8], [x14], #0x1\n"
+ "ld1 { v29.b }[8], [x13], #0x1\n"
+ "ld1 { v0.b }[8], [x10], #0x1\n"
+ "ld1 { v7.b }[8], [x9], #0x1\n"
+ "ld1 { v16.b }[8], [x28], #0x1\n"
+ "ld1 { v30.b }[8], [x27], #0x1\n"
+ "ld1 { v2.b }[8], [x26], #0x1\n"
+ "ld1 { v1.b }[8], [x21], #0x1\n"
"b 19f\n"
"15:" // Oddments: Load (B): Bit 3: Unset
"tbz %x[n_channels], #2, 17f\n"
- "ldr s25, [x14], #0x4\n"
- "ldr s24, [x13], #0x4\n"
- "ldr s23, [x10], #0x4\n"
- "ldr s22, [x9], #0x4\n"
- "ldr s20, [x28], #0x4\n"
- "ldr s19, [x27], #0x4\n"
- "ldr s18, [x26], #0x4\n"
- "ldr s17, [x25], #0x4\n"
+ "ldr s5, [x14], #0x4\n"
+ "ldr s29, [x13], #0x4\n"
+ "ldr s0, [x10], #0x4\n"
+ "ldr s7, [x9], #0x4\n"
+ "ldr s16, [x28], #0x4\n"
+ "ldr s30, [x27], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s1, [x21], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v25.h }[2], [x14], #0x2\n"
- "ld1 { v24.h }[2], [x13], #0x2\n"
- "ld1 { v23.h }[2], [x10], #0x2\n"
- "ld1 { v22.h }[2], [x9], #0x2\n"
- "ld1 { v20.h }[2], [x28], #0x2\n"
- "ld1 { v19.h }[2], [x27], #0x2\n"
- "ld1 { v18.h }[2], [x26], #0x2\n"
- "ld1 { v17.h }[2], [x25], #0x2\n"
+ "ld1 { v5.h }[2], [x14], #0x2\n"
+ "ld1 { v29.h }[2], [x13], #0x2\n"
+ "ld1 { v0.h }[2], [x10], #0x2\n"
+ "ld1 { v7.h }[2], [x9], #0x2\n"
+ "ld1 { v16.h }[2], [x28], #0x2\n"
+ "ld1 { v30.h }[2], [x27], #0x2\n"
+ "ld1 { v2.h }[2], [x26], #0x2\n"
+ "ld1 { v1.h }[2], [x21], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v25.b }[6], [x14], #0x1\n"
- "ld1 { v24.b }[6], [x13], #0x1\n"
- "ld1 { v23.b }[6], [x10], #0x1\n"
- "ld1 { v22.b }[6], [x9], #0x1\n"
- "ld1 { v20.b }[6], [x28], #0x1\n"
- "ld1 { v19.b }[6], [x27], #0x1\n"
- "ld1 { v18.b }[6], [x26], #0x1\n"
- "ld1 { v17.b }[6], [x25], #0x1\n"
+ "ld1 { v5.b }[6], [x14], #0x1\n"
+ "ld1 { v29.b }[6], [x13], #0x1\n"
+ "ld1 { v0.b }[6], [x10], #0x1\n"
+ "ld1 { v7.b }[6], [x9], #0x1\n"
+ "ld1 { v16.b }[6], [x28], #0x1\n"
+ "ld1 { v30.b }[6], [x27], #0x1\n"
+ "ld1 { v2.b }[6], [x26], #0x1\n"
+ "ld1 { v1.b }[6], [x21], #0x1\n"
"b 19f\n"
"16:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v25.b }[4], [x14], #0x1\n"
- "ld1 { v24.b }[4], [x13], #0x1\n"
- "ld1 { v23.b }[4], [x10], #0x1\n"
- "ld1 { v22.b }[4], [x9], #0x1\n"
- "ld1 { v20.b }[4], [x28], #0x1\n"
- "ld1 { v19.b }[4], [x27], #0x1\n"
- "ld1 { v18.b }[4], [x26], #0x1\n"
- "ld1 { v17.b }[4], [x25], #0x1\n"
+ "ld1 { v5.b }[4], [x14], #0x1\n"
+ "ld1 { v29.b }[4], [x13], #0x1\n"
+ "ld1 { v0.b }[4], [x10], #0x1\n"
+ "ld1 { v7.b }[4], [x9], #0x1\n"
+ "ld1 { v16.b }[4], [x28], #0x1\n"
+ "ld1 { v30.b }[4], [x27], #0x1\n"
+ "ld1 { v2.b }[4], [x26], #0x1\n"
+ "ld1 { v1.b }[4], [x21], #0x1\n"
"b 19f\n"
"17:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ldr h25, [x14], #0x2\n"
- "ldr h24, [x13], #0x2\n"
- "ldr h23, [x10], #0x2\n"
- "ldr h22, [x9], #0x2\n"
- "ldr h20, [x28], #0x2\n"
- "ldr h19, [x27], #0x2\n"
- "ldr h18, [x26], #0x2\n"
- "ldr h17, [x25], #0x2\n"
+ "ldr h5, [x14], #0x2\n"
+ "ldr h29, [x13], #0x2\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h7, [x9], #0x2\n"
+ "ldr h16, [x28], #0x2\n"
+ "ldr h30, [x27], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h1, [x21], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v25.b }[2], [x14], #0x1\n"
- "ld1 { v24.b }[2], [x13], #0x1\n"
- "ld1 { v23.b }[2], [x10], #0x1\n"
- "ld1 { v22.b }[2], [x9], #0x1\n"
- "ld1 { v20.b }[2], [x28], #0x1\n"
- "ld1 { v19.b }[2], [x27], #0x1\n"
- "ld1 { v18.b }[2], [x26], #0x1\n"
- "ld1 { v17.b }[2], [x25], #0x1\n"
+ "ld1 { v5.b }[2], [x14], #0x1\n"
+ "ld1 { v29.b }[2], [x13], #0x1\n"
+ "ld1 { v0.b }[2], [x10], #0x1\n"
+ "ld1 { v7.b }[2], [x9], #0x1\n"
+ "ld1 { v16.b }[2], [x28], #0x1\n"
+ "ld1 { v30.b }[2], [x27], #0x1\n"
+ "ld1 { v2.b }[2], [x26], #0x1\n"
+ "ld1 { v1.b }[2], [x21], #0x1\n"
"b 19f\n"
"18:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
- "ldr b25, [x14], #0x1\n"
- "ldr b24, [x13], #0x1\n"
- "ldr b23, [x10], #0x1\n"
- "ldr b22, [x9], #0x1\n"
- "ldr b20, [x28], #0x1\n"
- "ldr b19, [x27], #0x1\n"
- "ldr b18, [x26], #0x1\n"
- "ldr b17, [x25], #0x1\n"
+ "ldr b5, [x14], #0x1\n"
+ "ldr b29, [x13], #0x1\n"
+ "ldr b0, [x10], #0x1\n"
+ "ldr b7, [x9], #0x1\n"
+ "ldr b16, [x28], #0x1\n"
+ "ldr b30, [x27], #0x1\n"
+ "ldr b2, [x26], #0x1\n"
+ "ldr b1, [x21], #0x1\n"
"19:" // Oddments: Load (B): Bit 3: End
- "ldr q29, [%x[params], #0x10]\n"
- "ldr q28, [%x[params], #0x20]\n"
- "zip2 v2.16b, v6.16b, v4.16b\n"
- "zip1 v6.16b, v6.16b, v4.16b\n"
- "ldr q27, [%x[params], #0x30]\n"
- "zip1 v4.16b, v5.16b, v3.16b\n"
- "zip2 v3.16b, v5.16b, v3.16b\n"
+ "ldr q25, [%x[params], #0x10]\n"
+ "ldr q24, [%x[params], #0x20]\n"
+ "zip2 v18.16b, v11.16b, v3.16b\n"
+ "zip1 v11.16b, v11.16b, v3.16b\n"
+ "ldr q23, [%x[params], #0x30]\n"
+ "zip1 v17.16b, v10.16b, v14.16b\n"
+ "zip2 v14.16b, v10.16b, v14.16b\n"
"cmp x20, #0x4\n"
- "zip2 v5.16b, v6.16b, v4.16b\n"
- "zip1 v6.16b, v6.16b, v4.16b\n"
- "zip1 v4.16b, v2.16b, v3.16b\n"
- "zip2 v3.16b, v2.16b, v3.16b\n"
- "ldr q2, [%x[params], #0x0]\n"
- "zip2 v26.16b, v1.16b, v31.16b\n"
- "zip1 v1.16b, v1.16b, v31.16b\n"
- "zip1 v31.16b, v0.16b, v30.16b\n"
- "zip2 v30.16b, v0.16b, v30.16b\n"
- "zip2 v21.16b, v25.16b, v23.16b\n"
- "zip1 v25.16b, v25.16b, v23.16b\n"
- "zip1 v23.16b, v24.16b, v22.16b\n"
- "zip2 v22.16b, v24.16b, v22.16b\n"
- "zip2 v16.16b, v20.16b, v18.16b\n"
- "zip1 v20.16b, v20.16b, v18.16b\n"
- "zip1 v18.16b, v19.16b, v17.16b\n"
- "zip2 v17.16b, v19.16b, v17.16b\n"
- "zip2 v0.16b, v1.16b, v31.16b\n"
- "zip1 v1.16b, v1.16b, v31.16b\n"
- "zip1 v31.16b, v26.16b, v30.16b\n"
- "zip2 v30.16b, v26.16b, v30.16b\n"
- "zip2 v24.16b, v25.16b, v23.16b\n"
- "zip1 v25.16b, v25.16b, v23.16b\n"
- "zip1 v23.16b, v21.16b, v22.16b\n"
- "zip2 v22.16b, v21.16b, v22.16b\n"
- "zip2 v19.16b, v20.16b, v18.16b\n"
- "zip1 v20.16b, v20.16b, v18.16b\n"
- "zip1 v18.16b, v16.16b, v17.16b\n"
- "zip2 v17.16b, v16.16b, v17.16b\n"
- "mov v26.16b, v2.16b\n"
- "mov v21.16b, v2.16b\n"
- ".inst 0x4e8197b5 // sdot v21.4s, v29.16b, v1.16b\n"
- "mov v16.16b, v2.16b\n"
- ".inst 0x4e8697a2 // sdot v2.4s, v29.16b, v6.16b\n"
- ".inst 0x4e819782 // sdot v2.4s, v28.16b, v1.16b\n"
- "ext v6.16b, v6.16b, v6.16b, #0x1\n"
- "ext v1.16b, v1.16b, v1.16b, #0x1\n"
- ".inst 0x4e8697ba // sdot v26.4s, v29.16b, v6.16b\n"
- "ldr q6, [%x[params], #0x40]\n"
- ".inst 0x4e8197b0 // sdot v16.4s, v29.16b, v1.16b\n"
- ".inst 0x4e999795 // sdot v21.4s, v28.16b, v25.16b\n"
- ".inst 0x4e999762 // sdot v2.4s, v27.16b, v25.16b\n"
- "ext v25.16b, v25.16b, v25.16b, #0x1\n"
- ".inst 0x4e81979a // sdot v26.4s, v28.16b, v1.16b\n"
- "ldr q1, [%x[params], #0x50]\n"
- ".inst 0x4e999790 // sdot v16.4s, v28.16b, v25.16b\n"
- ".inst 0x4e949775 // sdot v21.4s, v27.16b, v20.16b\n"
- "ext v20.16b, v20.16b, v20.16b, #0x1\n"
+ "zip2 v10.16b, v11.16b, v17.16b\n"
+ "zip1 v11.16b, v11.16b, v17.16b\n"
+ "zip1 v3.16b, v18.16b, v14.16b\n"
+ "zip2 v14.16b, v18.16b, v14.16b\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "zip2 v22.16b, v13.16b, v21.16b\n"
+ "zip1 v13.16b, v13.16b, v21.16b\n"
+ "zip1 v21.16b, v28.16b, v27.16b\n"
+ "zip2 v27.16b, v28.16b, v27.16b\n"
+ "zip2 v20.16b, v5.16b, v0.16b\n"
+ "zip1 v5.16b, v5.16b, v0.16b\n"
+ "zip1 v19.16b, v29.16b, v7.16b\n"
+ "zip2 v7.16b, v29.16b, v7.16b\n"
+ "zip2 v18.16b, v16.16b, v2.16b\n"
+ "zip1 v16.16b, v16.16b, v2.16b\n"
+ "zip1 v17.16b, v30.16b, v1.16b\n"
+ "zip2 v1.16b, v30.16b, v1.16b\n"
+ "zip2 v28.16b, v13.16b, v21.16b\n"
+ "zip1 v13.16b, v13.16b, v21.16b\n"
+ "zip1 v21.16b, v22.16b, v27.16b\n"
+ "zip2 v27.16b, v22.16b, v27.16b\n"
+ "zip2 v29.16b, v5.16b, v19.16b\n"
+ "zip1 v5.16b, v5.16b, v19.16b\n"
+ "zip1 v0.16b, v20.16b, v7.16b\n"
+ "zip2 v7.16b, v20.16b, v7.16b\n"
+ "zip2 v30.16b, v16.16b, v17.16b\n"
+ "zip1 v16.16b, v16.16b, v17.16b\n"
+ "zip1 v2.16b, v18.16b, v1.16b\n"
+ "zip2 v1.16b, v18.16b, v1.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ ".inst 0x4e8d9732 // sdot v18.4s, v25.16b, v13.16b\n"
+ "mov v4.16b, v31.16b\n"
+ ".inst 0x4e8b973f // sdot v31.4s, v25.16b, v11.16b\n"
+ ".inst 0x4e8d971f // sdot v31.4s, v24.16b, v13.16b\n"
+ "ext v11.16b, v11.16b, v11.16b, #0x1\n"
+ "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+ ".inst 0x4e8b973a // sdot v26.4s, v25.16b, v11.16b\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ ".inst 0x4e8d9724 // sdot v4.4s, v25.16b, v13.16b\n"
+ ".inst 0x4e859712 // sdot v18.4s, v24.16b, v5.16b\n"
+ ".inst 0x4e8596ff // sdot v31.4s, v23.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x4e8d971a // sdot v26.4s, v24.16b, v13.16b\n"
+ "ldr q20, [%x[params], #0x50]\n"
+ ".inst 0x4e859704 // sdot v4.4s, v24.16b, v5.16b\n"
+ ".inst 0x4e9096f2 // sdot v18.4s, v23.16b, v16.16b\n"
+ "ext v16.16b, v16.16b, v16.16b, #0x1\n"
"add %x[params], %x[params], #0x60\n"
- "sqrdmulh v2.4s, v2.4s, v6.4s\n"
- ".inst 0x4e99977a // sdot v26.4s, v27.16b, v25.16b\n"
- ".inst 0x4e949770 // sdot v16.4s, v27.16b, v20.16b\n"
- "and v29.16b, v2.16b, v1.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqrdmulh v26.4s, v26.4s, v6.4s\n"
- "sqrdmulh v21.4s, v21.4s, v6.4s\n"
- "sqrdmulh v16.4s, v16.4s, v6.4s\n"
- "sqadd v2.4s, v2.4s, v29.4s\n"
- "and v28.16b, v26.16b, v1.16b\n"
- "and v27.16b, v21.16b, v1.16b\n"
- "and v29.16b, v16.16b, v1.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v28.4s\n"
- "sqadd v21.4s, v21.4s, v27.4s\n"
- "sqadd v16.4s, v16.4s, v29.4s\n"
- "srshl v2.4s, v2.4s, v1.4s\n"
- "srshl v26.4s, v26.4s, v1.4s\n"
- "srshl v21.4s, v21.4s, v1.4s\n"
- "srshl v16.4s, v16.4s, v1.4s\n"
- "add v2.4s, v2.4s, v7.4s\n"
- "add v26.4s, v26.4s, v7.4s\n"
- "add v21.4s, v21.4s, v7.4s\n"
- "add v16.4s, v16.4s, v7.4s\n"
- "smax v2.4s, v2.4s, v9.4s\n"
- "smax v26.4s, v26.4s, v9.4s\n"
- "smax v21.4s, v21.4s, v9.4s\n"
- "smax v16.4s, v16.4s, v9.4s\n"
- "smin v2.4s, v2.4s, v8.4s\n"
- "smin v26.4s, v26.4s, v8.4s\n"
- "smin v21.4s, v21.4s, v8.4s\n"
- "smin v16.4s, v16.4s, v8.4s\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x4e8596fa // sdot v26.4s, v23.16b, v5.16b\n"
+ ".inst 0x4e9096e4 // sdot v4.4s, v23.16b, v16.16b\n"
+ "and v16.16b, v31.16b, v20.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v19.16b, v26.16b, v20.16b\n"
+ "and v17.16b, v18.16b, v20.16b\n"
+ "and v16.16b, v4.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v19.4s\n"
+ "sqadd v18.4s, v18.4s, v17.4s\n"
+ "sqadd v4.4s, v4.4s, v16.4s\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "srshl v18.4s, v18.4s, v20.4s\n"
+ "srshl v4.4s, v4.4s, v20.4s\n"
+ "add v31.4s, v31.4s, v15.4s\n"
+ "add v26.4s, v26.4s, v15.4s\n"
+ "add v18.4s, v18.4s, v15.4s\n"
+ "add v4.4s, v4.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v18.4s, v18.4s, v8.4s\n"
+ "smax v4.4s, v4.4s, v8.4s\n"
+ "smin v31.4s, v31.4s, v12.4s\n"
+ "smin v26.4s, v26.4s, v12.4s\n"
+ "smin v18.4s, v18.4s, v12.4s\n"
+ "smin v4.4s, v4.4s, v12.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
"blt 20f\n"
- "str s2, [x24, x11]\n"
- "str s26, [x23, x11]\n"
- "str s21, [x22, x11]\n"
- "str s16, [x21, x11]\n"
+ "str s31, [x25, x11]\n"
+ "str s26, [x24, x11]\n"
+ "str s18, [x23, x11]\n"
+ "str s4, [x22, x11]\n"
"b 23f\n"
"20:" // Oddments: Unroll 0: Oddment store
+ "add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
- "add x21, x21, x11\n"
"tbz x20, #1, 21f\n"
- "st1 { v2.h }[0], [x24], #0x2\n"
- "st1 { v26.h }[0], [x23], #0x2\n"
- "st1 { v21.h }[0], [x22], #0x2\n"
- "st1 { v16.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v26.h }[0], [x24], #0x2\n"
+ "st1 { v18.h }[0], [x23], #0x2\n"
+ "st1 { v4.h }[0], [x22], #0x2\n"
"tbz x20, #0, 22f\n"
- "st1 { v2.b }[2], [x24], #0x1\n"
- "st1 { v26.b }[2], [x23], #0x1\n"
- "st1 { v21.b }[2], [x22], #0x1\n"
- "st1 { v16.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v26.b }[2], [x24], #0x1\n"
+ "st1 { v18.b }[2], [x23], #0x1\n"
+ "st1 { v4.b }[2], [x22], #0x1\n"
"b 22f\n"
"21:" // Oddments: Unroll 0: Oddment store: Bit 1: Unset
- "st1 { v2.b }[0], [x24], #0x1\n"
- "st1 { v26.b }[0], [x23], #0x1\n"
- "st1 { v21.b }[0], [x22], #0x1\n"
- "st1 { v16.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v26.b }[0], [x24], #0x1\n"
+ "st1 { v18.b }[0], [x23], #0x1\n"
+ "st1 { v4.b }[0], [x22], #0x1\n"
"22:" // Oddments: Unroll 0: Oddment store: Bit 1: End
"23:" // Oddments: Unroll 0: After oddment store
"subs x20, x20, #0x4\n"
"add x11, x11, #0x4\n"
"ble 35f\n"
- "ldr q2, [%x[params], #0x0]\n"
- "ldr q29, [%x[params], #0x10]\n"
- "mov v26.16b, v2.16b\n"
- "mov v21.16b, v2.16b\n"
- "ldr q28, [%x[params], #0x20]\n"
- "ldr q27, [%x[params], #0x30]\n"
- "mov v16.16b, v2.16b\n"
- ".inst 0x4e8597a2 // sdot v2.4s, v29.16b, v5.16b\n"
- "ldr q6, [%x[params], #0x40]\n"
- "ldr q1, [%x[params], #0x50]\n"
- ".inst 0x4e8097b5 // sdot v21.4s, v29.16b, v0.16b\n"
- ".inst 0x4e809782 // sdot v2.4s, v28.16b, v0.16b\n"
- "ext v5.16b, v5.16b, v5.16b, #0x1\n"
- "ext v0.16b, v0.16b, v0.16b, #0x1\n"
- ".inst 0x4e8597ba // sdot v26.4s, v29.16b, v5.16b\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q23, [%x[params], #0x10]\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ "ldr q22, [%x[params], #0x20]\n"
+ "ldr q16, [%x[params], #0x30]\n"
+ "mov v4.16b, v31.16b\n"
+ ".inst 0x4e8a96ff // sdot v31.4s, v23.16b, v10.16b\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ "ldr q20, [%x[params], #0x50]\n"
+ ".inst 0x4e9c96f2 // sdot v18.4s, v23.16b, v28.16b\n"
+ ".inst 0x4e9c96df // sdot v31.4s, v22.16b, v28.16b\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+ ".inst 0x4e8a96fa // sdot v26.4s, v23.16b, v10.16b\n"
"cmp x20, #0x4\n"
- ".inst 0x4e8097b0 // sdot v16.4s, v29.16b, v0.16b\n"
- ".inst 0x4e989795 // sdot v21.4s, v28.16b, v24.16b\n"
+ ".inst 0x4e9c96e4 // sdot v4.4s, v23.16b, v28.16b\n"
+ ".inst 0x4e9d96d2 // sdot v18.4s, v22.16b, v29.16b\n"
"add %x[params], %x[params], #0x60\n"
- ".inst 0x4e989762 // sdot v2.4s, v27.16b, v24.16b\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x4e80979a // sdot v26.4s, v28.16b, v0.16b\n"
- ".inst 0x4e989790 // sdot v16.4s, v28.16b, v24.16b\n"
- ".inst 0x4e939775 // sdot v21.4s, v27.16b, v19.16b\n"
- "ext v19.16b, v19.16b, v19.16b, #0x1\n"
- "sqrdmulh v2.4s, v2.4s, v6.4s\n"
- ".inst 0x4e98977a // sdot v26.4s, v27.16b, v24.16b\n"
- ".inst 0x4e939770 // sdot v16.4s, v27.16b, v19.16b\n"
- "and v29.16b, v2.16b, v1.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqrdmulh v26.4s, v26.4s, v6.4s\n"
- "sqrdmulh v21.4s, v21.4s, v6.4s\n"
- "sqrdmulh v16.4s, v16.4s, v6.4s\n"
- "sqadd v2.4s, v2.4s, v29.4s\n"
- "and v28.16b, v26.16b, v1.16b\n"
- "and v27.16b, v21.16b, v1.16b\n"
- "and v29.16b, v16.16b, v1.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v28.4s\n"
- "sqadd v21.4s, v21.4s, v27.4s\n"
- "sqadd v16.4s, v16.4s, v29.4s\n"
- "srshl v2.4s, v2.4s, v1.4s\n"
- "srshl v26.4s, v26.4s, v1.4s\n"
- "srshl v21.4s, v21.4s, v1.4s\n"
- "srshl v16.4s, v16.4s, v1.4s\n"
- "add v2.4s, v2.4s, v7.4s\n"
- "add v26.4s, v26.4s, v7.4s\n"
- "add v21.4s, v21.4s, v7.4s\n"
- "add v16.4s, v16.4s, v7.4s\n"
- "smax v2.4s, v2.4s, v9.4s\n"
- "smax v26.4s, v26.4s, v9.4s\n"
- "smax v21.4s, v21.4s, v9.4s\n"
- "smax v16.4s, v16.4s, v9.4s\n"
- "smin v2.4s, v2.4s, v8.4s\n"
- "smin v26.4s, v26.4s, v8.4s\n"
- "smin v21.4s, v21.4s, v8.4s\n"
- "smin v16.4s, v16.4s, v8.4s\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
+ ".inst 0x4e9d961f // sdot v31.4s, v16.16b, v29.16b\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x1\n"
+ ".inst 0x4e9c96da // sdot v26.4s, v22.16b, v28.16b\n"
+ ".inst 0x4e9d96c4 // sdot v4.4s, v22.16b, v29.16b\n"
+ ".inst 0x4e9e9612 // sdot v18.4s, v16.16b, v30.16b\n"
+ "ext v30.16b, v30.16b, v30.16b, #0x1\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x4e9d961a // sdot v26.4s, v16.16b, v29.16b\n"
+ ".inst 0x4e9e9604 // sdot v4.4s, v16.16b, v30.16b\n"
+ "and v16.16b, v31.16b, v20.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v19.16b, v26.16b, v20.16b\n"
+ "and v17.16b, v18.16b, v20.16b\n"
+ "and v16.16b, v4.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v19.4s\n"
+ "sqadd v18.4s, v18.4s, v17.4s\n"
+ "sqadd v4.4s, v4.4s, v16.4s\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "srshl v18.4s, v18.4s, v20.4s\n"
+ "srshl v4.4s, v4.4s, v20.4s\n"
+ "add v31.4s, v31.4s, v15.4s\n"
+ "add v26.4s, v26.4s, v15.4s\n"
+ "add v18.4s, v18.4s, v15.4s\n"
+ "add v4.4s, v4.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v18.4s, v18.4s, v8.4s\n"
+ "smax v4.4s, v4.4s, v8.4s\n"
+ "smin v31.4s, v31.4s, v12.4s\n"
+ "smin v26.4s, v26.4s, v12.4s\n"
+ "smin v18.4s, v18.4s, v12.4s\n"
+ "smin v4.4s, v4.4s, v12.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
"blt 24f\n"
- "str s2, [x24, x11]\n"
- "str s26, [x23, x11]\n"
- "str s21, [x22, x11]\n"
- "str s16, [x21, x11]\n"
+ "str s31, [x25, x11]\n"
+ "str s26, [x24, x11]\n"
+ "str s18, [x23, x11]\n"
+ "str s4, [x22, x11]\n"
"b 27f\n"
"24:" // Oddments: Unroll 1: Oddment store
+ "add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
- "add x21, x21, x11\n"
"tbz x20, #1, 25f\n"
- "st1 { v2.h }[0], [x24], #0x2\n"
- "st1 { v26.h }[0], [x23], #0x2\n"
- "st1 { v21.h }[0], [x22], #0x2\n"
- "st1 { v16.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v26.h }[0], [x24], #0x2\n"
+ "st1 { v18.h }[0], [x23], #0x2\n"
+ "st1 { v4.h }[0], [x22], #0x2\n"
"tbz x20, #0, 26f\n"
- "st1 { v2.b }[2], [x24], #0x1\n"
- "st1 { v26.b }[2], [x23], #0x1\n"
- "st1 { v21.b }[2], [x22], #0x1\n"
- "st1 { v16.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v26.b }[2], [x24], #0x1\n"
+ "st1 { v18.b }[2], [x23], #0x1\n"
+ "st1 { v4.b }[2], [x22], #0x1\n"
"b 26f\n"
"25:" // Oddments: Unroll 1: Oddment store: Bit 1: Unset
- "st1 { v2.b }[0], [x24], #0x1\n"
- "st1 { v26.b }[0], [x23], #0x1\n"
- "st1 { v21.b }[0], [x22], #0x1\n"
- "st1 { v16.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v26.b }[0], [x24], #0x1\n"
+ "st1 { v18.b }[0], [x23], #0x1\n"
+ "st1 { v4.b }[0], [x22], #0x1\n"
"26:" // Oddments: Unroll 1: Oddment store: Bit 1: End
"27:" // Oddments: Unroll 1: After oddment store
"subs x20, x20, #0x4\n"
"add x11, x11, #0x4\n"
"ble 35f\n"
- "ldr q2, [%x[params], #0x0]\n"
- "ldr q29, [%x[params], #0x10]\n"
- "mov v26.16b, v2.16b\n"
- "mov v21.16b, v2.16b\n"
- "ldr q28, [%x[params], #0x20]\n"
- "ldr q27, [%x[params], #0x30]\n"
- "mov v16.16b, v2.16b\n"
- ".inst 0x4e8497a2 // sdot v2.4s, v29.16b, v4.16b\n"
- "ldr q6, [%x[params], #0x40]\n"
- "ldr q1, [%x[params], #0x50]\n"
- ".inst 0x4e9f97b5 // sdot v21.4s, v29.16b, v31.16b\n"
- ".inst 0x4e9f9782 // sdot v2.4s, v28.16b, v31.16b\n"
- "ext v4.16b, v4.16b, v4.16b, #0x1\n"
- "ext v31.16b, v31.16b, v31.16b, #0x1\n"
- ".inst 0x4e8497ba // sdot v26.4s, v29.16b, v4.16b\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q23, [%x[params], #0x10]\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ "ldr q22, [%x[params], #0x20]\n"
+ "ldr q16, [%x[params], #0x30]\n"
+ "mov v4.16b, v31.16b\n"
+ ".inst 0x4e8396ff // sdot v31.4s, v23.16b, v3.16b\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ "ldr q20, [%x[params], #0x50]\n"
+ ".inst 0x4e9596f2 // sdot v18.4s, v23.16b, v21.16b\n"
+ ".inst 0x4e9596df // sdot v31.4s, v22.16b, v21.16b\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ ".inst 0x4e8396fa // sdot v26.4s, v23.16b, v3.16b\n"
"cmp x20, #0x4\n"
- ".inst 0x4e9f97b0 // sdot v16.4s, v29.16b, v31.16b\n"
- ".inst 0x4e979795 // sdot v21.4s, v28.16b, v23.16b\n"
+ ".inst 0x4e9596e4 // sdot v4.4s, v23.16b, v21.16b\n"
+ ".inst 0x4e8096d2 // sdot v18.4s, v22.16b, v0.16b\n"
"add %x[params], %x[params], #0x60\n"
- ".inst 0x4e979762 // sdot v2.4s, v27.16b, v23.16b\n"
- "ext v23.16b, v23.16b, v23.16b, #0x1\n"
- ".inst 0x4e9f979a // sdot v26.4s, v28.16b, v31.16b\n"
- ".inst 0x4e979790 // sdot v16.4s, v28.16b, v23.16b\n"
- ".inst 0x4e929775 // sdot v21.4s, v27.16b, v18.16b\n"
- "ext v18.16b, v18.16b, v18.16b, #0x1\n"
- "sqrdmulh v2.4s, v2.4s, v6.4s\n"
- ".inst 0x4e97977a // sdot v26.4s, v27.16b, v23.16b\n"
- ".inst 0x4e929770 // sdot v16.4s, v27.16b, v18.16b\n"
- "and v29.16b, v2.16b, v1.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqrdmulh v26.4s, v26.4s, v6.4s\n"
- "sqrdmulh v21.4s, v21.4s, v6.4s\n"
- "sqrdmulh v16.4s, v16.4s, v6.4s\n"
- "sqadd v2.4s, v2.4s, v29.4s\n"
- "and v28.16b, v26.16b, v1.16b\n"
- "and v27.16b, v21.16b, v1.16b\n"
- "and v29.16b, v16.16b, v1.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v28.4s\n"
- "sqadd v21.4s, v21.4s, v27.4s\n"
- "sqadd v16.4s, v16.4s, v29.4s\n"
- "srshl v2.4s, v2.4s, v1.4s\n"
- "srshl v26.4s, v26.4s, v1.4s\n"
- "srshl v21.4s, v21.4s, v1.4s\n"
- "srshl v16.4s, v16.4s, v1.4s\n"
- "add v2.4s, v2.4s, v7.4s\n"
- "add v26.4s, v26.4s, v7.4s\n"
- "add v21.4s, v21.4s, v7.4s\n"
- "add v16.4s, v16.4s, v7.4s\n"
- "smax v2.4s, v2.4s, v9.4s\n"
- "smax v26.4s, v26.4s, v9.4s\n"
- "smax v21.4s, v21.4s, v9.4s\n"
- "smax v16.4s, v16.4s, v9.4s\n"
- "smin v2.4s, v2.4s, v8.4s\n"
- "smin v26.4s, v26.4s, v8.4s\n"
- "smin v21.4s, v21.4s, v8.4s\n"
- "smin v16.4s, v16.4s, v8.4s\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
+ ".inst 0x4e80961f // sdot v31.4s, v16.16b, v0.16b\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x4e9596da // sdot v26.4s, v22.16b, v21.16b\n"
+ ".inst 0x4e8096c4 // sdot v4.4s, v22.16b, v0.16b\n"
+ ".inst 0x4e829612 // sdot v18.4s, v16.16b, v2.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x4e80961a // sdot v26.4s, v16.16b, v0.16b\n"
+ ".inst 0x4e829604 // sdot v4.4s, v16.16b, v2.16b\n"
+ "and v16.16b, v31.16b, v20.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v19.16b, v26.16b, v20.16b\n"
+ "and v17.16b, v18.16b, v20.16b\n"
+ "and v16.16b, v4.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v19.4s\n"
+ "sqadd v18.4s, v18.4s, v17.4s\n"
+ "sqadd v4.4s, v4.4s, v16.4s\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "srshl v18.4s, v18.4s, v20.4s\n"
+ "srshl v4.4s, v4.4s, v20.4s\n"
+ "add v31.4s, v31.4s, v15.4s\n"
+ "add v26.4s, v26.4s, v15.4s\n"
+ "add v18.4s, v18.4s, v15.4s\n"
+ "add v4.4s, v4.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v18.4s, v18.4s, v8.4s\n"
+ "smax v4.4s, v4.4s, v8.4s\n"
+ "smin v31.4s, v31.4s, v12.4s\n"
+ "smin v26.4s, v26.4s, v12.4s\n"
+ "smin v18.4s, v18.4s, v12.4s\n"
+ "smin v4.4s, v4.4s, v12.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
"blt 28f\n"
- "str s2, [x24, x11]\n"
- "str s26, [x23, x11]\n"
- "str s21, [x22, x11]\n"
- "str s16, [x21, x11]\n"
+ "str s31, [x25, x11]\n"
+ "str s26, [x24, x11]\n"
+ "str s18, [x23, x11]\n"
+ "str s4, [x22, x11]\n"
"b 31f\n"
"28:" // Oddments: Unroll 2: Oddment store
+ "add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
- "add x21, x21, x11\n"
"tbz x20, #1, 29f\n"
- "st1 { v2.h }[0], [x24], #0x2\n"
- "st1 { v26.h }[0], [x23], #0x2\n"
- "st1 { v21.h }[0], [x22], #0x2\n"
- "st1 { v16.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v26.h }[0], [x24], #0x2\n"
+ "st1 { v18.h }[0], [x23], #0x2\n"
+ "st1 { v4.h }[0], [x22], #0x2\n"
"tbz x20, #0, 30f\n"
- "st1 { v2.b }[2], [x24], #0x1\n"
- "st1 { v26.b }[2], [x23], #0x1\n"
- "st1 { v21.b }[2], [x22], #0x1\n"
- "st1 { v16.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v26.b }[2], [x24], #0x1\n"
+ "st1 { v18.b }[2], [x23], #0x1\n"
+ "st1 { v4.b }[2], [x22], #0x1\n"
"b 30f\n"
"29:" // Oddments: Unroll 2: Oddment store: Bit 1: Unset
- "st1 { v2.b }[0], [x24], #0x1\n"
- "st1 { v26.b }[0], [x23], #0x1\n"
- "st1 { v21.b }[0], [x22], #0x1\n"
- "st1 { v16.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v26.b }[0], [x24], #0x1\n"
+ "st1 { v18.b }[0], [x23], #0x1\n"
+ "st1 { v4.b }[0], [x22], #0x1\n"
"30:" // Oddments: Unroll 2: Oddment store: Bit 1: End
-
"31:" // Oddments: Unroll 2: After oddment store
"subs x20, x20, #0x4\n"
"add x11, x11, #0x4\n"
"ble 35f\n"
- "ldr q2, [%x[params], #0x0]\n"
- "ldr q29, [%x[params], #0x10]\n"
- "mov v26.16b, v2.16b\n"
- "mov v21.16b, v2.16b\n"
- "ldr q28, [%x[params], #0x20]\n"
- "ldr q27, [%x[params], #0x30]\n"
- "mov v16.16b, v2.16b\n"
- ".inst 0x4e8397a2 // sdot v2.4s, v29.16b, v3.16b\n"
- "ldr q6, [%x[params], #0x40]\n"
- "ldr q1, [%x[params], #0x50]\n"
- ".inst 0x4e9e97b5 // sdot v21.4s, v29.16b, v30.16b\n"
- ".inst 0x4e9e9782 // sdot v2.4s, v28.16b, v30.16b\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- "ext v30.16b, v30.16b, v30.16b, #0x1\n"
- ".inst 0x4e8397ba // sdot v26.4s, v29.16b, v3.16b\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q20, [%x[params], #0x10]\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ "ldr q19, [%x[params], #0x20]\n"
+ "ldr q16, [%x[params], #0x30]\n"
+ "mov v4.16b, v31.16b\n"
+ ".inst 0x4e8e969f // sdot v31.4s, v20.16b, v14.16b\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ "ldr q22, [%x[params], #0x50]\n"
+ ".inst 0x4e9b9692 // sdot v18.4s, v20.16b, v27.16b\n"
+ ".inst 0x4e9b967f // sdot v31.4s, v19.16b, v27.16b\n"
+ "ext v14.16b, v14.16b, v14.16b, #0x1\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x4e8e969a // sdot v26.4s, v20.16b, v14.16b\n"
"add %x[params], %x[params], #0x60\n"
- ".inst 0x4e9e97b0 // sdot v16.4s, v29.16b, v30.16b\n"
- ".inst 0x4e969795 // sdot v21.4s, v28.16b, v22.16b\n"
- ".inst 0x4e969762 // sdot v2.4s, v27.16b, v22.16b\n"
- "ext v22.16b, v22.16b, v22.16b, #0x1\n"
- ".inst 0x4e9e979a // sdot v26.4s, v28.16b, v30.16b\n"
- ".inst 0x4e969790 // sdot v16.4s, v28.16b, v22.16b\n"
- ".inst 0x4e919775 // sdot v21.4s, v27.16b, v17.16b\n"
- "ext v17.16b, v17.16b, v17.16b, #0x1\n"
- "sqrdmulh v2.4s, v2.4s, v6.4s\n"
- ".inst 0x4e96977a // sdot v26.4s, v27.16b, v22.16b\n"
- ".inst 0x4e919770 // sdot v16.4s, v27.16b, v17.16b\n"
- "and v29.16b, v2.16b, v1.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqrdmulh v26.4s, v26.4s, v6.4s\n"
- "sqrdmulh v21.4s, v21.4s, v6.4s\n"
- "sqrdmulh v16.4s, v16.4s, v6.4s\n"
- "sqadd v2.4s, v2.4s, v29.4s\n"
- "and v28.16b, v26.16b, v1.16b\n"
- "and v27.16b, v21.16b, v1.16b\n"
- "and v29.16b, v16.16b, v1.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v28.4s\n"
- "sqadd v21.4s, v21.4s, v27.4s\n"
- "sqadd v16.4s, v16.4s, v29.4s\n"
- "srshl v2.4s, v2.4s, v1.4s\n"
- "srshl v26.4s, v26.4s, v1.4s\n"
- "srshl v21.4s, v21.4s, v1.4s\n"
- "srshl v16.4s, v16.4s, v1.4s\n"
- "add v2.4s, v2.4s, v7.4s\n"
- "add v26.4s, v26.4s, v7.4s\n"
- "add v21.4s, v21.4s, v7.4s\n"
- "add v16.4s, v16.4s, v7.4s\n"
- "smax v2.4s, v2.4s, v9.4s\n"
- "smax v26.4s, v26.4s, v9.4s\n"
- "smax v21.4s, v21.4s, v9.4s\n"
- "smax v16.4s, v16.4s, v9.4s\n"
- "smin v2.4s, v2.4s, v8.4s\n"
- "smin v26.4s, v26.4s, v8.4s\n"
- "smin v21.4s, v21.4s, v8.4s\n"
- "smin v16.4s, v16.4s, v8.4s\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
+ ".inst 0x4e9b9684 // sdot v4.4s, v20.16b, v27.16b\n"
+ ".inst 0x4e879672 // sdot v18.4s, v19.16b, v7.16b\n"
+ ".inst 0x4e87961f // sdot v31.4s, v16.16b, v7.16b\n"
+ "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+ ".inst 0x4e9b967a // sdot v26.4s, v19.16b, v27.16b\n"
+ ".inst 0x4e879664 // sdot v4.4s, v19.16b, v7.16b\n"
+ ".inst 0x4e819612 // sdot v18.4s, v16.16b, v1.16b\n"
+ "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x4e87961a // sdot v26.4s, v16.16b, v7.16b\n"
+ ".inst 0x4e819604 // sdot v4.4s, v16.16b, v1.16b\n"
+ "and v16.16b, v31.16b, v22.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v23.16b, v26.16b, v22.16b\n"
+ "and v17.16b, v18.16b, v22.16b\n"
+ "and v16.16b, v4.16b, v22.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v23.4s\n"
+ "sqadd v18.4s, v18.4s, v17.4s\n"
+ "sqadd v4.4s, v4.4s, v16.4s\n"
+ "srshl v31.4s, v31.4s, v22.4s\n"
+ "srshl v26.4s, v26.4s, v22.4s\n"
+ "srshl v18.4s, v18.4s, v22.4s\n"
+ "srshl v4.4s, v4.4s, v22.4s\n"
+ "add v31.4s, v31.4s, v15.4s\n"
+ "add v26.4s, v26.4s, v15.4s\n"
+ "add v18.4s, v18.4s, v15.4s\n"
+ "add v4.4s, v4.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v18.4s, v18.4s, v8.4s\n"
+ "smax v4.4s, v4.4s, v8.4s\n"
+ "smin v31.4s, v31.4s, v12.4s\n"
+ "smin v26.4s, v26.4s, v12.4s\n"
+ "smin v18.4s, v18.4s, v12.4s\n"
+ "smin v4.4s, v4.4s, v12.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
"32:" // Oddments: Unroll 3: Oddment store
+ "add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
- "add x21, x21, x11\n"
"tbz x20, #1, 33f\n"
- "st1 { v2.h }[0], [x24], #0x2\n"
- "st1 { v26.h }[0], [x23], #0x2\n"
- "st1 { v21.h }[0], [x22], #0x2\n"
- "st1 { v16.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v26.h }[0], [x24], #0x2\n"
+ "st1 { v18.h }[0], [x23], #0x2\n"
+ "st1 { v4.h }[0], [x22], #0x2\n"
"tbz x20, #0, 34f\n"
- "st1 { v2.b }[2], [x24], #0x1\n"
- "st1 { v26.b }[2], [x23], #0x1\n"
- "st1 { v21.b }[2], [x22], #0x1\n"
- "st1 { v16.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v26.b }[2], [x24], #0x1\n"
+ "st1 { v18.b }[2], [x23], #0x1\n"
+ "st1 { v4.b }[2], [x22], #0x1\n"
"b 34f\n"
"33:" // Oddments: Unroll 3: Oddment store: Bit 1: Unset
- "st1 { v2.b }[0], [x24], #0x1\n"
- "st1 { v26.b }[0], [x23], #0x1\n"
- "st1 { v21.b }[0], [x22], #0x1\n"
- "st1 { v16.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v26.b }[0], [x24], #0x1\n"
+ "st1 { v18.b }[0], [x23], #0x1\n"
+ "st1 { v4.b }[0], [x22], #0x1\n"
"34:" // Oddments: Unroll 3: Oddment store: Bit 1: End
-
"35:" // End
: [params] "+&r" (params)
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
index 8366b0a270..bea97a54b6 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
@@ -34,7 +34,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int, const uint8_t *const * const, const uint8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, uint8_t *const *const);
+void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, uint8_t *const *);
class a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
index 986937f3b4..5a28daffbf 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -30,15 +30,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
- const unsigned int n_channels,
- const uint8_t *const *const inptrs,
- const uint8_t *params,
- const int32_t *, // Bias, should be wrapped into the parameters
- const arm_gemm::Requantize32& qp,
- const int32_t *, const int32_t *, // Requant parameters, also wrapped
- uint8_t *const *const outptrs
-)
+void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const uint8_t *const *const inptrs, const uint8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, uint8_t *const *const outptrs)
{
__asm__ __volatile__(
"mov x20, #0x1\n"
@@ -47,817 +39,817 @@ void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
"ldp x13, x12, [%x[inptrs], #0x10]\n"
"orr x20, x20, #0x10000\n"
"lsr x11, %x[n_channels], #0x4\n"
- "dup v14.4s, w20\n"
+ "dup v12.4s, w20\n"
"ldp x10, x9, [%x[inptrs], #0x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
"ld1r { v13.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v12.4s }, [x20]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
"ld1r { v11.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v16.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v10.4s }, [x20]\n"
+ "ld1r { v14.4s }, [x20]\n"
"mov x28, #0x0\n"
"mov x27, #0x0\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
- "ldp x24, x23, [%x[outptrs], #0x0]\n"
- "ldp x22, x21, [%x[outptrs], #0x10]\n"
+ "ldp x26, x21, [%x[inptrs], #0x30]\n"
+ "ldp x25, x24, [%x[outptrs], #0x0]\n"
+ "ldp x23, x22, [%x[outptrs], #0x10]\n"
"cbz x11, 3f\n"
- "ldr q9, [x15, x28]\n"
- "ldr q8, [x14, x28]\n"
- "subs x11, x11, #0x1\n"
- "ldr q7, [x13, x28]\n"
- "ldr q6, [x12, x28]\n"
- "zip2 v5.16b, v9.16b, v7.16b\n"
- "zip1 v9.16b, v9.16b, v7.16b\n"
- "ldr q4, [x10, x28]\n"
- "ldr q3, [x9, x28]\n"
- "zip1 v7.16b, v8.16b, v6.16b\n"
- "zip2 v6.16b, v8.16b, v6.16b\n"
- "ldr q2, [x26, x28]\n"
- "ldr q1, [x25, x28]\n"
- "zip2 v8.16b, v9.16b, v7.16b\n"
- "zip1 v9.16b, v9.16b, v7.16b\n"
- "ldr q0, [%x[params], #0x10]\n"
- "ldr q16, [%x[params], #0x20]\n"
- "zip1 v7.16b, v5.16b, v6.16b\n"
- "zip2 v6.16b, v5.16b, v6.16b\n"
- "ldr q5, [%x[params], #0x0]\n"
- "ldr q31, [%x[params], #0x30]\n"
- "zip2 v30.16b, v4.16b, v2.16b\n"
- "zip1 v4.16b, v4.16b, v2.16b\n"
- "ldp x15, x14, [%x[inptrs], #0x40]\n"
- "ldr q29, [x15, x28]\n"
- "zip1 v2.16b, v3.16b, v1.16b\n"
- "zip2 v1.16b, v3.16b, v1.16b\n"
+ "ldr q15, [x15, x28]\n"
"ldr q28, [x14, x28]\n"
- "ldp x13, x12, [%x[inptrs], #0x50]\n"
- "zip2 v3.16b, v4.16b, v2.16b\n"
- "zip1 v4.16b, v4.16b, v2.16b\n"
- "ldr q27, [x13, x28]\n"
- "ldr q26, [x12, x28]\n"
- "zip2 v25.16b, v29.16b, v27.16b\n"
- "zip1 v29.16b, v29.16b, v27.16b\n"
- "ldp x10, x9, [%x[inptrs], #0x60]\n"
- "ldr q24, [x10, x28]\n"
- "zip1 v27.16b, v28.16b, v26.16b\n"
- "zip2 v26.16b, v28.16b, v26.16b\n"
- "ldr q23, [x9, x28]\n"
- "ldp x26, x25, [%x[inptrs], #0x70]\n"
- "zip1 v2.16b, v30.16b, v1.16b\n"
- "zip2 v1.16b, v30.16b, v1.16b\n"
- "ldr q22, [x26, x28]\n"
- "ldr q21, [x25, x28]\n"
- "zip2 v20.16b, v24.16b, v22.16b\n"
- "zip1 v24.16b, v24.16b, v22.16b\n"
- "zip1 v22.16b, v23.16b, v21.16b\n"
- "zip2 v21.16b, v23.16b, v21.16b\n"
+ "subs x11, x11, #0x1\n"
+ "ldr q30, [x13, x28]\n"
+ "ldr q8, [x12, x28]\n"
+ "zip2 v19.16b, v15.16b, v30.16b\n"
+ "zip1 v15.16b, v15.16b, v30.16b\n"
+ "ldr q26, [x10, x28]\n"
+ "ldr q0, [x9, x28]\n"
+ "zip1 v7.16b, v28.16b, v8.16b\n"
+ "zip2 v8.16b, v28.16b, v8.16b\n"
+ "ldr q29, [x26, x28]\n"
+ "ldr q10, [x21, x28]\n"
+ "zip2 v25.16b, v15.16b, v7.16b\n"
+ "zip1 v15.16b, v15.16b, v7.16b\n"
+ "ldr q1, [%x[params], #0x10]\n"
+ "ldr q6, [%x[params], #0x20]\n"
+ "zip1 v7.16b, v19.16b, v8.16b\n"
+ "zip2 v8.16b, v19.16b, v8.16b\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q20, [%x[params], #0x30]\n"
+ "zip2 v21.16b, v26.16b, v29.16b\n"
+ "zip1 v26.16b, v26.16b, v29.16b\n"
+ "ldp x21, x20, [%x[inptrs], #0x40]\n"
+ "ldr q22, [x21, x28]\n"
+ "zip1 v27.16b, v0.16b, v10.16b\n"
+ "zip2 v10.16b, v0.16b, v10.16b\n"
+ "ldr q17, [x20, x28]\n"
+ "ldp x21, x20, [%x[inptrs], #0x50]\n"
+ "zip2 v23.16b, v26.16b, v27.16b\n"
+ "zip1 v26.16b, v26.16b, v27.16b\n"
+ "ldr q9, [x21, x28]\n"
+ "ldr q5, [x20, x28]\n"
+ "zip2 v28.16b, v22.16b, v9.16b\n"
+ "zip1 v22.16b, v22.16b, v9.16b\n"
+ "ldp x21, x20, [%x[inptrs], #0x60]\n"
+ "ldr q27, [x21, x28]\n"
+ "zip1 v24.16b, v17.16b, v5.16b\n"
+ "zip2 v5.16b, v17.16b, v5.16b\n"
+ "ldr q18, [x20, x28]\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "zip1 v3.16b, v21.16b, v10.16b\n"
+ "zip2 v10.16b, v21.16b, v10.16b\n"
+ "ldr q4, [x21, x28]\n"
+ "ldr q9, [x20, x28]\n"
+ "zip2 v17.16b, v27.16b, v4.16b\n"
+ "zip1 v27.16b, v27.16b, v4.16b\n"
+ "zip1 v4.16b, v18.16b, v9.16b\n"
+ "zip2 v9.16b, v18.16b, v9.16b\n"
"ldp x15, x14, [%x[inptrs], #0x0]\n"
"ldp x13, x12, [%x[inptrs], #0x10]\n"
"ldp x10, x9, [%x[inptrs], #0x20]\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
- "zip2 v28.16b, v29.16b, v27.16b\n"
- "zip1 v29.16b, v29.16b, v27.16b\n"
- "zip1 v27.16b, v25.16b, v26.16b\n"
- "zip2 v26.16b, v25.16b, v26.16b\n"
+ "ldp x26, x21, [%x[inptrs], #0x30]\n"
+ "zip2 v19.16b, v22.16b, v24.16b\n"
+ "zip1 v22.16b, v22.16b, v24.16b\n"
+ "zip1 v0.16b, v28.16b, v5.16b\n"
+ "zip2 v5.16b, v28.16b, v5.16b\n"
"add %x[params], %x[params], #0x40\n"
- "zip2 v23.16b, v24.16b, v22.16b\n"
- "zip1 v24.16b, v24.16b, v22.16b\n"
- "zip1 v22.16b, v20.16b, v21.16b\n"
- "zip2 v21.16b, v20.16b, v21.16b\n"
- "mov v30.16b, v5.16b\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
+ "zip2 v24.16b, v27.16b, v4.16b\n"
+ "zip1 v27.16b, v27.16b, v4.16b\n"
+ "zip1 v2.16b, v17.16b, v9.16b\n"
+ "zip2 v9.16b, v17.16b, v9.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
"beq 2f\n"
"1:" // Loop
- "movi v19.4s, #0x0\n"
- ".inst 0x6e8495d3 // udot v19.4s, v14.16b, v4.16b\n"
- ".inst 0x6e899405 // udot v5.4s, v0.16b, v9.16b\n"
+ "movi v21.4s, #0x0\n"
+ ".inst 0x6e9a9595 // udot v21.4s, v12.16b, v26.16b\n"
+ ".inst 0x6e8f943f // udot v31.4s, v1.16b, v15.16b\n"
"add x28, x28, #0x10\n"
- ".inst 0x6e9d95d3 // udot v19.4s, v14.16b, v29.16b\n"
- ".inst 0x6e849419 // udot v25.4s, v0.16b, v4.16b\n"
+ ".inst 0x6e969595 // udot v21.4s, v12.16b, v22.16b\n"
+ ".inst 0x6e9a943d // udot v29.4s, v1.16b, v26.16b\n"
+ "movi v18.4s, #0x0\n"
"subs x11, x11, #0x1\n"
- ".inst 0x6e849605 // udot v5.4s, v16.16b, v4.16b\n"
- "ext v4.16b, v4.16b, v4.16b, #0x1\n"
- "mov v18.16b, v19.16b\n .inst 0x6e9895d2 // udot v18.4s, v14.16b, v24.16b\n"
- ".inst 0x6e8995d3 // udot v19.4s, v14.16b, v9.16b\n"
- "ext v9.16b, v9.16b, v9.16b, #0x1\n"
- ".inst 0x6e9d9619 // udot v25.4s, v16.16b, v29.16b\n"
- ".inst 0x6e9d97e5 // udot v5.4s, v31.16b, v29.16b\n"
- "ext v29.16b, v29.16b, v29.16b, #0x1\n"
- ".inst 0x6e89941e // udot v30.4s, v0.16b, v9.16b\n"
- ".inst 0x6e849414 // udot v20.4s, v0.16b, v4.16b\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x6e8495d1 // udot v17.4s, v14.16b, v4.16b\n"
- ".inst 0x6e9d95d1 // udot v17.4s, v14.16b, v29.16b\n"
- ".inst 0x6e9897f9 // udot v25.4s, v31.16b, v24.16b\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x6e84961e // udot v30.4s, v16.16b, v4.16b\n"
- "ldr q4, [%x[params], #0x10]\n"
- ".inst 0x6e9d9614 // udot v20.4s, v16.16b, v29.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x6e9895d0 // udot v16.4s, v14.16b, v24.16b\n"
- ".inst 0x6e8995d1 // udot v17.4s, v14.16b, v9.16b\n"
- "ldr q9, [%x[params], #0x0]\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- ".inst 0x6e9d97fe // udot v30.4s, v31.16b, v29.16b\n"
- ".inst 0x6e9897f4 // udot v20.4s, v31.16b, v24.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "ldr q9, [%x[params], #0x60]\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "ldr q16, [%x[params], #0x40]\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "ldr q31, [%x[params], #0x50]\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "ldr q0, [%x[params], #0x30]\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "ldr q4, [%x[params], #0x70]\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
+ ".inst 0x6e9a94df // udot v31.4s, v6.16b, v26.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ "mov v17.16b, v21.16b\n .inst 0x6e9b9591 // udot v17.4s, v12.16b, v27.16b\n"
+ ".inst 0x6e8f9595 // udot v21.4s, v12.16b, v15.16b\n"
+ "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+ ".inst 0x6e9a9592 // udot v18.4s, v12.16b, v26.16b\n"
+ ".inst 0x6e9694dd // udot v29.4s, v6.16b, v22.16b\n"
+ ".inst 0x6e96969f // udot v31.4s, v20.16b, v22.16b\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ ".inst 0x6e8f943e // udot v30.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e9a943c // udot v28.4s, v1.16b, v26.16b\n"
+ "mls v31.4s, v21.4s, v16.4s\n"
+ ".inst 0x6e969592 // udot v18.4s, v12.16b, v22.16b\n"
+ ".inst 0x6e9b969d // udot v29.4s, v20.16b, v27.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x6e9a94de // udot v30.4s, v6.16b, v26.16b\n"
+ "ldr q26, [%x[params], #0x10]\n"
+ ".inst 0x6e9694dc // udot v28.4s, v6.16b, v22.16b\n"
+ "mls v29.4s, v17.4s, v16.4s\n"
+ "mov v21.16b, v18.16b\n .inst 0x6e9b9595 // udot v21.4s, v12.16b, v27.16b\n"
+ ".inst 0x6e8f9592 // udot v18.4s, v12.16b, v15.16b\n"
+ "ldr q17, [%x[params], #0x0]\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x6e96969e // udot v30.4s, v20.16b, v22.16b\n"
+ ".inst 0x6e9b969c // udot v28.4s, v20.16b, v27.16b\n"
+ "mls v30.4s, v18.4s, v16.4s\n"
+ "mls v28.4s, v21.4s, v16.4s\n"
+ "and v15.16b, v31.16b, v26.16b\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v17.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v17.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v17.4s\n"
+ "ldr q1, [%x[params], #0x60]\n"
+ "sqadd v31.4s, v31.4s, v15.4s\n"
+ "and v18.16b, v30.16b, v26.16b\n"
+ "and v21.16b, v29.16b, v26.16b\n"
+ "and v17.16b, v28.16b, v26.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v26.4s\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "ldr q18, [%x[params], #0x40]\n"
+ "sqadd v29.4s, v29.4s, v21.4s\n"
+ "ldr q27, [%x[params], #0x50]\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "ldr q15, [%x[params], #0x30]\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "srshl v30.4s, v30.4s, v26.4s\n"
+ "srshl v29.4s, v29.4s, v26.4s\n"
+ "srshl v28.4s, v28.4s, v26.4s\n"
+ "ldr q20, [%x[params], #0x70]\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
"smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x6e8395d3 // udot v19.4s, v14.16b, v3.16b\n"
- ".inst 0x6e9c95d3 // udot v19.4s, v14.16b, v28.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "movi v22.4s, #0x0\n"
+ ".inst 0x6e979596 // udot v22.4s, v12.16b, v23.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s5, [x24, x27]\n"
- "ldr q5, [%x[params], #0x20]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x6e9795d2 // udot v18.4s, v14.16b, v23.16b\n"
+ "str s31, [x25, x27]\n"
+ "ldr q26, [%x[params], #0x20]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ ".inst 0x6e939596 // udot v22.4s, v12.16b, v19.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
- ".inst 0x6e8895d3 // udot v19.4s, v14.16b, v8.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "mov v30.16b, v5.16b\n"
- "str s20, [x21, x27]\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x6e889405 // udot v5.4s, v0.16b, v8.16b\n"
- ".inst 0x6e839419 // udot v25.4s, v0.16b, v3.16b\n"
- ".inst 0x6e839605 // udot v5.4s, v16.16b, v3.16b\n"
- "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s30, [x24, x27]\n"
+ "mov v6.16b, v22.16b\n .inst 0x6e989586 // udot v6.4s, v12.16b, v24.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s29, [x23, x27]\n"
+ "mov v30.16b, v26.16b\n"
+ ".inst 0x6e999596 // udot v22.4s, v12.16b, v25.16b\n"
+ "str s28, [x22, x27]\n"
+ "mov v29.16b, v26.16b\n"
+ "mov v21.16b, v26.16b\n"
+ ".inst 0x6e9995fa // udot v26.4s, v15.16b, v25.16b\n"
+ ".inst 0x6e9795fd // udot v29.4s, v15.16b, v23.16b\n"
+ ".inst 0x6e97965a // udot v26.4s, v18.16b, v23.16b\n"
+ "ext v25.16b, v25.16b, v25.16b, #0x1\n"
"add x27, x27, #0x4\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x6e88941e // udot v30.4s, v0.16b, v8.16b\n"
- ".inst 0x6e839414 // udot v20.4s, v0.16b, v3.16b\n"
- ".inst 0x6e8395d1 // udot v17.4s, v14.16b, v3.16b\n"
- ".inst 0x6e9c9619 // udot v25.4s, v16.16b, v28.16b\n"
- ".inst 0x6e9c97e5 // udot v5.4s, v31.16b, v28.16b\n"
- "ext v28.16b, v28.16b, v28.16b, #0x1\n"
- ".inst 0x6e83961e // udot v30.4s, v16.16b, v3.16b\n"
- "ldr q3, [x9, x28]\n"
- ".inst 0x6e9c9614 // udot v20.4s, v16.16b, v28.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x6e9c95d1 // udot v17.4s, v14.16b, v28.16b\n"
- ".inst 0x6e9797f9 // udot v25.4s, v31.16b, v23.16b\n"
"ext v23.16b, v23.16b, v23.16b, #0x1\n"
- ".inst 0x6e9c97fe // udot v30.4s, v31.16b, v28.16b\n"
- ".inst 0x6e9797f4 // udot v20.4s, v31.16b, v23.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x6e9795d0 // udot v16.4s, v14.16b, v23.16b\n"
- ".inst 0x6e8895d1 // udot v17.4s, v14.16b, v8.16b\n"
- "ldr q8, [x14, x28]\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "ldr q9, [%x[params], #0xc0]\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "ldr q16, [%x[params], #0xa0]\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "ldr q31, [%x[params], #0xb0]\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "ldr q0, [%x[params], #0x90]\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "ldr q4, [%x[params], #0xd0]\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
+ "movi v28.4s, #0x0\n"
+ ".inst 0x6e9995fe // udot v30.4s, v15.16b, v25.16b\n"
+ ".inst 0x6e9795f5 // udot v21.4s, v15.16b, v23.16b\n"
+ ".inst 0x6e97959c // udot v28.4s, v12.16b, v23.16b\n"
+ ".inst 0x6e93965d // udot v29.4s, v18.16b, v19.16b\n"
+ ".inst 0x6e93977a // udot v26.4s, v27.16b, v19.16b\n"
+ "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+ ".inst 0x6e97965e // udot v30.4s, v18.16b, v23.16b\n"
+ "ldr q4, [x9, x28]\n"
+ ".inst 0x6e939655 // udot v21.4s, v18.16b, v19.16b\n"
+ "mls v26.4s, v22.4s, v16.4s\n"
+ ".inst 0x6e93959c // udot v28.4s, v12.16b, v19.16b\n"
+ ".inst 0x6e98977d // udot v29.4s, v27.16b, v24.16b\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x6e93977e // udot v30.4s, v27.16b, v19.16b\n"
+ ".inst 0x6e989775 // udot v21.4s, v27.16b, v24.16b\n"
+ "sqrdmulh v26.4s, v26.4s, v1.4s\n"
+ "mov v17.16b, v28.16b\n .inst 0x6e989591 // udot v17.4s, v12.16b, v24.16b\n"
+ ".inst 0x6e99959c // udot v28.4s, v12.16b, v25.16b\n"
+ "ldr q31, [x14, x28]\n"
+ "mls v30.4s, v28.4s, v16.4s\n"
+ "mls v29.4s, v6.4s, v16.4s\n"
+ "mls v21.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v26.16b, v20.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v1.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v1.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v1.4s\n"
+ "ldr q27, [%x[params], #0xc0]\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "and v18.16b, v30.16b, v20.16b\n"
+ "and v6.16b, v29.16b, v20.16b\n"
+ "and v17.16b, v21.16b, v20.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "ldr q28, [%x[params], #0xa0]\n"
+ "sqadd v29.4s, v29.4s, v6.4s\n"
+ "ldr q24, [%x[params], #0xb0]\n"
+ "sqadd v21.4s, v21.4s, v17.4s\n"
+ "ldr q15, [%x[params], #0x90]\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "srshl v30.4s, v30.4s, v20.4s\n"
+ "srshl v29.4s, v29.4s, v20.4s\n"
+ "srshl v21.4s, v21.4s, v20.4s\n"
+ "ldr q1, [%x[params], #0xd0]\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
"smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x6e8295d3 // udot v19.4s, v14.16b, v2.16b\n"
- ".inst 0x6e9b95d3 // udot v19.4s, v14.16b, v27.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "movi v22.4s, #0x0\n"
+ ".inst 0x6e839596 // udot v22.4s, v12.16b, v3.16b\n"
+ ".inst 0x6e809596 // udot v22.4s, v12.16b, v0.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s5, [x24, x27]\n"
- "ldr q5, [%x[params], #0x80]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x6e9695d2 // udot v18.4s, v14.16b, v22.16b\n"
+ "str s26, [x25, x27]\n"
+ "ldr q26, [%x[params], #0x80]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "mov v18.16b, v22.16b\n .inst 0x6e829592 // udot v18.4s, v12.16b, v2.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
- ".inst 0x6e8795d3 // udot v19.4s, v14.16b, v7.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "mov v30.16b, v5.16b\n"
- "str s20, [x21, x27]\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x6e879405 // udot v5.4s, v0.16b, v7.16b\n"
- ".inst 0x6e829419 // udot v25.4s, v0.16b, v2.16b\n"
- ".inst 0x6e829605 // udot v5.4s, v16.16b, v2.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s30, [x24, x27]\n"
+ ".inst 0x6e879596 // udot v22.4s, v12.16b, v7.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s29, [x23, x27]\n"
+ "mov v6.16b, v26.16b\n"
+ "str s21, [x22, x27]\n"
+ "mov v25.16b, v26.16b\n"
+ "mov v20.16b, v26.16b\n"
+ ".inst 0x6e8795fa // udot v26.4s, v15.16b, v7.16b\n"
+ ".inst 0x6e8395f9 // udot v25.4s, v15.16b, v3.16b\n"
+ ".inst 0x6e83979a // udot v26.4s, v28.16b, v3.16b\n"
"ext v7.16b, v7.16b, v7.16b, #0x1\n"
"add x27, x27, #0x4\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "movi v23.4s, #0x0\n"
+ ".inst 0x6e8795e6 // udot v6.4s, v15.16b, v7.16b\n"
+ ".inst 0x6e8395f4 // udot v20.4s, v15.16b, v3.16b\n"
+ ".inst 0x6e839597 // udot v23.4s, v12.16b, v3.16b\n"
+ ".inst 0x6e809799 // udot v25.4s, v28.16b, v0.16b\n"
+ ".inst 0x6e80971a // udot v26.4s, v24.16b, v0.16b\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x6e839786 // udot v6.4s, v28.16b, v3.16b\n"
+ "ldr q19, [x26, x28]\n"
+ ".inst 0x6e809794 // udot v20.4s, v28.16b, v0.16b\n"
+ "mls v26.4s, v22.4s, v16.4s\n"
+ ".inst 0x6e809597 // udot v23.4s, v12.16b, v0.16b\n"
+ ".inst 0x6e829719 // udot v25.4s, v24.16b, v2.16b\n"
"ext v2.16b, v2.16b, v2.16b, #0x1\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x6e87941e // udot v30.4s, v0.16b, v7.16b\n"
- ".inst 0x6e829414 // udot v20.4s, v0.16b, v2.16b\n"
- ".inst 0x6e8295d1 // udot v17.4s, v14.16b, v2.16b\n"
- ".inst 0x6e9b9619 // udot v25.4s, v16.16b, v27.16b\n"
- ".inst 0x6e9b97e5 // udot v5.4s, v31.16b, v27.16b\n"
- "ext v27.16b, v27.16b, v27.16b, #0x1\n"
- ".inst 0x6e82961e // udot v30.4s, v16.16b, v2.16b\n"
- "ldr q2, [x26, x28]\n"
- ".inst 0x6e9b9614 // udot v20.4s, v16.16b, v27.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x6e9b95d1 // udot v17.4s, v14.16b, v27.16b\n"
- ".inst 0x6e9697f9 // udot v25.4s, v31.16b, v22.16b\n"
- "ext v22.16b, v22.16b, v22.16b, #0x1\n"
- ".inst 0x6e9b97fe // udot v30.4s, v31.16b, v27.16b\n"
- ".inst 0x6e9697f4 // udot v20.4s, v31.16b, v22.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x6e9695d0 // udot v16.4s, v14.16b, v22.16b\n"
- ".inst 0x6e8795d1 // udot v17.4s, v14.16b, v7.16b\n"
- "ldr q7, [x13, x28]\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "ldr q9, [%x[params], #0x120]\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "ldr q16, [%x[params], #0x100]\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "ldr q31, [%x[params], #0x110]\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "ldr q0, [%x[params], #0xf0]\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "ldr q4, [%x[params], #0x130]\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
+ ".inst 0x6e809706 // udot v6.4s, v24.16b, v0.16b\n"
+ ".inst 0x6e829714 // udot v20.4s, v24.16b, v2.16b\n"
+ "sqrdmulh v26.4s, v26.4s, v27.4s\n"
+ "mov v17.16b, v23.16b\n .inst 0x6e829591 // udot v17.4s, v12.16b, v2.16b\n"
+ ".inst 0x6e879597 // udot v23.4s, v12.16b, v7.16b\n"
+ "ldr q21, [x13, x28]\n"
+ "mls v6.4s, v23.4s, v16.4s\n"
+ "mls v25.4s, v18.4s, v16.4s\n"
+ "mls v20.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v26.16b, v1.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v6.4s, v6.4s, v27.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v27.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+ "ldr q15, [%x[params], #0x120]\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "and v18.16b, v6.16b, v1.16b\n"
+ "and v22.16b, v25.16b, v1.16b\n"
+ "and v17.16b, v20.16b, v1.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "ldr q30, [%x[params], #0x100]\n"
+ "sqadd v25.4s, v25.4s, v22.4s\n"
+ "ldr q27, [%x[params], #0x110]\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "ldr q24, [%x[params], #0xf0]\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "srshl v6.4s, v6.4s, v1.4s\n"
+ "srshl v25.4s, v25.4s, v1.4s\n"
+ "srshl v20.4s, v20.4s, v1.4s\n"
+ "ldr q23, [%x[params], #0x130]\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "add v6.4s, v6.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smax v6.4s, v6.4s, v13.4s\n"
"smax v25.4s, v25.4s, v13.4s\n"
"smax v20.4s, v20.4s, v13.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x6e8195d3 // udot v19.4s, v14.16b, v1.16b\n"
- ".inst 0x6e9a95d3 // udot v19.4s, v14.16b, v26.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s5, [x24, x27]\n"
- "ldr q5, [%x[params], #0xe0]\n"
+ "smin v6.4s, v6.4s, v11.4s\n"
+ "smin v25.4s, v25.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "movi v0.4s, #0x0\n"
+ ".inst 0x6e8a9580 // udot v0.4s, v12.16b, v10.16b\n"
+ ".inst 0x6e859580 // udot v0.4s, v12.16b, v5.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "str s26, [x25, x27]\n"
+ "ldr q28, [%x[params], #0xe0]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x6e9595d2 // udot v18.4s, v14.16b, v21.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "mov v22.16b, v0.16b\n .inst 0x6e899596 // udot v22.4s, v12.16b, v9.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
- ".inst 0x6e8695d3 // udot v19.4s, v14.16b, v6.16b\n"
+ "str s6, [x24, x27]\n"
+ ".inst 0x6e889580 // udot v0.4s, v12.16b, v8.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "mov v30.16b, v5.16b\n"
- "str s20, [x21, x27]\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x6e869405 // udot v5.4s, v0.16b, v6.16b\n"
- ".inst 0x6e819419 // udot v25.4s, v0.16b, v1.16b\n"
- ".inst 0x6e819605 // udot v5.4s, v16.16b, v1.16b\n"
- "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ "str s25, [x23, x27]\n"
+ "mov v29.16b, v28.16b\n"
+ "str s20, [x22, x27]\n"
+ "mov v25.16b, v28.16b\n"
+ "mov v7.16b, v28.16b\n"
+ ".inst 0x6e88971c // udot v28.4s, v24.16b, v8.16b\n"
+ ".inst 0x6e8a9719 // udot v25.4s, v24.16b, v10.16b\n"
+ ".inst 0x6e8a97dc // udot v28.4s, v30.16b, v10.16b\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
"add x27, x27, #0x4\n"
- "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
"movi v17.4s, #0x0\n"
- ".inst 0x6e86941e // udot v30.4s, v0.16b, v6.16b\n"
- ".inst 0x6e819414 // udot v20.4s, v0.16b, v1.16b\n"
- ".inst 0x6e8195d1 // udot v17.4s, v14.16b, v1.16b\n"
- ".inst 0x6e9a9619 // udot v25.4s, v16.16b, v26.16b\n"
- ".inst 0x6e9a97e5 // udot v5.4s, v31.16b, v26.16b\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- ".inst 0x6e81961e // udot v30.4s, v16.16b, v1.16b\n"
- "ldr q1, [x25, x28]\n"
- ".inst 0x6e9a9614 // udot v20.4s, v16.16b, v26.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x6e9a95d1 // udot v17.4s, v14.16b, v26.16b\n"
- ".inst 0x6e9597f9 // udot v25.4s, v31.16b, v21.16b\n"
- "ext v21.16b, v21.16b, v21.16b, #0x1\n"
- ".inst 0x6e9a97fe // udot v30.4s, v31.16b, v26.16b\n"
- ".inst 0x6e9597f4 // udot v20.4s, v31.16b, v21.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x6e9595d0 // udot v16.4s, v14.16b, v21.16b\n"
- ".inst 0x6e8695d1 // udot v17.4s, v14.16b, v6.16b\n"
- "ldr q6, [x12, x28]\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "ldr q9, [x15, x28]\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "ldp x15, x14, [%x[inptrs], #0x40]\n"
- "ldr q29, [x15, x28]\n"
- "ldr q28, [x14, x28]\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "ldp x13, x12, [%x[inptrs], #0x50]\n"
- "ldr q27, [x13, x28]\n"
- "ldr q26, [x12, x28]\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "ldr q16, [%x[params], #0x160]\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "ldr q31, [%x[params], #0x170]\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "ldr q0, [%x[params], #0x150]\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "ldr q4, [x10, x28]\n"
- "ldp x10, x9, [%x[inptrs], #0x60]\n"
- "ldr q24, [x10, x28]\n"
- "ldr q23, [x9, x28]\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "ldp x26, x25, [%x[inptrs], #0x70]\n"
- "ldr q22, [x26, x28]\n"
- "ldr q21, [x25, x28]\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
+ ".inst 0x6e88971d // udot v29.4s, v24.16b, v8.16b\n"
+ ".inst 0x6e8a9707 // udot v7.4s, v24.16b, v10.16b\n"
+ ".inst 0x6e8a9591 // udot v17.4s, v12.16b, v10.16b\n"
+ ".inst 0x6e8597d9 // udot v25.4s, v30.16b, v5.16b\n"
+ ".inst 0x6e85977c // udot v28.4s, v27.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x6e8a97dd // udot v29.4s, v30.16b, v10.16b\n"
+ "ldr q10, [x21, x28]\n"
+ ".inst 0x6e8597c7 // udot v7.4s, v30.16b, v5.16b\n"
+ "mls v28.4s, v0.4s, v16.4s\n"
+ ".inst 0x6e859591 // udot v17.4s, v12.16b, v5.16b\n"
+ ".inst 0x6e899779 // udot v25.4s, v27.16b, v9.16b\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+ ".inst 0x6e85977d // udot v29.4s, v27.16b, v5.16b\n"
+ ".inst 0x6e899767 // udot v7.4s, v27.16b, v9.16b\n"
+ "sqrdmulh v28.4s, v28.4s, v15.4s\n"
+ "mov v18.16b, v17.16b\n .inst 0x6e899592 // udot v18.4s, v12.16b, v9.16b\n"
+ ".inst 0x6e889591 // udot v17.4s, v12.16b, v8.16b\n"
+ "ldr q8, [x12, x28]\n"
+ "mls v29.4s, v17.4s, v16.4s\n"
+ "mls v25.4s, v22.4s, v16.4s\n"
+ "mls v7.4s, v18.4s, v16.4s\n"
+ "and v17.16b, v28.16b, v23.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v29.4s, v29.4s, v15.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v15.4s\n"
+ "sqrdmulh v7.4s, v7.4s, v15.4s\n"
+ "ldr q15, [x15, x28]\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "ldp x21, x20, [%x[inptrs], #0x40]\n"
+ "ldr q22, [x21, x28]\n"
+ "ldr q3, [x20, x28]\n"
+ "and v24.16b, v29.16b, v23.16b\n"
+ "and v20.16b, v25.16b, v23.16b\n"
+ "and v17.16b, v7.16b, v23.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "ldp x21, x20, [%x[inptrs], #0x50]\n"
+ "ldr q2, [x21, x28]\n"
+ "ldr q5, [x20, x28]\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v28.4s, v28.4s, v23.4s\n"
+ "sqadd v29.4s, v29.4s, v24.4s\n"
+ "ldr q6, [%x[params], #0x160]\n"
+ "sqadd v25.4s, v25.4s, v20.4s\n"
+ "ldr q20, [%x[params], #0x170]\n"
+ "sqadd v7.4s, v7.4s, v17.4s\n"
+ "ldr q1, [%x[params], #0x150]\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "srshl v29.4s, v29.4s, v23.4s\n"
+ "srshl v25.4s, v25.4s, v23.4s\n"
+ "srshl v7.4s, v7.4s, v23.4s\n"
+ "ldr q26, [x10, x28]\n"
+ "ldp x21, x20, [%x[inptrs], #0x60]\n"
+ "ldr q27, [x21, x28]\n"
+ "ldr q30, [x20, x28]\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v7.4s, v7.4s, v14.4s\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "ldr q23, [x21, x28]\n"
+ "ldr q9, [x20, x28]\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
"ldp x15, x14, [%x[inptrs], #0x0]\n"
"smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
+ "smax v7.4s, v7.4s, v13.4s\n"
"ldp x13, x12, [%x[inptrs], #0x10]\n"
"ldp x10, x9, [%x[inptrs], #0x20]\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s5, [x24, x27]\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v25.4s, v25.4s, v11.4s\n"
+ "ldp x26, x21, [%x[inptrs], #0x30]\n"
+ "smin v7.4s, v7.4s, v11.4s\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s28, [x25, x27]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "zip2 v5.16b, v9.16b, v7.16b\n"
- "zip1 v9.16b, v9.16b, v7.16b\n"
- "zip1 v7.16b, v8.16b, v6.16b\n"
- "zip2 v6.16b, v8.16b, v6.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "zip2 v17.16b, v15.16b, v21.16b\n"
+ "zip1 v15.16b, v15.16b, v21.16b\n"
+ "zip1 v18.16b, v31.16b, v8.16b\n"
+ "zip2 v8.16b, v31.16b, v8.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "zip2 v8.16b, v9.16b, v7.16b\n"
- "str s20, [x21, x27]\n"
- "zip1 v9.16b, v9.16b, v7.16b\n"
- "zip1 v7.16b, v5.16b, v6.16b\n"
+ "str s29, [x24, x27]\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "str s25, [x23, x27]\n"
+ "zip2 v25.16b, v15.16b, v18.16b\n"
+ "str s7, [x22, x27]\n"
+ "zip1 v15.16b, v15.16b, v18.16b\n"
+ "zip1 v7.16b, v17.16b, v8.16b\n"
"add x27, x27, #0x4\n"
- "zip2 v6.16b, v5.16b, v6.16b\n"
- "ldr q5, [%x[params], #0x140]\n"
- "zip2 v30.16b, v4.16b, v2.16b\n"
+ "zip2 v8.16b, v17.16b, v8.16b\n"
+ "ldr q31, [%x[params], #0x140]\n"
+ "zip2 v29.16b, v26.16b, v19.16b\n"
"add %x[params], %x[params], #0x180\n"
- "zip1 v4.16b, v4.16b, v2.16b\n"
- "zip1 v2.16b, v3.16b, v1.16b\n"
- "zip2 v1.16b, v3.16b, v1.16b\n"
- "zip2 v25.16b, v29.16b, v27.16b\n"
- "zip1 v29.16b, v29.16b, v27.16b\n"
- "zip1 v27.16b, v28.16b, v26.16b\n"
- "zip2 v26.16b, v28.16b, v26.16b\n"
- "zip2 v20.16b, v24.16b, v22.16b\n"
- "zip1 v24.16b, v24.16b, v22.16b\n"
- "zip1 v22.16b, v23.16b, v21.16b\n"
- "zip2 v21.16b, v23.16b, v21.16b\n"
- "zip2 v3.16b, v4.16b, v2.16b\n"
- "zip1 v4.16b, v4.16b, v2.16b\n"
- "zip1 v2.16b, v30.16b, v1.16b\n"
- "zip2 v1.16b, v30.16b, v1.16b\n"
- "zip2 v28.16b, v29.16b, v27.16b\n"
- "zip1 v29.16b, v29.16b, v27.16b\n"
- "zip1 v27.16b, v25.16b, v26.16b\n"
- "zip2 v26.16b, v25.16b, v26.16b\n"
- "zip2 v23.16b, v24.16b, v22.16b\n"
- "zip1 v24.16b, v24.16b, v22.16b\n"
- "zip1 v22.16b, v20.16b, v21.16b\n"
- "zip2 v21.16b, v20.16b, v21.16b\n"
- "mov v30.16b, v5.16b\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
+ "zip1 v26.16b, v26.16b, v19.16b\n"
+ "zip1 v28.16b, v4.16b, v10.16b\n"
+ "zip2 v10.16b, v4.16b, v10.16b\n"
+ "zip2 v24.16b, v22.16b, v2.16b\n"
+ "zip1 v22.16b, v22.16b, v2.16b\n"
+ "zip1 v21.16b, v3.16b, v5.16b\n"
+ "zip2 v5.16b, v3.16b, v5.16b\n"
+ "zip2 v18.16b, v27.16b, v23.16b\n"
+ "zip1 v27.16b, v27.16b, v23.16b\n"
+ "zip1 v17.16b, v30.16b, v9.16b\n"
+ "zip2 v9.16b, v30.16b, v9.16b\n"
+ "zip2 v23.16b, v26.16b, v28.16b\n"
+ "zip1 v26.16b, v26.16b, v28.16b\n"
+ "zip1 v3.16b, v29.16b, v10.16b\n"
+ "zip2 v10.16b, v29.16b, v10.16b\n"
+ "zip2 v19.16b, v22.16b, v21.16b\n"
+ "zip1 v22.16b, v22.16b, v21.16b\n"
+ "zip1 v0.16b, v24.16b, v5.16b\n"
+ "zip2 v5.16b, v24.16b, v5.16b\n"
+ "zip2 v24.16b, v27.16b, v17.16b\n"
+ "zip1 v27.16b, v27.16b, v17.16b\n"
+ "zip1 v2.16b, v18.16b, v9.16b\n"
+ "zip2 v9.16b, v18.16b, v9.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
"bgt 1b\n"
"2:" // Detached iteration
- "movi v19.4s, #0x0\n"
- ".inst 0x6e8495d3 // udot v19.4s, v14.16b, v4.16b\n"
- ".inst 0x6e899405 // udot v5.4s, v0.16b, v9.16b\n"
+ "movi v21.4s, #0x0\n"
+ ".inst 0x6e9a9595 // udot v21.4s, v12.16b, v26.16b\n"
+ ".inst 0x6e8f943f // udot v31.4s, v1.16b, v15.16b\n"
"tst %x[n_channels], #0xf\n"
- ".inst 0x6e9d95d3 // udot v19.4s, v14.16b, v29.16b\n"
- ".inst 0x6e849419 // udot v25.4s, v0.16b, v4.16b\n"
+ ".inst 0x6e969595 // udot v21.4s, v12.16b, v22.16b\n"
+ ".inst 0x6e9a943d // udot v29.4s, v1.16b, v26.16b\n"
+ "movi v18.4s, #0x0\n"
"add x28, x28, #0x10\n"
- ".inst 0x6e849605 // udot v5.4s, v16.16b, v4.16b\n"
- "ext v4.16b, v4.16b, v4.16b, #0x1\n"
- "mov v18.16b, v19.16b\n .inst 0x6e9895d2 // udot v18.4s, v14.16b, v24.16b\n"
- ".inst 0x6e8995d3 // udot v19.4s, v14.16b, v9.16b\n"
- "ext v9.16b, v9.16b, v9.16b, #0x1\n"
- ".inst 0x6e9d9619 // udot v25.4s, v16.16b, v29.16b\n"
- ".inst 0x6e9d97e5 // udot v5.4s, v31.16b, v29.16b\n"
- "ext v29.16b, v29.16b, v29.16b, #0x1\n"
- ".inst 0x6e89941e // udot v30.4s, v0.16b, v9.16b\n"
- ".inst 0x6e849414 // udot v20.4s, v0.16b, v4.16b\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x6e8495d1 // udot v17.4s, v14.16b, v4.16b\n"
- ".inst 0x6e9d95d1 // udot v17.4s, v14.16b, v29.16b\n"
- ".inst 0x6e9897f9 // udot v25.4s, v31.16b, v24.16b\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x6e84961e // udot v30.4s, v16.16b, v4.16b\n"
+ ".inst 0x6e9a94df // udot v31.4s, v6.16b, v26.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ "mov v17.16b, v21.16b\n .inst 0x6e9b9591 // udot v17.4s, v12.16b, v27.16b\n"
+ ".inst 0x6e8f9595 // udot v21.4s, v12.16b, v15.16b\n"
+ "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+ ".inst 0x6e9a9592 // udot v18.4s, v12.16b, v26.16b\n"
+ ".inst 0x6e9694dd // udot v29.4s, v6.16b, v22.16b\n"
+ ".inst 0x6e96969f // udot v31.4s, v20.16b, v22.16b\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ ".inst 0x6e8f943e // udot v30.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e9a943c // udot v28.4s, v1.16b, v26.16b\n"
+ "mls v31.4s, v21.4s, v16.4s\n"
+ ".inst 0x6e969592 // udot v18.4s, v12.16b, v22.16b\n"
+ ".inst 0x6e9b969d // udot v29.4s, v20.16b, v27.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x6e9a94de // udot v30.4s, v6.16b, v26.16b\n"
"ldr q4, [%x[params], #0x10]\n"
- ".inst 0x6e9d9614 // udot v20.4s, v16.16b, v29.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x6e9895d0 // udot v16.4s, v14.16b, v24.16b\n"
- ".inst 0x6e8995d1 // udot v17.4s, v14.16b, v9.16b\n"
- "ldr q9, [%x[params], #0x0]\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- ".inst 0x6e9d97fe // udot v30.4s, v31.16b, v29.16b\n"
- ".inst 0x6e9897f4 // udot v20.4s, v31.16b, v24.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "ldr q9, [%x[params], #0x60]\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "ldr q16, [%x[params], #0x40]\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "ldr q31, [%x[params], #0x50]\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "ldr q0, [%x[params], #0x30]\n"
- "add v5.4s, v5.4s, v10.4s\n"
+ ".inst 0x6e9694dc // udot v28.4s, v6.16b, v22.16b\n"
+ "mls v29.4s, v17.4s, v16.4s\n"
+ "mov v21.16b, v18.16b\n .inst 0x6e9b9595 // udot v21.4s, v12.16b, v27.16b\n"
+ ".inst 0x6e8f9592 // udot v18.4s, v12.16b, v15.16b\n"
+ "ldr q17, [%x[params], #0x0]\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x6e96969e // udot v30.4s, v20.16b, v22.16b\n"
+ ".inst 0x6e9b969c // udot v28.4s, v20.16b, v27.16b\n"
+ "mls v30.4s, v18.4s, v16.4s\n"
+ "mls v28.4s, v21.4s, v16.4s\n"
+ "and v27.16b, v31.16b, v4.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v17.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v17.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v17.4s\n"
+ "ldr q15, [%x[params], #0x60]\n"
+ "sqadd v31.4s, v31.4s, v27.4s\n"
+ "and v20.16b, v30.16b, v4.16b\n"
+ "and v18.16b, v29.16b, v4.16b\n"
+ "and v17.16b, v28.16b, v4.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v20.4s\n"
+ "ldr q27, [%x[params], #0x40]\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "ldr q26, [%x[params], #0x50]\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "ldr q6, [%x[params], #0x30]\n"
+ "add v31.4s, v31.4s, v14.4s\n"
"srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
+ "srshl v29.4s, v29.4s, v4.4s\n"
+ "srshl v28.4s, v28.4s, v4.4s\n"
"ldr q4, [%x[params], #0x70]\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
"smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x6e8395d3 // udot v19.4s, v14.16b, v3.16b\n"
- ".inst 0x6e9c95d3 // udot v19.4s, v14.16b, v28.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "movi v1.4s, #0x0\n"
+ ".inst 0x6e979581 // udot v1.4s, v12.16b, v23.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s5, [x24, x27]\n"
- "ldr q5, [%x[params], #0x20]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x6e9795d2 // udot v18.4s, v14.16b, v23.16b\n"
+ "str s31, [x25, x27]\n"
+ "ldr q31, [%x[params], #0x20]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ ".inst 0x6e939581 // udot v1.4s, v12.16b, v19.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
- ".inst 0x6e8895d3 // udot v19.4s, v14.16b, v8.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "mov v30.16b, v5.16b\n"
- "str s20, [x21, x27]\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x6e889405 // udot v5.4s, v0.16b, v8.16b\n"
- ".inst 0x6e839419 // udot v25.4s, v0.16b, v3.16b\n"
- ".inst 0x6e839605 // udot v5.4s, v16.16b, v3.16b\n"
- "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s30, [x24, x27]\n"
+ "mov v22.16b, v1.16b\n .inst 0x6e989596 // udot v22.4s, v12.16b, v24.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s29, [x23, x27]\n"
+ "mov v29.16b, v31.16b\n"
+ ".inst 0x6e999581 // udot v1.4s, v12.16b, v25.16b\n"
+ "str s28, [x22, x27]\n"
+ "mov v21.16b, v31.16b\n"
+ "mov v20.16b, v31.16b\n"
+ ".inst 0x6e9994df // udot v31.4s, v6.16b, v25.16b\n"
+ ".inst 0x6e9794d5 // udot v21.4s, v6.16b, v23.16b\n"
+ ".inst 0x6e97977f // udot v31.4s, v27.16b, v23.16b\n"
+ "ext v25.16b, v25.16b, v25.16b, #0x1\n"
"add x27, x27, #0x4\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x6e88941e // udot v30.4s, v0.16b, v8.16b\n"
- ".inst 0x6e839414 // udot v20.4s, v0.16b, v3.16b\n"
- ".inst 0x6e8395d1 // udot v17.4s, v14.16b, v3.16b\n"
- ".inst 0x6e9c9619 // udot v25.4s, v16.16b, v28.16b\n"
- ".inst 0x6e9c97e5 // udot v5.4s, v31.16b, v28.16b\n"
- "ext v28.16b, v28.16b, v28.16b, #0x1\n"
- ".inst 0x6e83961e // udot v30.4s, v16.16b, v3.16b\n"
- ".inst 0x6e9c9614 // udot v20.4s, v16.16b, v28.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x6e9c95d1 // udot v17.4s, v14.16b, v28.16b\n"
- ".inst 0x6e9797f9 // udot v25.4s, v31.16b, v23.16b\n"
"ext v23.16b, v23.16b, v23.16b, #0x1\n"
- ".inst 0x6e9c97fe // udot v30.4s, v31.16b, v28.16b\n"
- ".inst 0x6e9797f4 // udot v20.4s, v31.16b, v23.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x6e9795d0 // udot v16.4s, v14.16b, v23.16b\n"
- ".inst 0x6e8895d1 // udot v17.4s, v14.16b, v8.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "ldr q9, [%x[params], #0xc0]\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "ldr q16, [%x[params], #0xa0]\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "ldr q31, [%x[params], #0xb0]\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "ldr q0, [%x[params], #0x90]\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
+ "movi v18.4s, #0x0\n"
+ ".inst 0x6e9994dd // udot v29.4s, v6.16b, v25.16b\n"
+ ".inst 0x6e9794d4 // udot v20.4s, v6.16b, v23.16b\n"
+ ".inst 0x6e979592 // udot v18.4s, v12.16b, v23.16b\n"
+ ".inst 0x6e939775 // udot v21.4s, v27.16b, v19.16b\n"
+ ".inst 0x6e93975f // udot v31.4s, v26.16b, v19.16b\n"
+ "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+ ".inst 0x6e97977d // udot v29.4s, v27.16b, v23.16b\n"
+ ".inst 0x6e939774 // udot v20.4s, v27.16b, v19.16b\n"
+ "mls v31.4s, v1.4s, v16.4s\n"
+ ".inst 0x6e939592 // udot v18.4s, v12.16b, v19.16b\n"
+ ".inst 0x6e989755 // udot v21.4s, v26.16b, v24.16b\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x6e93975d // udot v29.4s, v26.16b, v19.16b\n"
+ ".inst 0x6e989754 // udot v20.4s, v26.16b, v24.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v15.4s\n"
+ "mov v17.16b, v18.16b\n .inst 0x6e989591 // udot v17.4s, v12.16b, v24.16b\n"
+ ".inst 0x6e999592 // udot v18.4s, v12.16b, v25.16b\n"
+ "mls v29.4s, v18.4s, v16.4s\n"
+ "mls v21.4s, v22.4s, v16.4s\n"
+ "mls v20.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v31.16b, v4.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v29.4s, v29.4s, v15.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v15.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v15.4s\n"
+ "ldr q27, [%x[params], #0xc0]\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "and v19.16b, v29.16b, v4.16b\n"
+ "and v18.16b, v21.16b, v4.16b\n"
+ "and v17.16b, v20.16b, v4.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v4.4s\n"
+ "sqadd v29.4s, v29.4s, v19.4s\n"
+ "ldr q26, [%x[params], #0xa0]\n"
+ "sqadd v21.4s, v21.4s, v18.4s\n"
+ "ldr q25, [%x[params], #0xb0]\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "ldr q24, [%x[params], #0x90]\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "srshl v29.4s, v29.4s, v4.4s\n"
+ "srshl v21.4s, v21.4s, v4.4s\n"
"srshl v20.4s, v20.4s, v4.4s\n"
- "ldr q4, [%x[params], #0xd0]\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
+ "ldr q1, [%x[params], #0xd0]\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
"smax v20.4s, v20.4s, v13.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x6e8295d3 // udot v19.4s, v14.16b, v2.16b\n"
- ".inst 0x6e9b95d3 // udot v19.4s, v14.16b, v27.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s5, [x24, x27]\n"
- "ldr q5, [%x[params], #0x80]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "movi v23.4s, #0x0\n"
+ ".inst 0x6e839597 // udot v23.4s, v12.16b, v3.16b\n"
+ ".inst 0x6e809597 // udot v23.4s, v12.16b, v0.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s31, [x25, x27]\n"
+ "ldr q31, [%x[params], #0x80]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x6e9695d2 // udot v18.4s, v14.16b, v22.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
- ".inst 0x6e8795d3 // udot v19.4s, v14.16b, v7.16b\n"
+ "mov v22.16b, v23.16b\n .inst 0x6e829596 // udot v22.4s, v12.16b, v2.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s29, [x24, x27]\n"
+ ".inst 0x6e879597 // udot v23.4s, v12.16b, v7.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "mov v30.16b, v5.16b\n"
- "str s20, [x21, x27]\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x6e879405 // udot v5.4s, v0.16b, v7.16b\n"
- ".inst 0x6e829419 // udot v25.4s, v0.16b, v2.16b\n"
- ".inst 0x6e829605 // udot v5.4s, v16.16b, v2.16b\n"
+ "str s21, [x23, x27]\n"
+ "mov v21.16b, v31.16b\n"
+ "str s20, [x22, x27]\n"
+ "mov v4.16b, v31.16b\n"
+ "mov v20.16b, v31.16b\n"
+ ".inst 0x6e87971f // udot v31.4s, v24.16b, v7.16b\n"
+ ".inst 0x6e839704 // udot v4.4s, v24.16b, v3.16b\n"
+ ".inst 0x6e83975f // udot v31.4s, v26.16b, v3.16b\n"
"ext v7.16b, v7.16b, v7.16b, #0x1\n"
"add x27, x27, #0x4\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "movi v18.4s, #0x0\n"
+ ".inst 0x6e879715 // udot v21.4s, v24.16b, v7.16b\n"
+ ".inst 0x6e839714 // udot v20.4s, v24.16b, v3.16b\n"
+ ".inst 0x6e839592 // udot v18.4s, v12.16b, v3.16b\n"
+ ".inst 0x6e809744 // udot v4.4s, v26.16b, v0.16b\n"
+ ".inst 0x6e80973f // udot v31.4s, v25.16b, v0.16b\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x6e839755 // udot v21.4s, v26.16b, v3.16b\n"
+ ".inst 0x6e809754 // udot v20.4s, v26.16b, v0.16b\n"
+ "mls v31.4s, v23.4s, v16.4s\n"
+ ".inst 0x6e809592 // udot v18.4s, v12.16b, v0.16b\n"
+ ".inst 0x6e829724 // udot v4.4s, v25.16b, v2.16b\n"
"ext v2.16b, v2.16b, v2.16b, #0x1\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x6e87941e // udot v30.4s, v0.16b, v7.16b\n"
- ".inst 0x6e829414 // udot v20.4s, v0.16b, v2.16b\n"
- ".inst 0x6e8295d1 // udot v17.4s, v14.16b, v2.16b\n"
- ".inst 0x6e9b9619 // udot v25.4s, v16.16b, v27.16b\n"
- ".inst 0x6e9b97e5 // udot v5.4s, v31.16b, v27.16b\n"
- "ext v27.16b, v27.16b, v27.16b, #0x1\n"
- ".inst 0x6e82961e // udot v30.4s, v16.16b, v2.16b\n"
- ".inst 0x6e9b9614 // udot v20.4s, v16.16b, v27.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x6e9b95d1 // udot v17.4s, v14.16b, v27.16b\n"
- ".inst 0x6e9697f9 // udot v25.4s, v31.16b, v22.16b\n"
- "ext v22.16b, v22.16b, v22.16b, #0x1\n"
- ".inst 0x6e9b97fe // udot v30.4s, v31.16b, v27.16b\n"
- ".inst 0x6e9697f4 // udot v20.4s, v31.16b, v22.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x6e9695d0 // udot v16.4s, v14.16b, v22.16b\n"
- ".inst 0x6e8795d1 // udot v17.4s, v14.16b, v7.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "ldr q9, [%x[params], #0x120]\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "ldr q16, [%x[params], #0x100]\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "ldr q31, [%x[params], #0x110]\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "ldr q0, [%x[params], #0xf0]\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "ldr q4, [%x[params], #0x130]\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
+ ".inst 0x6e809735 // udot v21.4s, v25.16b, v0.16b\n"
+ ".inst 0x6e829734 // udot v20.4s, v25.16b, v2.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v27.4s\n"
+ "mov v17.16b, v18.16b\n .inst 0x6e829591 // udot v17.4s, v12.16b, v2.16b\n"
+ ".inst 0x6e879592 // udot v18.4s, v12.16b, v7.16b\n"
+ "mls v21.4s, v18.4s, v16.4s\n"
+ "mls v4.4s, v22.4s, v16.4s\n"
+ "mls v20.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v31.16b, v1.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v27.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+ "ldr q30, [%x[params], #0x120]\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "and v19.16b, v21.16b, v1.16b\n"
+ "and v18.16b, v4.16b, v1.16b\n"
+ "and v17.16b, v20.16b, v1.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v1.4s\n"
+ "sqadd v21.4s, v21.4s, v19.4s\n"
+ "ldr q29, [%x[params], #0x100]\n"
+ "sqadd v4.4s, v4.4s, v18.4s\n"
+ "ldr q28, [%x[params], #0x110]\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "ldr q27, [%x[params], #0xf0]\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "srshl v21.4s, v21.4s, v1.4s\n"
+ "srshl v4.4s, v4.4s, v1.4s\n"
+ "srshl v20.4s, v20.4s, v1.4s\n"
+ "ldr q26, [%x[params], #0x130]\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v4.4s, v4.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
+ "smax v4.4s, v4.4s, v13.4s\n"
"smax v20.4s, v20.4s, v13.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x6e8195d3 // udot v19.4s, v14.16b, v1.16b\n"
- ".inst 0x6e9a95d3 // udot v19.4s, v14.16b, v26.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s5, [x24, x27]\n"
- "ldr q5, [%x[params], #0xe0]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v4.4s, v4.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "movi v25.4s, #0x0\n"
+ ".inst 0x6e8a9599 // udot v25.4s, v12.16b, v10.16b\n"
+ ".inst 0x6e859599 // udot v25.4s, v12.16b, v5.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s31, [x25, x27]\n"
+ "ldr q24, [%x[params], #0xe0]\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x6e9595d2 // udot v18.4s, v14.16b, v21.16b\n"
+ "mov v23.16b, v25.16b\n .inst 0x6e899597 // udot v23.4s, v12.16b, v9.16b\n"
"add %x[params], %x[params], #0x140\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
- ".inst 0x6e8695d3 // udot v19.4s, v14.16b, v6.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "str s21, [x24, x27]\n"
+ ".inst 0x6e889599 // udot v25.4s, v12.16b, v8.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "mov v30.16b, v5.16b\n"
- "str s20, [x21, x27]\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x6e869405 // udot v5.4s, v0.16b, v6.16b\n"
- ".inst 0x6e819419 // udot v25.4s, v0.16b, v1.16b\n"
- ".inst 0x6e819605 // udot v5.4s, v16.16b, v1.16b\n"
- "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ "str s4, [x23, x27]\n"
+ "mov v22.16b, v24.16b\n"
+ "str s20, [x22, x27]\n"
+ "mov v21.16b, v24.16b\n"
+ "mov v20.16b, v24.16b\n"
+ ".inst 0x6e889778 // udot v24.4s, v27.16b, v8.16b\n"
+ ".inst 0x6e8a9775 // udot v21.4s, v27.16b, v10.16b\n"
+ ".inst 0x6e8a97b8 // udot v24.4s, v29.16b, v10.16b\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
"add x27, x27, #0x4\n"
- "ext v1.16b, v1.16b, v1.16b, #0x1\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x6e86941e // udot v30.4s, v0.16b, v6.16b\n"
- ".inst 0x6e819414 // udot v20.4s, v0.16b, v1.16b\n"
- ".inst 0x6e8195d1 // udot v17.4s, v14.16b, v1.16b\n"
- ".inst 0x6e9a9619 // udot v25.4s, v16.16b, v26.16b\n"
- ".inst 0x6e9a97e5 // udot v5.4s, v31.16b, v26.16b\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- ".inst 0x6e81961e // udot v30.4s, v16.16b, v1.16b\n"
- ".inst 0x6e9a9614 // udot v20.4s, v16.16b, v26.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x6e9a95d1 // udot v17.4s, v14.16b, v26.16b\n"
- ".inst 0x6e9597f9 // udot v25.4s, v31.16b, v21.16b\n"
- "ext v21.16b, v21.16b, v21.16b, #0x1\n"
- ".inst 0x6e9a97fe // udot v30.4s, v31.16b, v26.16b\n"
- ".inst 0x6e9597f4 // udot v20.4s, v31.16b, v21.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x6e9595d0 // udot v16.4s, v14.16b, v21.16b\n"
- ".inst 0x6e8695d1 // udot v17.4s, v14.16b, v6.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+ "movi v18.4s, #0x0\n"
+ ".inst 0x6e889776 // udot v22.4s, v27.16b, v8.16b\n"
+ ".inst 0x6e8a9774 // udot v20.4s, v27.16b, v10.16b\n"
+ ".inst 0x6e8a9592 // udot v18.4s, v12.16b, v10.16b\n"
+ ".inst 0x6e8597b5 // udot v21.4s, v29.16b, v5.16b\n"
+ ".inst 0x6e859798 // udot v24.4s, v28.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x6e8a97b6 // udot v22.4s, v29.16b, v10.16b\n"
+ ".inst 0x6e8597b4 // udot v20.4s, v29.16b, v5.16b\n"
+ "mls v24.4s, v25.4s, v16.4s\n"
+ ".inst 0x6e859592 // udot v18.4s, v12.16b, v5.16b\n"
+ ".inst 0x6e899795 // udot v21.4s, v28.16b, v9.16b\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+ ".inst 0x6e859796 // udot v22.4s, v28.16b, v5.16b\n"
+ ".inst 0x6e899794 // udot v20.4s, v28.16b, v9.16b\n"
+ "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+ "mov v17.16b, v18.16b\n .inst 0x6e899591 // udot v17.4s, v12.16b, v9.16b\n"
+ ".inst 0x6e889592 // udot v18.4s, v12.16b, v8.16b\n"
+ "mls v22.4s, v18.4s, v16.4s\n"
+ "mls v21.4s, v23.4s, v16.4s\n"
+ "mls v20.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v24.16b, v26.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v22.4s, v22.4s, v30.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "and v19.16b, v22.16b, v26.16b\n"
+ "and v18.16b, v21.16b, v26.16b\n"
+ "and v17.16b, v20.16b, v26.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v22.4s, v22.4s, v19.4s\n"
+ "sqadd v21.4s, v21.4s, v18.4s\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "srshl v24.4s, v24.4s, v26.4s\n"
+ "srshl v22.4s, v22.4s, v26.4s\n"
+ "srshl v21.4s, v21.4s, v26.4s\n"
+ "srshl v20.4s, v20.4s, v26.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v22.4s, v22.4s, v14.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v22.4s, v22.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
"smax v20.4s, v20.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "smin v24.4s, v24.4s, v11.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "str s5, [x24, x27]\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x25, x27]\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s22, [x24, x27]\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "str s20, [x21, x27]\n"
+ "str s21, [x23, x27]\n"
+ "str s20, [x22, x27]\n"
"add x27, x27, #0x4\n"
"beq 35f\n"
"3:" // Oddments
@@ -869,794 +861,794 @@ void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
"add x10, x10, x28\n"
"add x9, x9, x28\n"
"add x26, x26, x28\n"
- "add x25, x25, x28\n"
+ "add x21, x21, x28\n"
"tbz %x[n_channels], #3, 7f\n"
- "ldr d9, [x15], #0x8\n"
- "ldr d8, [x14], #0x8\n"
+ "ldr d15, [x15], #0x8\n"
+ "ldr d25, [x14], #0x8\n"
"ldr d7, [x13], #0x8\n"
- "ldr d6, [x12], #0x8\n"
- "ldr d4, [x10], #0x8\n"
- "ldr d3, [x9], #0x8\n"
- "ldr d2, [x26], #0x8\n"
- "ldr d1, [x25], #0x8\n"
+ "ldr d8, [x12], #0x8\n"
+ "ldr d26, [x10], #0x8\n"
+ "ldr d23, [x9], #0x8\n"
+ "ldr d3, [x26], #0x8\n"
+ "ldr d10, [x21], #0x8\n"
"tbz %x[n_channels], #2, 5f\n"
- "ld1 { v9.s }[2], [x15], #0x4\n"
- "ld1 { v8.s }[2], [x14], #0x4\n"
+ "ld1 { v15.s }[2], [x15], #0x4\n"
+ "ld1 { v25.s }[2], [x14], #0x4\n"
"ld1 { v7.s }[2], [x13], #0x4\n"
- "ld1 { v6.s }[2], [x12], #0x4\n"
- "ld1 { v4.s }[2], [x10], #0x4\n"
- "ld1 { v3.s }[2], [x9], #0x4\n"
- "ld1 { v2.s }[2], [x26], #0x4\n"
- "ld1 { v1.s }[2], [x25], #0x4\n"
+ "ld1 { v8.s }[2], [x12], #0x4\n"
+ "ld1 { v26.s }[2], [x10], #0x4\n"
+ "ld1 { v23.s }[2], [x9], #0x4\n"
+ "ld1 { v3.s }[2], [x26], #0x4\n"
+ "ld1 { v10.s }[2], [x21], #0x4\n"
"tbz %x[n_channels], #1, 4f\n"
- "ld1 { v9.h }[6], [x15], #0x2\n"
- "ld1 { v8.h }[6], [x14], #0x2\n"
+ "ld1 { v15.h }[6], [x15], #0x2\n"
+ "ld1 { v25.h }[6], [x14], #0x2\n"
"ld1 { v7.h }[6], [x13], #0x2\n"
- "ld1 { v6.h }[6], [x12], #0x2\n"
- "ld1 { v4.h }[6], [x10], #0x2\n"
- "ld1 { v3.h }[6], [x9], #0x2\n"
- "ld1 { v2.h }[6], [x26], #0x2\n"
- "ld1 { v1.h }[6], [x25], #0x2\n"
+ "ld1 { v8.h }[6], [x12], #0x2\n"
+ "ld1 { v26.h }[6], [x10], #0x2\n"
+ "ld1 { v23.h }[6], [x9], #0x2\n"
+ "ld1 { v3.h }[6], [x26], #0x2\n"
+ "ld1 { v10.h }[6], [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.b }[14], [x15], #0x1\n"
- "ld1 { v8.b }[14], [x14], #0x1\n"
+ "ld1 { v15.b }[14], [x15], #0x1\n"
+ "ld1 { v25.b }[14], [x14], #0x1\n"
"ld1 { v7.b }[14], [x13], #0x1\n"
- "ld1 { v6.b }[14], [x12], #0x1\n"
- "ld1 { v4.b }[14], [x10], #0x1\n"
- "ld1 { v3.b }[14], [x9], #0x1\n"
- "ld1 { v2.b }[14], [x26], #0x1\n"
- "ld1 { v1.b }[14], [x25], #0x1\n"
+ "ld1 { v8.b }[14], [x12], #0x1\n"
+ "ld1 { v26.b }[14], [x10], #0x1\n"
+ "ld1 { v23.b }[14], [x9], #0x1\n"
+ "ld1 { v3.b }[14], [x26], #0x1\n"
+ "ld1 { v10.b }[14], [x21], #0x1\n"
"b 11f\n"
"4:" // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.b }[12], [x15], #0x1\n"
- "ld1 { v8.b }[12], [x14], #0x1\n"
+ "ld1 { v15.b }[12], [x15], #0x1\n"
+ "ld1 { v25.b }[12], [x14], #0x1\n"
"ld1 { v7.b }[12], [x13], #0x1\n"
- "ld1 { v6.b }[12], [x12], #0x1\n"
- "ld1 { v4.b }[12], [x10], #0x1\n"
- "ld1 { v3.b }[12], [x9], #0x1\n"
- "ld1 { v2.b }[12], [x26], #0x1\n"
- "ld1 { v1.b }[12], [x25], #0x1\n"
+ "ld1 { v8.b }[12], [x12], #0x1\n"
+ "ld1 { v26.b }[12], [x10], #0x1\n"
+ "ld1 { v23.b }[12], [x9], #0x1\n"
+ "ld1 { v3.b }[12], [x26], #0x1\n"
+ "ld1 { v10.b }[12], [x21], #0x1\n"
"b 11f\n"
"5:" // Oddments: Load (A): Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 6f\n"
- "ld1 { v9.h }[4], [x15], #0x2\n"
- "ld1 { v8.h }[4], [x14], #0x2\n"
+ "ld1 { v15.h }[4], [x15], #0x2\n"
+ "ld1 { v25.h }[4], [x14], #0x2\n"
"ld1 { v7.h }[4], [x13], #0x2\n"
- "ld1 { v6.h }[4], [x12], #0x2\n"
- "ld1 { v4.h }[4], [x10], #0x2\n"
- "ld1 { v3.h }[4], [x9], #0x2\n"
- "ld1 { v2.h }[4], [x26], #0x2\n"
- "ld1 { v1.h }[4], [x25], #0x2\n"
+ "ld1 { v8.h }[4], [x12], #0x2\n"
+ "ld1 { v26.h }[4], [x10], #0x2\n"
+ "ld1 { v23.h }[4], [x9], #0x2\n"
+ "ld1 { v3.h }[4], [x26], #0x2\n"
+ "ld1 { v10.h }[4], [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.b }[10], [x15], #0x1\n"
- "ld1 { v8.b }[10], [x14], #0x1\n"
+ "ld1 { v15.b }[10], [x15], #0x1\n"
+ "ld1 { v25.b }[10], [x14], #0x1\n"
"ld1 { v7.b }[10], [x13], #0x1\n"
- "ld1 { v6.b }[10], [x12], #0x1\n"
- "ld1 { v4.b }[10], [x10], #0x1\n"
- "ld1 { v3.b }[10], [x9], #0x1\n"
- "ld1 { v2.b }[10], [x26], #0x1\n"
- "ld1 { v1.b }[10], [x25], #0x1\n"
+ "ld1 { v8.b }[10], [x12], #0x1\n"
+ "ld1 { v26.b }[10], [x10], #0x1\n"
+ "ld1 { v23.b }[10], [x9], #0x1\n"
+ "ld1 { v3.b }[10], [x26], #0x1\n"
+ "ld1 { v10.b }[10], [x21], #0x1\n"
"b 11f\n"
"6:" // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.b }[8], [x15], #0x1\n"
- "ld1 { v8.b }[8], [x14], #0x1\n"
+ "ld1 { v15.b }[8], [x15], #0x1\n"
+ "ld1 { v25.b }[8], [x14], #0x1\n"
"ld1 { v7.b }[8], [x13], #0x1\n"
- "ld1 { v6.b }[8], [x12], #0x1\n"
- "ld1 { v4.b }[8], [x10], #0x1\n"
- "ld1 { v3.b }[8], [x9], #0x1\n"
- "ld1 { v2.b }[8], [x26], #0x1\n"
- "ld1 { v1.b }[8], [x25], #0x1\n"
+ "ld1 { v8.b }[8], [x12], #0x1\n"
+ "ld1 { v26.b }[8], [x10], #0x1\n"
+ "ld1 { v23.b }[8], [x9], #0x1\n"
+ "ld1 { v3.b }[8], [x26], #0x1\n"
+ "ld1 { v10.b }[8], [x21], #0x1\n"
"b 11f\n"
"7:" // Oddments: Load (A): Bit 3: Unset
"tbz %x[n_channels], #2, 9f\n"
- "ldr s9, [x15], #0x4\n"
- "ldr s8, [x14], #0x4\n"
+ "ldr s15, [x15], #0x4\n"
+ "ldr s25, [x14], #0x4\n"
"ldr s7, [x13], #0x4\n"
- "ldr s6, [x12], #0x4\n"
- "ldr s4, [x10], #0x4\n"
- "ldr s3, [x9], #0x4\n"
- "ldr s2, [x26], #0x4\n"
- "ldr s1, [x25], #0x4\n"
+ "ldr s8, [x12], #0x4\n"
+ "ldr s26, [x10], #0x4\n"
+ "ldr s23, [x9], #0x4\n"
+ "ldr s3, [x26], #0x4\n"
+ "ldr s10, [x21], #0x4\n"
"tbz %x[n_channels], #1, 8f\n"
- "ld1 { v9.h }[2], [x15], #0x2\n"
- "ld1 { v8.h }[2], [x14], #0x2\n"
+ "ld1 { v15.h }[2], [x15], #0x2\n"
+ "ld1 { v25.h }[2], [x14], #0x2\n"
"ld1 { v7.h }[2], [x13], #0x2\n"
- "ld1 { v6.h }[2], [x12], #0x2\n"
- "ld1 { v4.h }[2], [x10], #0x2\n"
- "ld1 { v3.h }[2], [x9], #0x2\n"
- "ld1 { v2.h }[2], [x26], #0x2\n"
- "ld1 { v1.h }[2], [x25], #0x2\n"
+ "ld1 { v8.h }[2], [x12], #0x2\n"
+ "ld1 { v26.h }[2], [x10], #0x2\n"
+ "ld1 { v23.h }[2], [x9], #0x2\n"
+ "ld1 { v3.h }[2], [x26], #0x2\n"
+ "ld1 { v10.h }[2], [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.b }[6], [x15], #0x1\n"
- "ld1 { v8.b }[6], [x14], #0x1\n"
+ "ld1 { v15.b }[6], [x15], #0x1\n"
+ "ld1 { v25.b }[6], [x14], #0x1\n"
"ld1 { v7.b }[6], [x13], #0x1\n"
- "ld1 { v6.b }[6], [x12], #0x1\n"
- "ld1 { v4.b }[6], [x10], #0x1\n"
- "ld1 { v3.b }[6], [x9], #0x1\n"
- "ld1 { v2.b }[6], [x26], #0x1\n"
- "ld1 { v1.b }[6], [x25], #0x1\n"
+ "ld1 { v8.b }[6], [x12], #0x1\n"
+ "ld1 { v26.b }[6], [x10], #0x1\n"
+ "ld1 { v23.b }[6], [x9], #0x1\n"
+ "ld1 { v3.b }[6], [x26], #0x1\n"
+ "ld1 { v10.b }[6], [x21], #0x1\n"
"b 11f\n"
"8:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.b }[4], [x15], #0x1\n"
- "ld1 { v8.b }[4], [x14], #0x1\n"
+ "ld1 { v15.b }[4], [x15], #0x1\n"
+ "ld1 { v25.b }[4], [x14], #0x1\n"
"ld1 { v7.b }[4], [x13], #0x1\n"
- "ld1 { v6.b }[4], [x12], #0x1\n"
- "ld1 { v4.b }[4], [x10], #0x1\n"
- "ld1 { v3.b }[4], [x9], #0x1\n"
- "ld1 { v2.b }[4], [x26], #0x1\n"
- "ld1 { v1.b }[4], [x25], #0x1\n"
+ "ld1 { v8.b }[4], [x12], #0x1\n"
+ "ld1 { v26.b }[4], [x10], #0x1\n"
+ "ld1 { v23.b }[4], [x9], #0x1\n"
+ "ld1 { v3.b }[4], [x26], #0x1\n"
+ "ld1 { v10.b }[4], [x21], #0x1\n"
"b 11f\n"
"9:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 10f\n"
- "ldr h9, [x15], #0x2\n"
- "ldr h8, [x14], #0x2\n"
+ "ldr h15, [x15], #0x2\n"
+ "ldr h25, [x14], #0x2\n"
"ldr h7, [x13], #0x2\n"
- "ldr h6, [x12], #0x2\n"
- "ldr h4, [x10], #0x2\n"
- "ldr h3, [x9], #0x2\n"
- "ldr h2, [x26], #0x2\n"
- "ldr h1, [x25], #0x2\n"
+ "ldr h8, [x12], #0x2\n"
+ "ldr h26, [x10], #0x2\n"
+ "ldr h23, [x9], #0x2\n"
+ "ldr h3, [x26], #0x2\n"
+ "ldr h10, [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.b }[2], [x15], #0x1\n"
- "ld1 { v8.b }[2], [x14], #0x1\n"
+ "ld1 { v15.b }[2], [x15], #0x1\n"
+ "ld1 { v25.b }[2], [x14], #0x1\n"
"ld1 { v7.b }[2], [x13], #0x1\n"
- "ld1 { v6.b }[2], [x12], #0x1\n"
- "ld1 { v4.b }[2], [x10], #0x1\n"
- "ld1 { v3.b }[2], [x9], #0x1\n"
- "ld1 { v2.b }[2], [x26], #0x1\n"
- "ld1 { v1.b }[2], [x25], #0x1\n"
+ "ld1 { v8.b }[2], [x12], #0x1\n"
+ "ld1 { v26.b }[2], [x10], #0x1\n"
+ "ld1 { v23.b }[2], [x9], #0x1\n"
+ "ld1 { v3.b }[2], [x26], #0x1\n"
+ "ld1 { v10.b }[2], [x21], #0x1\n"
"b 11f\n"
"10:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
- "ldr b9, [x15], #0x1\n"
- "ldr b8, [x14], #0x1\n"
+ "ldr b15, [x15], #0x1\n"
+ "ldr b25, [x14], #0x1\n"
"ldr b7, [x13], #0x1\n"
- "ldr b6, [x12], #0x1\n"
- "ldr b4, [x10], #0x1\n"
- "ldr b3, [x9], #0x1\n"
- "ldr b2, [x26], #0x1\n"
- "ldr b1, [x25], #0x1\n"
+ "ldr b8, [x12], #0x1\n"
+ "ldr b26, [x10], #0x1\n"
+ "ldr b23, [x9], #0x1\n"
+ "ldr b3, [x26], #0x1\n"
+ "ldr b10, [x21], #0x1\n"
"11:" // Oddments: Load (A): Bit 3: End
"ldp x15, x14, [%x[inptrs], #0x40]\n"
"ldp x13, x12, [%x[inptrs], #0x50]\n"
"add x15, x15, x28\n"
"add x14, x14, x28\n"
"ldp x10, x9, [%x[inptrs], #0x60]\n"
- "ldp x26, x25, [%x[inptrs], #0x70]\n"
+ "ldp x26, x21, [%x[inptrs], #0x70]\n"
"add x13, x13, x28\n"
"add x12, x12, x28\n"
"add x10, x10, x28\n"
"add x9, x9, x28\n"
"add x26, x26, x28\n"
- "add x25, x25, x28\n"
+ "add x21, x21, x28\n"
"tbz %x[n_channels], #3, 15f\n"
- "ldr d29, [x15], #0x8\n"
- "ldr d28, [x14], #0x8\n"
- "ldr d27, [x13], #0x8\n"
- "ldr d26, [x12], #0x8\n"
- "ldr d24, [x10], #0x8\n"
- "ldr d23, [x9], #0x8\n"
- "ldr d22, [x26], #0x8\n"
- "ldr d21, [x25], #0x8\n"
+ "ldr d22, [x15], #0x8\n"
+ "ldr d19, [x14], #0x8\n"
+ "ldr d0, [x13], #0x8\n"
+ "ldr d5, [x12], #0x8\n"
+ "ldr d27, [x10], #0x8\n"
+ "ldr d24, [x9], #0x8\n"
+ "ldr d2, [x26], #0x8\n"
+ "ldr d9, [x21], #0x8\n"
"tbz %x[n_channels], #2, 13f\n"
- "ld1 { v29.s }[2], [x15], #0x4\n"
- "ld1 { v28.s }[2], [x14], #0x4\n"
- "ld1 { v27.s }[2], [x13], #0x4\n"
- "ld1 { v26.s }[2], [x12], #0x4\n"
- "ld1 { v24.s }[2], [x10], #0x4\n"
- "ld1 { v23.s }[2], [x9], #0x4\n"
- "ld1 { v22.s }[2], [x26], #0x4\n"
- "ld1 { v21.s }[2], [x25], #0x4\n"
+ "ld1 { v22.s }[2], [x15], #0x4\n"
+ "ld1 { v19.s }[2], [x14], #0x4\n"
+ "ld1 { v0.s }[2], [x13], #0x4\n"
+ "ld1 { v5.s }[2], [x12], #0x4\n"
+ "ld1 { v27.s }[2], [x10], #0x4\n"
+ "ld1 { v24.s }[2], [x9], #0x4\n"
+ "ld1 { v2.s }[2], [x26], #0x4\n"
+ "ld1 { v9.s }[2], [x21], #0x4\n"
"tbz %x[n_channels], #1, 12f\n"
- "ld1 { v29.h }[6], [x15], #0x2\n"
- "ld1 { v28.h }[6], [x14], #0x2\n"
- "ld1 { v27.h }[6], [x13], #0x2\n"
- "ld1 { v26.h }[6], [x12], #0x2\n"
- "ld1 { v24.h }[6], [x10], #0x2\n"
- "ld1 { v23.h }[6], [x9], #0x2\n"
- "ld1 { v22.h }[6], [x26], #0x2\n"
- "ld1 { v21.h }[6], [x25], #0x2\n"
+ "ld1 { v22.h }[6], [x15], #0x2\n"
+ "ld1 { v19.h }[6], [x14], #0x2\n"
+ "ld1 { v0.h }[6], [x13], #0x2\n"
+ "ld1 { v5.h }[6], [x12], #0x2\n"
+ "ld1 { v27.h }[6], [x10], #0x2\n"
+ "ld1 { v24.h }[6], [x9], #0x2\n"
+ "ld1 { v2.h }[6], [x26], #0x2\n"
+ "ld1 { v9.h }[6], [x21], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v29.b }[14], [x15], #0x1\n"
- "ld1 { v28.b }[14], [x14], #0x1\n"
- "ld1 { v27.b }[14], [x13], #0x1\n"
- "ld1 { v26.b }[14], [x12], #0x1\n"
- "ld1 { v24.b }[14], [x10], #0x1\n"
- "ld1 { v23.b }[14], [x9], #0x1\n"
- "ld1 { v22.b }[14], [x26], #0x1\n"
- "ld1 { v21.b }[14], [x25], #0x1\n"
+ "ld1 { v22.b }[14], [x15], #0x1\n"
+ "ld1 { v19.b }[14], [x14], #0x1\n"
+ "ld1 { v0.b }[14], [x13], #0x1\n"
+ "ld1 { v5.b }[14], [x12], #0x1\n"
+ "ld1 { v27.b }[14], [x10], #0x1\n"
+ "ld1 { v24.b }[14], [x9], #0x1\n"
+ "ld1 { v2.b }[14], [x26], #0x1\n"
+ "ld1 { v9.b }[14], [x21], #0x1\n"
"b 19f\n"
"12:" // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v29.b }[12], [x15], #0x1\n"
- "ld1 { v28.b }[12], [x14], #0x1\n"
- "ld1 { v27.b }[12], [x13], #0x1\n"
- "ld1 { v26.b }[12], [x12], #0x1\n"
- "ld1 { v24.b }[12], [x10], #0x1\n"
- "ld1 { v23.b }[12], [x9], #0x1\n"
- "ld1 { v22.b }[12], [x26], #0x1\n"
- "ld1 { v21.b }[12], [x25], #0x1\n"
+ "ld1 { v22.b }[12], [x15], #0x1\n"
+ "ld1 { v19.b }[12], [x14], #0x1\n"
+ "ld1 { v0.b }[12], [x13], #0x1\n"
+ "ld1 { v5.b }[12], [x12], #0x1\n"
+ "ld1 { v27.b }[12], [x10], #0x1\n"
+ "ld1 { v24.b }[12], [x9], #0x1\n"
+ "ld1 { v2.b }[12], [x26], #0x1\n"
+ "ld1 { v9.b }[12], [x21], #0x1\n"
"b 19f\n"
"13:" // Oddments: Load (B): Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 14f\n"
- "ld1 { v29.h }[4], [x15], #0x2\n"
- "ld1 { v28.h }[4], [x14], #0x2\n"
- "ld1 { v27.h }[4], [x13], #0x2\n"
- "ld1 { v26.h }[4], [x12], #0x2\n"
- "ld1 { v24.h }[4], [x10], #0x2\n"
- "ld1 { v23.h }[4], [x9], #0x2\n"
- "ld1 { v22.h }[4], [x26], #0x2\n"
- "ld1 { v21.h }[4], [x25], #0x2\n"
+ "ld1 { v22.h }[4], [x15], #0x2\n"
+ "ld1 { v19.h }[4], [x14], #0x2\n"
+ "ld1 { v0.h }[4], [x13], #0x2\n"
+ "ld1 { v5.h }[4], [x12], #0x2\n"
+ "ld1 { v27.h }[4], [x10], #0x2\n"
+ "ld1 { v24.h }[4], [x9], #0x2\n"
+ "ld1 { v2.h }[4], [x26], #0x2\n"
+ "ld1 { v9.h }[4], [x21], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v29.b }[10], [x15], #0x1\n"
- "ld1 { v28.b }[10], [x14], #0x1\n"
- "ld1 { v27.b }[10], [x13], #0x1\n"
- "ld1 { v26.b }[10], [x12], #0x1\n"
- "ld1 { v24.b }[10], [x10], #0x1\n"
- "ld1 { v23.b }[10], [x9], #0x1\n"
- "ld1 { v22.b }[10], [x26], #0x1\n"
- "ld1 { v21.b }[10], [x25], #0x1\n"
+ "ld1 { v22.b }[10], [x15], #0x1\n"
+ "ld1 { v19.b }[10], [x14], #0x1\n"
+ "ld1 { v0.b }[10], [x13], #0x1\n"
+ "ld1 { v5.b }[10], [x12], #0x1\n"
+ "ld1 { v27.b }[10], [x10], #0x1\n"
+ "ld1 { v24.b }[10], [x9], #0x1\n"
+ "ld1 { v2.b }[10], [x26], #0x1\n"
+ "ld1 { v9.b }[10], [x21], #0x1\n"
"b 19f\n"
"14:" // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v29.b }[8], [x15], #0x1\n"
- "ld1 { v28.b }[8], [x14], #0x1\n"
- "ld1 { v27.b }[8], [x13], #0x1\n"
- "ld1 { v26.b }[8], [x12], #0x1\n"
- "ld1 { v24.b }[8], [x10], #0x1\n"
- "ld1 { v23.b }[8], [x9], #0x1\n"
- "ld1 { v22.b }[8], [x26], #0x1\n"
- "ld1 { v21.b }[8], [x25], #0x1\n"
+ "ld1 { v22.b }[8], [x15], #0x1\n"
+ "ld1 { v19.b }[8], [x14], #0x1\n"
+ "ld1 { v0.b }[8], [x13], #0x1\n"
+ "ld1 { v5.b }[8], [x12], #0x1\n"
+ "ld1 { v27.b }[8], [x10], #0x1\n"
+ "ld1 { v24.b }[8], [x9], #0x1\n"
+ "ld1 { v2.b }[8], [x26], #0x1\n"
+ "ld1 { v9.b }[8], [x21], #0x1\n"
"b 19f\n"
"15:" // Oddments: Load (B): Bit 3: Unset
"tbz %x[n_channels], #2, 17f\n"
- "ldr s29, [x15], #0x4\n"
- "ldr s28, [x14], #0x4\n"
- "ldr s27, [x13], #0x4\n"
- "ldr s26, [x12], #0x4\n"
- "ldr s24, [x10], #0x4\n"
- "ldr s23, [x9], #0x4\n"
- "ldr s22, [x26], #0x4\n"
- "ldr s21, [x25], #0x4\n"
+ "ldr s22, [x15], #0x4\n"
+ "ldr s19, [x14], #0x4\n"
+ "ldr s0, [x13], #0x4\n"
+ "ldr s5, [x12], #0x4\n"
+ "ldr s27, [x10], #0x4\n"
+ "ldr s24, [x9], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s9, [x21], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v29.h }[2], [x15], #0x2\n"
- "ld1 { v28.h }[2], [x14], #0x2\n"
- "ld1 { v27.h }[2], [x13], #0x2\n"
- "ld1 { v26.h }[2], [x12], #0x2\n"
- "ld1 { v24.h }[2], [x10], #0x2\n"
- "ld1 { v23.h }[2], [x9], #0x2\n"
- "ld1 { v22.h }[2], [x26], #0x2\n"
- "ld1 { v21.h }[2], [x25], #0x2\n"
+ "ld1 { v22.h }[2], [x15], #0x2\n"
+ "ld1 { v19.h }[2], [x14], #0x2\n"
+ "ld1 { v0.h }[2], [x13], #0x2\n"
+ "ld1 { v5.h }[2], [x12], #0x2\n"
+ "ld1 { v27.h }[2], [x10], #0x2\n"
+ "ld1 { v24.h }[2], [x9], #0x2\n"
+ "ld1 { v2.h }[2], [x26], #0x2\n"
+ "ld1 { v9.h }[2], [x21], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v29.b }[6], [x15], #0x1\n"
- "ld1 { v28.b }[6], [x14], #0x1\n"
- "ld1 { v27.b }[6], [x13], #0x1\n"
- "ld1 { v26.b }[6], [x12], #0x1\n"
- "ld1 { v24.b }[6], [x10], #0x1\n"
- "ld1 { v23.b }[6], [x9], #0x1\n"
- "ld1 { v22.b }[6], [x26], #0x1\n"
- "ld1 { v21.b }[6], [x25], #0x1\n"
+ "ld1 { v22.b }[6], [x15], #0x1\n"
+ "ld1 { v19.b }[6], [x14], #0x1\n"
+ "ld1 { v0.b }[6], [x13], #0x1\n"
+ "ld1 { v5.b }[6], [x12], #0x1\n"
+ "ld1 { v27.b }[6], [x10], #0x1\n"
+ "ld1 { v24.b }[6], [x9], #0x1\n"
+ "ld1 { v2.b }[6], [x26], #0x1\n"
+ "ld1 { v9.b }[6], [x21], #0x1\n"
"b 19f\n"
"16:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v29.b }[4], [x15], #0x1\n"
- "ld1 { v28.b }[4], [x14], #0x1\n"
- "ld1 { v27.b }[4], [x13], #0x1\n"
- "ld1 { v26.b }[4], [x12], #0x1\n"
- "ld1 { v24.b }[4], [x10], #0x1\n"
- "ld1 { v23.b }[4], [x9], #0x1\n"
- "ld1 { v22.b }[4], [x26], #0x1\n"
- "ld1 { v21.b }[4], [x25], #0x1\n"
+ "ld1 { v22.b }[4], [x15], #0x1\n"
+ "ld1 { v19.b }[4], [x14], #0x1\n"
+ "ld1 { v0.b }[4], [x13], #0x1\n"
+ "ld1 { v5.b }[4], [x12], #0x1\n"
+ "ld1 { v27.b }[4], [x10], #0x1\n"
+ "ld1 { v24.b }[4], [x9], #0x1\n"
+ "ld1 { v2.b }[4], [x26], #0x1\n"
+ "ld1 { v9.b }[4], [x21], #0x1\n"
"b 19f\n"
"17:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ldr h29, [x15], #0x2\n"
- "ldr h28, [x14], #0x2\n"
- "ldr h27, [x13], #0x2\n"
- "ldr h26, [x12], #0x2\n"
- "ldr h24, [x10], #0x2\n"
- "ldr h23, [x9], #0x2\n"
- "ldr h22, [x26], #0x2\n"
- "ldr h21, [x25], #0x2\n"
+ "ldr h22, [x15], #0x2\n"
+ "ldr h19, [x14], #0x2\n"
+ "ldr h0, [x13], #0x2\n"
+ "ldr h5, [x12], #0x2\n"
+ "ldr h27, [x10], #0x2\n"
+ "ldr h24, [x9], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h9, [x21], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v29.b }[2], [x15], #0x1\n"
- "ld1 { v28.b }[2], [x14], #0x1\n"
- "ld1 { v27.b }[2], [x13], #0x1\n"
- "ld1 { v26.b }[2], [x12], #0x1\n"
- "ld1 { v24.b }[2], [x10], #0x1\n"
- "ld1 { v23.b }[2], [x9], #0x1\n"
- "ld1 { v22.b }[2], [x26], #0x1\n"
- "ld1 { v21.b }[2], [x25], #0x1\n"
+ "ld1 { v22.b }[2], [x15], #0x1\n"
+ "ld1 { v19.b }[2], [x14], #0x1\n"
+ "ld1 { v0.b }[2], [x13], #0x1\n"
+ "ld1 { v5.b }[2], [x12], #0x1\n"
+ "ld1 { v27.b }[2], [x10], #0x1\n"
+ "ld1 { v24.b }[2], [x9], #0x1\n"
+ "ld1 { v2.b }[2], [x26], #0x1\n"
+ "ld1 { v9.b }[2], [x21], #0x1\n"
"b 19f\n"
"18:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
- "ldr b29, [x15], #0x1\n"
- "ldr b28, [x14], #0x1\n"
- "ldr b27, [x13], #0x1\n"
- "ldr b26, [x12], #0x1\n"
- "ldr b24, [x10], #0x1\n"
- "ldr b23, [x9], #0x1\n"
- "ldr b22, [x26], #0x1\n"
- "ldr b21, [x25], #0x1\n"
+ "ldr b22, [x15], #0x1\n"
+ "ldr b19, [x14], #0x1\n"
+ "ldr b0, [x13], #0x1\n"
+ "ldr b5, [x12], #0x1\n"
+ "ldr b27, [x10], #0x1\n"
+ "ldr b24, [x9], #0x1\n"
+ "ldr b2, [x26], #0x1\n"
+ "ldr b9, [x21], #0x1\n"
"19:" // Oddments: Load (B): Bit 3: End
- "ldr q0, [%x[params], #0x10]\n"
- "ldr q16, [%x[params], #0x20]\n"
- "zip2 v30.16b, v4.16b, v2.16b\n"
- "zip1 v4.16b, v4.16b, v2.16b\n"
- "ldr q31, [%x[params], #0x30]\n"
- "zip1 v2.16b, v3.16b, v1.16b\n"
- "zip2 v5.16b, v9.16b, v7.16b\n"
+ "ldr q20, [%x[params], #0x10]\n"
+ "ldr q6, [%x[params], #0x20]\n"
+ "zip2 v1.16b, v26.16b, v3.16b\n"
+ "zip1 v26.16b, v26.16b, v3.16b\n"
+ "ldr q4, [%x[params], #0x30]\n"
+ "zip1 v18.16b, v23.16b, v10.16b\n"
+ "zip2 v30.16b, v15.16b, v7.16b\n"
"cmp x20, #0x4\n"
- "zip1 v9.16b, v9.16b, v7.16b\n"
- "zip1 v7.16b, v8.16b, v6.16b\n"
- "zip2 v6.16b, v8.16b, v6.16b\n"
- "zip2 v1.16b, v3.16b, v1.16b\n"
- "zip2 v3.16b, v4.16b, v2.16b\n"
- "zip1 v4.16b, v4.16b, v2.16b\n"
- "zip2 v25.16b, v29.16b, v27.16b\n"
- "zip1 v29.16b, v29.16b, v27.16b\n"
- "zip1 v27.16b, v28.16b, v26.16b\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x6e8495d3 // udot v19.4s, v14.16b, v4.16b\n"
- "zip2 v8.16b, v9.16b, v7.16b\n"
- "zip1 v9.16b, v9.16b, v7.16b\n"
- "zip1 v7.16b, v5.16b, v6.16b\n"
- "zip2 v6.16b, v5.16b, v6.16b\n"
- "ldr q5, [%x[params], #0x0]\n"
- "zip2 v26.16b, v28.16b, v26.16b\n"
- "zip2 v20.16b, v24.16b, v22.16b\n"
- "zip1 v24.16b, v24.16b, v22.16b\n"
- "zip1 v22.16b, v23.16b, v21.16b\n"
- "zip2 v21.16b, v23.16b, v21.16b\n"
- "zip2 v28.16b, v29.16b, v27.16b\n"
- "zip1 v29.16b, v29.16b, v27.16b\n"
- "zip1 v2.16b, v30.16b, v1.16b\n"
- ".inst 0x6e9d95d3 // udot v19.4s, v14.16b, v29.16b\n"
- "zip2 v1.16b, v30.16b, v1.16b\n"
- "zip1 v27.16b, v25.16b, v26.16b\n"
- "zip2 v26.16b, v25.16b, v26.16b\n"
- "zip2 v23.16b, v24.16b, v22.16b\n"
- "zip1 v24.16b, v24.16b, v22.16b\n"
- "zip1 v22.16b, v20.16b, v21.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x6e9895d2 // udot v18.4s, v14.16b, v24.16b\n"
- "zip2 v21.16b, v20.16b, v21.16b\n"
- "mov v30.16b, v5.16b\n"
- ".inst 0x6e8995d3 // udot v19.4s, v14.16b, v9.16b\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x6e899405 // udot v5.4s, v0.16b, v9.16b\n"
- ".inst 0x6e849419 // udot v25.4s, v0.16b, v4.16b\n"
- ".inst 0x6e849605 // udot v5.4s, v16.16b, v4.16b\n"
- "ext v4.16b, v4.16b, v4.16b, #0x1\n"
- "ext v9.16b, v9.16b, v9.16b, #0x1\n"
- ".inst 0x6e9d9619 // udot v25.4s, v16.16b, v29.16b\n"
- ".inst 0x6e9d97e5 // udot v5.4s, v31.16b, v29.16b\n"
- "ext v29.16b, v29.16b, v29.16b, #0x1\n"
- ".inst 0x6e89941e // udot v30.4s, v0.16b, v9.16b\n"
- ".inst 0x6e849414 // udot v20.4s, v0.16b, v4.16b\n"
+ "zip1 v15.16b, v15.16b, v7.16b\n"
+ "zip1 v29.16b, v25.16b, v8.16b\n"
+ "zip2 v8.16b, v25.16b, v8.16b\n"
+ "zip2 v10.16b, v23.16b, v10.16b\n"
+ "zip2 v23.16b, v26.16b, v18.16b\n"
+ "zip1 v26.16b, v26.16b, v18.16b\n"
+ "zip2 v28.16b, v22.16b, v0.16b\n"
+ "zip1 v22.16b, v22.16b, v0.16b\n"
+ "zip1 v21.16b, v19.16b, v5.16b\n"
"movi v17.4s, #0x0\n"
- ".inst 0x6e8495d1 // udot v17.4s, v14.16b, v4.16b\n"
- ".inst 0x6e9d95d1 // udot v17.4s, v14.16b, v29.16b\n"
- ".inst 0x6e9897f9 // udot v25.4s, v31.16b, v24.16b\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x6e84961e // udot v30.4s, v16.16b, v4.16b\n"
- "ldr q4, [%x[params], #0x50]\n"
- ".inst 0x6e9d9614 // udot v20.4s, v16.16b, v29.16b\n"
- "mov v16.16b, v17.16b\n .inst 0x6e9895d0 // udot v16.4s, v14.16b, v24.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x6e8995d1 // udot v17.4s, v14.16b, v9.16b\n"
- "ldr q9, [%x[params], #0x40]\n"
- ".inst 0x6e9d97fe // udot v30.4s, v31.16b, v29.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- ".inst 0x6e9897f4 // udot v20.4s, v31.16b, v24.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
+ ".inst 0x6e9a9591 // udot v17.4s, v12.16b, v26.16b\n"
+ "zip2 v25.16b, v15.16b, v29.16b\n"
+ "zip1 v15.16b, v15.16b, v29.16b\n"
+ "zip1 v7.16b, v30.16b, v8.16b\n"
+ "zip2 v8.16b, v30.16b, v8.16b\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "zip2 v5.16b, v19.16b, v5.16b\n"
+ "zip2 v30.16b, v27.16b, v2.16b\n"
+ "zip1 v27.16b, v27.16b, v2.16b\n"
+ "zip1 v18.16b, v24.16b, v9.16b\n"
+ "zip2 v9.16b, v24.16b, v9.16b\n"
+ "zip2 v19.16b, v22.16b, v21.16b\n"
+ "zip1 v22.16b, v22.16b, v21.16b\n"
+ "zip1 v3.16b, v1.16b, v10.16b\n"
+ ".inst 0x6e969591 // udot v17.4s, v12.16b, v22.16b\n"
+ "zip2 v10.16b, v1.16b, v10.16b\n"
+ "zip1 v0.16b, v28.16b, v5.16b\n"
+ "zip2 v5.16b, v28.16b, v5.16b\n"
+ "zip2 v24.16b, v27.16b, v18.16b\n"
+ "zip1 v27.16b, v27.16b, v18.16b\n"
+ "zip1 v2.16b, v30.16b, v9.16b\n"
+ "mov v18.16b, v17.16b\n .inst 0x6e9b9592 // udot v18.4s, v12.16b, v27.16b\n"
+ "zip2 v9.16b, v30.16b, v9.16b\n"
+ "mov v30.16b, v31.16b\n"
+ ".inst 0x6e8f9591 // udot v17.4s, v12.16b, v15.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
+ ".inst 0x6e8f969f // udot v31.4s, v20.16b, v15.16b\n"
+ ".inst 0x6e9a969d // udot v29.4s, v20.16b, v26.16b\n"
+ ".inst 0x6e9a94df // udot v31.4s, v6.16b, v26.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ "movi v1.4s, #0x0\n"
+ "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+ ".inst 0x6e9a9581 // udot v1.4s, v12.16b, v26.16b\n"
+ ".inst 0x6e9694dd // udot v29.4s, v6.16b, v22.16b\n"
+ ".inst 0x6e96949f // udot v31.4s, v4.16b, v22.16b\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ ".inst 0x6e8f969e // udot v30.4s, v20.16b, v15.16b\n"
+ ".inst 0x6e9a969c // udot v28.4s, v20.16b, v26.16b\n"
+ "mls v31.4s, v17.4s, v16.4s\n"
+ ".inst 0x6e969581 // udot v1.4s, v12.16b, v22.16b\n"
+ ".inst 0x6e9b949d // udot v29.4s, v4.16b, v27.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x6e9a94de // udot v30.4s, v6.16b, v26.16b\n"
+ "ldr q21, [%x[params], #0x50]\n"
+ ".inst 0x6e9694dc // udot v28.4s, v6.16b, v22.16b\n"
+ "mls v29.4s, v18.4s, v16.4s\n"
+ "mov v20.16b, v1.16b\n .inst 0x6e9b9594 // udot v20.4s, v12.16b, v27.16b\n"
+ ".inst 0x6e8f9581 // udot v1.4s, v12.16b, v15.16b\n"
+ "ldr q18, [%x[params], #0x40]\n"
+ "sqrdmulh v31.4s, v31.4s, v18.4s\n"
+ ".inst 0x6e96949e // udot v30.4s, v4.16b, v22.16b\n"
+ ".inst 0x6e9b949c // udot v28.4s, v4.16b, v27.16b\n"
+ "mls v30.4s, v1.4s, v16.4s\n"
"add %x[params], %x[params], #0x60\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smax v5.4s, v5.4s, v13.4s\n"
+ "mls v28.4s, v20.4s, v16.4s\n"
+ "and v17.16b, v31.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v18.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v18.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v18.4s\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "and v17.16b, v30.16b, v21.16b\n"
+ "and v18.16b, v29.16b, v21.16b\n"
+ "and v26.16b, v28.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v28.4s, v28.4s, v26.4s\n"
+ "srshl v31.4s, v31.4s, v21.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "srshl v29.4s, v29.4s, v21.4s\n"
+ "srshl v28.4s, v28.4s, v21.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
"smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
"blt 20f\n"
- "str s5, [x24, x27]\n"
- "str s30, [x23, x27]\n"
- "str s25, [x22, x27]\n"
- "str s20, [x21, x27]\n"
+ "str s31, [x25, x27]\n"
+ "str s30, [x24, x27]\n"
+ "str s29, [x23, x27]\n"
+ "str s28, [x22, x27]\n"
"b 23f\n"
"20:" // Oddments: Unroll 0: Oddment store
+ "add x25, x25, x27\n"
"add x24, x24, x27\n"
"add x23, x23, x27\n"
"add x22, x22, x27\n"
- "add x21, x21, x27\n"
"tbz x20, #1, 21f\n"
- "st1 { v5.h }[0], [x24], #0x2\n"
- "st1 { v30.h }[0], [x23], #0x2\n"
- "st1 { v25.h }[0], [x22], #0x2\n"
- "st1 { v20.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v30.h }[0], [x24], #0x2\n"
+ "st1 { v29.h }[0], [x23], #0x2\n"
+ "st1 { v28.h }[0], [x22], #0x2\n"
"tbz x20, #0, 22f\n"
- "st1 { v5.b }[2], [x24], #0x1\n"
- "st1 { v30.b }[2], [x23], #0x1\n"
- "st1 { v25.b }[2], [x22], #0x1\n"
- "st1 { v20.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v30.b }[2], [x24], #0x1\n"
+ "st1 { v29.b }[2], [x23], #0x1\n"
+ "st1 { v28.b }[2], [x22], #0x1\n"
"b 22f\n"
"21:" // Oddments: Unroll 0: Oddment store: Bit 1: Unset
- "st1 { v5.b }[0], [x24], #0x1\n"
- "st1 { v30.b }[0], [x23], #0x1\n"
- "st1 { v25.b }[0], [x22], #0x1\n"
- "st1 { v20.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v30.b }[0], [x24], #0x1\n"
+ "st1 { v29.b }[0], [x23], #0x1\n"
+ "st1 { v28.b }[0], [x22], #0x1\n"
"22:" // Oddments: Unroll 0: Oddment store: Bit 1: End
"23:" // Oddments: Unroll 0: After oddment store
"subs x20, x20, #0x4\n"
"add x27, x27, #0x4\n"
"ble 35f\n"
- "ldr q5, [%x[params], #0x0]\n"
- "ldr q0, [%x[params], #0x10]\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x6e8395d3 // udot v19.4s, v14.16b, v3.16b\n"
- "ldr q16, [%x[params], #0x20]\n"
- "ldr q31, [%x[params], #0x30]\n"
- "mov v30.16b, v5.16b\n"
- "mov v25.16b, v5.16b\n"
- "ldr q9, [%x[params], #0x40]\n"
- "ldr q4, [%x[params], #0x50]\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x6e889405 // udot v5.4s, v0.16b, v8.16b\n"
- ".inst 0x6e9c95d3 // udot v19.4s, v14.16b, v28.16b\n"
- ".inst 0x6e839419 // udot v25.4s, v0.16b, v3.16b\n"
- "movi v17.4s, #0x0\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q27, [%x[params], #0x10]\n"
+ "movi v1.4s, #0x0\n"
+ ".inst 0x6e979581 // udot v1.4s, v12.16b, v23.16b\n"
+ "ldr q26, [%x[params], #0x20]\n"
+ "ldr q22, [%x[params], #0x30]\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "ldr q4, [%x[params], #0x40]\n"
+ "ldr q21, [%x[params], #0x50]\n"
+ "mov v28.16b, v31.16b\n"
+ ".inst 0x6e99977f // udot v31.4s, v27.16b, v25.16b\n"
+ ".inst 0x6e939581 // udot v1.4s, v12.16b, v19.16b\n"
+ ".inst 0x6e97977d // udot v29.4s, v27.16b, v23.16b\n"
+ "movi v20.4s, #0x0\n"
"cmp x20, #0x4\n"
- ".inst 0x6e839605 // udot v5.4s, v16.16b, v3.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x6e9795d2 // udot v18.4s, v14.16b, v23.16b\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- "add %x[params], %x[params], #0x60\n"
- ".inst 0x6e8895d3 // udot v19.4s, v14.16b, v8.16b\n"
- "ext v8.16b, v8.16b, v8.16b, #0x1\n"
- ".inst 0x6e88941e // udot v30.4s, v0.16b, v8.16b\n"
- ".inst 0x6e839414 // udot v20.4s, v0.16b, v3.16b\n"
- ".inst 0x6e8395d1 // udot v17.4s, v14.16b, v3.16b\n"
- ".inst 0x6e9c9619 // udot v25.4s, v16.16b, v28.16b\n"
- ".inst 0x6e9c97e5 // udot v5.4s, v31.16b, v28.16b\n"
- "ext v28.16b, v28.16b, v28.16b, #0x1\n"
- ".inst 0x6e83961e // udot v30.4s, v16.16b, v3.16b\n"
- ".inst 0x6e9c9614 // udot v20.4s, v16.16b, v28.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x6e9c95d1 // udot v17.4s, v14.16b, v28.16b\n"
- ".inst 0x6e9797f9 // udot v25.4s, v31.16b, v23.16b\n"
+ ".inst 0x6e97975f // udot v31.4s, v26.16b, v23.16b\n"
+ "mov v18.16b, v1.16b\n .inst 0x6e989592 // udot v18.4s, v12.16b, v24.16b\n"
"ext v23.16b, v23.16b, v23.16b, #0x1\n"
- ".inst 0x6e9c97fe // udot v30.4s, v31.16b, v28.16b\n"
- ".inst 0x6e9797f4 // udot v20.4s, v31.16b, v23.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x6e9795d0 // udot v16.4s, v14.16b, v23.16b\n"
- ".inst 0x6e8895d1 // udot v17.4s, v14.16b, v8.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smax v5.4s, v5.4s, v13.4s\n"
+ "add %x[params], %x[params], #0x60\n"
+ ".inst 0x6e999581 // udot v1.4s, v12.16b, v25.16b\n"
+ "ext v25.16b, v25.16b, v25.16b, #0x1\n"
+ ".inst 0x6e99977e // udot v30.4s, v27.16b, v25.16b\n"
+ ".inst 0x6e97977c // udot v28.4s, v27.16b, v23.16b\n"
+ ".inst 0x6e979594 // udot v20.4s, v12.16b, v23.16b\n"
+ ".inst 0x6e93975d // udot v29.4s, v26.16b, v19.16b\n"
+ ".inst 0x6e9396df // udot v31.4s, v22.16b, v19.16b\n"
+ "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+ ".inst 0x6e97975e // udot v30.4s, v26.16b, v23.16b\n"
+ ".inst 0x6e93975c // udot v28.4s, v26.16b, v19.16b\n"
+ "mls v31.4s, v1.4s, v16.4s\n"
+ ".inst 0x6e939594 // udot v20.4s, v12.16b, v19.16b\n"
+ ".inst 0x6e9896dd // udot v29.4s, v22.16b, v24.16b\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x6e9396de // udot v30.4s, v22.16b, v19.16b\n"
+ ".inst 0x6e9896dc // udot v28.4s, v22.16b, v24.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+ "mov v17.16b, v20.16b\n .inst 0x6e989591 // udot v17.4s, v12.16b, v24.16b\n"
+ ".inst 0x6e999594 // udot v20.4s, v12.16b, v25.16b\n"
+ "mls v30.4s, v20.4s, v16.4s\n"
+ "mls v29.4s, v18.4s, v16.4s\n"
+ "mls v28.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v31.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v4.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v4.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v4.4s\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "and v19.16b, v30.16b, v21.16b\n"
+ "and v18.16b, v29.16b, v21.16b\n"
+ "and v17.16b, v28.16b, v21.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "srshl v31.4s, v31.4s, v21.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "srshl v29.4s, v29.4s, v21.4s\n"
+ "srshl v28.4s, v28.4s, v21.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
"smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
"blt 24f\n"
- "str s5, [x24, x27]\n"
- "str s30, [x23, x27]\n"
- "str s25, [x22, x27]\n"
- "str s20, [x21, x27]\n"
+ "str s31, [x25, x27]\n"
+ "str s30, [x24, x27]\n"
+ "str s29, [x23, x27]\n"
+ "str s28, [x22, x27]\n"
"b 27f\n"
"24:" // Oddments: Unroll 1: Oddment store
+ "add x25, x25, x27\n"
"add x24, x24, x27\n"
"add x23, x23, x27\n"
"add x22, x22, x27\n"
- "add x21, x21, x27\n"
"tbz x20, #1, 25f\n"
- "st1 { v5.h }[0], [x24], #0x2\n"
- "st1 { v30.h }[0], [x23], #0x2\n"
- "st1 { v25.h }[0], [x22], #0x2\n"
- "st1 { v20.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v30.h }[0], [x24], #0x2\n"
+ "st1 { v29.h }[0], [x23], #0x2\n"
+ "st1 { v28.h }[0], [x22], #0x2\n"
"tbz x20, #0, 26f\n"
- "st1 { v5.b }[2], [x24], #0x1\n"
- "st1 { v30.b }[2], [x23], #0x1\n"
- "st1 { v25.b }[2], [x22], #0x1\n"
- "st1 { v20.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v30.b }[2], [x24], #0x1\n"
+ "st1 { v29.b }[2], [x23], #0x1\n"
+ "st1 { v28.b }[2], [x22], #0x1\n"
"b 26f\n"
"25:" // Oddments: Unroll 1: Oddment store: Bit 1: Unset
- "st1 { v5.b }[0], [x24], #0x1\n"
- "st1 { v30.b }[0], [x23], #0x1\n"
- "st1 { v25.b }[0], [x22], #0x1\n"
- "st1 { v20.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v30.b }[0], [x24], #0x1\n"
+ "st1 { v29.b }[0], [x23], #0x1\n"
+ "st1 { v28.b }[0], [x22], #0x1\n"
"26:" // Oddments: Unroll 1: Oddment store: Bit 1: End
"27:" // Oddments: Unroll 1: After oddment store
"subs x20, x20, #0x4\n"
"add x27, x27, #0x4\n"
"ble 35f\n"
- "ldr q5, [%x[params], #0x0]\n"
- "ldr q0, [%x[params], #0x10]\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q25, [%x[params], #0x10]\n"
+ "movi v24.4s, #0x0\n"
+ ".inst 0x6e839598 // udot v24.4s, v12.16b, v3.16b\n"
+ "ldr q23, [%x[params], #0x20]\n"
+ "ldr q22, [%x[params], #0x30]\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "ldr q21, [%x[params], #0x40]\n"
+ "ldr q20, [%x[params], #0x50]\n"
+ "mov v28.16b, v31.16b\n"
+ ".inst 0x6e87973f // udot v31.4s, v25.16b, v7.16b\n"
+ ".inst 0x6e809598 // udot v24.4s, v12.16b, v0.16b\n"
+ ".inst 0x6e83973d // udot v29.4s, v25.16b, v3.16b\n"
"movi v19.4s, #0x0\n"
- ".inst 0x6e8295d3 // udot v19.4s, v14.16b, v2.16b\n"
- "ldr q16, [%x[params], #0x20]\n"
- "ldr q31, [%x[params], #0x30]\n"
- "mov v30.16b, v5.16b\n"
- "mov v25.16b, v5.16b\n"
- "ldr q9, [%x[params], #0x40]\n"
- "ldr q4, [%x[params], #0x50]\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x6e879405 // udot v5.4s, v0.16b, v7.16b\n"
- ".inst 0x6e9b95d3 // udot v19.4s, v14.16b, v27.16b\n"
- ".inst 0x6e829419 // udot v25.4s, v0.16b, v2.16b\n"
- "movi v17.4s, #0x0\n"
"cmp x20, #0x4\n"
- ".inst 0x6e829605 // udot v5.4s, v16.16b, v2.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x6e9695d2 // udot v18.4s, v14.16b, v22.16b\n"
- "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ ".inst 0x6e8396ff // udot v31.4s, v23.16b, v3.16b\n"
+ "mov v18.16b, v24.16b\n .inst 0x6e829592 // udot v18.4s, v12.16b, v2.16b\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
"add %x[params], %x[params], #0x60\n"
- ".inst 0x6e8795d3 // udot v19.4s, v14.16b, v7.16b\n"
+ ".inst 0x6e879598 // udot v24.4s, v12.16b, v7.16b\n"
"ext v7.16b, v7.16b, v7.16b, #0x1\n"
- ".inst 0x6e87941e // udot v30.4s, v0.16b, v7.16b\n"
- ".inst 0x6e829414 // udot v20.4s, v0.16b, v2.16b\n"
- ".inst 0x6e8295d1 // udot v17.4s, v14.16b, v2.16b\n"
- ".inst 0x6e9b9619 // udot v25.4s, v16.16b, v27.16b\n"
- ".inst 0x6e9b97e5 // udot v5.4s, v31.16b, v27.16b\n"
- "ext v27.16b, v27.16b, v27.16b, #0x1\n"
- ".inst 0x6e82961e // udot v30.4s, v16.16b, v2.16b\n"
- ".inst 0x6e9b9614 // udot v20.4s, v16.16b, v27.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x6e9b95d1 // udot v17.4s, v14.16b, v27.16b\n"
- ".inst 0x6e9697f9 // udot v25.4s, v31.16b, v22.16b\n"
- "ext v22.16b, v22.16b, v22.16b, #0x1\n"
- ".inst 0x6e9b97fe // udot v30.4s, v31.16b, v27.16b\n"
- ".inst 0x6e9697f4 // udot v20.4s, v31.16b, v22.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x6e9695d0 // udot v16.4s, v14.16b, v22.16b\n"
- ".inst 0x6e8795d1 // udot v17.4s, v14.16b, v7.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smax v5.4s, v5.4s, v13.4s\n"
+ ".inst 0x6e87973e // udot v30.4s, v25.16b, v7.16b\n"
+ ".inst 0x6e83973c // udot v28.4s, v25.16b, v3.16b\n"
+ ".inst 0x6e839593 // udot v19.4s, v12.16b, v3.16b\n"
+ ".inst 0x6e8096fd // udot v29.4s, v23.16b, v0.16b\n"
+ ".inst 0x6e8096df // udot v31.4s, v22.16b, v0.16b\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x6e8396fe // udot v30.4s, v23.16b, v3.16b\n"
+ ".inst 0x6e8096fc // udot v28.4s, v23.16b, v0.16b\n"
+ "mls v31.4s, v24.4s, v16.4s\n"
+ ".inst 0x6e809593 // udot v19.4s, v12.16b, v0.16b\n"
+ ".inst 0x6e8296dd // udot v29.4s, v22.16b, v2.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ ".inst 0x6e8096de // udot v30.4s, v22.16b, v0.16b\n"
+ ".inst 0x6e8296dc // udot v28.4s, v22.16b, v2.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+ "mov v17.16b, v19.16b\n .inst 0x6e829591 // udot v17.4s, v12.16b, v2.16b\n"
+ ".inst 0x6e879593 // udot v19.4s, v12.16b, v7.16b\n"
+ "mls v30.4s, v19.4s, v16.4s\n"
+ "mls v29.4s, v18.4s, v16.4s\n"
+ "mls v28.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v31.16b, v20.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v21.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "and v19.16b, v30.16b, v20.16b\n"
+ "and v18.16b, v29.16b, v20.16b\n"
+ "and v17.16b, v28.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "srshl v30.4s, v30.4s, v20.4s\n"
+ "srshl v29.4s, v29.4s, v20.4s\n"
+ "srshl v28.4s, v28.4s, v20.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
"smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
"blt 28f\n"
- "str s5, [x24, x27]\n"
- "str s30, [x23, x27]\n"
- "str s25, [x22, x27]\n"
- "str s20, [x21, x27]\n"
+ "str s31, [x25, x27]\n"
+ "str s30, [x24, x27]\n"
+ "str s29, [x23, x27]\n"
+ "str s28, [x22, x27]\n"
"b 31f\n"
"28:" // Oddments: Unroll 2: Oddment store
+ "add x25, x25, x27\n"
"add x24, x24, x27\n"
"add x23, x23, x27\n"
"add x22, x22, x27\n"
- "add x21, x21, x27\n"
"tbz x20, #1, 29f\n"
- "st1 { v5.h }[0], [x24], #0x2\n"
- "st1 { v30.h }[0], [x23], #0x2\n"
- "st1 { v25.h }[0], [x22], #0x2\n"
- "st1 { v20.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v30.h }[0], [x24], #0x2\n"
+ "st1 { v29.h }[0], [x23], #0x2\n"
+ "st1 { v28.h }[0], [x22], #0x2\n"
"tbz x20, #0, 30f\n"
- "st1 { v5.b }[2], [x24], #0x1\n"
- "st1 { v30.b }[2], [x23], #0x1\n"
- "st1 { v25.b }[2], [x22], #0x1\n"
- "st1 { v20.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v30.b }[2], [x24], #0x1\n"
+ "st1 { v29.b }[2], [x23], #0x1\n"
+ "st1 { v28.b }[2], [x22], #0x1\n"
"b 30f\n"
"29:" // Oddments: Unroll 2: Oddment store: Bit 1: Unset
- "st1 { v5.b }[0], [x24], #0x1\n"
- "st1 { v30.b }[0], [x23], #0x1\n"
- "st1 { v25.b }[0], [x22], #0x1\n"
- "st1 { v20.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v30.b }[0], [x24], #0x1\n"
+ "st1 { v29.b }[0], [x23], #0x1\n"
+ "st1 { v28.b }[0], [x22], #0x1\n"
"30:" // Oddments: Unroll 2: Oddment store: Bit 1: End
"31:" // Oddments: Unroll 2: After oddment store
"subs x20, x20, #0x4\n"
"add x27, x27, #0x4\n"
"ble 35f\n"
- "ldr q5, [%x[params], #0x0]\n"
- "ldr q0, [%x[params], #0x10]\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x6e8195d3 // udot v19.4s, v14.16b, v1.16b\n"
- "ldr q16, [%x[params], #0x20]\n"
- "ldr q31, [%x[params], #0x30]\n"
- "mov v30.16b, v5.16b\n"
- "mov v25.16b, v5.16b\n"
- "ldr q9, [%x[params], #0x40]\n"
- "ldr q4, [%x[params], #0x50]\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x6e869405 // udot v5.4s, v0.16b, v6.16b\n"
- ".inst 0x6e9a95d3 // udot v19.4s, v14.16b, v26.16b\n"
- ".inst 0x6e819419 // udot v25.4s, v0.16b, v1.16b\n"
- "movi v17.4s, #0x0\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q23, [%x[params], #0x10]\n"
+ "movi v22.4s, #0x0\n"
+ ".inst 0x6e8a9596 // udot v22.4s, v12.16b, v10.16b\n"
+ "ldr q21, [%x[params], #0x20]\n"
+ "ldr q19, [%x[params], #0x30]\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "ldr q20, [%x[params], #0x40]\n"
+ "ldr q26, [%x[params], #0x50]\n"
+ "mov v28.16b, v31.16b\n"
+ ".inst 0x6e8896ff // udot v31.4s, v23.16b, v8.16b\n"
+ ".inst 0x6e859596 // udot v22.4s, v12.16b, v5.16b\n"
+ ".inst 0x6e8a96fd // udot v29.4s, v23.16b, v10.16b\n"
+ "movi v18.4s, #0x0\n"
"add %x[params], %x[params], #0x60\n"
- ".inst 0x6e819605 // udot v5.4s, v16.16b, v1.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x6e9595d2 // udot v18.4s, v14.16b, v21.16b\n"
- "ext v1.16b, v1.16b, v1.16b, #0x1\n"
- ".inst 0x6e8695d3 // udot v19.4s, v14.16b, v6.16b\n"
- "ext v6.16b, v6.16b, v6.16b, #0x1\n"
- ".inst 0x6e86941e // udot v30.4s, v0.16b, v6.16b\n"
- ".inst 0x6e819414 // udot v20.4s, v0.16b, v1.16b\n"
- ".inst 0x6e8195d1 // udot v17.4s, v14.16b, v1.16b\n"
- ".inst 0x6e9a9619 // udot v25.4s, v16.16b, v26.16b\n"
- ".inst 0x6e9a97e5 // udot v5.4s, v31.16b, v26.16b\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- ".inst 0x6e81961e // udot v30.4s, v16.16b, v1.16b\n"
- ".inst 0x6e9a9614 // udot v20.4s, v16.16b, v26.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x6e9a95d1 // udot v17.4s, v14.16b, v26.16b\n"
- ".inst 0x6e9597f9 // udot v25.4s, v31.16b, v21.16b\n"
- "ext v21.16b, v21.16b, v21.16b, #0x1\n"
- ".inst 0x6e9a97fe // udot v30.4s, v31.16b, v26.16b\n"
- ".inst 0x6e9597f4 // udot v20.4s, v31.16b, v21.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x6e9595d0 // udot v16.4s, v14.16b, v21.16b\n"
- ".inst 0x6e8695d1 // udot v17.4s, v14.16b, v6.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
+ ".inst 0x6e8a96bf // udot v31.4s, v21.16b, v10.16b\n"
+ "mov v17.16b, v22.16b\n .inst 0x6e899591 // udot v17.4s, v12.16b, v9.16b\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+ ".inst 0x6e889596 // udot v22.4s, v12.16b, v8.16b\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ ".inst 0x6e8896fe // udot v30.4s, v23.16b, v8.16b\n"
+ ".inst 0x6e8a96fc // udot v28.4s, v23.16b, v10.16b\n"
+ ".inst 0x6e8a9592 // udot v18.4s, v12.16b, v10.16b\n"
+ ".inst 0x6e8596bd // udot v29.4s, v21.16b, v5.16b\n"
+ ".inst 0x6e85967f // udot v31.4s, v19.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x6e8a96be // udot v30.4s, v21.16b, v10.16b\n"
+ ".inst 0x6e8596bc // udot v28.4s, v21.16b, v5.16b\n"
+ "mls v31.4s, v22.4s, v16.4s\n"
+ ".inst 0x6e859592 // udot v18.4s, v12.16b, v5.16b\n"
+ ".inst 0x6e89967d // udot v29.4s, v19.16b, v9.16b\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+ ".inst 0x6e85967e // udot v30.4s, v19.16b, v5.16b\n"
+ ".inst 0x6e89967c // udot v28.4s, v19.16b, v9.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v20.4s\n"
+ "mov v7.16b, v18.16b\n .inst 0x6e899587 // udot v7.4s, v12.16b, v9.16b\n"
+ ".inst 0x6e889592 // udot v18.4s, v12.16b, v8.16b\n"
+ "mls v30.4s, v18.4s, v16.4s\n"
+ "mls v29.4s, v17.4s, v16.4s\n"
+ "mls v28.4s, v7.4s, v16.4s\n"
+ "and v16.16b, v31.16b, v26.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smax v5.4s, v5.4s, v13.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v20.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v20.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v20.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v18.16b, v30.16b, v26.16b\n"
+ "and v17.16b, v29.16b, v26.16b\n"
+ "and v16.16b, v28.16b, v26.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "sqadd v29.4s, v29.4s, v17.4s\n"
+ "sqadd v28.4s, v28.4s, v16.4s\n"
+ "srshl v31.4s, v31.4s, v26.4s\n"
+ "srshl v30.4s, v30.4s, v26.4s\n"
+ "srshl v29.4s, v29.4s, v26.4s\n"
+ "srshl v28.4s, v28.4s, v26.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
"smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
"32:" // Oddments: Unroll 3: Oddment store
+ "add x25, x25, x27\n"
"add x24, x24, x27\n"
"add x23, x23, x27\n"
"add x22, x22, x27\n"
- "add x21, x21, x27\n"
"tbz x20, #1, 33f\n"
- "st1 { v5.h }[0], [x24], #0x2\n"
- "st1 { v30.h }[0], [x23], #0x2\n"
- "st1 { v25.h }[0], [x22], #0x2\n"
- "st1 { v20.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v30.h }[0], [x24], #0x2\n"
+ "st1 { v29.h }[0], [x23], #0x2\n"
+ "st1 { v28.h }[0], [x22], #0x2\n"
"tbz x20, #0, 34f\n"
- "st1 { v5.b }[2], [x24], #0x1\n"
- "st1 { v30.b }[2], [x23], #0x1\n"
- "st1 { v25.b }[2], [x22], #0x1\n"
- "st1 { v20.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v30.b }[2], [x24], #0x1\n"
+ "st1 { v29.b }[2], [x23], #0x1\n"
+ "st1 { v28.b }[2], [x22], #0x1\n"
"b 34f\n"
"33:" // Oddments: Unroll 3: Oddment store: Bit 1: Unset
- "st1 { v5.b }[0], [x24], #0x1\n"
- "st1 { v30.b }[0], [x23], #0x1\n"
- "st1 { v25.b }[0], [x22], #0x1\n"
- "st1 { v20.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v30.b }[0], [x24], #0x1\n"
+ "st1 { v29.b }[0], [x23], #0x1\n"
+ "st1 { v28.b }[0], [x22], #0x1\n"
"34:" // Oddments: Unroll 3: Oddment store: Bit 1: End
"35:" // End
: [params] "+&r" (params)
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index 49ef5dc0d9..9fc6a5bc34 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
@@ -34,15 +34,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
- const unsigned int,
- const uint8_t *const *const,
- const uint8_t *const,
- const int32_t *const,
- const arm_gemm::Requantize32 &,
- const int32_t *const,
- const int32_t *const,
- uint8_t *const *const);
+void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
class a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index 15bbb31413..26fe4c8a10 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -91,1072 +91,1072 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x6, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
- "lsr x7, x6, #0x3\n"
+ "lsr x8, x7, #0x3\n"
"add x20, x23, %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v24.16b }, [x20]\n"
+ "ld1r { v14.16b }, [x20]\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
"add x21, x23, %[offsetof_Requantize32_b_offset]\n"
"add x20, x23, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v15.16b }, [x21]\n"
- "ld1r { v14.8h }, [x20]\n"
+ "ld1r { v19.16b }, [x21]\n"
+ "ld1r { v13.8h }, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_minval]\n"
"add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v12.8h }, [x21]\n"
- "ld1r { v11.8h }, [x20]\n"
- "mov x8, #0x0\n"
+ "ld1r { v29.8h }, [x21]\n"
+ "ld1r { v12.8h }, [x20]\n"
"mov x17, #0x0\n"
- "add x16, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x15, [%x[params], %[offsetof_Params_weights]]\n"
- "ldr x14, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "ldr x13, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x12, x11, [x22, #0x0]\n"
- "ldp x10, x9, [x22, #0x10]\n"
- "cbz x7, 3f\n"
- "ldr d0, [x15, #0x0]\n"
- "ldr d1, [x15, #0x8]\n"
- "subs x7, x7, #0x1\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "ldr d2, [x15, #0x10]\n"
- "ldr d3, [x15, #0x18]\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "ldr d4, [x15, #0x20]\n"
- "ldr d5, [x15, #0x28]\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "ldr d6, [x15, #0x30]\n"
- "ldr d7, [x15, #0x38]\n"
- "usubl v5.8h, v5.8b, v15.8b\n"
- "usubl v6.8h, v6.8b, v15.8b\n"
- "ldr d8, [x15, #0x40]\n"
- "ldr x28, [%x[params], %[offsetof_Params_bias]]\n"
- "usubl v7.8h, v7.8b, v15.8b\n"
- "usubl v8.8h, v8.8b, v15.8b\n"
- "ldr q13, [x28, #0x0]\n"
- "ldr q20, [x28, #0x10]\n"
- "add x28, x28, #0x20\n"
- "str x28, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x24, x23, [x16, #0x0]\n"
- "ldp x22, x21, [x16, #0x10]\n"
- "mov v9.16b, v13.16b\n"
- "mov v18.16b, v20.16b\n"
- "ldr d31, [x24, x8]\n"
- "ldr d30, [x23, x8]\n"
- "mov v16.16b, v13.16b\n"
- "mov v26.16b, v20.16b\n"
- "ldr d29, [x22, x8]\n"
- "ldr d28, [x21, x8]\n"
- "mov v25.16b, v13.16b\n"
- "mov v10.16b, v20.16b\n"
- "ldr x20, [x16, #0x20]\n"
- "ldr d27, [x20, x8]\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "usubl v30.8h, v30.8b, v24.8b\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "usubl v27.8h, v27.8b, v24.8b\n"
+ "mov x16, #0x0\n"
+ "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x11, x10, [x22, #0x0]\n"
+ "ldp x9, x28, [x22, #0x10]\n"
+ "cbz x8, 3f\n"
+ "ldr d23, [x14, #0x0]\n"
+ "ldr d16, [x14, #0x8]\n"
+ "subs x8, x8, #0x1\n"
+ "usubl v23.8h, v23.8b, v19.8b\n"
+ "ldr d1, [x14, #0x10]\n"
+ "ldr d5, [x14, #0x18]\n"
+ "usubl v16.8h, v16.8b, v19.8b\n"
+ "usubl v1.8h, v1.8b, v19.8b\n"
+ "ldr d26, [x14, #0x20]\n"
+ "ldr d18, [x14, #0x28]\n"
+ "usubl v5.8h, v5.8b, v19.8b\n"
+ "usubl v26.8h, v26.8b, v19.8b\n"
+ "ldr d31, [x14, #0x30]\n"
+ "ldr d25, [x14, #0x38]\n"
+ "usubl v18.8h, v18.8b, v19.8b\n"
+ "usubl v31.8h, v31.8b, v19.8b\n"
+ "ldr d20, [x14, #0x40]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "usubl v25.8h, v25.8b, v19.8b\n"
+ "usubl v20.8h, v20.8b, v19.8b\n"
+ "ldr q9, [x20, #0x0]\n"
+ "ldr q24, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x23, x22, [x15, #0x0]\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "mov v7.16b, v9.16b\n"
+ "mov v0.16b, v24.16b\n"
+ "ldr d22, [x23, x17]\n"
+ "ldr d4, [x22, x17]\n"
+ "mov v2.16b, v9.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "ldr d8, [x21, x17]\n"
+ "ldr d27, [x20, x17]\n"
+ "mov v10.16b, v9.16b\n"
+ "mov v6.16b, v24.16b\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ldr d15, [x20, x17]\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
"beq 2f\n"
"1:" // Loop
- "ldr q17, [x14, #0x0]\n"
- "ldr q22, [x13, #0x0]\n"
- "smlal v13.4s, v31.4h, v4.4h\n"
- "smlal2 v20.4s, v31.8h, v4.8h\n"
- "ldr q23, [x14, #0x10]\n"
- "smlal v9.4s, v31.4h, v3.4h\n"
- "smlal2 v18.4s, v31.8h, v3.8h\n"
- "ldr x21, [x16, #0x28]\n"
- "smlal v13.4s, v30.4h, v0.4h\n"
- "smlal2 v20.4s, v30.8h, v0.8h\n"
- "ldr q19, [x13, #0x10]\n"
- "ldr x28, [x16, #0x38]\n"
- "smlal v9.4s, v29.4h, v2.4h\n"
- "smlal2 v18.4s, v29.8h, v2.8h\n"
- "ldr x20, [x16, #0x30]\n"
- "ldr d29, [x20, x8]\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "smlal2 v26.4s, v31.8h, v1.8h\n"
- "ldr x27, [x16, #0x40]\n"
- "ldr x26, [x16, #0x48]\n"
- "smlal v25.4s, v31.4h, v0.4h\n"
- "smlal2 v10.4s, v31.8h, v0.8h\n"
- "ldr d31, [x21, x8]\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "ldr x25, [x16, #0x50]\n"
- "smlal v9.4s, v28.4h, v4.4h\n"
- "smlal2 v18.4s, v28.8h, v4.8h\n"
- "ldr x24, [x16, #0x58]\n"
- "ldr x23, [x16, #0x60]\n"
- "smlal v16.4s, v28.4h, v2.4h\n"
- "smlal2 v26.4s, v28.8h, v2.8h\n"
- "ldr x22, [x16, #0x68]\n"
- "ldr x21, [x16, #0x70]\n"
- "smlal v25.4s, v28.4h, v1.4h\n"
- "smlal2 v10.4s, v28.8h, v1.8h\n"
- "ldr d28, [x28, x8]\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v13.4s, v27.4h, v7.4h\n"
- "smlal2 v20.4s, v27.8h, v7.8h\n"
- "ldr x20, [x16, #0x78]\n"
- "ldr x28, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal v9.4s, v27.4h, v6.4h\n"
- "smlal2 v18.4s, v27.8h, v6.8h\n"
- "add x15, x15, #0x48\n"
- "subs x7, x7, #0x1\n"
- "smlal v16.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v6.8h\n"
- "ldr d31, [x27, x8]\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v25.4s, v27.4h, v3.4h\n"
- "smlal2 v10.4s, v27.8h, v3.8h\n"
- "add x14, x14, #0x20\n"
+ "ldr q3, [x13, #0x0]\n"
+ "ldr q17, [x12, #0x0]\n"
+ "smlal v9.4s, v22.4h, v26.4h\n"
+ "smlal2 v24.4s, v22.8h, v26.8h\n"
+ "ldr q21, [x13, #0x10]\n"
+ "ldr q28, [x12, #0x10]\n"
+ "smlal v9.4s, v4.4h, v23.4h\n"
+ "smlal v7.4s, v22.4h, v5.4h\n"
+ "ldr x20, [x15, #0x28]\n"
+ "ldr d11, [x20, x17]\n"
+ "smlal v2.4s, v22.4h, v16.4h\n"
+ "smlal v10.4s, v22.4h, v23.4h\n"
+ "smlal2 v24.4s, v4.8h, v23.8h\n"
+ "ldr x20, [x15, #0x38]\n"
+ "ldr d4, [x20, x17]\n"
+ "smlal v9.4s, v27.4h, v18.4h\n"
+ "smlal2 v0.4s, v22.8h, v5.8h\n"
+ "smlal2 v30.4s, v22.8h, v16.8h\n"
+ "ldr x20, [x15, #0x30]\n"
+ "usubl v11.8h, v11.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v23.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "smlal v2.4s, v27.4h, v1.4h\n"
+ "smlal v10.4s, v27.4h, v16.4h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "ldr x27, [x15, #0x48]\n"
+ "smlal2 v24.4s, v27.8h, v18.8h\n"
+ "smlal v9.4s, v15.4h, v25.4h\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "ldr x26, [x15, #0x50]\n"
+ "smlal2 v0.4s, v8.8h, v1.8h\n"
+ "ldr d8, [x20, x17]\n"
+ "smlal2 v30.4s, v27.8h, v1.8h\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v16.8h\n"
+ "smlal v7.4s, v27.4h, v26.4h\n"
+ "ldr x25, [x15, #0x58]\n"
+ "ldr x24, [x15, #0x60]\n"
+ "smlal v2.4s, v11.4h, v31.4h\n"
+ "smlal v10.4s, v15.4h, v5.4h\n"
+ "ldr x23, [x15, #0x68]\n"
+ "ldr x22, [x15, #0x70]\n"
+ "smlal2 v24.4s, v15.8h, v25.8h\n"
+ "smlal v9.4s, v4.4h, v16.4h\n"
+ "ldr x21, [x15, #0x78]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v0.4s, v27.8h, v26.8h\n"
+ "ldr d27, [x27, x17]\n"
+ "smlal2 v30.4s, v11.8h, v31.8h\n"
+ "ldr d11, [x26, x17]\n"
+ "smlal2 v6.4s, v15.8h, v5.8h\n"
+ "smlal v7.4s, v15.4h, v31.4h\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "add x14, x14, #0x48\n"
+ "smlal v2.4s, v15.4h, v26.4h\n"
+ "smlal v10.4s, v22.4h, v20.4h\n"
+ "usubl v11.8h, v11.8b, v14.8b\n"
+ "subs x8, x8, #0x1\n"
+ "smlal2 v24.4s, v4.8h, v16.8h\n"
+ "smlal v9.4s, v8.4h, v1.4h\n"
"add x13, x13, #0x20\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "smlal v9.4s, v28.4h, v0.4h\n"
- "smlal2 v18.4s, v28.8h, v0.8h\n"
- "ldr d30, [x26, x8]\n"
- "usubl v30.8h, v30.8b, v24.8b\n"
- "smlal v16.4s, v27.4h, v4.4h\n"
- "smlal v25.4s, v29.4h, v8.4h\n"
- "smlal2 v26.4s, v27.8h, v4.8h\n"
- "ldr d28, [x24, x8]\n"
- "smlal2 v10.4s, v29.8h, v8.8h\n"
- "ldr d29, [x25, x8]\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v20.4s, v31.8h, v2.8h\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "smlal v9.4s, v31.4h, v1.4h\n"
- "smlal2 v18.4s, v31.8h, v1.8h\n"
- "ldr d31, [x23, x8]\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v16.4s, v30.4h, v5.4h\n"
- "smlal v25.4s, v30.4h, v4.4h\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v20.4s, v30.8h, v8.8h\n"
- "smlal v9.4s, v30.4h, v7.4h\n"
- "smlal2 v18.4s, v30.8h, v7.8h\n"
- "smlal2 v26.4s, v30.8h, v5.8h\n"
- "smlal2 v10.4s, v30.8h, v4.8h\n"
- "ldr d30, [x22, x8]\n"
- "usubl v30.8h, v30.8b, v24.8b\n"
- "smlal v16.4s, v29.4h, v0.4h\n"
- "smlal v25.4s, v28.4h, v2.4h\n"
- "smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v20.4s, v29.8h, v3.8h\n"
- "smlal2 v26.4s, v29.8h, v0.8h\n"
- "ldr d29, [x21, x8]\n"
- "smlal2 v10.4s, v28.8h, v2.8h\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal v25.4s, v30.4h, v5.4h\n"
- "smlal v9.4s, v28.4h, v5.4h\n"
- "smlal2 v18.4s, v28.8h, v5.8h\n"
- "ldr d28, [x20, x8]\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v13.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v3.8h\n"
- "sqrdmulh v13.4s, v13.4s, v17.4s\n"
- "add x8, x8, #0x8\n"
- "smlal2 v10.4s, v30.8h, v5.8h\n"
- "smlal v16.4s, v29.4h, v7.4h\n"
- "and v21.16b, v13.16b, v22.16b\n"
- "smlal v25.4s, v29.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "smlal2 v26.4s, v29.8h, v7.8h\n"
- "smlal2 v10.4s, v29.8h, v6.8h\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "smlal v9.4s, v30.4h, v8.4h\n"
- "smlal v16.4s, v28.4h, v8.4h\n"
- "and v29.16b, v20.16b, v19.16b\n"
- "smlal v25.4s, v28.4h, v7.4h\n"
- "smlal2 v18.4s, v30.8h, v8.8h\n"
- "sqrdmulh v9.4s, v9.4s, v17.4s\n"
- "smlal2 v26.4s, v28.8h, v8.8h\n"
- "smlal2 v10.4s, v28.8h, v7.8h\n"
- "sqrdmulh v16.4s, v16.4s, v17.4s\n"
- "sqrdmulh v25.4s, v25.4s, v17.4s\n"
- "sqadd v13.4s, v13.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v0.16b, v9.16b, v22.16b\n"
- "sqrdmulh v18.4s, v18.4s, v23.4s\n"
- "and v27.16b, v16.16b, v22.16b\n"
- "sqrdmulh v26.4s, v26.4s, v23.4s\n"
- "and v21.16b, v25.16b, v22.16b\n"
- "sqrdmulh v10.4s, v10.4s, v23.4s\n"
- "sqadd v20.4s, v20.4s, v29.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v17.16b, v18.16b, v19.16b\n"
+ "add x12, x12, #0x20\n"
+ "smlal2 v0.4s, v15.8h, v31.8h\n"
+ "smlal2 v30.4s, v15.8h, v26.8h\n"
+ "ldr d15, [x25, x17]\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v20.8h\n"
+ "ldr d22, [x24, x17]\n"
+ "smlal v7.4s, v4.4h, v23.4h\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "smlal v2.4s, v27.4h, v18.4h\n"
+ "smlal v10.4s, v27.4h, v26.4h\n"
+ "smlal2 v24.4s, v8.8h, v1.8h\n"
+ "smlal v9.4s, v27.4h, v20.4h\n"
+ "smlal2 v0.4s, v4.8h, v23.8h\n"
+ "ldr d4, [x23, x17]\n"
+ "smlal2 v30.4s, v27.8h, v18.8h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v26.8h\n"
+ "ldr d26, [x22, x17]\n"
+ "smlal v7.4s, v8.4h, v16.4h\n"
+ "usubl v26.8h, v26.8b, v14.8b\n"
+ "smlal v2.4s, v11.4h, v23.4h\n"
+ "smlal v10.4s, v15.4h, v1.4h\n"
+ "smlal2 v24.4s, v27.8h, v20.8h\n"
+ "smlal v9.4s, v11.4h, v5.4h\n"
+ "smlal2 v0.4s, v8.8h, v16.8h\n"
+ "ldr d8, [x21, x17]\n"
+ "smlal2 v30.4s, v11.8h, v23.8h\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "smlal2 v6.4s, v15.8h, v1.8h\n"
+ "smlal v7.4s, v27.4h, v25.4h\n"
+ "add x17, x17, #0x8\n"
+ "smlal v2.4s, v22.4h, v5.4h\n"
+ "smlal v10.4s, v4.4h, v18.4h\n"
+ "smlal2 v24.4s, v11.8h, v5.8h\n"
+ "smlal v9.4s, v22.4h, v31.4h\n"
+ "sqrdmulh v9.4s, v9.4s, v3.4s\n"
+ "smlal2 v0.4s, v27.8h, v25.8h\n"
+ "smlal2 v30.4s, v22.8h, v5.8h\n"
+ "and v27.16b, v9.16b, v17.16b\n"
+ "smlal2 v6.4s, v4.8h, v18.8h\n"
+ "smlal v7.4s, v15.4h, v18.4h\n"
"sshr v27.4s, v27.4s, #0x1f\n"
- "and v7.16b, v26.16b, v19.16b\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "and v29.16b, v10.16b, v19.16b\n"
- "sqadd v9.4s, v9.4s, v0.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v27.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v25.4s, v25.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v22.4s\n"
- "srshl v9.4s, v9.4s, v22.4s\n"
- "sqadd v18.4s, v18.4s, v17.4s\n"
- "srshl v16.4s, v16.4s, v22.4s\n"
- "sqadd v26.4s, v26.4s, v7.4s\n"
- "srshl v25.4s, v25.4s, v22.4s\n"
- "sqadd v10.4s, v10.4s, v29.4s\n"
- "srshl v20.4s, v20.4s, v19.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v18.4s, v18.4s, v19.4s\n"
+ "smlal v2.4s, v26.4h, v25.4h\n"
+ "smlal v10.4s, v26.4h, v31.4h\n"
+ "sqadd v9.4s, v9.4s, v27.4s\n"
+ "smlal2 v24.4s, v22.8h, v31.8h\n"
+ "smlal2 v0.4s, v15.8h, v18.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ "smlal2 v30.4s, v26.8h, v25.8h\n"
+ "smlal2 v6.4s, v26.8h, v31.8h\n"
+ "and v31.16b, v24.16b, v28.16b\n"
+ "smlal v7.4s, v4.4h, v20.4h\n"
+ "smlal v2.4s, v8.4h, v20.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v3.4s\n"
+ "smlal v10.4s, v8.4h, v25.4h\n"
+ "smlal2 v0.4s, v4.8h, v20.8h\n"
+ "sqrdmulh v2.4s, v2.4s, v3.4s\n"
+ "smlal2 v30.4s, v8.8h, v20.8h\n"
+ "smlal2 v6.4s, v8.8h, v25.8h\n"
+ "sqrdmulh v10.4s, v10.4s, v3.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v22.16b, v7.16b, v17.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v21.4s\n"
+ "and v3.16b, v2.16b, v17.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "and v11.16b, v10.16b, v17.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v20.16b, v0.16b, v28.16b\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "and v31.16b, v30.16b, v28.16b\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "and v18.16b, v6.16b, v28.16b\n"
+ "sqadd v7.4s, v7.4s, v22.4s\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sqadd v2.4s, v2.4s, v3.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v11.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "srshl v9.4s, v9.4s, v17.4s\n"
+ "srshl v7.4s, v7.4s, v17.4s\n"
+ "sqadd v0.4s, v0.4s, v20.4s\n"
+ "srshl v2.4s, v2.4s, v17.4s\n"
+ "sqadd v30.4s, v30.4s, v31.4s\n"
+ "srshl v10.4s, v10.4s, v17.4s\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
"sqxtn v9.4h, v9.4s\n"
- "srshl v26.4s, v26.4s, v19.4s\n"
- "sqxtn v16.4h, v16.4s\n"
- "srshl v10.4s, v10.4s, v19.4s\n"
- "sqxtn v25.4h, v25.4s\n"
- "sqxtn2 v13.8h, v20.4s\n"
- "sqxtn2 v9.8h, v18.4s\n"
- "sqxtn2 v16.8h, v26.4s\n"
- "sqxtn2 v25.8h, v10.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v9.8h, v9.8h, v14.8h\n"
- "sqadd v16.8h, v16.8h, v14.8h\n"
- "sqadd v25.8h, v25.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v9.8h, v9.8h, v12.8h\n"
- "smax v16.8h, v16.8h, v12.8h\n"
- "smax v25.8h, v25.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v9.8h, v9.8h, v11.8h\n"
- "smin v16.8h, v16.8h, v11.8h\n"
- "smin v25.8h, v25.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
- "str d13, [x12, x17]\n"
+ "srshl v0.4s, v0.4s, v28.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v30.4s, v30.4s, v28.4s\n"
+ "sqxtn v2.4h, v2.4s\n"
+ "srshl v6.4s, v6.4s, v28.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "sqxtn2 v9.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v0.4s\n"
+ "sqxtn2 v2.8h, v30.4s\n"
+ "sqxtn2 v10.8h, v6.4s\n"
+ "sqadd v9.8h, v9.8h, v13.8h\n"
+ "sqadd v7.8h, v7.8h, v13.8h\n"
+ "sqadd v2.8h, v2.8h, v13.8h\n"
+ "sqadd v10.8h, v10.8h, v13.8h\n"
+ "smax v9.8h, v9.8h, v29.8h\n"
+ "smax v7.8h, v7.8h, v29.8h\n"
+ "smax v2.8h, v2.8h, v29.8h\n"
+ "smax v10.8h, v10.8h, v29.8h\n"
+ "smin v9.8h, v9.8h, v12.8h\n"
+ "smin v7.8h, v7.8h, v12.8h\n"
+ "smin v2.8h, v2.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str d9, [x11, x17]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str d16, [x10, x17]\n"
- "str d25, [x9, x17]\n"
- "ldr q13, [x28, #0x0]\n"
- "ldr q20, [x28, #0x10]\n"
- "add x28, x28, #0x20\n"
- "ldr d0, [x15, #0x0]\n"
- "ldr d1, [x15, #0x8]\n"
- "add x17, x17, #0x8\n"
- "str x28, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d2, [x15, #0x10]\n"
- "ldr d3, [x15, #0x18]\n"
- "mov v9.16b, v13.16b\n"
- "mov v18.16b, v20.16b\n"
- "ldr d4, [x15, #0x20]\n"
- "ldr d5, [x15, #0x28]\n"
- "mov v16.16b, v13.16b\n"
- "mov v26.16b, v20.16b\n"
- "ldr d6, [x15, #0x30]\n"
- "ldr d7, [x15, #0x38]\n"
- "mov v25.16b, v13.16b\n"
- "mov v10.16b, v20.16b\n"
- "ldr d8, [x15, #0x40]\n"
- "ldp x24, x23, [x16, #0x0]\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "ldp x22, x21, [x16, #0x10]\n"
- "ldr d31, [x24, x8]\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "ldr d30, [x23, x8]\n"
- "ldr d29, [x22, x8]\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "usubl v5.8h, v5.8b, v15.8b\n"
- "ldr d28, [x21, x8]\n"
- "ldr x20, [x16, #0x20]\n"
- "usubl v6.8h, v6.8b, v15.8b\n"
- "usubl v7.8h, v7.8b, v15.8b\n"
- "ldr d27, [x20, x8]\n"
- "usubl v8.8h, v8.8b, v15.8b\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "usubl v30.8h, v30.8b, v24.8b\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "usubl v27.8h, v27.8b, v24.8b\n"
+ "str d9, [x11, x16]\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "str d7, [x10, x16]\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "str d2, [x9, x16]\n"
+ "str d10, [x28, x16]\n"
+ "ldr q9, [x20, #0x0]\n"
+ "ldr q24, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d23, [x14, #0x0]\n"
+ "ldr d16, [x14, #0x8]\n"
+ "add x16, x16, #0x8\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d1, [x14, #0x10]\n"
+ "ldr d5, [x14, #0x18]\n"
+ "mov v7.16b, v9.16b\n"
+ "mov v0.16b, v24.16b\n"
+ "ldr d26, [x14, #0x20]\n"
+ "ldr d18, [x14, #0x28]\n"
+ "mov v2.16b, v9.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "ldr d31, [x14, #0x30]\n"
+ "ldr d25, [x14, #0x38]\n"
+ "mov v10.16b, v9.16b\n"
+ "mov v6.16b, v24.16b\n"
+ "ldr d20, [x14, #0x40]\n"
+ "ldp x23, x22, [x15, #0x0]\n"
+ "usubl v23.8h, v23.8b, v19.8b\n"
+ "usubl v16.8h, v16.8b, v19.8b\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr d22, [x23, x17]\n"
+ "usubl v1.8h, v1.8b, v19.8b\n"
+ "usubl v5.8h, v5.8b, v19.8b\n"
+ "ldr d4, [x22, x17]\n"
+ "ldr d8, [x21, x17]\n"
+ "usubl v26.8h, v26.8b, v19.8b\n"
+ "usubl v18.8h, v18.8b, v19.8b\n"
+ "ldr d27, [x20, x17]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "usubl v31.8h, v31.8b, v19.8b\n"
+ "usubl v25.8h, v25.8b, v19.8b\n"
+ "ldr d15, [x20, x17]\n"
+ "usubl v20.8h, v20.8b, v19.8b\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
"bgt 1b\n"
"2:" // Tail
- "ldr q17, [x14, #0x0]\n"
- "ldr q22, [x13, #0x0]\n"
- "smlal v13.4s, v31.4h, v4.4h\n"
- "smlal2 v20.4s, v31.8h, v4.8h\n"
- "ldr q23, [x14, #0x10]\n"
- "smlal v9.4s, v31.4h, v3.4h\n"
- "smlal2 v18.4s, v31.8h, v3.8h\n"
- "ldr x21, [x16, #0x28]\n"
- "smlal v13.4s, v30.4h, v0.4h\n"
- "smlal2 v20.4s, v30.8h, v0.8h\n"
- "ldr q19, [x13, #0x10]\n"
- "ldr x28, [x16, #0x38]\n"
- "smlal v9.4s, v29.4h, v2.4h\n"
- "smlal2 v18.4s, v29.8h, v2.8h\n"
- "ldr x20, [x16, #0x30]\n"
- "ldr d29, [x20, x8]\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "smlal2 v26.4s, v31.8h, v1.8h\n"
- "ldr x27, [x16, #0x40]\n"
- "ldr x26, [x16, #0x48]\n"
- "smlal v25.4s, v31.4h, v0.4h\n"
- "smlal2 v10.4s, v31.8h, v0.8h\n"
- "ldr d31, [x21, x8]\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "ldr x25, [x16, #0x50]\n"
- "smlal v9.4s, v28.4h, v4.4h\n"
- "smlal2 v18.4s, v28.8h, v4.8h\n"
- "ldr x24, [x16, #0x58]\n"
- "ldr x23, [x16, #0x60]\n"
- "smlal v16.4s, v28.4h, v2.4h\n"
- "smlal2 v26.4s, v28.8h, v2.8h\n"
- "ldr x22, [x16, #0x68]\n"
- "ldr x21, [x16, #0x70]\n"
- "smlal v25.4s, v28.4h, v1.4h\n"
- "smlal2 v10.4s, v28.8h, v1.8h\n"
- "ldr d28, [x28, x8]\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v13.4s, v27.4h, v7.4h\n"
- "smlal2 v20.4s, v27.8h, v7.8h\n"
- "ldr x20, [x16, #0x78]\n"
- "tst x6, #0x7\n"
- "smlal v9.4s, v27.4h, v6.4h\n"
- "smlal2 v18.4s, v27.8h, v6.8h\n"
- "add x14, x14, #0x20\n"
+ "ldr q28, [x13, #0x0]\n"
+ "ldr q17, [x12, #0x0]\n"
+ "smlal v9.4s, v22.4h, v26.4h\n"
+ "smlal2 v24.4s, v22.8h, v26.8h\n"
+ "ldr q21, [x13, #0x10]\n"
+ "ldr q3, [x12, #0x10]\n"
+ "smlal v9.4s, v4.4h, v23.4h\n"
+ "smlal v7.4s, v22.4h, v5.4h\n"
+ "ldr x20, [x15, #0x28]\n"
+ "ldr d11, [x20, x17]\n"
+ "smlal v2.4s, v22.4h, v16.4h\n"
+ "smlal v10.4s, v22.4h, v23.4h\n"
+ "smlal2 v24.4s, v4.8h, v23.8h\n"
+ "ldr x20, [x15, #0x38]\n"
+ "ldr d4, [x20, x17]\n"
+ "smlal v9.4s, v27.4h, v18.4h\n"
+ "smlal2 v0.4s, v22.8h, v5.8h\n"
+ "smlal2 v30.4s, v22.8h, v16.8h\n"
+ "ldr x20, [x15, #0x30]\n"
+ "usubl v11.8h, v11.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v23.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "smlal v2.4s, v27.4h, v1.4h\n"
+ "smlal v10.4s, v27.4h, v16.4h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "ldr x26, [x15, #0x48]\n"
+ "smlal2 v24.4s, v27.8h, v18.8h\n"
+ "smlal v9.4s, v15.4h, v25.4h\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "ldr x25, [x15, #0x50]\n"
+ "smlal2 v0.4s, v8.8h, v1.8h\n"
+ "ldr d8, [x20, x17]\n"
+ "smlal2 v30.4s, v27.8h, v1.8h\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v16.8h\n"
+ "smlal v7.4s, v27.4h, v26.4h\n"
+ "ldr x24, [x15, #0x58]\n"
+ "ldr x23, [x15, #0x60]\n"
+ "smlal v2.4s, v11.4h, v31.4h\n"
+ "smlal v10.4s, v15.4h, v5.4h\n"
+ "ldr x22, [x15, #0x68]\n"
+ "ldr x21, [x15, #0x70]\n"
+ "smlal2 v24.4s, v15.8h, v25.8h\n"
+ "smlal v9.4s, v4.4h, v16.4h\n"
+ "ldr x20, [x15, #0x78]\n"
+ "tst x7, #0x7\n"
+ "smlal2 v0.4s, v27.8h, v26.8h\n"
+ "ldr d27, [x26, x17]\n"
+ "smlal2 v30.4s, v11.8h, v31.8h\n"
+ "ldr d11, [x25, x17]\n"
+ "smlal2 v6.4s, v15.8h, v5.8h\n"
+ "smlal v7.4s, v15.4h, v31.4h\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
"add x13, x13, #0x20\n"
- "smlal v16.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v6.8h\n"
- "ldr d31, [x27, x8]\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v25.4s, v27.4h, v3.4h\n"
- "smlal2 v10.4s, v27.8h, v3.8h\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "smlal v9.4s, v28.4h, v0.4h\n"
- "smlal2 v18.4s, v28.8h, v0.8h\n"
- "ldr d30, [x26, x8]\n"
- "usubl v30.8h, v30.8b, v24.8b\n"
- "smlal v16.4s, v27.4h, v4.4h\n"
- "smlal v25.4s, v29.4h, v8.4h\n"
- "smlal2 v26.4s, v27.8h, v4.8h\n"
- "ldr d28, [x24, x8]\n"
- "smlal2 v10.4s, v29.8h, v8.8h\n"
- "ldr d29, [x25, x8]\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v20.4s, v31.8h, v2.8h\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "smlal v9.4s, v31.4h, v1.4h\n"
- "smlal2 v18.4s, v31.8h, v1.8h\n"
- "ldr d31, [x23, x8]\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v16.4s, v30.4h, v5.4h\n"
- "smlal v25.4s, v30.4h, v4.4h\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v20.4s, v30.8h, v8.8h\n"
- "smlal v9.4s, v30.4h, v7.4h\n"
- "smlal2 v18.4s, v30.8h, v7.8h\n"
- "smlal2 v26.4s, v30.8h, v5.8h\n"
- "smlal2 v10.4s, v30.8h, v4.8h\n"
- "ldr d30, [x22, x8]\n"
- "usubl v30.8h, v30.8b, v24.8b\n"
- "smlal v16.4s, v29.4h, v0.4h\n"
- "smlal v25.4s, v28.4h, v2.4h\n"
- "smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v20.4s, v29.8h, v3.8h\n"
- "smlal2 v26.4s, v29.8h, v0.8h\n"
- "ldr d29, [x21, x8]\n"
- "smlal2 v10.4s, v28.8h, v2.8h\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal v25.4s, v30.4h, v5.4h\n"
- "smlal v9.4s, v28.4h, v5.4h\n"
- "smlal2 v18.4s, v28.8h, v5.8h\n"
- "ldr d28, [x20, x8]\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v13.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v3.8h\n"
- "sqrdmulh v13.4s, v13.4s, v17.4s\n"
- "add x8, x8, #0x8\n"
- "smlal2 v10.4s, v30.8h, v5.8h\n"
- "smlal v16.4s, v29.4h, v7.4h\n"
- "and v21.16b, v13.16b, v22.16b\n"
- "smlal v25.4s, v29.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "smlal2 v26.4s, v29.8h, v7.8h\n"
- "smlal2 v10.4s, v29.8h, v6.8h\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "smlal v9.4s, v30.4h, v8.4h\n"
- "smlal v16.4s, v28.4h, v8.4h\n"
- "and v29.16b, v20.16b, v19.16b\n"
- "smlal v25.4s, v28.4h, v7.4h\n"
- "smlal2 v18.4s, v30.8h, v8.8h\n"
- "sqrdmulh v9.4s, v9.4s, v17.4s\n"
- "smlal2 v26.4s, v28.8h, v8.8h\n"
- "smlal2 v10.4s, v28.8h, v7.8h\n"
- "sqrdmulh v16.4s, v16.4s, v17.4s\n"
- "sqrdmulh v25.4s, v25.4s, v17.4s\n"
- "sqadd v13.4s, v13.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v0.16b, v9.16b, v22.16b\n"
- "sqrdmulh v18.4s, v18.4s, v23.4s\n"
- "and v27.16b, v16.16b, v22.16b\n"
- "sqrdmulh v26.4s, v26.4s, v23.4s\n"
- "and v21.16b, v25.16b, v22.16b\n"
- "sqrdmulh v10.4s, v10.4s, v23.4s\n"
- "sqadd v20.4s, v20.4s, v29.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v17.16b, v18.16b, v19.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "and v7.16b, v26.16b, v19.16b\n"
+ "smlal v2.4s, v15.4h, v26.4h\n"
+ "smlal v10.4s, v22.4h, v20.4h\n"
+ "usubl v11.8h, v11.8b, v14.8b\n"
+ "add x12, x12, #0x20\n"
+ "smlal2 v24.4s, v4.8h, v16.8h\n"
+ "smlal v9.4s, v8.4h, v1.4h\n"
+ "smlal2 v0.4s, v15.8h, v31.8h\n"
+ "smlal2 v30.4s, v15.8h, v26.8h\n"
+ "ldr d15, [x24, x17]\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v20.8h\n"
+ "ldr d22, [x23, x17]\n"
+ "smlal v7.4s, v4.4h, v23.4h\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "smlal v2.4s, v27.4h, v18.4h\n"
+ "smlal v10.4s, v27.4h, v26.4h\n"
+ "smlal2 v24.4s, v8.8h, v1.8h\n"
+ "smlal v9.4s, v27.4h, v20.4h\n"
+ "smlal2 v0.4s, v4.8h, v23.8h\n"
+ "ldr d4, [x22, x17]\n"
+ "smlal2 v30.4s, v27.8h, v18.8h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v26.8h\n"
+ "ldr d26, [x21, x17]\n"
+ "smlal v7.4s, v8.4h, v16.4h\n"
+ "usubl v26.8h, v26.8b, v14.8b\n"
+ "smlal v2.4s, v11.4h, v23.4h\n"
+ "smlal v10.4s, v15.4h, v1.4h\n"
+ "smlal2 v24.4s, v27.8h, v20.8h\n"
+ "smlal v9.4s, v11.4h, v5.4h\n"
+ "smlal2 v0.4s, v8.8h, v16.8h\n"
+ "ldr d16, [x20, x17]\n"
+ "smlal2 v30.4s, v11.8h, v23.8h\n"
+ "usubl v16.8h, v16.8b, v14.8b\n"
+ "smlal2 v6.4s, v15.8h, v1.8h\n"
+ "smlal v7.4s, v27.4h, v25.4h\n"
+ "add x17, x17, #0x8\n"
+ "smlal v2.4s, v22.4h, v5.4h\n"
+ "smlal v10.4s, v4.4h, v18.4h\n"
+ "smlal2 v24.4s, v11.8h, v5.8h\n"
+ "smlal v9.4s, v22.4h, v31.4h\n"
+ "sqrdmulh v9.4s, v9.4s, v28.4s\n"
+ "smlal2 v0.4s, v27.8h, v25.8h\n"
+ "smlal2 v30.4s, v22.8h, v5.8h\n"
+ "and v1.16b, v9.16b, v17.16b\n"
+ "smlal2 v6.4s, v4.8h, v18.8h\n"
+ "smlal v7.4s, v15.4h, v18.4h\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "smlal v2.4s, v26.4h, v25.4h\n"
+ "smlal v10.4s, v26.4h, v31.4h\n"
+ "sqadd v9.4s, v9.4s, v1.4s\n"
+ "smlal2 v24.4s, v22.8h, v31.8h\n"
+ "smlal2 v0.4s, v15.8h, v18.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ "smlal2 v30.4s, v26.8h, v25.8h\n"
+ "smlal2 v6.4s, v26.8h, v31.8h\n"
+ "and v31.16b, v24.16b, v3.16b\n"
+ "smlal v7.4s, v4.4h, v20.4h\n"
+ "smlal v2.4s, v16.4h, v20.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v28.4s\n"
+ "smlal v10.4s, v16.4h, v25.4h\n"
+ "smlal2 v0.4s, v4.8h, v20.8h\n"
+ "sqrdmulh v2.4s, v2.4s, v28.4s\n"
+ "smlal2 v30.4s, v16.8h, v20.8h\n"
+ "smlal2 v6.4s, v16.8h, v25.8h\n"
+ "sqrdmulh v10.4s, v10.4s, v28.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v22.16b, v7.16b, v17.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v21.4s\n"
+ "and v15.16b, v2.16b, v17.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "and v11.16b, v10.16b, v17.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v18.16b, v0.16b, v3.16b\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "and v23.16b, v30.16b, v3.16b\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "and v21.16b, v6.16b, v3.16b\n"
+ "sqadd v7.4s, v7.4s, v22.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v2.4s, v2.4s, v15.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v11.4s\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "and v29.16b, v10.16b, v19.16b\n"
- "sqadd v9.4s, v9.4s, v0.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v27.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v25.4s, v25.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v22.4s\n"
- "srshl v9.4s, v9.4s, v22.4s\n"
- "sqadd v18.4s, v18.4s, v17.4s\n"
- "srshl v16.4s, v16.4s, v22.4s\n"
- "sqadd v26.4s, v26.4s, v7.4s\n"
- "srshl v25.4s, v25.4s, v22.4s\n"
- "sqadd v10.4s, v10.4s, v29.4s\n"
- "srshl v20.4s, v20.4s, v19.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v18.4s, v18.4s, v19.4s\n"
+ "srshl v9.4s, v9.4s, v17.4s\n"
+ "srshl v7.4s, v7.4s, v17.4s\n"
+ "sqadd v0.4s, v0.4s, v18.4s\n"
+ "srshl v2.4s, v2.4s, v17.4s\n"
+ "sqadd v30.4s, v30.4s, v23.4s\n"
+ "srshl v10.4s, v10.4s, v17.4s\n"
+ "sqadd v6.4s, v6.4s, v21.4s\n"
+ "srshl v24.4s, v24.4s, v3.4s\n"
"sqxtn v9.4h, v9.4s\n"
- "srshl v26.4s, v26.4s, v19.4s\n"
- "sqxtn v16.4h, v16.4s\n"
- "srshl v10.4s, v10.4s, v19.4s\n"
- "sqxtn v25.4h, v25.4s\n"
- "sqxtn2 v13.8h, v20.4s\n"
- "sqxtn2 v9.8h, v18.4s\n"
- "sqxtn2 v16.8h, v26.4s\n"
- "sqxtn2 v25.8h, v10.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v9.8h, v9.8h, v14.8h\n"
- "sqadd v16.8h, v16.8h, v14.8h\n"
- "sqadd v25.8h, v25.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v9.8h, v9.8h, v12.8h\n"
- "smax v16.8h, v16.8h, v12.8h\n"
- "smax v25.8h, v25.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v9.8h, v9.8h, v11.8h\n"
- "smin v16.8h, v16.8h, v11.8h\n"
- "smin v25.8h, v25.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
- "str d13, [x12, x17]\n"
+ "srshl v0.4s, v0.4s, v3.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v30.4s, v30.4s, v3.4s\n"
+ "sqxtn v2.4h, v2.4s\n"
+ "srshl v6.4s, v6.4s, v3.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "sqxtn2 v9.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v0.4s\n"
+ "sqxtn2 v2.8h, v30.4s\n"
+ "sqxtn2 v10.8h, v6.4s\n"
+ "sqadd v9.8h, v9.8h, v13.8h\n"
+ "sqadd v7.8h, v7.8h, v13.8h\n"
+ "sqadd v2.8h, v2.8h, v13.8h\n"
+ "sqadd v10.8h, v10.8h, v13.8h\n"
+ "smax v9.8h, v9.8h, v29.8h\n"
+ "smax v7.8h, v7.8h, v29.8h\n"
+ "smax v2.8h, v2.8h, v29.8h\n"
+ "smax v10.8h, v10.8h, v29.8h\n"
+ "smin v9.8h, v9.8h, v12.8h\n"
+ "smin v7.8h, v7.8h, v12.8h\n"
+ "smin v2.8h, v2.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str d9, [x11, x17]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str d16, [x10, x17]\n"
- "str d25, [x9, x17]\n"
- "add x17, x17, #0x8\n"
+ "str d9, [x11, x16]\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "str d7, [x10, x16]\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "str d2, [x9, x16]\n"
+ "str d10, [x28, x16]\n"
+ "add x16, x16, #0x8\n"
"beq 64f\n"
- "add x15, x15, #0x48\n"
+ "add x14, x14, #0x48\n"
"3:" // Oddments
- "ldr x28, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x6, #2, 5f\n"
- "ld1 { v13.4s }, [x28], #0x10\n"
- "tbz x6, #1, 4f\n"
- "ld1 { v20.d }[0], [x28], #0x8\n"
- "tbz x6, #0, 7f\n"
- "ld1 { v20.s }[2], [x28]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x7, #2, 5f\n"
+ "ld1 { v9.4s }, [x20], #0x10\n"
+ "tbz x7, #1, 4f\n"
+ "ld1 { v24.d }[0], [x20], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v24.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x6, #0, 7f\n"
- "ld1 { v20.s }[0], [x28]\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v24.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x6, #1, 6f\n"
- "ld1 { v13.d }[0], [x28], #0x8\n"
- "tbz x6, #0, 7f\n"
- "ld1 { v13.s }[2], [x28]\n"
+ "tbz x7, #1, 6f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v9.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 7f\n"
- "ld1 { v13.s }[0], [x28]\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v9.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x15, #0x0]\n"
- "ldr d1, [x15, #0x8]\n"
- "mov v9.16b, v13.16b\n"
- "mov v18.16b, v20.16b\n"
- "ldr d2, [x15, #0x10]\n"
- "ldr d3, [x15, #0x18]\n"
- "mov v16.16b, v13.16b\n"
- "mov v26.16b, v20.16b\n"
- "ldr d4, [x15, #0x20]\n"
- "ldr d5, [x15, #0x28]\n"
- "mov v25.16b, v13.16b\n"
- "mov v10.16b, v20.16b\n"
- "ldr d6, [x15, #0x30]\n"
- "ldr d7, [x15, #0x38]\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "ldr d8, [x15, #0x40]\n"
- "ldp x24, x23, [x16, #0x0]\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "ldp x22, x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x20]\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "usubl v5.8h, v5.8b, v15.8b\n"
- "usubl v6.8h, v6.8b, v15.8b\n"
- "usubl v7.8h, v7.8b, v15.8b\n"
- "usubl v8.8h, v8.8b, v15.8b\n"
- "add x24, x24, x8\n"
- "add x23, x23, x8\n"
- "add x22, x22, x8\n"
- "add x21, x21, x8\n"
- "add x20, x20, x8\n"
- "tbz x6, #2, 9f\n"
- "ld1 { v31.s }[0], [x24], #0x4\n"
- "ld1 { v30.s }[0], [x23], #0x4\n"
- "ld1 { v29.s }[0], [x22], #0x4\n"
- "ld1 { v28.s }[0], [x21], #0x4\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
- "tbz x6, #1, 8f\n"
- "ld1 { v31.h }[2], [x24], #0x2\n"
- "ld1 { v30.h }[2], [x23], #0x2\n"
- "ld1 { v29.h }[2], [x22], #0x2\n"
- "ld1 { v28.h }[2], [x21], #0x2\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
- "tbz x6, #0, 11f\n"
- "ld1 { v31.b }[6], [x24]\n"
- "ld1 { v30.b }[6], [x23]\n"
- "ld1 { v29.b }[6], [x22]\n"
- "ld1 { v28.b }[6], [x21]\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ldr d23, [x14, #0x0]\n"
+ "ldr d16, [x14, #0x8]\n"
+ "mov v7.16b, v9.16b\n"
+ "mov v0.16b, v24.16b\n"
+ "ldr d1, [x14, #0x10]\n"
+ "ldr d5, [x14, #0x18]\n"
+ "mov v2.16b, v9.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "ldr d26, [x14, #0x20]\n"
+ "ldr d18, [x14, #0x28]\n"
+ "mov v10.16b, v9.16b\n"
+ "mov v6.16b, v24.16b\n"
+ "ldr d31, [x14, #0x30]\n"
+ "ldr d25, [x14, #0x38]\n"
+ "usubl v23.8h, v23.8b, v19.8b\n"
+ "usubl v16.8h, v16.8b, v19.8b\n"
+ "ldr d20, [x14, #0x40]\n"
+ "ldp x24, x23, [x15, #0x0]\n"
+ "usubl v1.8h, v1.8b, v19.8b\n"
+ "usubl v5.8h, v5.8b, v19.8b\n"
+ "ldp x22, x21, [x15, #0x10]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "usubl v26.8h, v26.8b, v19.8b\n"
+ "usubl v18.8h, v18.8b, v19.8b\n"
+ "usubl v31.8h, v31.8b, v19.8b\n"
+ "usubl v25.8h, v25.8b, v19.8b\n"
+ "usubl v20.8h, v20.8b, v19.8b\n"
+ "add x24, x24, x17\n"
+ "add x23, x23, x17\n"
+ "add x22, x22, x17\n"
+ "add x21, x21, x17\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 9f\n"
+ "ld1 { v22.s }[0], [x24], #0x4\n"
+ "ld1 { v4.s }[0], [x23], #0x4\n"
+ "ld1 { v8.s }[0], [x22], #0x4\n"
+ "ld1 { v27.s }[0], [x21], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 8f\n"
+ "ld1 { v22.h }[2], [x24], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v8.h }[2], [x22], #0x2\n"
+ "ld1 { v27.h }[2], [x21], #0x2\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[6], [x24]\n"
+ "ld1 { v4.b }[6], [x23]\n"
+ "ld1 { v8.b }[6], [x22]\n"
+ "ld1 { v27.b }[6], [x21]\n"
+ "ld1 { v15.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x6, #0, 11f\n"
- "ld1 { v31.b }[4], [x24]\n"
- "ld1 { v30.b }[4], [x23]\n"
- "ld1 { v29.b }[4], [x22]\n"
- "ld1 { v28.b }[4], [x21]\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[4], [x24]\n"
+ "ld1 { v4.b }[4], [x23]\n"
+ "ld1 { v8.b }[4], [x22]\n"
+ "ld1 { v27.b }[4], [x21]\n"
+ "ld1 { v15.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x6, #1, 10f\n"
- "ld1 { v31.h }[0], [x24], #0x2\n"
- "ld1 { v30.h }[0], [x23], #0x2\n"
- "ld1 { v29.h }[0], [x22], #0x2\n"
- "ld1 { v28.h }[0], [x21], #0x2\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
- "tbz x6, #0, 11f\n"
- "ld1 { v31.b }[2], [x24]\n"
- "ld1 { v30.b }[2], [x23]\n"
- "ld1 { v29.b }[2], [x22]\n"
- "ld1 { v28.b }[2], [x21]\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "tbz x7, #1, 10f\n"
+ "ld1 { v22.h }[0], [x24], #0x2\n"
+ "ld1 { v4.h }[0], [x23], #0x2\n"
+ "ld1 { v8.h }[0], [x22], #0x2\n"
+ "ld1 { v27.h }[0], [x21], #0x2\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[2], [x24]\n"
+ "ld1 { v4.b }[2], [x23]\n"
+ "ld1 { v8.b }[2], [x22]\n"
+ "ld1 { v27.b }[2], [x21]\n"
+ "ld1 { v15.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 11f\n"
- "ld1 { v31.b }[0], [x24]\n"
- "ld1 { v30.b }[0], [x23]\n"
- "ld1 { v29.b }[0], [x22]\n"
- "ld1 { v28.b }[0], [x21]\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[0], [x24]\n"
+ "ld1 { v4.b }[0], [x23]\n"
+ "ld1 { v8.b }[0], [x22]\n"
+ "ld1 { v27.b }[0], [x21]\n"
+ "ld1 { v15.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v13.4s, v31.4h, v4.4h\n"
- "smlal2 v20.4s, v31.8h, v4.8h\n"
- "ldr x21, [x16, #0x28]\n"
- "smlal v9.4s, v31.4h, v3.4h\n"
- "smlal2 v18.4s, v31.8h, v3.8h\n"
- "usubl v30.8h, v30.8b, v24.8b\n"
- "add x21, x21, x8\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "smlal2 v26.4s, v31.8h, v1.8h\n"
- "smlal v25.4s, v31.4h, v0.4h\n"
- "smlal2 v10.4s, v31.8h, v0.8h\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v13.4s, v30.4h, v0.4h\n"
- "smlal2 v20.4s, v30.8h, v0.8h\n"
- "usubl v27.8h, v27.8b, v24.8b\n"
- "smlal v9.4s, v29.4h, v2.4h\n"
- "smlal2 v18.4s, v29.8h, v2.8h\n"
- "smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "smlal v9.4s, v28.4h, v4.4h\n"
- "smlal2 v18.4s, v28.8h, v4.8h\n"
- "smlal v16.4s, v28.4h, v2.4h\n"
- "smlal2 v26.4s, v28.8h, v2.8h\n"
- "smlal v25.4s, v28.4h, v1.4h\n"
- "smlal2 v10.4s, v28.8h, v1.8h\n"
- "tbz x6, #2, 13f\n"
- "ld1 { v31.s }[0], [x21], #0x4\n"
- "tbz x6, #1, 12f\n"
- "ld1 { v31.h }[2], [x21], #0x2\n"
- "tbz x6, #0, 15f\n"
- "ld1 { v31.b }[6], [x21]\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "smlal v9.4s, v22.4h, v26.4h\n"
+ "smlal2 v24.4s, v22.8h, v26.8h\n"
+ "ldr x20, [x15, #0x28]\n"
+ "smlal v7.4s, v22.4h, v5.4h\n"
+ "smlal2 v0.4s, v22.8h, v5.8h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "smlal v2.4s, v22.4h, v16.4h\n"
+ "smlal2 v30.4s, v22.8h, v16.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v10.4s, v22.4h, v23.4h\n"
+ "smlal2 v6.4s, v22.8h, v23.8h\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "smlal v9.4s, v4.4h, v23.4h\n"
+ "smlal2 v24.4s, v4.8h, v23.8h\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "smlal2 v0.4s, v8.8h, v1.8h\n"
+ "smlal v9.4s, v27.4h, v18.4h\n"
+ "smlal2 v24.4s, v27.8h, v18.8h\n"
+ "smlal v7.4s, v27.4h, v26.4h\n"
+ "smlal2 v0.4s, v27.8h, v26.8h\n"
+ "smlal v2.4s, v27.4h, v1.4h\n"
+ "smlal2 v30.4s, v27.8h, v1.8h\n"
+ "smlal v10.4s, v27.4h, v16.4h\n"
+ "smlal2 v6.4s, v27.8h, v16.8h\n"
+ "tbz x7, #2, 13f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 12f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x6, #0, 15f\n"
- "ld1 { v31.b }[4], [x21]\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x6, #1, 14f\n"
- "ld1 { v31.h }[0], [x21], #0x2\n"
- "tbz x6, #0, 15f\n"
- "ld1 { v31.b }[2], [x21]\n"
+ "tbz x7, #1, 14f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 15f\n"
- "ld1 { v31.b }[0], [x21]\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[0], [x20]\n"
"15:" // Oddments: Load (3, 0): Bit 2: End
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v16.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v6.8h\n"
- "ldr x20, [x16, #0x30]\n"
- "smlal v13.4s, v27.4h, v7.4h\n"
- "smlal2 v20.4s, v27.8h, v7.8h\n"
- "add x20, x20, x8\n"
- "smlal v9.4s, v27.4h, v6.4h\n"
- "smlal2 v18.4s, v27.8h, v6.8h\n"
- "smlal v16.4s, v27.4h, v4.4h\n"
- "smlal2 v26.4s, v27.8h, v4.8h\n"
- "smlal v25.4s, v27.4h, v3.4h\n"
- "smlal2 v10.4s, v27.8h, v3.8h\n"
- "tbz x6, #2, 17f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
- "tbz x6, #1, 16f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
- "tbz x6, #0, 19f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "usubl v21.8h, v21.8b, v14.8b\n"
+ "smlal v2.4s, v21.4h, v31.4h\n"
+ "smlal2 v30.4s, v21.8h, v31.8h\n"
+ "ldr x20, [x15, #0x30]\n"
+ "smlal v9.4s, v15.4h, v25.4h\n"
+ "smlal2 v24.4s, v15.8h, v25.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v7.4s, v15.4h, v31.4h\n"
+ "smlal2 v0.4s, v15.8h, v31.8h\n"
+ "smlal v2.4s, v15.4h, v26.4h\n"
+ "smlal2 v30.4s, v15.8h, v26.8h\n"
+ "smlal v10.4s, v15.4h, v5.4h\n"
+ "smlal2 v6.4s, v15.8h, v5.8h\n"
+ "tbz x7, #2, 17f\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 16f\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x6, #0, 19f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x6, #1, 18f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
- "tbz x6, #0, 19f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "tbz x7, #1, 18f\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 19f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[0], [x20]\n"
"19:" // Oddments: Load (3, 3): Bit 2: End
- "usubl v29.8h, v29.8b, v24.8b\n"
- "ldr x28, [x16, #0x38]\n"
- "smlal v25.4s, v29.4h, v8.4h\n"
- "smlal2 v10.4s, v29.8h, v8.8h\n"
- "add x28, x28, x8\n"
- "tbz x6, #2, 21f\n"
- "ld1 { v28.s }[0], [x28], #0x4\n"
- "tbz x6, #1, 20f\n"
- "ld1 { v28.h }[2], [x28], #0x2\n"
- "tbz x6, #0, 23f\n"
- "ld1 { v28.b }[6], [x28]\n"
+ "usubl v28.8h, v28.8b, v14.8b\n"
+ "ldr x20, [x15, #0x38]\n"
+ "smlal v10.4s, v28.4h, v20.4h\n"
+ "smlal2 v6.4s, v28.8h, v20.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 21f\n"
+ "ld1 { v22.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 20f\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
- "tbz x6, #0, 23f\n"
- "ld1 { v28.b }[4], [x28]\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 1): Bit 2: Unset
- "tbz x6, #1, 22f\n"
- "ld1 { v28.h }[0], [x28], #0x2\n"
- "tbz x6, #0, 23f\n"
- "ld1 { v28.b }[2], [x28]\n"
+ "tbz x7, #1, 22f\n"
+ "ld1 { v22.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 23f\n"
- "ld1 { v28.b }[0], [x28]\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[0], [x20]\n"
"23:" // Oddments: Load (0, 1): Bit 2: End
- "usubl v28.8h, v28.8b, v24.8b\n"
- "ldr x27, [x16, #0x40]\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "smlal v9.4s, v28.4h, v0.4h\n"
- "smlal2 v18.4s, v28.8h, v0.8h\n"
- "add x27, x27, x8\n"
- "tbz x6, #2, 25f\n"
- "ld1 { v31.s }[0], [x27], #0x4\n"
- "tbz x6, #1, 24f\n"
- "ld1 { v31.h }[2], [x27], #0x2\n"
- "tbz x6, #0, 27f\n"
- "ld1 { v31.b }[6], [x27]\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "ldr x20, [x15, #0x40]\n"
+ "smlal v9.4s, v22.4h, v16.4h\n"
+ "smlal2 v24.4s, v22.8h, v16.8h\n"
+ "smlal v7.4s, v22.4h, v23.4h\n"
+ "smlal2 v0.4s, v22.8h, v23.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 25f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 24f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
- "tbz x6, #0, 27f\n"
- "ld1 { v31.b }[4], [x27]\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (0, 2): Bit 2: Unset
- "tbz x6, #1, 26f\n"
- "ld1 { v31.h }[0], [x27], #0x2\n"
- "tbz x6, #0, 27f\n"
- "ld1 { v31.b }[2], [x27]\n"
+ "tbz x7, #1, 26f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 27f\n"
- "ld1 { v31.b }[0], [x27]\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[0], [x20]\n"
"27:" // Oddments: Load (0, 2): Bit 2: End
- "usubl v31.8h, v31.8b, v24.8b\n"
- "ldr x26, [x16, #0x48]\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v20.4s, v31.8h, v2.8h\n"
- "smlal v9.4s, v31.4h, v1.4h\n"
- "smlal2 v18.4s, v31.8h, v1.8h\n"
- "add x26, x26, x8\n"
- "tbz x6, #2, 29f\n"
- "ld1 { v30.s }[0], [x26], #0x4\n"
- "tbz x6, #1, 28f\n"
- "ld1 { v30.h }[2], [x26], #0x2\n"
- "tbz x6, #0, 31f\n"
- "ld1 { v30.b }[6], [x26]\n"
+ "usubl v21.8h, v21.8b, v14.8b\n"
+ "ldr x20, [x15, #0x48]\n"
+ "smlal v9.4s, v21.4h, v1.4h\n"
+ "smlal2 v24.4s, v21.8h, v1.8h\n"
+ "smlal v7.4s, v21.4h, v16.4h\n"
+ "smlal2 v0.4s, v21.8h, v16.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 29f\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 28f\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
- "tbz x6, #0, 31f\n"
- "ld1 { v30.b }[4], [x26]\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
- "tbz x6, #1, 30f\n"
- "ld1 { v30.h }[0], [x26], #0x2\n"
- "tbz x6, #0, 31f\n"
- "ld1 { v30.b }[2], [x26]\n"
+ "tbz x7, #1, 30f\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 31f\n"
- "ld1 { v30.b }[0], [x26]\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "usubl v30.8h, v30.8b, v24.8b\n"
- "ldr x25, [x16, #0x50]\n"
- "smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v20.4s, v30.8h, v8.8h\n"
- "smlal v9.4s, v30.4h, v7.4h\n"
- "smlal2 v18.4s, v30.8h, v7.8h\n"
- "add x25, x25, x8\n"
- "smlal v16.4s, v30.4h, v5.4h\n"
- "smlal2 v26.4s, v30.8h, v5.8h\n"
- "smlal v25.4s, v30.4h, v4.4h\n"
- "smlal2 v10.4s, v30.8h, v4.8h\n"
- "tbz x6, #2, 33f\n"
- "ld1 { v29.s }[0], [x25], #0x4\n"
- "tbz x6, #1, 32f\n"
- "ld1 { v29.h }[2], [x25], #0x2\n"
- "tbz x6, #0, 35f\n"
- "ld1 { v29.b }[6], [x25]\n"
+ "usubl v28.8h, v28.8b, v14.8b\n"
+ "ldr x20, [x15, #0x50]\n"
+ "smlal v9.4s, v28.4h, v20.4h\n"
+ "smlal2 v24.4s, v28.8h, v20.8h\n"
+ "smlal v7.4s, v28.4h, v25.4h\n"
+ "smlal2 v0.4s, v28.8h, v25.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v2.4s, v28.4h, v18.4h\n"
+ "smlal2 v30.4s, v28.8h, v18.8h\n"
+ "smlal v10.4s, v28.4h, v26.4h\n"
+ "smlal2 v6.4s, v28.8h, v26.8h\n"
+ "tbz x7, #2, 33f\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 32f\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
- "tbz x6, #0, 35f\n"
- "ld1 { v29.b }[4], [x25]\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (1, 0): Bit 2: Unset
- "tbz x6, #1, 34f\n"
- "ld1 { v29.h }[0], [x25], #0x2\n"
- "tbz x6, #0, 35f\n"
- "ld1 { v29.b }[2], [x25]\n"
+ "tbz x7, #1, 34f\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 35f\n"
- "ld1 { v29.b }[0], [x25]\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[0], [x20]\n"
"35:" // Oddments: Load (1, 0): Bit 2: End
- "usubl v29.8h, v29.8b, v24.8b\n"
- "ldr x24, [x16, #0x58]\n"
- "smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v20.4s, v29.8h, v3.8h\n"
- "smlal v16.4s, v29.4h, v0.4h\n"
- "smlal2 v26.4s, v29.8h, v0.8h\n"
- "add x24, x24, x8\n"
- "tbz x6, #2, 37f\n"
- "ld1 { v28.s }[0], [x24], #0x4\n"
- "tbz x6, #1, 36f\n"
- "ld1 { v28.h }[2], [x24], #0x2\n"
- "tbz x6, #0, 39f\n"
- "ld1 { v28.b }[6], [x24]\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "ldr x20, [x15, #0x58]\n"
+ "smlal v9.4s, v8.4h, v5.4h\n"
+ "smlal2 v24.4s, v8.8h, v5.8h\n"
+ "smlal v2.4s, v8.4h, v23.4h\n"
+ "smlal2 v30.4s, v8.8h, v23.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 37f\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 36f\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x6, #0, 39f\n"
- "ld1 { v28.b }[4], [x24]\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x6, #1, 38f\n"
- "ld1 { v28.h }[0], [x24], #0x2\n"
- "tbz x6, #0, 39f\n"
- "ld1 { v28.b }[2], [x24]\n"
+ "tbz x7, #1, 38f\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 39f\n"
- "ld1 { v28.b }[0], [x24]\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[0], [x20]\n"
"39:" // Oddments: Load (1, 3): Bit 2: End
- "usubl v28.8h, v28.8b, v24.8b\n"
- "ldr x23, [x16, #0x60]\n"
- "smlal v9.4s, v28.4h, v5.4h\n"
- "smlal2 v18.4s, v28.8h, v5.8h\n"
- "smlal v25.4s, v28.4h, v2.4h\n"
- "smlal2 v10.4s, v28.8h, v2.8h\n"
- "add x23, x23, x8\n"
- "tbz x6, #2, 41f\n"
- "ld1 { v31.s }[0], [x23], #0x4\n"
- "tbz x6, #1, 40f\n"
- "ld1 { v31.h }[2], [x23], #0x2\n"
- "tbz x6, #0, 43f\n"
- "ld1 { v31.b }[6], [x23]\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "ldr x20, [x15, #0x60]\n"
+ "smlal v7.4s, v8.4h, v18.4h\n"
+ "smlal2 v0.4s, v8.8h, v18.8h\n"
+ "smlal v10.4s, v8.4h, v1.4h\n"
+ "smlal2 v6.4s, v8.8h, v1.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 41f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 40f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
- "tbz x6, #0, 43f\n"
- "ld1 { v31.b }[4], [x23]\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 0): Bit 2: Unset
- "tbz x6, #1, 42f\n"
- "ld1 { v31.h }[0], [x23], #0x2\n"
- "tbz x6, #0, 43f\n"
- "ld1 { v31.b }[2], [x23]\n"
+ "tbz x7, #1, 42f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 43f\n"
- "ld1 { v31.b }[0], [x23]\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"43:" // Oddments: Load (2, 0): Bit 2: End
- "usubl v31.8h, v31.8b, v24.8b\n"
- "ldr x22, [x16, #0x68]\n"
- "smlal v13.4s, v31.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal2 v26.4s, v31.8h, v3.8h\n"
- "add x22, x22, x8\n"
- "tbz x6, #2, 45f\n"
- "ld1 { v30.s }[0], [x22], #0x4\n"
- "tbz x6, #1, 44f\n"
- "ld1 { v30.h }[2], [x22], #0x2\n"
- "tbz x6, #0, 47f\n"
- "ld1 { v30.b }[6], [x22]\n"
+ "usubl v17.8h, v17.8b, v14.8b\n"
+ "ldr x20, [x15, #0x68]\n"
+ "smlal v9.4s, v17.4h, v31.4h\n"
+ "smlal2 v24.4s, v17.8h, v31.8h\n"
+ "smlal v2.4s, v17.4h, v5.4h\n"
+ "smlal2 v30.4s, v17.8h, v5.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 45f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 44f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x6, #0, 47f\n"
- "ld1 { v30.b }[4], [x22]\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x6, #1, 46f\n"
- "ld1 { v30.h }[0], [x22], #0x2\n"
- "tbz x6, #0, 47f\n"
- "ld1 { v30.b }[2], [x22]\n"
+ "tbz x7, #1, 46f\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 47f\n"
- "ld1 { v30.b }[0], [x22]\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[0], [x20]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "usubl v30.8h, v30.8b, v24.8b\n"
- "ldr x21, [x16, #0x70]\n"
- "smlal v9.4s, v30.4h, v8.4h\n"
- "smlal2 v18.4s, v30.8h, v8.8h\n"
- "smlal v25.4s, v30.4h, v5.4h\n"
- "smlal2 v10.4s, v30.8h, v5.8h\n"
- "add x21, x21, x8\n"
- "tbz x6, #2, 49f\n"
- "ld1 { v29.s }[0], [x21], #0x4\n"
- "tbz x6, #1, 48f\n"
- "ld1 { v29.h }[2], [x21], #0x2\n"
- "tbz x6, #0, 51f\n"
- "ld1 { v29.b }[6], [x21]\n"
+ "usubl v23.8h, v23.8b, v14.8b\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal v7.4s, v23.4h, v20.4h\n"
+ "smlal2 v0.4s, v23.8h, v20.8h\n"
+ "smlal v10.4s, v23.4h, v18.4h\n"
+ "smlal2 v6.4s, v23.8h, v18.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 49f\n"
+ "ld1 { v5.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 48f\n"
+ "ld1 { v5.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x6, #0, 51f\n"
- "ld1 { v29.b }[4], [x21]\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x6, #1, 50f\n"
- "ld1 { v29.h }[0], [x21], #0x2\n"
- "tbz x6, #0, 51f\n"
- "ld1 { v29.b }[2], [x21]\n"
+ "tbz x7, #1, 50f\n"
+ "ld1 { v5.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 51f\n"
- "ld1 { v29.b }[0], [x21]\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "usubl v29.8h, v29.8b, v24.8b\n"
- "ldr x20, [x16, #0x78]\n"
- "smlal v16.4s, v29.4h, v7.4h\n"
- "smlal2 v26.4s, v29.8h, v7.8h\n"
- "smlal v25.4s, v29.4h, v6.4h\n"
- "smlal2 v10.4s, v29.8h, v6.8h\n"
- "add x20, x20, x8\n"
- "tbz x6, #2, 53f\n"
- "ld1 { v28.s }[0], [x20], #0x4\n"
- "tbz x6, #1, 52f\n"
- "ld1 { v28.h }[2], [x20], #0x2\n"
- "tbz x6, #0, 55f\n"
- "ld1 { v28.b }[6], [x20]\n"
+ "usubl v5.8h, v5.8b, v14.8b\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v2.4s, v5.4h, v25.4h\n"
+ "smlal2 v30.4s, v5.8h, v25.8h\n"
+ "smlal v10.4s, v5.4h, v31.4h\n"
+ "smlal2 v6.4s, v5.8h, v31.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 53f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 52f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x6, #0, 55f\n"
- "ld1 { v28.b }[4], [x20]\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x6, #1, 54f\n"
- "ld1 { v28.h }[0], [x20], #0x2\n"
- "tbz x6, #0, 55f\n"
- "ld1 { v28.b }[2], [x20]\n"
+ "tbz x7, #1, 54f\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 55f\n"
- "ld1 { v28.b }[0], [x20]\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[0], [x20]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v16.4s, v28.4h, v8.4h\n"
- "smlal2 v26.4s, v28.8h, v8.8h\n"
- "smlal v25.4s, v28.4h, v7.4h\n"
- "smlal2 v10.4s, v28.8h, v7.8h\n"
- "tbz x6, #2, 57f\n"
- "ld1 { v17.4s }, [x14], #0x10\n"
- "ld1 { v22.4s }, [x13], #0x10\n"
- "tbz x6, #1, 56f\n"
- "ld1 { v23.d }[0], [x14], #0x8\n"
- "ld1 { v19.d }[0], [x13], #0x8\n"
- "tbz x6, #0, 59f\n"
- "ld1 { v23.s }[2], [x14]\n"
- "ld1 { v19.s }[2], [x13]\n"
+ "usubl v23.8h, v23.8b, v14.8b\n"
+ "smlal v2.4s, v23.4h, v20.4h\n"
+ "smlal2 v30.4s, v23.8h, v20.8h\n"
+ "smlal v10.4s, v23.4h, v25.4h\n"
+ "smlal2 v6.4s, v23.8h, v25.8h\n"
+ "tbz x7, #2, 57f\n"
+ "ld1 { v15.4s }, [x13], #0x10\n"
+ "ld1 { v19.4s }, [x12], #0x10\n"
+ "tbz x7, #1, 56f\n"
+ "ld1 { v18.d }[0], [x13], #0x8\n"
+ "ld1 { v22.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v18.s }[2], [x13]\n"
+ "ld1 { v22.s }[2], [x12]\n"
"b 59f\n"
"56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x6, #0, 59f\n"
- "ld1 { v23.s }[0], [x14]\n"
- "ld1 { v19.s }[0], [x13]\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v18.s }[0], [x13]\n"
+ "ld1 { v22.s }[0], [x12]\n"
"b 59f\n"
"57:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x6, #1, 58f\n"
- "ld1 { v17.d }[0], [x14], #0x8\n"
- "ld1 { v22.d }[0], [x13], #0x8\n"
- "tbz x6, #0, 59f\n"
- "ld1 { v17.s }[2], [x14]\n"
- "ld1 { v22.s }[2], [x13]\n"
+ "tbz x7, #1, 58f\n"
+ "ld1 { v15.d }[0], [x13], #0x8\n"
+ "ld1 { v19.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v15.s }[2], [x13]\n"
+ "ld1 { v19.s }[2], [x12]\n"
"b 59f\n"
"58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 59f\n"
- "ld1 { v17.s }[0], [x14]\n"
- "ld1 { v22.s }[0], [x13]\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v15.s }[0], [x13]\n"
+ "ld1 { v19.s }[0], [x12]\n"
"59:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v13.4s, v13.4s, v17.4s\n"
- "and v21.16b, v13.16b, v22.16b\n"
- "add x12, x12, x17\n"
- "add x11, x11, x17\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "add x10, x10, x17\n"
- "add x9, x9, x17\n"
- "and v29.16b, v20.16b, v19.16b\n"
- "sqrdmulh v9.4s, v9.4s, v17.4s\n"
- "sqrdmulh v16.4s, v16.4s, v17.4s\n"
- "sqrdmulh v25.4s, v25.4s, v17.4s\n"
- "sqadd v13.4s, v13.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v0.16b, v9.16b, v22.16b\n"
- "sqrdmulh v18.4s, v18.4s, v23.4s\n"
- "and v27.16b, v16.16b, v22.16b\n"
- "sqrdmulh v26.4s, v26.4s, v23.4s\n"
- "and v21.16b, v25.16b, v22.16b\n"
- "sqrdmulh v10.4s, v10.4s, v23.4s\n"
- "sqadd v20.4s, v20.4s, v29.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v17.16b, v18.16b, v19.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "and v7.16b, v26.16b, v19.16b\n"
+ "sqrdmulh v9.4s, v9.4s, v15.4s\n"
+ "and v17.16b, v9.16b, v19.16b\n"
+ "add x11, x11, x16\n"
+ "add x10, x10, x16\n"
+ "sqrdmulh v24.4s, v24.4s, v18.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "add x9, x9, x16\n"
+ "add x28, x28, x16\n"
+ "and v20.16b, v24.16b, v22.16b\n"
+ "sqrdmulh v7.4s, v7.4s, v15.4s\n"
+ "sqrdmulh v2.4s, v2.4s, v15.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v15.4s\n"
+ "sqadd v9.4s, v9.4s, v17.4s\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v21.16b, v7.16b, v19.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+ "and v15.16b, v2.16b, v19.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v18.4s\n"
+ "and v23.16b, v10.16b, v19.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v18.4s\n"
+ "sqadd v24.4s, v24.4s, v20.4s\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "and v29.16b, v10.16b, v19.16b\n"
- "sqadd v9.4s, v9.4s, v0.4s\n"
+ "and v18.16b, v0.16b, v22.16b\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "and v17.16b, v30.16b, v22.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v28.16b, v6.16b, v22.16b\n"
+ "sqadd v7.4s, v7.4s, v21.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v2.4s, v2.4s, v15.4s\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v27.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v25.4s, v25.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v22.4s\n"
- "srshl v9.4s, v9.4s, v22.4s\n"
- "sqadd v18.4s, v18.4s, v17.4s\n"
- "srshl v16.4s, v16.4s, v22.4s\n"
- "sqadd v26.4s, v26.4s, v7.4s\n"
- "srshl v25.4s, v25.4s, v22.4s\n"
- "sqadd v10.4s, v10.4s, v29.4s\n"
- "srshl v20.4s, v20.4s, v19.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v18.4s, v18.4s, v19.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v26.4s, v26.4s, v19.4s\n"
- "sqxtn v16.4h, v16.4s\n"
+ "sqadd v10.4s, v10.4s, v23.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "srshl v9.4s, v9.4s, v19.4s\n"
+ "srshl v7.4s, v7.4s, v19.4s\n"
+ "sqadd v0.4s, v0.4s, v18.4s\n"
+ "srshl v2.4s, v2.4s, v19.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
"srshl v10.4s, v10.4s, v19.4s\n"
- "sqxtn v25.4h, v25.4s\n"
- "sqxtn2 v13.8h, v20.4s\n"
- "sqxtn2 v9.8h, v18.4s\n"
- "sqxtn2 v16.8h, v26.4s\n"
- "sqxtn2 v25.8h, v10.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v9.8h, v9.8h, v14.8h\n"
- "sqadd v16.8h, v16.8h, v14.8h\n"
- "sqadd v25.8h, v25.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v9.8h, v9.8h, v12.8h\n"
- "smax v16.8h, v16.8h, v12.8h\n"
- "smax v25.8h, v25.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v9.8h, v9.8h, v11.8h\n"
- "smin v16.8h, v16.8h, v11.8h\n"
- "smin v25.8h, v25.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "sqadd v6.4s, v6.4s, v28.4s\n"
+ "srshl v24.4s, v24.4s, v22.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "srshl v0.4s, v0.4s, v22.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v30.4s, v30.4s, v22.4s\n"
+ "sqxtn v2.4h, v2.4s\n"
+ "srshl v6.4s, v6.4s, v22.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "sqxtn2 v9.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v0.4s\n"
+ "sqxtn2 v2.8h, v30.4s\n"
+ "sqxtn2 v10.8h, v6.4s\n"
+ "sqadd v9.8h, v9.8h, v13.8h\n"
+ "sqadd v7.8h, v7.8h, v13.8h\n"
+ "sqadd v2.8h, v2.8h, v13.8h\n"
+ "sqadd v10.8h, v10.8h, v13.8h\n"
+ "smax v9.8h, v9.8h, v29.8h\n"
+ "smax v7.8h, v7.8h, v29.8h\n"
+ "smax v2.8h, v2.8h, v29.8h\n"
+ "smax v10.8h, v10.8h, v29.8h\n"
+ "smin v9.8h, v9.8h, v12.8h\n"
+ "smin v7.8h, v7.8h, v12.8h\n"
+ "smin v2.8h, v2.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "tbz x6, #2, 61f\n"
- "st1 { v13.s }[0], [x12], #0x4\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "tbz x7, #2, 61f\n"
"st1 { v9.s }[0], [x11], #0x4\n"
- "st1 { v16.s }[0], [x10], #0x4\n"
- "st1 { v25.s }[0], [x9], #0x4\n"
- "tbz x6, #1, 60f\n"
- "st1 { v13.h }[2], [x12], #0x2\n"
+ "st1 { v7.s }[0], [x10], #0x4\n"
+ "st1 { v2.s }[0], [x9], #0x4\n"
+ "st1 { v10.s }[0], [x28], #0x4\n"
+ "tbz x7, #1, 60f\n"
"st1 { v9.h }[2], [x11], #0x2\n"
- "st1 { v16.h }[2], [x10], #0x2\n"
- "st1 { v25.h }[2], [x9], #0x2\n"
- "tbz x6, #0, 63f\n"
- "st1 { v13.b }[6], [x12], #0x1\n"
+ "st1 { v7.h }[2], [x10], #0x2\n"
+ "st1 { v2.h }[2], [x9], #0x2\n"
+ "st1 { v10.h }[2], [x28], #0x2\n"
+ "tbz x7, #0, 63f\n"
"st1 { v9.b }[6], [x11], #0x1\n"
- "st1 { v16.b }[6], [x10], #0x1\n"
- "st1 { v25.b }[6], [x9], #0x1\n"
+ "st1 { v7.b }[6], [x10], #0x1\n"
+ "st1 { v2.b }[6], [x9], #0x1\n"
+ "st1 { v10.b }[6], [x28], #0x1\n"
"b 63f\n"
"60:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x6, #0, 63f\n"
- "st1 { v13.b }[4], [x12], #0x1\n"
+ "tbz x7, #0, 63f\n"
"st1 { v9.b }[4], [x11], #0x1\n"
- "st1 { v16.b }[4], [x10], #0x1\n"
- "st1 { v25.b }[4], [x9], #0x1\n"
+ "st1 { v7.b }[4], [x10], #0x1\n"
+ "st1 { v2.b }[4], [x9], #0x1\n"
+ "st1 { v10.b }[4], [x28], #0x1\n"
"b 63f\n"
"61:" // Oddments: Bit 2: Unset
- "tbz x6, #1, 62f\n"
- "st1 { v13.h }[0], [x12], #0x2\n"
+ "tbz x7, #1, 62f\n"
"st1 { v9.h }[0], [x11], #0x2\n"
- "st1 { v16.h }[0], [x10], #0x2\n"
- "st1 { v25.h }[0], [x9], #0x2\n"
- "tbz x6, #0, 63f\n"
- "st1 { v13.b }[2], [x12], #0x1\n"
+ "st1 { v7.h }[0], [x10], #0x2\n"
+ "st1 { v2.h }[0], [x9], #0x2\n"
+ "st1 { v10.h }[0], [x28], #0x2\n"
+ "tbz x7, #0, 63f\n"
"st1 { v9.b }[2], [x11], #0x1\n"
- "st1 { v16.b }[2], [x10], #0x1\n"
- "st1 { v25.b }[2], [x9], #0x1\n"
+ "st1 { v7.b }[2], [x10], #0x1\n"
+ "st1 { v2.b }[2], [x9], #0x1\n"
+ "st1 { v10.b }[2], [x28], #0x1\n"
"b 63f\n"
"62:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 63f\n"
- "st1 { v13.b }[0], [x12], #0x1\n"
+ "tbz x7, #0, 63f\n"
"st1 { v9.b }[0], [x11], #0x1\n"
- "st1 { v16.b }[0], [x10], #0x1\n"
- "st1 { v25.b }[0], [x9], #0x1\n"
+ "st1 { v7.b }[0], [x10], #0x1\n"
+ "st1 { v2.b }[0], [x9], #0x1\n"
+ "st1 { v10.b }[0], [x28], #0x1\n"
"63:" // Oddments: Bit 2: End
"64:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index 0baebafa3f..f4f2bc82e1 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
@@ -34,15 +34,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
- const unsigned int,
- const uint8_t *const *const,
- const uint8_t *const,
- const int32_t *const,
- const arm_gemm::Requantize32 &,
- const int32_t *const,
- const int32_t *const,
- uint8_t *const *const);
+void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
class a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index de072a7d55..fb533893a6 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -104,16 +104,16 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
"lsr x8, x7, #0x3\n"
"add x20, x23, %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v12.16b }, [x20]\n"
+ "ld1r { v6.16b }, [x20]\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
"add x21, x23, %[offsetof_Requantize32_b_offset]\n"
"add x20, x23, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v13.16b }, [x21]\n"
- "ld1r { v11.8h }, [x20]\n"
+ "ld1r { v15.16b }, [x21]\n"
+ "ld1r { v13.8h }, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_minval]\n"
"add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v16.8h }, [x21]\n"
- "ld1r { v14.8h }, [x20]\n"
+ "ld1r { v17.8h }, [x21]\n"
+ "ld1r { v24.8h }, [x20]\n"
"mov x17, #0x0\n"
"mov x16, #0x0\n"
"add x15, %x[params], %[offsetof_Params_inptrs]\n"
@@ -123,563 +123,563 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ldp x11, x10, [x22, #0x0]\n"
"ldp x9, x28, [x22, #0x10]\n"
"cbz x8, 3f\n"
- "ldr d0, [x14, #0x0]\n"
- "ldr d1, [x14, #0x8]\n"
+ "ldr d11, [x14, #0x0]\n"
+ "ldr d22, [x14, #0x8]\n"
"subs x8, x8, #0x1\n"
- "usubl v0.8h, v0.8b, v13.8b\n"
- "ldr d2, [x14, #0x10]\n"
- "ldr d3, [x14, #0x18]\n"
- "usubl v1.8h, v1.8b, v13.8b\n"
- "usubl v2.8h, v2.8b, v13.8b\n"
- "ldr d4, [x14, #0x20]\n"
- "ldr d5, [x14, #0x28]\n"
- "usubl v3.8h, v3.8b, v13.8b\n"
- "usubl v4.8h, v4.8b, v13.8b\n"
- "ldr d6, [x14, #0x30]\n"
+ "usubl v11.8h, v11.8b, v15.8b\n"
+ "ldr d14, [x14, #0x10]\n"
+ "ldr d28, [x14, #0x18]\n"
+ "usubl v22.8h, v22.8b, v15.8b\n"
+ "usubl v14.8h, v14.8b, v15.8b\n"
+ "ldr d18, [x14, #0x20]\n"
+ "ldr d9, [x14, #0x28]\n"
+ "usubl v28.8h, v28.8b, v15.8b\n"
+ "usubl v18.8h, v18.8b, v15.8b\n"
+ "ldr d26, [x14, #0x30]\n"
"ldr d7, [x14, #0x38]\n"
- "usubl v5.8h, v5.8b, v13.8b\n"
- "usubl v6.8h, v6.8b, v13.8b\n"
- "ldr d8, [x14, #0x40]\n"
- "ldr x24, [%x[params], %[offsetof_Params_bias]]\n"
- "usubl v7.8h, v7.8b, v13.8b\n"
- "usubl v8.8h, v8.8b, v13.8b\n"
- "ldr q15, [x24, #0x0]\n"
- "ldr q17, [x24, #0x10]\n"
- "add x24, x24, #0x20\n"
- "str x24, [%x[params], %[offsetof_Params_bias]]\n"
+ "usubl v9.8h, v9.8b, v15.8b\n"
+ "usubl v26.8h, v26.8b, v15.8b\n"
+ "ldr d4, [x14, #0x40]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q3, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
"ldp x27, x26, [x15, #0x0]\n"
"ldp x25, x24, [x15, #0x10]\n"
- "mov v10.16b, v15.16b\n"
- "mov v20.16b, v17.16b\n"
+ "mov v21.16b, v5.16b\n"
+ "mov v8.16b, v3.16b\n"
"ldp x23, x22, [x15, #0x20]\n"
"ldp x21, x20, [x15, #0x30]\n"
- "mov v9.16b, v15.16b\n"
- "mov v23.16b, v17.16b\n"
- "ldr d31, [x27, x17]\n"
- "ldr d30, [x26, x17]\n"
- "mov v21.16b, v15.16b\n"
- "mov v22.16b, v17.16b\n"
- "ldr d29, [x25, x17]\n"
- "ldr d28, [x24, x17]\n"
- "usubl v31.8h, v31.8b, v12.8b\n"
- "usubl v30.8h, v30.8b, v12.8b\n"
- "ldr d27, [x23, x17]\n"
- "ldr d26, [x22, x17]\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "usubl v28.8h, v28.8b, v12.8b\n"
- "ldr d25, [x21, x17]\n"
- "ldr d24, [x20, x17]\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
+ "mov v20.16b, v5.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "ldr d25, [x27, x17]\n"
+ "ldr d27, [x26, x17]\n"
+ "mov v19.16b, v5.16b\n"
+ "mov v31.16b, v3.16b\n"
+ "ldr d1, [x25, x17]\n"
+ "ldr d2, [x24, x17]\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "ldr d12, [x23, x17]\n"
+ "ldr d16, [x22, x17]\n"
+ "usubl v1.8h, v1.8b, v6.8b\n"
+ "usubl v2.8h, v2.8b, v6.8b\n"
+ "ldr d23, [x21, x17]\n"
+ "ldr d10, [x20, x17]\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "usubl v10.8h, v10.8b, v6.8b\n"
"beq 2f\n"
"1:" // Loop
- "smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v17.4s, v31.8h, v8.8h\n"
- "ldr x24, [x15, #0x40]\n"
- "ldr x22, [x15, #0x48]\n"
- "smlal v10.4s, v31.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "ldr x21, [x15, #0x50]\n"
- "ldr x20, [x15, #0x58]\n"
- "smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v17.4s, v30.8h, v0.8h\n"
- "ldr q19, [x13, #0x0]\n"
- "ldr x23, [x15, #0x78]\n"
- "smlal v10.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "ldr d28, [x22, x17]\n"
- "usubl v28.8h, v28.8b, v12.8b\n"
- "smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v17.4s, v29.8h, v1.8h\n"
- "ldr d29, [x24, x17]\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
+ "ldr q30, [x13, #0x0]\n"
+ "ldr q29, [x12, #0x0]\n"
+ "smlal v5.4s, v25.4h, v4.4h\n"
+ "smlal2 v3.4s, v25.8h, v4.8h\n"
+ "ldr x21, [x15, #0x58]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v5.4s, v27.4h, v11.4h\n"
+ "smlal v21.4s, v25.4h, v26.4h\n"
+ "ldr x25, [x15, #0x60]\n"
+ "ldr x24, [x15, #0x80]\n"
+ "smlal v20.4s, v25.4h, v14.4h\n"
+ "smlal v19.4s, v25.4h, v11.4h\n"
+ "smlal2 v3.4s, v27.8h, v11.8h\n"
"ldr d27, [x21, x17]\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v17.4s, v26.8h, v3.8h\n"
- "ldr d26, [x20, x17]\n"
- "ldr x20, [x15, #0x60]\n"
- "smlal v10.4s, v24.4h, v0.4h\n"
- "smlal2 v20.4s, v24.8h, v0.8h\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "ldr x21, [x15, #0x80]\n"
- "smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v17.4s, v25.8h, v4.8h\n"
- "ldr d25, [x20, x17]\n"
- "ldr x20, [x15, #0x68]\n"
- "smlal v10.4s, v29.4h, v4.4h\n"
- "smlal2 v20.4s, v29.8h, v4.8h\n"
- "ldr d29, [x20, x17]\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v17.4s, v24.8h, v2.8h\n"
- "ldr q18, [x12, #0x0]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v22.4h\n"
+ "smlal2 v8.4s, v25.8h, v26.8h\n"
+ "smlal2 v0.4s, v25.8h, v14.8h\n"
+ "ldr x23, [x15, #0x68]\n"
"ldr x22, [x15, #0x88]\n"
- "smlal v10.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "ldr d28, [x21, x17]\n"
- "ldr x21, [x15, #0x70]\n"
- "smlal v9.4s, v31.4h, v2.4h\n"
- "smlal2 v23.4s, v31.8h, v2.8h\n"
- "usubl v28.8h, v28.8b, v12.8b\n"
+ "smlal2 v31.4s, v25.8h, v11.8h\n"
+ "ldr d25, [x20, x17]\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "smlal v21.4s, v2.4h, v22.4h\n"
+ "smlal v20.4s, v27.4h, v28.4h\n"
+ "smlal v19.4s, v25.4h, v18.4h\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal2 v3.4s, v1.8h, v22.8h\n"
+ "ldr d1, [x25, x17]\n"
+ "usubl v1.8h, v1.8b, v6.8b\n"
+ "smlal v5.4s, v16.4h, v28.4h\n"
+ "smlal2 v8.4s, v2.8h, v22.8h\n"
+ "ldr d2, [x24, x17]\n"
+ "usubl v2.8h, v2.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v28.8h\n"
+ "ldr d27, [x23, x17]\n"
+ "smlal2 v31.4s, v25.8h, v18.8h\n"
+ "ldr d25, [x22, x17]\n"
+ "smlal v21.4s, v12.4h, v14.4h\n"
"ldr x25, [x15, #0x98]\n"
- "smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v17.4s, v27.8h, v5.8h\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "ldr x24, [x15, #0x90]\n"
- "smlal v10.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "smlal v20.4s, v1.4h, v11.4h\n"
+ "smlal v19.4s, v2.4h, v22.4h\n"
+ "ldr x24, [x15, #0x50]\n"
+ "smlal2 v3.4s, v16.8h, v28.8h\n"
+ "ldr d16, [x21, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v23.4h, v18.4h\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "smlal2 v8.4s, v12.8h, v14.8h\n"
+ "ldr d12, [x20, x17]\n"
+ "ldr x23, [x15, #0x48]\n"
+ "smlal2 v0.4s, v1.8h, v11.8h\n"
+ "smlal2 v31.4s, v2.8h, v22.8h\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v21.4s, v10.4h, v11.4h\n"
+ "smlal v20.4s, v27.4h, v18.4h\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "smlal v19.4s, v25.4h, v9.4h\n"
+ "smlal2 v3.4s, v23.8h, v18.8h\n"
+ "ldr d23, [x25, x17]\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "smlal v5.4s, v10.4h, v14.4h\n"
+ "smlal2 v8.4s, v10.8h, v11.8h\n"
+ "ldr d11, [x24, x17]\n"
+ "usubl v11.8h, v11.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v18.8h\n"
"ldr d27, [x23, x17]\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "smlal v21.4s, v31.4h, v0.4h\n"
- "smlal v9.4s, v26.4h, v3.4h\n"
- "ldr x23, [x15, #0xa8]\n"
- "ldr x20, [x15, #0xa0]\n"
- "smlal2 v23.4s, v26.8h, v3.8h\n"
- "ldr d26, [x22, x17]\n"
- "smlal2 v22.4s, v31.8h, v0.8h\n"
- "ldr d24, [x21, x17]\n"
- "smlal v21.4s, v27.4h, v4.4h\n"
- "smlal v9.4s, v25.4h, v0.4h\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "ldr x22, [x15, #0xb0]\n"
- "smlal2 v23.4s, v25.8h, v0.8h\n"
- "ldr q30, [x13, #0x10]\n"
- "smlal2 v22.4s, v27.8h, v4.8h\n"
- "ldr d27, [x20, x17]\n"
- "smlal v21.4s, v28.4h, v1.4h\n"
- "smlal v15.4s, v25.4h, v6.4h\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
- "ldr x21, [x15, #0xb8]\n"
- "smlal2 v17.4s, v25.8h, v6.8h\n"
- "ldr d25, [x24, x17]\n"
- "smlal v9.4s, v29.4h, v4.4h\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal2 v23.4s, v29.8h, v4.8h\n"
- "ldr d29, [x25, x17]\n"
- "ldr q31, [x12, #0x10]\n"
- "smlal2 v22.4s, v28.8h, v1.8h\n"
- "smlal v21.4s, v26.4h, v5.4h\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v15.4s, v24.4h, v7.4h\n"
- "ldr x20, [x15, #0xc0]\n"
- "smlal2 v17.4s, v24.8h, v7.8h\n"
- "smlal v9.4s, v24.4h, v1.4h\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "ldr x24, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal2 v23.4s, v24.8h, v1.8h\n"
- "ldr d24, [x23, x17]\n"
- "smlal2 v22.4s, v26.8h, v5.8h\n"
- "ldr d26, [x22, x17]\n"
- "smlal v21.4s, v29.4h, v2.4h\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
- "smlal2 v22.4s, v29.8h, v2.8h\n"
+ "smlal2 v31.4s, v25.8h, v9.8h\n"
+ "ldr d25, [x21, x17]\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "smlal v21.4s, v16.4h, v18.4h\n"
+ "smlal v20.4s, v12.4h, v22.4h\n"
+ "smlal v19.4s, v23.4h, v14.4h\n"
+ "smlal2 v3.4s, v10.8h, v14.8h\n"
+ "ldr d10, [x20, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "usubl v10.8h, v10.8b, v6.8b\n"
+ "smlal v5.4s, v11.4h, v9.4h\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "smlal2 v8.4s, v16.8h, v18.8h\n"
+ "ldr d18, [x22, x17]\n"
+ "ldr d16, [x21, x17]\n"
+ "smlal2 v0.4s, v12.8h, v22.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal2 v31.4s, v23.8h, v14.8h\n"
+ "ldr q14, [x13, #0x10]\n"
+ "smlal v21.4s, v27.4h, v9.4h\n"
+ "smlal v20.4s, v25.4h, v26.4h\n"
+ "smlal v19.4s, v10.4h, v28.4h\n"
+ "usubl v18.8h, v18.8b, v6.8b\n"
+ "ldr x21, [x15, #0xc0]\n"
+ "smlal2 v3.4s, v11.8h, v9.8h\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v26.4h\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v8.4s, v27.8h, v9.8h\n"
+ "ldr d27, [x21, x17]\n"
+ "smlal2 v0.4s, v25.8h, v26.8h\n"
+ "ldr q25, [x12, #0x10]\n"
+ "smlal2 v31.4s, v10.8h, v28.8h\n"
+ "smlal v21.4s, v11.4h, v28.4h\n"
+ "usubl v22.8h, v22.8b, v6.8b\n"
"add x14, x14, #0x48\n"
- "smlal v9.4s, v25.4h, v6.4h\n"
- "smlal v21.4s, v24.4h, v3.4h\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v20.4s, v18.4h, v7.4h\n"
+ "smlal v19.4s, v16.4h, v7.4h\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "add x17, x17, #0x8\n"
+ "smlal2 v3.4s, v1.8h, v26.8h\n"
+ "smlal v5.4s, v12.4h, v7.4h\n"
+ "sqrdmulh v5.4s, v5.4s, v30.4s\n"
"subs x8, x8, #0x1\n"
- "smlal v10.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
- "sqrdmulh v15.4s, v15.4s, v19.4s\n"
+ "smlal2 v8.4s, v11.8h, v28.8h\n"
+ "smlal2 v0.4s, v18.8h, v7.8h\n"
+ "and v28.16b, v5.16b, v29.16b\n"
"add x13, x13, #0x20\n"
- "smlal2 v23.4s, v25.8h, v6.8h\n"
- "ldr d25, [x21, x17]\n"
- "smlal2 v22.4s, v24.8h, v3.8h\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v9.4s, v27.4h, v7.4h\n"
- "smlal v21.4s, v26.4h, v7.4h\n"
- "and v0.16b, v15.16b, v18.16b\n"
+ "smlal2 v31.4s, v16.8h, v7.8h\n"
+ "smlal v21.4s, v2.4h, v7.4h\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
"add x12, x12, #0x20\n"
- "smlal v10.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "ldr d29, [x20, x17]\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal2 v23.4s, v27.8h, v7.8h\n"
- "smlal2 v22.4s, v26.8h, v7.8h\n"
- "sqrdmulh v17.4s, v17.4s, v30.4s\n"
- "add x17, x17, #0x8\n"
- "smlal v9.4s, v24.4h, v5.4h\n"
- "smlal v21.4s, v25.4h, v6.4h\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "smlal2 v23.4s, v24.8h, v5.8h\n"
- "smlal2 v22.4s, v25.8h, v6.8h\n"
- "and v7.16b, v17.16b, v31.16b\n"
- "smlal v9.4s, v25.4h, v8.4h\n"
- "smlal v21.4s, v29.4h, v8.4h\n"
- "sqrdmulh v10.4s, v10.4s, v19.4s\n"
- "smlal2 v23.4s, v25.8h, v8.8h\n"
- "smlal2 v22.4s, v29.8h, v8.8h\n"
- "sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "sqrdmulh v21.4s, v21.4s, v19.4s\n"
- "sqadd v15.4s, v15.4s, v0.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "and v19.16b, v10.16b, v18.16b\n"
+ "smlal v20.4s, v10.4h, v9.4h\n"
+ "smlal v19.4s, v22.4h, v26.4h\n"
+ "sqadd v5.4s, v5.4s, v28.4s\n"
+ "smlal2 v3.4s, v12.8h, v7.8h\n"
+ "smlal2 v8.4s, v2.8h, v7.8h\n"
+ "sqrdmulh v3.4s, v3.4s, v14.4s\n"
+ "smlal2 v0.4s, v10.8h, v9.8h\n"
+ "smlal2 v31.4s, v22.8h, v26.8h\n"
+ "and v16.16b, v3.16b, v25.16b\n"
+ "smlal v21.4s, v23.4h, v4.4h\n"
+ "smlal v20.4s, v22.4h, v4.4h\n"
+ "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+ "smlal v19.4s, v27.4h, v4.4h\n"
+ "smlal2 v8.4s, v23.8h, v4.8h\n"
"sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "and v27.16b, v9.16b, v18.16b\n"
- "sqrdmulh v23.4s, v23.4s, v30.4s\n"
- "and v0.16b, v21.16b, v18.16b\n"
- "sqrdmulh v22.4s, v22.4s, v30.4s\n"
- "sqadd v17.4s, v17.4s, v7.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "and v5.16b, v20.16b, v31.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "and v4.16b, v23.16b, v31.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v7.16b, v22.16b, v31.16b\n"
- "sqadd v10.4s, v10.4s, v19.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v27.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v18.4s\n"
- "srshl v10.4s, v10.4s, v18.4s\n"
- "sqadd v20.4s, v20.4s, v5.4s\n"
- "srshl v9.4s, v9.4s, v18.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "srshl v21.4s, v21.4s, v18.4s\n"
- "sqadd v22.4s, v22.4s, v7.4s\n"
- "srshl v17.4s, v17.4s, v31.4s\n"
- "sqxtn v15.4h, v15.4s\n"
- "srshl v20.4s, v20.4s, v31.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "srshl v23.4s, v23.4s, v31.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v22.4s, v22.4s, v31.4s\n"
+ "smlal2 v0.4s, v22.8h, v4.8h\n"
+ "smlal2 v31.4s, v27.8h, v4.8h\n"
+ "sqrdmulh v19.4s, v19.4s, v30.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v12.16b, v21.16b, v29.16b\n"
+ "sqrdmulh v8.4s, v8.4s, v14.4s\n"
+ "and v23.16b, v20.16b, v29.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v14.4s\n"
+ "and v9.16b, v19.16b, v29.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v14.4s\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "and v18.16b, v8.16b, v25.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v22.16b, v0.16b, v25.16b\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "and v16.16b, v31.16b, v25.16b\n"
+ "sqadd v21.4s, v21.4s, v12.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v23.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v9.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v29.4s\n"
+ "srshl v21.4s, v21.4s, v29.4s\n"
+ "sqadd v8.4s, v8.4s, v18.4s\n"
+ "srshl v20.4s, v20.4s, v29.4s\n"
+ "sqadd v0.4s, v0.4s, v22.4s\n"
+ "srshl v19.4s, v19.4s, v29.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v3.4s, v3.4s, v25.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "srshl v8.4s, v8.4s, v25.4s\n"
"sqxtn v21.4h, v21.4s\n"
- "sqxtn2 v15.8h, v17.4s\n"
- "sqxtn2 v10.8h, v20.4s\n"
- "sqxtn2 v9.8h, v23.4s\n"
- "sqxtn2 v21.8h, v22.4s\n"
- "sqadd v15.8h, v15.8h, v11.8h\n"
- "sqadd v10.8h, v10.8h, v11.8h\n"
- "sqadd v9.8h, v9.8h, v11.8h\n"
- "sqadd v21.8h, v21.8h, v11.8h\n"
- "smax v15.8h, v15.8h, v16.8h\n"
- "smax v10.8h, v10.8h, v16.8h\n"
- "smax v9.8h, v9.8h, v16.8h\n"
- "smax v21.8h, v21.8h, v16.8h\n"
- "smin v15.8h, v15.8h, v14.8h\n"
- "smin v10.8h, v10.8h, v14.8h\n"
- "smin v9.8h, v9.8h, v14.8h\n"
- "smin v21.8h, v21.8h, v14.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
- "str d15, [x11, x16]\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
- "str d10, [x10, x16]\n"
+ "srshl v0.4s, v0.4s, v25.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v31.4s, v31.4s, v25.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "sqxtn2 v5.8h, v3.4s\n"
+ "sqxtn2 v21.8h, v8.4s\n"
+ "sqxtn2 v20.8h, v0.4s\n"
+ "sqxtn2 v19.8h, v31.4s\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "sqadd v21.8h, v21.8h, v13.8h\n"
+ "sqadd v20.8h, v20.8h, v13.8h\n"
+ "sqadd v19.8h, v19.8h, v13.8h\n"
+ "smax v5.8h, v5.8h, v17.8h\n"
+ "smax v21.8h, v21.8h, v17.8h\n"
+ "smax v20.8h, v20.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smin v5.8h, v5.8h, v24.8h\n"
+ "smin v21.8h, v21.8h, v24.8h\n"
+ "smin v20.8h, v20.8h, v24.8h\n"
+ "smin v19.8h, v19.8h, v24.8h\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "str d5, [x11, x16]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
- "str d9, [x9, x16]\n"
- "str d21, [x28, x16]\n"
- "ldr q15, [x24, #0x0]\n"
- "ldr q17, [x24, #0x10]\n"
- "add x24, x24, #0x20\n"
- "ldr d0, [x14, #0x0]\n"
- "ldr d1, [x14, #0x8]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d21, [x10, x16]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str d20, [x9, x16]\n"
+ "str d19, [x28, x16]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q3, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d11, [x14, #0x0]\n"
+ "ldr d22, [x14, #0x8]\n"
"add x16, x16, #0x8\n"
- "str x24, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d2, [x14, #0x10]\n"
- "ldr d3, [x14, #0x18]\n"
- "mov v10.16b, v15.16b\n"
- "mov v20.16b, v17.16b\n"
- "ldr d4, [x14, #0x20]\n"
- "ldr d5, [x14, #0x28]\n"
- "mov v9.16b, v15.16b\n"
- "mov v23.16b, v17.16b\n"
- "ldr d6, [x14, #0x30]\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d14, [x14, #0x10]\n"
+ "ldr d28, [x14, #0x18]\n"
+ "mov v21.16b, v5.16b\n"
+ "mov v8.16b, v3.16b\n"
+ "ldr d18, [x14, #0x20]\n"
+ "ldr d9, [x14, #0x28]\n"
+ "mov v20.16b, v5.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "ldr d26, [x14, #0x30]\n"
"ldr d7, [x14, #0x38]\n"
- "mov v21.16b, v15.16b\n"
- "mov v22.16b, v17.16b\n"
- "ldr d8, [x14, #0x40]\n"
+ "mov v19.16b, v5.16b\n"
+ "mov v31.16b, v3.16b\n"
+ "ldr d4, [x14, #0x40]\n"
"ldp x27, x26, [x15, #0x0]\n"
- "usubl v0.8h, v0.8b, v13.8b\n"
- "usubl v1.8h, v1.8b, v13.8b\n"
+ "usubl v11.8h, v11.8b, v15.8b\n"
+ "usubl v22.8h, v22.8b, v15.8b\n"
"ldp x25, x24, [x15, #0x10]\n"
"ldp x23, x22, [x15, #0x20]\n"
- "usubl v2.8h, v2.8b, v13.8b\n"
- "usubl v3.8h, v3.8b, v13.8b\n"
+ "usubl v14.8h, v14.8b, v15.8b\n"
+ "usubl v28.8h, v28.8b, v15.8b\n"
"ldp x21, x20, [x15, #0x30]\n"
- "ldr d31, [x27, x17]\n"
- "usubl v4.8h, v4.8b, v13.8b\n"
- "usubl v5.8h, v5.8b, v13.8b\n"
- "ldr d30, [x26, x17]\n"
- "ldr d29, [x25, x17]\n"
- "usubl v6.8h, v6.8b, v13.8b\n"
- "usubl v7.8h, v7.8b, v13.8b\n"
- "ldr d28, [x24, x17]\n"
- "ldr d27, [x23, x17]\n"
- "usubl v8.8h, v8.8b, v13.8b\n"
- "usubl v31.8h, v31.8b, v12.8b\n"
- "ldr d26, [x22, x17]\n"
- "ldr d25, [x21, x17]\n"
- "usubl v30.8h, v30.8b, v12.8b\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "ldr d24, [x20, x17]\n"
- "usubl v28.8h, v28.8b, v12.8b\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
+ "ldr d25, [x27, x17]\n"
+ "usubl v18.8h, v18.8b, v15.8b\n"
+ "usubl v9.8h, v9.8b, v15.8b\n"
+ "ldr d27, [x26, x17]\n"
+ "ldr d1, [x25, x17]\n"
+ "usubl v26.8h, v26.8b, v15.8b\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "ldr d2, [x24, x17]\n"
+ "ldr d12, [x23, x17]\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "ldr d16, [x22, x17]\n"
+ "ldr d23, [x21, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "usubl v1.8h, v1.8b, v6.8b\n"
+ "ldr d10, [x20, x17]\n"
+ "usubl v2.8h, v2.8b, v6.8b\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "usubl v10.8h, v10.8b, v6.8b\n"
"bgt 1b\n"
"2:" // Tail
- "smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v17.4s, v31.8h, v8.8h\n"
- "ldr x24, [x15, #0x40]\n"
- "ldr x22, [x15, #0x48]\n"
- "smlal v10.4s, v31.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "ldr x21, [x15, #0x50]\n"
- "ldr x20, [x15, #0x58]\n"
- "smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v17.4s, v30.8h, v0.8h\n"
- "ldr q19, [x13, #0x0]\n"
- "ldr x23, [x15, #0x78]\n"
- "smlal v10.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "ldr d28, [x22, x17]\n"
- "usubl v28.8h, v28.8b, v12.8b\n"
- "smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v17.4s, v29.8h, v1.8h\n"
- "ldr d29, [x24, x17]\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
+ "ldr q29, [x13, #0x0]\n"
+ "ldr q30, [x12, #0x0]\n"
+ "smlal v5.4s, v25.4h, v4.4h\n"
+ "smlal2 v3.4s, v25.8h, v4.8h\n"
+ "ldr x21, [x15, #0x58]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v5.4s, v27.4h, v11.4h\n"
+ "smlal v21.4s, v25.4h, v26.4h\n"
+ "ldr x25, [x15, #0x60]\n"
+ "ldr x24, [x15, #0x80]\n"
+ "smlal v20.4s, v25.4h, v14.4h\n"
+ "smlal v19.4s, v25.4h, v11.4h\n"
+ "smlal2 v3.4s, v27.8h, v11.8h\n"
"ldr d27, [x21, x17]\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v17.4s, v26.8h, v3.8h\n"
- "ldr d26, [x20, x17]\n"
- "ldr x20, [x15, #0x60]\n"
- "smlal v10.4s, v24.4h, v0.4h\n"
- "smlal2 v20.4s, v24.8h, v0.8h\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "ldr x21, [x15, #0x80]\n"
- "smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v17.4s, v25.8h, v4.8h\n"
- "ldr d25, [x20, x17]\n"
- "ldr x20, [x15, #0x68]\n"
- "smlal v10.4s, v29.4h, v4.4h\n"
- "smlal2 v20.4s, v29.8h, v4.8h\n"
- "ldr d29, [x20, x17]\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v17.4s, v24.8h, v2.8h\n"
- "ldr q18, [x12, #0x0]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v22.4h\n"
+ "smlal2 v8.4s, v25.8h, v26.8h\n"
+ "smlal2 v0.4s, v25.8h, v14.8h\n"
+ "ldr x23, [x15, #0x68]\n"
"ldr x22, [x15, #0x88]\n"
- "smlal v10.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "ldr d28, [x21, x17]\n"
- "ldr x21, [x15, #0x70]\n"
- "smlal v9.4s, v31.4h, v2.4h\n"
- "smlal2 v23.4s, v31.8h, v2.8h\n"
- "usubl v28.8h, v28.8b, v12.8b\n"
+ "smlal2 v31.4s, v25.8h, v11.8h\n"
+ "ldr d25, [x20, x17]\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "smlal v21.4s, v2.4h, v22.4h\n"
+ "smlal v20.4s, v27.4h, v28.4h\n"
+ "smlal v19.4s, v25.4h, v18.4h\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal2 v3.4s, v1.8h, v22.8h\n"
+ "ldr d1, [x25, x17]\n"
+ "usubl v1.8h, v1.8b, v6.8b\n"
+ "smlal v5.4s, v16.4h, v28.4h\n"
+ "smlal2 v8.4s, v2.8h, v22.8h\n"
+ "ldr d2, [x24, x17]\n"
+ "usubl v2.8h, v2.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v28.8h\n"
+ "ldr d27, [x23, x17]\n"
+ "smlal2 v31.4s, v25.8h, v18.8h\n"
+ "ldr d25, [x22, x17]\n"
+ "smlal v21.4s, v12.4h, v14.4h\n"
"ldr x25, [x15, #0x98]\n"
- "smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v17.4s, v27.8h, v5.8h\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "ldr x24, [x15, #0x90]\n"
- "smlal v10.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "smlal v20.4s, v1.4h, v11.4h\n"
+ "smlal v19.4s, v2.4h, v22.4h\n"
+ "ldr x24, [x15, #0x50]\n"
+ "smlal2 v3.4s, v16.8h, v28.8h\n"
+ "ldr d16, [x21, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v23.4h, v18.4h\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "smlal2 v8.4s, v12.8h, v14.8h\n"
+ "ldr d12, [x20, x17]\n"
+ "ldr x23, [x15, #0x48]\n"
+ "smlal2 v0.4s, v1.8h, v11.8h\n"
+ "smlal2 v31.4s, v2.8h, v22.8h\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v21.4s, v10.4h, v11.4h\n"
+ "smlal v20.4s, v27.4h, v18.4h\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "smlal v19.4s, v25.4h, v9.4h\n"
+ "smlal2 v3.4s, v23.8h, v18.8h\n"
+ "ldr d23, [x25, x17]\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "smlal v5.4s, v10.4h, v14.4h\n"
+ "smlal2 v8.4s, v10.8h, v11.8h\n"
+ "ldr d11, [x24, x17]\n"
+ "usubl v11.8h, v11.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v18.8h\n"
"ldr d27, [x23, x17]\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "smlal v21.4s, v31.4h, v0.4h\n"
- "smlal v9.4s, v26.4h, v3.4h\n"
- "ldr x23, [x15, #0xa8]\n"
- "ldr x20, [x15, #0xa0]\n"
- "smlal2 v23.4s, v26.8h, v3.8h\n"
- "ldr d26, [x22, x17]\n"
- "smlal2 v22.4s, v31.8h, v0.8h\n"
- "ldr d24, [x21, x17]\n"
- "smlal v21.4s, v27.4h, v4.4h\n"
- "smlal v9.4s, v25.4h, v0.4h\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "ldr x22, [x15, #0xb0]\n"
- "smlal2 v23.4s, v25.8h, v0.8h\n"
- "ldr q30, [x13, #0x10]\n"
- "smlal2 v22.4s, v27.8h, v4.8h\n"
- "ldr d27, [x20, x17]\n"
- "smlal v21.4s, v28.4h, v1.4h\n"
- "smlal v15.4s, v25.4h, v6.4h\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
- "ldr x21, [x15, #0xb8]\n"
- "smlal2 v17.4s, v25.8h, v6.8h\n"
- "ldr d25, [x24, x17]\n"
- "smlal v9.4s, v29.4h, v4.4h\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal2 v23.4s, v29.8h, v4.8h\n"
- "ldr d29, [x25, x17]\n"
- "ldr q31, [x12, #0x10]\n"
- "smlal2 v22.4s, v28.8h, v1.8h\n"
- "smlal v21.4s, v26.4h, v5.4h\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v15.4s, v24.4h, v7.4h\n"
+ "smlal2 v31.4s, v25.8h, v9.8h\n"
+ "ldr d25, [x21, x17]\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "smlal v21.4s, v16.4h, v18.4h\n"
+ "smlal v20.4s, v12.4h, v22.4h\n"
+ "smlal v19.4s, v23.4h, v14.4h\n"
+ "smlal2 v3.4s, v10.8h, v14.8h\n"
+ "ldr d10, [x20, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "usubl v10.8h, v10.8b, v6.8b\n"
+ "smlal v5.4s, v11.4h, v9.4h\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "smlal2 v8.4s, v16.8h, v18.8h\n"
+ "ldr d16, [x22, x17]\n"
+ "ldr d18, [x21, x17]\n"
+ "smlal2 v0.4s, v12.8h, v22.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal2 v31.4s, v23.8h, v14.8h\n"
+ "ldr q14, [x13, #0x10]\n"
+ "smlal v21.4s, v27.4h, v9.4h\n"
+ "smlal v20.4s, v25.4h, v26.4h\n"
+ "smlal v19.4s, v10.4h, v28.4h\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0xc0]\n"
- "smlal2 v17.4s, v24.8h, v7.8h\n"
- "smlal v9.4s, v24.4h, v1.4h\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
+ "smlal2 v3.4s, v11.8h, v9.8h\n"
+ "usubl v18.8h, v18.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v26.4h\n"
"tst x7, #0x7\n"
- "smlal2 v23.4s, v24.8h, v1.8h\n"
- "ldr d24, [x23, x17]\n"
- "smlal2 v22.4s, v26.8h, v5.8h\n"
- "ldr d26, [x22, x17]\n"
- "smlal v21.4s, v29.4h, v2.4h\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
- "smlal2 v22.4s, v29.8h, v2.8h\n"
+ "smlal2 v8.4s, v27.8h, v9.8h\n"
+ "ldr d27, [x20, x17]\n"
+ "smlal2 v0.4s, v25.8h, v26.8h\n"
+ "ldr q25, [x12, #0x10]\n"
+ "smlal2 v31.4s, v10.8h, v28.8h\n"
+ "smlal v21.4s, v11.4h, v28.4h\n"
+ "usubl v22.8h, v22.8b, v6.8b\n"
+ "add x17, x17, #0x8\n"
+ "smlal v20.4s, v16.4h, v7.4h\n"
+ "smlal v19.4s, v18.4h, v7.4h\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
"add x13, x13, #0x20\n"
- "smlal v9.4s, v25.4h, v6.4h\n"
- "smlal v21.4s, v24.4h, v3.4h\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal2 v3.4s, v1.8h, v26.8h\n"
+ "smlal v5.4s, v12.4h, v7.4h\n"
+ "sqrdmulh v5.4s, v5.4s, v29.4s\n"
"add x12, x12, #0x20\n"
- "smlal v10.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
- "sqrdmulh v15.4s, v15.4s, v19.4s\n"
- "smlal2 v23.4s, v25.8h, v6.8h\n"
- "ldr d25, [x21, x17]\n"
- "smlal2 v22.4s, v24.8h, v3.8h\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v9.4s, v27.4h, v7.4h\n"
- "smlal v21.4s, v26.4h, v7.4h\n"
- "and v0.16b, v15.16b, v18.16b\n"
- "smlal v10.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "ldr d29, [x20, x17]\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal2 v23.4s, v27.8h, v7.8h\n"
- "smlal2 v22.4s, v26.8h, v7.8h\n"
- "sqrdmulh v17.4s, v17.4s, v30.4s\n"
- "add x17, x17, #0x8\n"
- "smlal v9.4s, v24.4h, v5.4h\n"
- "smlal v21.4s, v25.4h, v6.4h\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "smlal2 v23.4s, v24.8h, v5.8h\n"
- "smlal2 v22.4s, v25.8h, v6.8h\n"
- "and v7.16b, v17.16b, v31.16b\n"
- "smlal v9.4s, v25.4h, v8.4h\n"
- "smlal v21.4s, v29.4h, v8.4h\n"
- "sqrdmulh v10.4s, v10.4s, v19.4s\n"
- "smlal2 v23.4s, v25.8h, v8.8h\n"
- "smlal2 v22.4s, v29.8h, v8.8h\n"
- "sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "sqrdmulh v21.4s, v21.4s, v19.4s\n"
- "sqadd v15.4s, v15.4s, v0.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "and v19.16b, v10.16b, v18.16b\n"
- "sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "and v27.16b, v9.16b, v18.16b\n"
- "sqrdmulh v23.4s, v23.4s, v30.4s\n"
- "and v0.16b, v21.16b, v18.16b\n"
- "sqrdmulh v22.4s, v22.4s, v30.4s\n"
- "sqadd v17.4s, v17.4s, v7.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "and v5.16b, v20.16b, v31.16b\n"
+ "smlal2 v8.4s, v11.8h, v28.8h\n"
+ "smlal2 v0.4s, v16.8h, v7.8h\n"
+ "and v16.16b, v5.16b, v30.16b\n"
+ "smlal2 v31.4s, v18.8h, v7.8h\n"
+ "smlal v21.4s, v2.4h, v7.4h\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smlal v20.4s, v10.4h, v9.4h\n"
+ "smlal v19.4s, v22.4h, v26.4h\n"
+ "sqadd v5.4s, v5.4s, v16.4s\n"
+ "smlal2 v3.4s, v12.8h, v7.8h\n"
+ "smlal2 v8.4s, v2.8h, v7.8h\n"
+ "sqrdmulh v3.4s, v3.4s, v14.4s\n"
+ "smlal2 v0.4s, v10.8h, v9.8h\n"
+ "smlal2 v31.4s, v22.8h, v26.8h\n"
+ "and v16.16b, v3.16b, v25.16b\n"
+ "smlal v21.4s, v23.4h, v4.4h\n"
+ "smlal v20.4s, v22.4h, v4.4h\n"
+ "sqrdmulh v21.4s, v21.4s, v29.4s\n"
+ "smlal v19.4s, v27.4h, v4.4h\n"
+ "smlal2 v8.4s, v23.8h, v4.8h\n"
+ "sqrdmulh v20.4s, v20.4s, v29.4s\n"
+ "smlal2 v0.4s, v22.8h, v4.8h\n"
+ "smlal2 v31.4s, v27.8h, v4.8h\n"
+ "sqrdmulh v19.4s, v19.4s, v29.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v23.16b, v21.16b, v30.16b\n"
+ "sqrdmulh v8.4s, v8.4s, v14.4s\n"
+ "and v27.16b, v20.16b, v30.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v14.4s\n"
+ "and v22.16b, v19.16b, v30.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v14.4s\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v14.16b, v8.16b, v25.16b\n"
"sshr v27.4s, v27.4s, #0x1f\n"
- "and v4.16b, v23.16b, v31.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v7.16b, v22.16b, v31.16b\n"
- "sqadd v10.4s, v10.4s, v19.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v27.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v18.4s\n"
- "srshl v10.4s, v10.4s, v18.4s\n"
- "sqadd v20.4s, v20.4s, v5.4s\n"
- "srshl v9.4s, v9.4s, v18.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "srshl v21.4s, v21.4s, v18.4s\n"
- "sqadd v22.4s, v22.4s, v7.4s\n"
- "srshl v17.4s, v17.4s, v31.4s\n"
- "sqxtn v15.4h, v15.4s\n"
- "srshl v20.4s, v20.4s, v31.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "srshl v23.4s, v23.4s, v31.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v22.4s, v22.4s, v31.4s\n"
+ "and v18.16b, v0.16b, v25.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v16.16b, v31.16b, v25.16b\n"
+ "sqadd v21.4s, v21.4s, v23.4s\n"
+ "sshr v14.4s, v14.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v27.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v22.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v30.4s\n"
+ "srshl v21.4s, v21.4s, v30.4s\n"
+ "sqadd v8.4s, v8.4s, v14.4s\n"
+ "srshl v20.4s, v20.4s, v30.4s\n"
+ "sqadd v0.4s, v0.4s, v18.4s\n"
+ "srshl v19.4s, v19.4s, v30.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v3.4s, v3.4s, v25.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "srshl v8.4s, v8.4s, v25.4s\n"
"sqxtn v21.4h, v21.4s\n"
- "sqxtn2 v15.8h, v17.4s\n"
- "sqxtn2 v10.8h, v20.4s\n"
- "sqxtn2 v9.8h, v23.4s\n"
- "sqxtn2 v21.8h, v22.4s\n"
- "sqadd v15.8h, v15.8h, v11.8h\n"
- "sqadd v10.8h, v10.8h, v11.8h\n"
- "sqadd v9.8h, v9.8h, v11.8h\n"
- "sqadd v21.8h, v21.8h, v11.8h\n"
- "smax v15.8h, v15.8h, v16.8h\n"
- "smax v10.8h, v10.8h, v16.8h\n"
- "smax v9.8h, v9.8h, v16.8h\n"
- "smax v21.8h, v21.8h, v16.8h\n"
- "smin v15.8h, v15.8h, v14.8h\n"
- "smin v10.8h, v10.8h, v14.8h\n"
- "smin v9.8h, v9.8h, v14.8h\n"
- "smin v21.8h, v21.8h, v14.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
- "str d15, [x11, x16]\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
- "str d10, [x10, x16]\n"
+ "srshl v0.4s, v0.4s, v25.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v31.4s, v31.4s, v25.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "sqxtn2 v5.8h, v3.4s\n"
+ "sqxtn2 v21.8h, v8.4s\n"
+ "sqxtn2 v20.8h, v0.4s\n"
+ "sqxtn2 v19.8h, v31.4s\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "sqadd v21.8h, v21.8h, v13.8h\n"
+ "sqadd v20.8h, v20.8h, v13.8h\n"
+ "sqadd v19.8h, v19.8h, v13.8h\n"
+ "smax v5.8h, v5.8h, v17.8h\n"
+ "smax v21.8h, v21.8h, v17.8h\n"
+ "smax v20.8h, v20.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smin v5.8h, v5.8h, v24.8h\n"
+ "smin v21.8h, v21.8h, v24.8h\n"
+ "smin v20.8h, v20.8h, v24.8h\n"
+ "smin v19.8h, v19.8h, v24.8h\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "str d5, [x11, x16]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
- "str d9, [x9, x16]\n"
- "str d21, [x28, x16]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d21, [x10, x16]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str d20, [x9, x16]\n"
+ "str d19, [x28, x16]\n"
"add x16, x16, #0x8\n"
"beq 88f\n"
"add x14, x14, #0x48\n"
"3:" // Oddments
- "ldr x24, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
"tbz x7, #2, 5f\n"
- "ld1 { v15.4s }, [x24], #0x10\n"
+ "ld1 { v5.4s }, [x20], #0x10\n"
"tbz x7, #1, 4f\n"
- "ld1 { v17.d }[0], [x24], #0x8\n"
+ "ld1 { v3.d }[0], [x20], #0x8\n"
"tbz x7, #0, 7f\n"
- "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v3.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
"tbz x7, #0, 7f\n"
- "ld1 { v17.s }[0], [x24]\n"
+ "ld1 { v3.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
"tbz x7, #1, 6f\n"
- "ld1 { v15.d }[0], [x24], #0x8\n"
+ "ld1 { v5.d }[0], [x20], #0x8\n"
"tbz x7, #0, 7f\n"
- "ld1 { v15.s }[2], [x24]\n"
+ "ld1 { v5.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 7f\n"
- "ld1 { v15.s }[0], [x24]\n"
+ "ld1 { v5.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x14, #0x0]\n"
- "ldr d1, [x14, #0x8]\n"
- "mov v10.16b, v15.16b\n"
- "mov v20.16b, v17.16b\n"
- "ldr d2, [x14, #0x10]\n"
- "ldr d3, [x14, #0x18]\n"
- "mov v9.16b, v15.16b\n"
- "mov v23.16b, v17.16b\n"
- "ldr d4, [x14, #0x20]\n"
- "ldr d5, [x14, #0x28]\n"
- "mov v21.16b, v15.16b\n"
- "mov v22.16b, v17.16b\n"
- "ldr d6, [x14, #0x30]\n"
+ "ldr d11, [x14, #0x0]\n"
+ "ldr d22, [x14, #0x8]\n"
+ "mov v21.16b, v5.16b\n"
+ "mov v8.16b, v3.16b\n"
+ "ldr d14, [x14, #0x10]\n"
+ "ldr d28, [x14, #0x18]\n"
+ "mov v20.16b, v5.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "ldr d18, [x14, #0x20]\n"
+ "ldr d9, [x14, #0x28]\n"
+ "mov v19.16b, v5.16b\n"
+ "mov v31.16b, v3.16b\n"
+ "ldr d26, [x14, #0x30]\n"
"ldr d7, [x14, #0x38]\n"
- "usubl v0.8h, v0.8b, v13.8b\n"
- "usubl v1.8h, v1.8b, v13.8b\n"
- "ldr d8, [x14, #0x40]\n"
+ "usubl v11.8h, v11.8b, v15.8b\n"
+ "usubl v22.8h, v22.8b, v15.8b\n"
+ "ldr d4, [x14, #0x40]\n"
"ldp x27, x26, [x15, #0x0]\n"
- "usubl v2.8h, v2.8b, v13.8b\n"
- "usubl v3.8h, v3.8b, v13.8b\n"
+ "usubl v14.8h, v14.8b, v15.8b\n"
+ "usubl v28.8h, v28.8b, v15.8b\n"
"ldp x25, x24, [x15, #0x10]\n"
"ldp x23, x22, [x15, #0x20]\n"
- "usubl v4.8h, v4.8b, v13.8b\n"
- "usubl v5.8h, v5.8b, v13.8b\n"
+ "usubl v18.8h, v18.8b, v15.8b\n"
+ "usubl v9.8h, v9.8b, v15.8b\n"
"ldp x21, x20, [x15, #0x30]\n"
- "usubl v6.8h, v6.8b, v13.8b\n"
- "usubl v7.8h, v7.8b, v13.8b\n"
- "usubl v8.8h, v8.8b, v13.8b\n"
+ "usubl v26.8h, v26.8b, v15.8b\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
"add x27, x27, x17\n"
"add x26, x26, x17\n"
"add x25, x25, x17\n"
@@ -689,700 +689,700 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"add x21, x21, x17\n"
"add x20, x20, x17\n"
"tbz x7, #2, 9f\n"
- "ld1 { v31.s }[0], [x27], #0x4\n"
- "ld1 { v30.s }[0], [x26], #0x4\n"
- "ld1 { v29.s }[0], [x25], #0x4\n"
- "ld1 { v28.s }[0], [x24], #0x4\n"
- "ld1 { v27.s }[0], [x23], #0x4\n"
- "ld1 { v26.s }[0], [x22], #0x4\n"
- "ld1 { v25.s }[0], [x21], #0x4\n"
- "ld1 { v24.s }[0], [x20], #0x4\n"
+ "ld1 { v25.s }[0], [x27], #0x4\n"
+ "ld1 { v27.s }[0], [x26], #0x4\n"
+ "ld1 { v1.s }[0], [x25], #0x4\n"
+ "ld1 { v2.s }[0], [x24], #0x4\n"
+ "ld1 { v12.s }[0], [x23], #0x4\n"
+ "ld1 { v16.s }[0], [x22], #0x4\n"
+ "ld1 { v23.s }[0], [x21], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz x7, #1, 8f\n"
- "ld1 { v31.h }[2], [x27], #0x2\n"
- "ld1 { v30.h }[2], [x26], #0x2\n"
- "ld1 { v29.h }[2], [x25], #0x2\n"
- "ld1 { v28.h }[2], [x24], #0x2\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
- "ld1 { v26.h }[2], [x22], #0x2\n"
- "ld1 { v25.h }[2], [x21], #0x2\n"
- "ld1 { v24.h }[2], [x20], #0x2\n"
+ "ld1 { v25.h }[2], [x27], #0x2\n"
+ "ld1 { v27.h }[2], [x26], #0x2\n"
+ "ld1 { v1.h }[2], [x25], #0x2\n"
+ "ld1 { v2.h }[2], [x24], #0x2\n"
+ "ld1 { v12.h }[2], [x23], #0x2\n"
+ "ld1 { v16.h }[2], [x22], #0x2\n"
+ "ld1 { v23.h }[2], [x21], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[6], [x27]\n"
- "ld1 { v30.b }[6], [x26]\n"
- "ld1 { v29.b }[6], [x25]\n"
- "ld1 { v28.b }[6], [x24]\n"
- "ld1 { v27.b }[6], [x23]\n"
- "ld1 { v26.b }[6], [x22]\n"
- "ld1 { v25.b }[6], [x21]\n"
- "ld1 { v24.b }[6], [x20]\n"
+ "ld1 { v25.b }[6], [x27]\n"
+ "ld1 { v27.b }[6], [x26]\n"
+ "ld1 { v1.b }[6], [x25]\n"
+ "ld1 { v2.b }[6], [x24]\n"
+ "ld1 { v12.b }[6], [x23]\n"
+ "ld1 { v16.b }[6], [x22]\n"
+ "ld1 { v23.b }[6], [x21]\n"
+ "ld1 { v10.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[4], [x27]\n"
- "ld1 { v30.b }[4], [x26]\n"
- "ld1 { v29.b }[4], [x25]\n"
- "ld1 { v28.b }[4], [x24]\n"
- "ld1 { v27.b }[4], [x23]\n"
- "ld1 { v26.b }[4], [x22]\n"
- "ld1 { v25.b }[4], [x21]\n"
- "ld1 { v24.b }[4], [x20]\n"
+ "ld1 { v25.b }[4], [x27]\n"
+ "ld1 { v27.b }[4], [x26]\n"
+ "ld1 { v1.b }[4], [x25]\n"
+ "ld1 { v2.b }[4], [x24]\n"
+ "ld1 { v12.b }[4], [x23]\n"
+ "ld1 { v16.b }[4], [x22]\n"
+ "ld1 { v23.b }[4], [x21]\n"
+ "ld1 { v10.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
"tbz x7, #1, 10f\n"
- "ld1 { v31.h }[0], [x27], #0x2\n"
- "ld1 { v30.h }[0], [x26], #0x2\n"
- "ld1 { v29.h }[0], [x25], #0x2\n"
- "ld1 { v28.h }[0], [x24], #0x2\n"
- "ld1 { v27.h }[0], [x23], #0x2\n"
- "ld1 { v26.h }[0], [x22], #0x2\n"
- "ld1 { v25.h }[0], [x21], #0x2\n"
- "ld1 { v24.h }[0], [x20], #0x2\n"
+ "ld1 { v25.h }[0], [x27], #0x2\n"
+ "ld1 { v27.h }[0], [x26], #0x2\n"
+ "ld1 { v1.h }[0], [x25], #0x2\n"
+ "ld1 { v2.h }[0], [x24], #0x2\n"
+ "ld1 { v12.h }[0], [x23], #0x2\n"
+ "ld1 { v16.h }[0], [x22], #0x2\n"
+ "ld1 { v23.h }[0], [x21], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[2], [x27]\n"
- "ld1 { v30.b }[2], [x26]\n"
- "ld1 { v29.b }[2], [x25]\n"
- "ld1 { v28.b }[2], [x24]\n"
- "ld1 { v27.b }[2], [x23]\n"
- "ld1 { v26.b }[2], [x22]\n"
- "ld1 { v25.b }[2], [x21]\n"
- "ld1 { v24.b }[2], [x20]\n"
+ "ld1 { v25.b }[2], [x27]\n"
+ "ld1 { v27.b }[2], [x26]\n"
+ "ld1 { v1.b }[2], [x25]\n"
+ "ld1 { v2.b }[2], [x24]\n"
+ "ld1 { v12.b }[2], [x23]\n"
+ "ld1 { v16.b }[2], [x22]\n"
+ "ld1 { v23.b }[2], [x21]\n"
+ "ld1 { v10.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[0], [x27]\n"
- "ld1 { v30.b }[0], [x26]\n"
- "ld1 { v29.b }[0], [x25]\n"
- "ld1 { v28.b }[0], [x24]\n"
- "ld1 { v27.b }[0], [x23]\n"
- "ld1 { v26.b }[0], [x22]\n"
- "ld1 { v25.b }[0], [x21]\n"
- "ld1 { v24.b }[0], [x20]\n"
+ "ld1 { v25.b }[0], [x27]\n"
+ "ld1 { v27.b }[0], [x26]\n"
+ "ld1 { v1.b }[0], [x25]\n"
+ "ld1 { v2.b }[0], [x24]\n"
+ "ld1 { v12.b }[0], [x23]\n"
+ "ld1 { v16.b }[0], [x22]\n"
+ "ld1 { v23.b }[0], [x21]\n"
+ "ld1 { v10.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "usubl v31.8h, v31.8b, v12.8b\n"
- "smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v17.4s, v31.8h, v8.8h\n"
- "ldr x24, [x15, #0x40]\n"
- "usubl v30.8h, v30.8b, v12.8b\n"
- "smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v17.4s, v30.8h, v0.8h\n"
- "add x24, x24, x17\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v10.4s, v31.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v17.4s, v29.8h, v1.8h\n"
- "usubl v28.8h, v28.8b, v12.8b\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "smlal v10.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v17.4s, v26.8h, v3.8h\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
- "smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v17.4s, v25.8h, v4.8h\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
- "smlal v9.4s, v31.4h, v2.4h\n"
- "smlal2 v23.4s, v31.8h, v2.8h\n"
- "smlal v21.4s, v31.4h, v0.4h\n"
- "smlal2 v22.4s, v31.8h, v0.8h\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v17.4s, v24.8h, v2.8h\n"
- "smlal v10.4s, v24.4h, v0.4h\n"
- "smlal2 v20.4s, v24.8h, v0.8h\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "smlal v5.4s, v25.4h, v4.4h\n"
+ "smlal2 v3.4s, v25.8h, v4.8h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v27.4h, v11.4h\n"
+ "smlal2 v3.4s, v27.8h, v11.8h\n"
+ "usubl v1.8h, v1.8b, v6.8b\n"
+ "smlal v21.4s, v25.4h, v26.4h\n"
+ "smlal2 v8.4s, v25.8h, v26.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v5.4s, v1.4h, v22.4h\n"
+ "smlal2 v3.4s, v1.8h, v22.8h\n"
+ "usubl v2.8h, v2.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "smlal v21.4s, v2.4h, v22.4h\n"
+ "smlal2 v8.4s, v2.8h, v22.8h\n"
+ "smlal v5.4s, v16.4h, v28.4h\n"
+ "smlal2 v3.4s, v16.8h, v28.8h\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "smlal v21.4s, v12.4h, v14.4h\n"
+ "smlal2 v8.4s, v12.8h, v14.8h\n"
+ "smlal v5.4s, v23.4h, v18.4h\n"
+ "smlal2 v3.4s, v23.8h, v18.8h\n"
+ "usubl v10.8h, v10.8b, v6.8b\n"
+ "smlal v20.4s, v25.4h, v14.4h\n"
+ "smlal2 v0.4s, v25.8h, v14.8h\n"
+ "smlal v19.4s, v25.4h, v11.4h\n"
+ "smlal2 v31.4s, v25.8h, v11.8h\n"
+ "smlal v5.4s, v10.4h, v14.4h\n"
+ "smlal2 v3.4s, v10.8h, v14.8h\n"
+ "smlal v21.4s, v10.4h, v11.4h\n"
+ "smlal2 v8.4s, v10.8h, v11.8h\n"
"tbz x7, #2, 13f\n"
- "ld1 { v29.s }[0], [x24], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
"tbz x7, #1, 12f\n"
- "ld1 { v29.h }[2], [x24], #0x2\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[6], [x24]\n"
+ "ld1 { v15.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[4], [x24]\n"
+ "ld1 { v15.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
"tbz x7, #1, 14f\n"
- "ld1 { v29.h }[0], [x24], #0x2\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[2], [x24]\n"
+ "ld1 { v15.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[0], [x24]\n"
+ "ld1 { v15.b }[0], [x20]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
- "usubl v29.8h, v29.8b, v12.8b\n"
- "ldr x22, [x15, #0x48]\n"
- "smlal v10.4s, v29.4h, v4.4h\n"
- "smlal2 v20.4s, v29.8h, v4.8h\n"
- "add x22, x22, x17\n"
+ "usubl v15.8h, v15.8b, v6.8b\n"
+ "ldr x20, [x15, #0x48]\n"
+ "smlal v21.4s, v15.4h, v18.4h\n"
+ "smlal2 v8.4s, v15.8h, v18.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 17f\n"
- "ld1 { v28.s }[0], [x22], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 16f\n"
- "ld1 { v28.h }[2], [x22], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[6], [x22]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[4], [x22]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
"tbz x7, #1, 18f\n"
- "ld1 { v28.h }[0], [x22], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[2], [x22]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[0], [x22]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
- "usubl v28.8h, v28.8b, v12.8b\n"
- "ldr x21, [x15, #0x50]\n"
- "smlal v10.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "add x21, x21, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x50]\n"
+ "smlal v21.4s, v16.4h, v9.4h\n"
+ "smlal2 v8.4s, v16.8h, v9.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 21f\n"
- "ld1 { v27.s }[0], [x21], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 20f\n"
- "ld1 { v27.h }[2], [x21], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[6], [x21]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[4], [x21]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (1, 2): Bit 2: Unset
"tbz x7, #1, 22f\n"
- "ld1 { v27.h }[0], [x21], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[0], [x21]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"23:" // Oddments: Load (1, 2): Bit 2: End
- "usubl v27.8h, v27.8b, v12.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0x58]\n"
- "smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v17.4s, v27.8h, v5.8h\n"
- "smlal v10.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "smlal v5.4s, v16.4h, v9.4h\n"
+ "smlal2 v3.4s, v16.8h, v9.8h\n"
+ "smlal v21.4s, v16.4h, v28.4h\n"
+ "smlal2 v8.4s, v16.8h, v28.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 25f\n"
- "ld1 { v26.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 24f\n"
- "ld1 { v26.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (3, 0): Bit 2: Unset
"tbz x7, #1, 26f\n"
- "ld1 { v26.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"27:" // Oddments: Load (3, 0): Bit 2: End
- "usubl v26.8h, v26.8b, v12.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0x60]\n"
- "smlal v9.4s, v26.4h, v3.4h\n"
- "smlal2 v23.4s, v26.8h, v3.8h\n"
+ "smlal v20.4s, v16.4h, v28.4h\n"
+ "smlal2 v0.4s, v16.8h, v28.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 29f\n"
- "ld1 { v25.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 28f\n"
- "ld1 { v25.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 0): Bit 2: Unset
"tbz x7, #1, 30f\n"
- "ld1 { v25.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 0): Bit 2: End
- "usubl v25.8h, v25.8b, v12.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0x68]\n"
- "smlal v15.4s, v25.4h, v6.4h\n"
- "smlal2 v17.4s, v25.8h, v6.8h\n"
- "smlal v9.4s, v25.4h, v0.4h\n"
- "smlal2 v23.4s, v25.8h, v0.8h\n"
+ "smlal v5.4s, v16.4h, v26.4h\n"
+ "smlal2 v3.4s, v16.8h, v26.8h\n"
+ "smlal v20.4s, v16.4h, v11.4h\n"
+ "smlal2 v0.4s, v16.8h, v11.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 33f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 32f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (3, 1): Bit 2: Unset
"tbz x7, #1, 34f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"35:" // Oddments: Load (3, 1): Bit 2: End
- "usubl v29.8h, v29.8b, v12.8b\n"
- "ldr x21, [x15, #0x70]\n"
- "smlal v9.4s, v29.4h, v4.4h\n"
- "smlal2 v23.4s, v29.8h, v4.8h\n"
- "add x21, x21, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal v20.4s, v16.4h, v18.4h\n"
+ "smlal2 v0.4s, v16.8h, v18.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 37f\n"
- "ld1 { v24.s }[0], [x21], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 36f\n"
- "ld1 { v24.h }[2], [x21], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[6], [x21]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[4], [x21]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 1): Bit 2: Unset
"tbz x7, #1, 38f\n"
- "ld1 { v24.h }[0], [x21], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[0], [x21]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"39:" // Oddments: Load (2, 1): Bit 2: End
- "usubl v24.8h, v24.8b, v12.8b\n"
- "ldr x23, [x15, #0x78]\n"
- "smlal v15.4s, v24.4h, v7.4h\n"
- "smlal2 v17.4s, v24.8h, v7.8h\n"
- "smlal v9.4s, v24.4h, v1.4h\n"
- "smlal2 v23.4s, v24.8h, v1.8h\n"
- "add x23, x23, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v5.4s, v16.4h, v7.4h\n"
+ "smlal2 v3.4s, v16.8h, v7.8h\n"
+ "smlal v20.4s, v16.4h, v22.4h\n"
+ "smlal2 v0.4s, v16.8h, v22.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 41f\n"
- "ld1 { v27.s }[0], [x23], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 40f\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[6], [x23]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[4], [x23]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (3, 3): Bit 2: Unset
"tbz x7, #1, 42f\n"
- "ld1 { v27.h }[0], [x23], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[2], [x23]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[0], [x23]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"43:" // Oddments: Load (3, 3): Bit 2: End
- "usubl v27.8h, v27.8b, v12.8b\n"
- "ldr x21, [x15, #0x80]\n"
- "smlal v21.4s, v27.4h, v4.4h\n"
- "smlal2 v22.4s, v27.8h, v4.8h\n"
- "add x21, x21, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x80]\n"
+ "smlal v19.4s, v16.4h, v18.4h\n"
+ "smlal2 v31.4s, v16.8h, v18.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 45f\n"
- "ld1 { v28.s }[0], [x21], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 44f\n"
- "ld1 { v28.h }[2], [x21], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[6], [x21]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[4], [x21]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
"tbz x7, #1, 46f\n"
- "ld1 { v28.h }[0], [x21], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[0], [x21]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "usubl v28.8h, v28.8b, v12.8b\n"
- "ldr x22, [x15, #0x88]\n"
- "smlal v10.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
- "smlal v21.4s, v28.4h, v1.4h\n"
- "smlal2 v22.4s, v28.8h, v1.8h\n"
- "add x22, x22, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x88]\n"
+ "smlal v21.4s, v16.4h, v7.4h\n"
+ "smlal2 v8.4s, v16.8h, v7.8h\n"
+ "smlal v19.4s, v16.4h, v22.4h\n"
+ "smlal2 v31.4s, v16.8h, v22.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 49f\n"
- "ld1 { v26.s }[0], [x22], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 48f\n"
- "ld1 { v26.h }[2], [x22], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[6], [x22]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[4], [x22]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 4): Bit 2: Unset
"tbz x7, #1, 50f\n"
- "ld1 { v26.h }[0], [x22], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[2], [x22]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[0], [x22]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 4): Bit 2: End
- "usubl v26.8h, v26.8b, v12.8b\n"
- "ldr x24, [x15, #0x90]\n"
- "smlal v21.4s, v26.4h, v5.4h\n"
- "smlal2 v22.4s, v26.8h, v5.8h\n"
- "add x24, x24, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x90]\n"
+ "smlal v19.4s, v16.4h, v9.4h\n"
+ "smlal2 v31.4s, v16.8h, v9.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 53f\n"
- "ld1 { v25.s }[0], [x24], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 52f\n"
- "ld1 { v25.h }[2], [x24], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[6], [x24]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[4], [x24]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (4, 0): Bit 2: Unset
"tbz x7, #1, 54f\n"
- "ld1 { v25.h }[0], [x24], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[2], [x24]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[0], [x24]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"55:" // Oddments: Load (4, 0): Bit 2: End
- "usubl v25.8h, v25.8b, v12.8b\n"
- "ldr x25, [x15, #0x98]\n"
- "smlal v9.4s, v25.4h, v6.4h\n"
- "smlal2 v23.4s, v25.8h, v6.8h\n"
- "add x25, x25, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x98]\n"
+ "smlal v20.4s, v16.4h, v26.4h\n"
+ "smlal2 v0.4s, v16.8h, v26.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 57f\n"
- "ld1 { v29.s }[0], [x25], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 56f\n"
- "ld1 { v29.h }[2], [x25], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[6], [x25]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 59f\n"
"56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[4], [x25]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 59f\n"
"57:" // Oddments: Load (2, 4): Bit 2: Unset
"tbz x7, #1, 58f\n"
- "ld1 { v29.h }[0], [x25], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[2], [x25]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 59f\n"
"58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[0], [x25]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"59:" // Oddments: Load (2, 4): Bit 2: End
- "usubl v29.8h, v29.8b, v12.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0xa0]\n"
- "smlal v10.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "smlal v21.4s, v29.4h, v2.4h\n"
- "smlal2 v22.4s, v29.8h, v2.8h\n"
+ "smlal v21.4s, v16.4h, v4.4h\n"
+ "smlal2 v8.4s, v16.8h, v4.8h\n"
+ "smlal v19.4s, v16.4h, v14.4h\n"
+ "smlal2 v31.4s, v16.8h, v14.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 61f\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 60f\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 63f\n"
"60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 63f\n"
"61:" // Oddments: Load (4, 1): Bit 2: Unset
"tbz x7, #1, 62f\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 63f\n"
"62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"63:" // Oddments: Load (4, 1): Bit 2: End
- "usubl v27.8h, v27.8b, v12.8b\n"
- "ldr x23, [x15, #0xa8]\n"
- "smlal v9.4s, v27.4h, v7.4h\n"
- "smlal2 v23.4s, v27.8h, v7.8h\n"
- "add x23, x23, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v20.4s, v16.4h, v7.4h\n"
+ "smlal2 v0.4s, v16.8h, v7.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 65f\n"
- "ld1 { v24.s }[0], [x23], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 64f\n"
- "ld1 { v24.h }[2], [x23], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[6], [x23]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[4], [x23]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 2): Bit 2: Unset
"tbz x7, #1, 66f\n"
- "ld1 { v24.h }[0], [x23], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[2], [x23]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[0], [x23]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"67:" // Oddments: Load (3, 2): Bit 2: End
- "usubl v24.8h, v24.8b, v12.8b\n"
- "ldr x22, [x15, #0xb0]\n"
- "smlal v9.4s, v24.4h, v5.4h\n"
- "smlal2 v23.4s, v24.8h, v5.8h\n"
- "smlal v21.4s, v24.4h, v3.4h\n"
- "smlal2 v22.4s, v24.8h, v3.8h\n"
- "add x22, x22, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xb0]\n"
+ "smlal v20.4s, v16.4h, v9.4h\n"
+ "smlal2 v0.4s, v16.8h, v9.8h\n"
+ "smlal v19.4s, v16.4h, v28.4h\n"
+ "smlal2 v31.4s, v16.8h, v28.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 69f\n"
- "ld1 { v26.s }[0], [x22], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 68f\n"
- "ld1 { v26.h }[2], [x22], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[6], [x22]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[4], [x22]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 3): Bit 2: Unset
"tbz x7, #1, 70f\n"
- "ld1 { v26.h }[0], [x22], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[2], [x22]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[0], [x22]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"71:" // Oddments: Load (4, 3): Bit 2: End
- "usubl v26.8h, v26.8b, v12.8b\n"
- "ldr x21, [x15, #0xb8]\n"
- "smlal v21.4s, v26.4h, v7.4h\n"
- "smlal2 v22.4s, v26.8h, v7.8h\n"
- "add x21, x21, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "smlal v19.4s, v16.4h, v7.4h\n"
+ "smlal2 v31.4s, v16.8h, v7.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 73f\n"
- "ld1 { v25.s }[0], [x21], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 72f\n"
- "ld1 { v25.h }[2], [x21], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[6], [x21]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[4], [x21]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 2): Bit 2: Unset
"tbz x7, #1, 74f\n"
- "ld1 { v25.h }[0], [x21], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[0], [x21]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"75:" // Oddments: Load (4, 2): Bit 2: End
- "usubl v25.8h, v25.8b, v12.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0xc0]\n"
- "smlal v9.4s, v25.4h, v8.4h\n"
- "smlal2 v23.4s, v25.8h, v8.8h\n"
- "smlal v21.4s, v25.4h, v6.4h\n"
- "smlal2 v22.4s, v25.8h, v6.8h\n"
+ "smlal v20.4s, v16.4h, v4.4h\n"
+ "smlal2 v0.4s, v16.8h, v4.8h\n"
+ "smlal v19.4s, v16.4h, v26.4h\n"
+ "smlal2 v31.4s, v16.8h, v26.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 77f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 76f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 4): Bit 2: Unset
"tbz x7, #1, 78f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"79:" // Oddments: Load (4, 4): Bit 2: End
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v21.4s, v29.4h, v8.4h\n"
- "smlal2 v22.4s, v29.8h, v8.8h\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "smlal v19.4s, v16.4h, v4.4h\n"
+ "smlal2 v31.4s, v16.8h, v4.8h\n"
"tbz x7, #2, 81f\n"
- "ld1 { v19.4s }, [x13], #0x10\n"
- "ld1 { v18.4s }, [x12], #0x10\n"
+ "ld1 { v14.4s }, [x13], #0x10\n"
+ "ld1 { v25.4s }, [x12], #0x10\n"
"tbz x7, #1, 80f\n"
- "ld1 { v30.d }[0], [x13], #0x8\n"
- "ld1 { v31.d }[0], [x12], #0x8\n"
+ "ld1 { v18.d }[0], [x13], #0x8\n"
+ "ld1 { v12.d }[0], [x12], #0x8\n"
"tbz x7, #0, 83f\n"
- "ld1 { v30.s }[2], [x13]\n"
- "ld1 { v31.s }[2], [x12]\n"
+ "ld1 { v18.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x12]\n"
"b 83f\n"
"80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
"tbz x7, #0, 83f\n"
- "ld1 { v30.s }[0], [x13]\n"
- "ld1 { v31.s }[0], [x12]\n"
+ "ld1 { v18.s }[0], [x13]\n"
+ "ld1 { v12.s }[0], [x12]\n"
"b 83f\n"
"81:" // Oddments: Load requant params: Bit 2: Unset
"tbz x7, #1, 82f\n"
- "ld1 { v19.d }[0], [x13], #0x8\n"
- "ld1 { v18.d }[0], [x12], #0x8\n"
+ "ld1 { v14.d }[0], [x13], #0x8\n"
+ "ld1 { v25.d }[0], [x12], #0x8\n"
"tbz x7, #0, 83f\n"
- "ld1 { v19.s }[2], [x13]\n"
- "ld1 { v18.s }[2], [x12]\n"
+ "ld1 { v14.s }[2], [x13]\n"
+ "ld1 { v25.s }[2], [x12]\n"
"b 83f\n"
"82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 83f\n"
- "ld1 { v19.s }[0], [x13]\n"
- "ld1 { v18.s }[0], [x12]\n"
+ "ld1 { v14.s }[0], [x13]\n"
+ "ld1 { v25.s }[0], [x12]\n"
"83:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v15.4s, v15.4s, v19.4s\n"
- "and v0.16b, v15.16b, v18.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v14.4s\n"
+ "and v28.16b, v5.16b, v25.16b\n"
"add x11, x11, x16\n"
"add x10, x10, x16\n"
- "sqrdmulh v17.4s, v17.4s, v30.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v3.4s, v3.4s, v18.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
"add x9, x9, x16\n"
"add x28, x28, x16\n"
- "and v7.16b, v17.16b, v31.16b\n"
- "sqrdmulh v10.4s, v10.4s, v19.4s\n"
- "sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "sqrdmulh v21.4s, v21.4s, v19.4s\n"
- "sqadd v15.4s, v15.4s, v0.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "and v19.16b, v10.16b, v18.16b\n"
- "sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "and v27.16b, v9.16b, v18.16b\n"
- "sqrdmulh v23.4s, v23.4s, v30.4s\n"
- "and v0.16b, v21.16b, v18.16b\n"
- "sqrdmulh v22.4s, v22.4s, v30.4s\n"
- "sqadd v17.4s, v17.4s, v7.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "and v5.16b, v20.16b, v31.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "and v4.16b, v23.16b, v31.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v7.16b, v22.16b, v31.16b\n"
- "sqadd v10.4s, v10.4s, v19.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v27.4s\n"
+ "and v16.16b, v3.16b, v12.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v14.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v14.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v14.4s\n"
+ "sqadd v5.4s, v5.4s, v28.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v14.16b, v21.16b, v25.16b\n"
+ "sqrdmulh v8.4s, v8.4s, v18.4s\n"
+ "and v6.16b, v20.16b, v25.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+ "and v4.16b, v19.16b, v25.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v18.4s\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "sshr v14.4s, v14.4s, #0x1f\n"
+ "and v18.16b, v8.16b, v12.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "and v7.16b, v0.16b, v12.16b\n"
"sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
+ "and v16.16b, v31.16b, v12.16b\n"
+ "sqadd v21.4s, v21.4s, v14.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v6.4s\n"
"sshr v7.4s, v7.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v18.4s\n"
- "srshl v10.4s, v10.4s, v18.4s\n"
- "sqadd v20.4s, v20.4s, v5.4s\n"
- "srshl v9.4s, v9.4s, v18.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "srshl v21.4s, v21.4s, v18.4s\n"
- "sqadd v22.4s, v22.4s, v7.4s\n"
- "srshl v17.4s, v17.4s, v31.4s\n"
- "sqxtn v15.4h, v15.4s\n"
- "srshl v20.4s, v20.4s, v31.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "srshl v23.4s, v23.4s, v31.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v22.4s, v22.4s, v31.4s\n"
+ "sqadd v19.4s, v19.4s, v4.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v25.4s\n"
+ "srshl v21.4s, v21.4s, v25.4s\n"
+ "sqadd v8.4s, v8.4s, v18.4s\n"
+ "srshl v20.4s, v20.4s, v25.4s\n"
+ "sqadd v0.4s, v0.4s, v7.4s\n"
+ "srshl v19.4s, v19.4s, v25.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v3.4s, v3.4s, v12.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "srshl v8.4s, v8.4s, v12.4s\n"
"sqxtn v21.4h, v21.4s\n"
- "sqxtn2 v15.8h, v17.4s\n"
- "sqxtn2 v10.8h, v20.4s\n"
- "sqxtn2 v9.8h, v23.4s\n"
- "sqxtn2 v21.8h, v22.4s\n"
- "sqadd v15.8h, v15.8h, v11.8h\n"
- "sqadd v10.8h, v10.8h, v11.8h\n"
- "sqadd v9.8h, v9.8h, v11.8h\n"
- "sqadd v21.8h, v21.8h, v11.8h\n"
- "smax v15.8h, v15.8h, v16.8h\n"
- "smax v10.8h, v10.8h, v16.8h\n"
- "smax v9.8h, v9.8h, v16.8h\n"
- "smax v21.8h, v21.8h, v16.8h\n"
- "smin v15.8h, v15.8h, v14.8h\n"
- "smin v10.8h, v10.8h, v14.8h\n"
- "smin v9.8h, v9.8h, v14.8h\n"
- "smin v21.8h, v21.8h, v14.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "srshl v0.4s, v0.4s, v12.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v31.4s, v31.4s, v12.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "sqxtn2 v5.8h, v3.4s\n"
+ "sqxtn2 v21.8h, v8.4s\n"
+ "sqxtn2 v20.8h, v0.4s\n"
+ "sqxtn2 v19.8h, v31.4s\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "sqadd v21.8h, v21.8h, v13.8h\n"
+ "sqadd v20.8h, v20.8h, v13.8h\n"
+ "sqadd v19.8h, v19.8h, v13.8h\n"
+ "smax v5.8h, v5.8h, v17.8h\n"
+ "smax v21.8h, v21.8h, v17.8h\n"
+ "smax v20.8h, v20.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smin v5.8h, v5.8h, v24.8h\n"
+ "smin v21.8h, v21.8h, v24.8h\n"
+ "smin v20.8h, v20.8h, v24.8h\n"
+ "smin v19.8h, v19.8h, v24.8h\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
"tbz x7, #2, 85f\n"
- "st1 { v15.s }[0], [x11], #0x4\n"
- "st1 { v10.s }[0], [x10], #0x4\n"
- "st1 { v9.s }[0], [x9], #0x4\n"
- "st1 { v21.s }[0], [x28], #0x4\n"
+ "st1 { v5.s }[0], [x11], #0x4\n"
+ "st1 { v21.s }[0], [x10], #0x4\n"
+ "st1 { v20.s }[0], [x9], #0x4\n"
+ "st1 { v19.s }[0], [x28], #0x4\n"
"tbz x7, #1, 84f\n"
- "st1 { v15.h }[2], [x11], #0x2\n"
- "st1 { v10.h }[2], [x10], #0x2\n"
- "st1 { v9.h }[2], [x9], #0x2\n"
- "st1 { v21.h }[2], [x28], #0x2\n"
+ "st1 { v5.h }[2], [x11], #0x2\n"
+ "st1 { v21.h }[2], [x10], #0x2\n"
+ "st1 { v20.h }[2], [x9], #0x2\n"
+ "st1 { v19.h }[2], [x28], #0x2\n"
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[6], [x11], #0x1\n"
- "st1 { v10.b }[6], [x10], #0x1\n"
- "st1 { v9.b }[6], [x9], #0x1\n"
- "st1 { v21.b }[6], [x28], #0x1\n"
+ "st1 { v5.b }[6], [x11], #0x1\n"
+ "st1 { v21.b }[6], [x10], #0x1\n"
+ "st1 { v20.b }[6], [x9], #0x1\n"
+ "st1 { v19.b }[6], [x28], #0x1\n"
"b 87f\n"
"84:" // Oddments: Bit 2: Bit 1: Unset
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[4], [x11], #0x1\n"
- "st1 { v10.b }[4], [x10], #0x1\n"
- "st1 { v9.b }[4], [x9], #0x1\n"
- "st1 { v21.b }[4], [x28], #0x1\n"
+ "st1 { v5.b }[4], [x11], #0x1\n"
+ "st1 { v21.b }[4], [x10], #0x1\n"
+ "st1 { v20.b }[4], [x9], #0x1\n"
+ "st1 { v19.b }[4], [x28], #0x1\n"
"b 87f\n"
"85:" // Oddments: Bit 2: Unset
"tbz x7, #1, 86f\n"
- "st1 { v15.h }[0], [x11], #0x2\n"
- "st1 { v10.h }[0], [x10], #0x2\n"
- "st1 { v9.h }[0], [x9], #0x2\n"
- "st1 { v21.h }[0], [x28], #0x2\n"
+ "st1 { v5.h }[0], [x11], #0x2\n"
+ "st1 { v21.h }[0], [x10], #0x2\n"
+ "st1 { v20.h }[0], [x9], #0x2\n"
+ "st1 { v19.h }[0], [x28], #0x2\n"
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[2], [x11], #0x1\n"
- "st1 { v10.b }[2], [x10], #0x1\n"
- "st1 { v9.b }[2], [x9], #0x1\n"
- "st1 { v21.b }[2], [x28], #0x1\n"
+ "st1 { v5.b }[2], [x11], #0x1\n"
+ "st1 { v21.b }[2], [x10], #0x1\n"
+ "st1 { v20.b }[2], [x9], #0x1\n"
+ "st1 { v19.b }[2], [x28], #0x1\n"
"b 87f\n"
"86:" // Oddments: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[0], [x11], #0x1\n"
- "st1 { v10.b }[0], [x10], #0x1\n"
- "st1 { v9.b }[0], [x9], #0x1\n"
- "st1 { v21.b }[0], [x28], #0x1\n"
+ "st1 { v5.b }[0], [x11], #0x1\n"
+ "st1 { v21.b }[0], [x10], #0x1\n"
+ "st1 { v20.b }[0], [x9], #0x1\n"
+ "st1 { v19.b }[0], [x28], #0x1\n"
"87:" // Oddments: Bit 2: End
"88:" // End
:
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index 407807fcc1..375e6f8f15 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
@@ -34,15 +34,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
- const unsigned int,
- const uint8_t *const *const,
- const uint8_t *const,
- const int32_t *const,
- const arm_gemm::Requantize32 &,
- const int32_t *const,
- const int32_t *const,
- uint8_t *const *const);
+void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
class a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index 2fe688a65e..ae663585a2 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -112,1188 +112,1188 @@ void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
__asm__ __volatile__(
"ldr x1, [%x[params], %[offsetof_Params_n_channels]]\n"
- "ldr x13, [%x[params], %[offsetof_Params_requant]]\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
"lsr x2, x1, #0x3\n"
- "add x3, x13, %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v9.16b }, [x3]\n"
- "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x11, x13, %[offsetof_Requantize32_b_offset]\n"
- "add x5, x13, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v15.16b }, [x11]\n"
- "ld1r { v14.8h }, [x5]\n"
- "add x3, x13, %[offsetof_Requantize32_minval]\n"
- "add x15, x13, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v12.8h }, [x3]\n"
- "ld1r { v11.8h }, [x15]\n"
- "mov x0, #0x0\n"
- "mov x10, #0x0\n"
- "add x4, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x3, [%x[params], %[offsetof_Params_weights]]\n"
- "ldr x5, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v18.16b }, [x20]\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v13.16b }, [x21]\n"
+ "ld1r { v26.8h }, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_minval]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v11.8h }, [x21]\n"
+ "ld1r { v0.8h }, [x20]\n"
+ "mov x3, #0x0\n"
+ "mov x4, #0x0\n"
+ "add x5, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_requant_muls]]\n"
"ldr x8, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x17, x6, [x24, #0x0]\n"
- "ldp x7, x16, [x24, #0x10]\n"
+ "ldp x17, x16, [x22, #0x0]\n"
+ "ldp x15, x14, [x22, #0x10]\n"
"cbz x2, 3f\n"
- "ldr d0, [x3, #0x0]\n"
- "ldr d1, [x3, #0x8]\n"
+ "ldr d6, [x6, #0x0]\n"
+ "ldr d14, [x6, #0x8]\n"
"subs x2, x2, #0x1\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "ldr d2, [x3, #0x10]\n"
- "ldr d3, [x3, #0x18]\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "ldr d4, [x3, #0x20]\n"
- "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "ldr q13, [x13, #0x0]\n"
- "ldr q19, [x13, #0x10]\n"
- "add x13, x13, #0x20\n"
- "str x13, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x9, x28, [x4, #0x0]\n"
- "ldp x27, x26, [x4, #0x10]\n"
- "mov v20.16b, v13.16b\n"
- "mov v10.16b, v19.16b\n"
- "ldp x25, x24, [x4, #0x20]\n"
- "ldp x23, x22, [x4, #0x30]\n"
- "mov v8.16b, v13.16b\n"
- "mov v7.16b, v19.16b\n"
- "ldp x21, x20, [x4, #0x40]\n"
- "ldr d31, [x9, x0]\n"
- "mov v17.16b, v13.16b\n"
- "mov v21.16b, v19.16b\n"
- "ldr d30, [x28, x0]\n"
- "ldr d29, [x27, x0]\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "ldr d28, [x26, x0]\n"
- "ldr d27, [x25, x0]\n"
- "usubl v29.8h, v29.8b, v9.8b\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "ldr d23, [x24, x0]\n"
- "ldr d25, [x23, x0]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "ldr d24, [x22, x0]\n"
- "ldr d26, [x21, x0]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "ldr d22, [x20, x0]\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "usubl v22.8h, v22.8b, v9.8b\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "ldr d10, [x6, #0x10]\n"
+ "ldr d21, [x6, #0x18]\n"
+ "usubl v14.8h, v14.8b, v13.8b\n"
+ "usubl v10.8h, v10.8b, v13.8b\n"
+ "ldr d12, [x6, #0x20]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "usubl v12.8h, v12.8b, v13.8b\n"
+ "ldr q7, [x20, #0x0]\n"
+ "ldr q15, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "mov v20.16b, v7.16b\n"
+ "mov v5.16b, v15.16b\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "mov v24.16b, v7.16b\n"
+ "mov v22.16b, v15.16b\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "ldr d31, [x9, x3]\n"
+ "mov v23.16b, v7.16b\n"
+ "mov v19.16b, v15.16b\n"
+ "ldr d17, [x28, x3]\n"
+ "ldr d30, [x27, x3]\n"
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "ldr d16, [x26, x3]\n"
+ "ldr d3, [x25, x3]\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "ldr d4, [x24, x3]\n"
+ "ldr d25, [x23, x3]\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "ldr d9, [x22, x3]\n"
+ "ldr d29, [x21, x3]\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "ldr d28, [x20, x3]\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
"beq 2f\n"
"1:" // Loop
- "ldr q18, [x5, #0x0]\n"
- "ldr q6, [x8, #0x0]\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "ldr q5, [x5, #0x10]\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "ldr x20, [x4, #0x50]\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "smlal v8.4s, v29.4h, v0.4h\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "ldr x22, [x4, #0x58]\n"
- "ldr x21, [x4, #0x60]\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr d31, [x20, x0]\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v7.4s, v29.8h, v0.8h\n"
- "smlal v13.4s, v27.4h, v2.4h\n"
- "ldr x20, [x4, #0x68]\n"
- "ldr x26, [x4, #0x70]\n"
- "smlal2 v21.4s, v28.8h, v0.8h\n"
- "ldr d30, [x22, x0]\n"
- "smlal v20.4s, v27.4h, v1.4h\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "smlal v8.4s, v28.4h, v1.4h\n"
- "smlal v17.4s, v23.4h, v1.4h\n"
- "ldr x25, [x4, #0x78]\n"
- "ldr x23, [x4, #0x80]\n"
- "smlal2 v19.4s, v27.8h, v2.8h\n"
- "smlal2 v10.4s, v27.8h, v1.8h\n"
- "ldr d0, [x3, #0x28]\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "smlal2 v7.4s, v28.8h, v1.8h\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "ldr x24, [x4, #0x88]\n"
- "ldr x15, [x4, #0x90]\n"
- "smlal2 v21.4s, v23.8h, v1.8h\n"
- "ldr d27, [x21, x0]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v8.4s, v23.4h, v2.4h\n"
- "smlal v17.4s, v31.4h, v2.4h\n"
- "ldr x21, [x4, #0x98]\n"
- "ldr x14, [x4, #0xa0]\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "ldr d1, [x3, #0x30]\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "smlal2 v7.4s, v23.8h, v2.8h\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "ldr x13, [x4, #0xa8]\n"
- "ldr x12, [x4, #0xb0]\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "ldr d25, [x20, x0]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal v8.4s, v31.4h, v3.4h\n"
- "smlal v17.4s, v30.4h, v3.4h\n"
- "ldr x20, [x4, #0xb8]\n"
- "ldr x11, [x4, #0xc0]\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "ldr d2, [x3, #0x38]\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "smlal2 v7.4s, v31.8h, v3.8h\n"
- "smlal v13.4s, v29.4h, v0.4h\n"
- "ldr x22, [x4, #0xc8]\n"
- "ldr x9, [x4, #0xd0]\n"
- "smlal2 v21.4s, v30.8h, v3.8h\n"
- "ldr d24, [x26, x0]\n"
- "smlal v20.4s, v27.4h, v4.4h\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal v8.4s, v30.4h, v4.4h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "ldr x28, [x4, #0xd8]\n"
- "ldr x27, [x4, #0xe0]\n"
- "smlal2 v19.4s, v29.8h, v0.8h\n"
- "ldr d3, [x3, #0x40]\n"
- "smlal2 v10.4s, v27.8h, v4.8h\n"
- "ldr d27, [x25, x0]\n"
- "smlal2 v7.4s, v30.8h, v4.8h\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "ldr x26, [x4, #0xe8]\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0x48]\n"
- "smlal v20.4s, v28.4h, v0.4h\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v8.4s, v22.4h, v0.4h\n"
- "smlal v17.4s, v25.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
+ "ldr d2, [x6, #0x28]\n"
+ "ldr d27, [x6, #0x30]\n"
+ "smlal v7.4s, v31.4h, v6.4h\n"
+ "smlal2 v15.4s, v31.8h, v6.8h\n"
+ "ldr d1, [x6, #0x38]\n"
+ "ldr d31, [x6, #0x40]\n"
+ "smlal v7.4s, v17.4h, v14.4h\n"
+ "smlal v20.4s, v17.4h, v6.4h\n"
+ "ldr d8, [x6, #0x48]\n"
+ "ldr x22, [x5, #0x50]\n"
+ "smlal v24.4s, v30.4h, v6.4h\n"
+ "smlal v23.4s, v16.4h, v6.4h\n"
+ "smlal2 v15.4s, v17.8h, v14.8h\n"
+ "smlal v7.4s, v3.4h, v10.4h\n"
+ "ldr x20, [x5, #0x58]\n"
+ "ldr x21, [x5, #0x60]\n"
+ "smlal2 v5.4s, v17.8h, v6.8h\n"
+ "ldr d17, [x22, x3]\n"
+ "smlal2 v22.4s, v30.8h, v6.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v6.8h\n"
+ "ldr d6, [x20, x3]\n"
+ "smlal v20.4s, v3.4h, v14.4h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal v24.4s, v16.4h, v14.4h\n"
+ "smlal v23.4s, v4.4h, v14.4h\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal2 v15.4s, v3.8h, v10.8h\n"
+ "smlal v7.4s, v25.4h, v21.4h\n"
+ "usubl v27.8h, v27.8b, v13.8b\n"
+ "ldr x22, [x5, #0x70]\n"
+ "smlal2 v5.4s, v3.8h, v14.8h\n"
+ "ldr d3, [x21, x3]\n"
+ "smlal2 v22.4s, v16.8h, v14.8h\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v14.8h\n"
+ "ldr d14, [x20, x3]\n"
+ "smlal v20.4s, v25.4h, v10.4h\n"
+ "usubl v14.8h, v14.8b, v18.8b\n"
+ "smlal v24.4s, v4.4h, v10.4h\n"
+ "smlal v23.4s, v17.4h, v10.4h\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "ldr x20, [x5, #0x78]\n"
+ "smlal2 v15.4s, v25.8h, v21.8h\n"
+ "smlal v7.4s, v9.4h, v12.4h\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x21, [x5, #0x80]\n"
+ "smlal2 v5.4s, v25.8h, v10.8h\n"
+ "ldr d25, [x22, x3]\n"
+ "smlal2 v22.4s, v4.8h, v10.8h\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v10.8h\n"
+ "ldr d10, [x20, x3]\n"
+ "smlal v20.4s, v9.4h, v21.4h\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "smlal v24.4s, v17.4h, v21.4h\n"
+ "smlal v23.4s, v6.4h, v21.4h\n"
+ "usubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x24, [x5, #0x88]\n"
+ "smlal2 v15.4s, v9.8h, v12.8h\n"
+ "smlal v7.4s, v30.4h, v2.4h\n"
+ "ldr x20, [x5, #0x90]\n"
+ "ldr x23, [x5, #0x98]\n"
+ "smlal2 v5.4s, v9.8h, v21.8h\n"
+ "ldr d9, [x21, x3]\n"
+ "smlal2 v22.4s, v17.8h, v21.8h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v21.8h\n"
+ "ldr d21, [x6, #0x50]\n"
+ "smlal v20.4s, v3.4h, v12.4h\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v12.4h\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "ldr x22, [x5, #0xa0]\n"
+ "ldr x21, [x5, #0xa8]\n"
+ "smlal2 v15.4s, v30.8h, v2.8h\n"
+ "ldr d30, [x24, x3]\n"
+ "smlal v7.4s, v16.4h, v27.4h\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "smlal2 v5.4s, v3.8h, v12.8h\n"
+ "ldr d3, [x6, #0x58]\n"
+ "smlal2 v22.4s, v6.8h, v12.8h\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "ldr d12, [x20, x3]\n"
+ "smlal v20.4s, v16.4h, v2.4h\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "smlal v24.4s, v28.4h, v2.4h\n"
+ "smlal v23.4s, v14.4h, v2.4h\n"
+ "ldr x20, [x5, #0xb0]\n"
+ "ldr x13, [x5, #0xb8]\n"
+ "smlal2 v15.4s, v16.8h, v27.8h\n"
+ "smlal v7.4s, v4.4h, v1.4h\n"
+ "ldr x12, [x5, #0xc0]\n"
+ "ldr x11, [x5, #0xc8]\n"
+ "smlal2 v5.4s, v16.8h, v2.8h\n"
+ "ldr d16, [x23, x3]\n"
+ "smlal2 v22.4s, v28.8h, v2.8h\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v2.8h\n"
+ "ldr d2, [x6, #0x60]\n"
+ "smlal v20.4s, v4.4h, v27.4h\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v27.4h\n"
+ "smlal v23.4s, v25.4h, v27.4h\n"
+ "ldr x10, [x5, #0xd0]\n"
+ "ldr x9, [x5, #0xd8]\n"
+ "smlal2 v15.4s, v4.8h, v1.8h\n"
+ "smlal v7.4s, v17.4h, v31.4h\n"
+ "ldr x28, [x5, #0xe0]\n"
+ "ldr x27, [x5, #0xe8]\n"
+ "smlal2 v5.4s, v4.8h, v27.8h\n"
+ "ldr d4, [x22, x3]\n"
+ "smlal2 v22.4s, v14.8h, v27.8h\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v19.4s, v25.8h, v27.8h\n"
+ "ldr d27, [x6, #0x68]\n"
+ "smlal v20.4s, v17.4h, v1.4h\n"
+ "usubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v1.4h\n"
+ "smlal v23.4s, v10.4h, v1.4h\n"
+ "ldr x26, [x5, #0xf0]\n"
+ "ldr x25, [x5, #0xf8]\n"
+ "smlal2 v15.4s, v17.8h, v31.8h\n"
+ "smlal v7.4s, v6.4h, v8.4h\n"
+ "ldr x24, [x5, #0x100]\n"
+ "ldr x23, [x5, #0x108]\n"
+ "smlal2 v5.4s, v17.8h, v1.8h\n"
+ "ldr d17, [x21, x3]\n"
+ "smlal2 v22.4s, v25.8h, v1.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v1.8h\n"
+ "ldr d1, [x6, #0x70]\n"
+ "smlal v20.4s, v6.4h, v31.4h\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v31.4h\n"
+ "smlal v23.4s, v9.4h, v31.4h\n"
+ "ldr x22, [x5, #0x110]\n"
+ "ldr x21, [x5, #0x118]\n"
+ "smlal2 v15.4s, v6.8h, v8.8h\n"
+ "smlal v7.4s, v28.4h, v21.4h\n"
"subs x2, x2, #0x1\n"
+ "smlal2 v5.4s, v6.8h, v31.8h\n"
+ "ldr d6, [x20, x3]\n"
+ "smlal2 v22.4s, v10.8h, v31.8h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v31.8h\n"
+ "ldr d31, [x6, #0x78]\n"
+ "smlal v20.4s, v29.4h, v8.4h\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v9.4h, v8.4h\n"
+ "smlal v23.4s, v30.4h, v8.4h\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v15.4s, v28.8h, v21.8h\n"
+ "ldr d28, [x13, x3]\n"
+ "smlal v7.4s, v14.4h, v3.4h\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
+ "smlal2 v5.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x6, #0x80]\n"
+ "smlal2 v22.4s, v9.8h, v8.8h\n"
+ "usubl v29.8h, v29.8b, v13.8b\n"
+ "smlal2 v19.4s, v30.8h, v8.8h\n"
+ "ldr d8, [x12, x3]\n"
+ "smlal v20.4s, v14.4h, v21.4h\n"
+ "usubl v8.8h, v8.8b, v18.8b\n"
+ "smlal v24.4s, v12.4h, v21.4h\n"
+ "smlal v23.4s, v16.4h, v21.4h\n"
+ "smlal2 v15.4s, v14.8h, v3.8h\n"
+ "smlal v7.4s, v25.4h, v2.4h\n"
+ "smlal2 v5.4s, v14.8h, v21.8h\n"
+ "ldr d14, [x11, x3]\n"
+ "smlal2 v22.4s, v12.8h, v21.8h\n"
+ "usubl v14.8h, v14.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v21.8h\n"
+ "ldr d21, [x6, #0x88]\n"
+ "smlal v20.4s, v25.4h, v3.4h\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v16.4h, v3.4h\n"
+ "smlal v23.4s, v4.4h, v3.4h\n"
+ "smlal2 v15.4s, v25.8h, v2.8h\n"
+ "smlal v7.4s, v10.4h, v27.4h\n"
+ "smlal2 v5.4s, v25.8h, v3.8h\n"
+ "ldr d25, [x10, x3]\n"
+ "smlal2 v22.4s, v16.8h, v3.8h\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v3.8h\n"
+ "ldr d3, [x6, #0x90]\n"
+ "smlal v20.4s, v10.4h, v2.4h\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v4.4h, v2.4h\n"
+ "smlal v23.4s, v17.4h, v2.4h\n"
+ "smlal2 v15.4s, v10.8h, v27.8h\n"
+ "smlal v7.4s, v9.4h, v1.4h\n"
+ "smlal2 v5.4s, v10.8h, v2.8h\n"
+ "ldr d10, [x9, x3]\n"
+ "smlal2 v22.4s, v4.8h, v2.8h\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v2.8h\n"
+ "ldr d2, [x6, #0x98]\n"
+ "smlal v20.4s, v9.4h, v27.4h\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v17.4h, v27.4h\n"
+ "smlal v23.4s, v6.4h, v27.4h\n"
+ "smlal2 v15.4s, v9.8h, v1.8h\n"
+ "smlal v7.4s, v12.4h, v31.4h\n"
+ "smlal2 v5.4s, v9.8h, v27.8h\n"
+ "ldr d9, [x28, x3]\n"
+ "smlal2 v22.4s, v17.8h, v27.8h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v27.8h\n"
+ "ldr d27, [x6, #0xa0]\n"
+ "smlal v20.4s, v30.4h, v1.4h\n"
+ "usubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v1.4h\n"
+ "smlal v23.4s, v28.4h, v1.4h\n"
+ "smlal2 v15.4s, v12.8h, v31.8h\n"
+ "ldr d12, [x27, x3]\n"
+ "smlal v7.4s, v16.4h, v29.4h\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "smlal2 v5.4s, v30.8h, v1.8h\n"
+ "ldr d30, [x6, #0xa8]\n"
+ "smlal2 v22.4s, v6.8h, v1.8h\n"
+ "usubl v30.8h, v30.8b, v13.8b\n"
"smlal2 v19.4s, v28.8h, v1.8h\n"
- "smlal2 v10.4s, v28.8h, v0.8h\n"
- "ldr d28, [x24, x0]\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "smlal2 v7.4s, v22.8h, v0.8h\n"
- "smlal v13.4s, v23.4h, v2.4h\n"
- "ldr x25, [x4, #0xf0]\n"
- "add x5, x5, #0x20\n"
- "smlal2 v21.4s, v25.8h, v0.8h\n"
- "ldr d0, [x3, #0x50]\n"
- "smlal v20.4s, v23.4h, v1.4h\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v1.4h\n"
- "smlal v17.4s, v24.4h, v1.4h\n"
- "smlal2 v19.4s, v23.8h, v2.8h\n"
- "smlal2 v10.4s, v23.8h, v1.8h\n"
- "ldr d23, [x23, x0]\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v7.4s, v25.8h, v1.8h\n"
- "smlal v13.4s, v31.4h, v3.4h\n"
- "ldr x24, [x4, #0xf8]\n"
- "smlal2 v21.4s, v24.8h, v1.8h\n"
- "ldr d1, [x3, #0x58]\n"
- "smlal v20.4s, v31.4h, v2.4h\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v24.4h, v2.4h\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal2 v19.4s, v31.8h, v3.8h\n"
- "smlal2 v10.4s, v31.8h, v2.8h\n"
- "ldr d31, [x15, x0]\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v7.4s, v24.8h, v2.8h\n"
- "smlal v13.4s, v30.4h, v4.4h\n"
- "ldr x23, [x4, #0x100]\n"
- "smlal2 v21.4s, v27.8h, v2.8h\n"
- "ldr d2, [x3, #0x60]\n"
- "smlal v20.4s, v30.4h, v3.4h\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v27.4h, v3.4h\n"
- "smlal v17.4s, v23.4h, v3.4h\n"
- "smlal2 v19.4s, v30.8h, v4.8h\n"
- "smlal2 v10.4s, v30.8h, v3.8h\n"
- "ldr d30, [x21, x0]\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "smlal2 v7.4s, v27.8h, v3.8h\n"
- "smlal v13.4s, v22.4h, v0.4h\n"
- "ldr x15, [x4, #0x108]\n"
- "smlal2 v21.4s, v23.8h, v3.8h\n"
- "ldr d3, [x3, #0x68]\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v23.4h, v4.4h\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "smlal2 v19.4s, v22.8h, v0.8h\n"
- "ldr d22, [x20, x0]\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "ldr d26, [x14, x0]\n"
- "smlal2 v7.4s, v23.8h, v4.8h\n"
- "smlal v13.4s, v25.4h, v1.4h\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "ldr x21, [x4, #0x110]\n"
- "smlal2 v21.4s, v28.8h, v4.8h\n"
- "ldr d4, [x3, #0x70]\n"
- "smlal v20.4s, v25.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v31.4h, v0.4h\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "usubl v22.8h, v22.8b, v9.8b\n"
- "ldr x20, [x4, #0x118]\n"
- "smlal2 v19.4s, v25.8h, v1.8h\n"
- "smlal2 v10.4s, v25.8h, v0.8h\n"
- "ldr d25, [x13, x0]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v0.8h\n"
- "smlal v13.4s, v24.4h, v2.4h\n"
- "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal2 v21.4s, v30.8h, v0.8h\n"
- "ldr d0, [x3, #0x78]\n"
- "smlal v20.4s, v24.4h, v1.4h\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v30.4h, v1.4h\n"
- "smlal v17.4s, v26.4h, v1.4h\n"
- "smlal2 v19.4s, v24.8h, v2.8h\n"
- "smlal2 v10.4s, v24.8h, v1.8h\n"
- "ldr d24, [x12, x0]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v7.4s, v30.8h, v1.8h\n"
- "smlal v13.4s, v27.4h, v3.4h\n"
- "smlal2 v21.4s, v26.8h, v1.8h\n"
- "ldr d1, [x3, #0x80]\n"
- "smlal v20.4s, v27.4h, v2.4h\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v26.4h, v2.4h\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v19.4s, v27.8h, v3.8h\n"
- "smlal2 v10.4s, v27.8h, v2.8h\n"
- "ldr d27, [x11, x0]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal2 v7.4s, v26.8h, v2.8h\n"
- "smlal v13.4s, v23.4h, v4.4h\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "ldr d2, [x3, #0x88]\n"
- "smlal v20.4s, v23.4h, v3.4h\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal2 v19.4s, v23.8h, v4.8h\n"
- "smlal2 v10.4s, v23.8h, v3.8h\n"
- "ldr d23, [x22, x0]\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "ldr d3, [x3, #0x90]\n"
- "smlal v20.4s, v28.4h, v4.4h\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal v17.4s, v22.4h, v4.4h\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "ldr d31, [x9, x0]\n"
- "smlal2 v10.4s, v28.8h, v4.8h\n"
- "ldr d28, [x27, x0]\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v21.4s, v22.8h, v4.8h\n"
- "ldr d4, [x3, #0x98]\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v27.4h, v0.4h\n"
- "smlal v17.4s, v23.4h, v0.4h\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr d30, [x28, x0]\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "smlal2 v7.4s, v27.8h, v0.8h\n"
- "smlal v13.4s, v26.4h, v2.4h\n"
- "smlal2 v21.4s, v23.8h, v0.8h\n"
- "ldr d0, [x3, #0xa0]\n"
- "smlal v20.4s, v26.4h, v1.4h\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v23.4h, v1.4h\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal2 v19.4s, v26.8h, v2.8h\n"
- "smlal2 v10.4s, v26.8h, v1.8h\n"
- "ldr d26, [x26, x0]\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "smlal2 v7.4s, v23.8h, v1.8h\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "smlal2 v21.4s, v31.8h, v1.8h\n"
- "ldr d1, [x3, #0xa8]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v31.4h, v2.4h\n"
- "smlal v17.4s, v30.4h, v2.4h\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "ldr d25, [x25, x0]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v2.8h\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "smlal2 v21.4s, v30.8h, v2.8h\n"
- "ldr d2, [x3, #0xb0]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v30.4h, v3.4h\n"
- "smlal v17.4s, v28.4h, v3.4h\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "ldr d24, [x24, x0]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v7.4s, v30.8h, v3.8h\n"
- "smlal v13.4s, v27.4h, v0.4h\n"
- "smlal2 v21.4s, v28.8h, v3.8h\n"
- "ldr d3, [x3, #0xb8]\n"
- "smlal v20.4s, v22.4h, v4.4h\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v28.4h, v4.4h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal2 v19.4s, v27.8h, v0.8h\n"
- "ldr d27, [x23, x0]\n"
- "smlal2 v7.4s, v28.8h, v4.8h\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v13.4s, v23.4h, v1.4h\n"
- "smlal2 v10.4s, v22.8h, v4.8h\n"
- "ldr q22, [x8, #0x10]\n"
+ "ldr d1, [x26, x3]\n"
+ "smlal v20.4s, v16.4h, v31.4h\n"
+ "usubl v1.8h, v1.8b, v18.8b\n"
+ "smlal v24.4s, v8.4h, v31.4h\n"
+ "smlal v23.4s, v14.4h, v31.4h\n"
+ "smlal2 v15.4s, v16.8h, v29.8h\n"
+ "smlal v7.4s, v4.4h, v21.4h\n"
+ "smlal2 v5.4s, v16.8h, v31.8h\n"
+ "ldr d16, [x25, x3]\n"
+ "smlal2 v22.4s, v8.8h, v31.8h\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v31.8h\n"
+ "ldr d31, [x6, #0xb0]\n"
+ "smlal v20.4s, v4.4h, v29.4h\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v29.4h\n"
+ "smlal v23.4s, v25.4h, v29.4h\n"
+ "smlal2 v15.4s, v4.8h, v21.8h\n"
+ "smlal v7.4s, v17.4h, v3.4h\n"
+ "smlal2 v5.4s, v4.8h, v29.8h\n"
+ "ldr d4, [x24, x3]\n"
+ "smlal2 v22.4s, v14.8h, v29.8h\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v19.4s, v25.8h, v29.8h\n"
+ "ldr d29, [x6, #0xb8]\n"
+ "smlal v20.4s, v17.4h, v21.4h\n"
+ "usubl v29.8h, v29.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v21.4h\n"
+ "smlal v23.4s, v10.4h, v21.4h\n"
+ "smlal2 v15.4s, v17.8h, v3.8h\n"
+ "smlal v7.4s, v6.4h, v2.4h\n"
+ "smlal2 v5.4s, v17.8h, v21.8h\n"
+ "ldr d17, [x23, x3]\n"
+ "smlal2 v22.4s, v25.8h, v21.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v21.8h\n"
+ "ldr d21, [x6, #0xc0]\n"
+ "smlal v20.4s, v6.4h, v3.4h\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v3.4h\n"
+ "smlal v23.4s, v9.4h, v3.4h\n"
+ "add x6, x6, #0xc8\n"
+ "smlal2 v15.4s, v6.8h, v2.8h\n"
+ "smlal v7.4s, v8.4h, v27.4h\n"
+ "smlal2 v5.4s, v6.8h, v3.8h\n"
+ "ldr d6, [x22, x3]\n"
+ "smlal2 v22.4s, v10.8h, v3.8h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v3.8h\n"
+ "ldr d3, [x21, x3]\n"
+ "smlal v20.4s, v28.4h, v2.4h\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "smlal v24.4s, v9.4h, v2.4h\n"
+ "smlal v23.4s, v12.4h, v2.4h\n"
+ "add x3, x3, #0x8\n"
+ "smlal2 v15.4s, v8.8h, v27.8h\n"
+ "ldr q8, [x7, #0x0]\n"
+ "smlal v7.4s, v14.4h, v30.4h\n"
+ "smlal2 v5.4s, v28.8h, v2.8h\n"
+ "ldr q28, [x8, #0x0]\n"
+ "smlal2 v22.4s, v9.8h, v2.8h\n"
+ "smlal2 v19.4s, v12.8h, v2.8h\n"
+ "ldr q2, [x7, #0x10]\n"
+ "smlal v20.4s, v14.4h, v27.4h\n"
+ "add x7, x7, #0x20\n"
+ "smlal v24.4s, v1.4h, v27.4h\n"
+ "smlal v23.4s, v16.4h, v27.4h\n"
+ "smlal2 v15.4s, v14.8h, v30.8h\n"
+ "smlal v7.4s, v25.4h, v31.4h\n"
+ "smlal2 v5.4s, v14.8h, v27.8h\n"
+ "ldr q14, [x8, #0x10]\n"
+ "smlal2 v22.4s, v1.8h, v27.8h\n"
"add x8, x8, #0x20\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0xc0]\n"
- "smlal v20.4s, v23.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v0.4h\n"
- "smlal v17.4s, v24.4h, v0.4h\n"
- "add x3, x3, #0xc8\n"
- "smlal2 v19.4s, v23.8h, v1.8h\n"
- "smlal2 v7.4s, v25.8h, v0.8h\n"
- "ldr d25, [x15, x0]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v10.4s, v23.8h, v0.8h\n"
- "smlal2 v21.4s, v24.8h, v0.8h\n"
- "smlal v20.4s, v31.4h, v1.4h\n"
- "smlal v8.4s, v24.4h, v1.4h\n"
- "smlal v17.4s, v27.4h, v1.4h\n"
- "smlal2 v19.4s, v31.8h, v2.8h\n"
- "smlal2 v7.4s, v24.8h, v1.8h\n"
- "ldr d24, [x21, x0]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal v13.4s, v30.4h, v3.4h\n"
- "smlal2 v10.4s, v31.8h, v1.8h\n"
- "smlal2 v21.4s, v27.8h, v1.8h\n"
- "smlal v20.4s, v30.4h, v2.4h\n"
- "smlal v8.4s, v27.4h, v2.4h\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v19.4s, v30.8h, v3.8h\n"
- "smlal2 v7.4s, v27.8h, v2.8h\n"
- "ldr d27, [x20, x0]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v13.4s, v28.4h, v4.4h\n"
- "smlal2 v10.4s, v30.8h, v2.8h\n"
- "sqrdmulh v13.4s, v13.4s, v18.4s\n"
- "add x0, x0, #0x8\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "smlal v20.4s, v28.4h, v3.4h\n"
- "and v30.16b, v13.16b, v6.16b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "smlal2 v19.4s, v28.8h, v4.8h\n"
- "smlal2 v10.4s, v28.8h, v3.8h\n"
- "sqrdmulh v19.4s, v19.4s, v5.4s\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "and v16.16b, v19.16b, v22.16b\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "sqrdmulh v20.4s, v20.4s, v18.4s\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "sqrdmulh v8.4s, v8.4s, v18.4s\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal2 v21.4s, v27.8h, v4.8h\n"
- "sqrdmulh v17.4s, v17.4s, v18.4s\n"
- "sqadd v13.4s, v13.4s, v30.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "and v0.16b, v20.16b, v6.16b\n"
- "sqrdmulh v10.4s, v10.4s, v5.4s\n"
- "and v18.16b, v8.16b, v6.16b\n"
- "sqrdmulh v7.4s, v7.4s, v5.4s\n"
- "and v30.16b, v17.16b, v6.16b\n"
- "sqrdmulh v21.4s, v21.4s, v5.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v26.16b, v10.16b, v22.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "and v23.16b, v7.16b, v22.16b\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "and v16.16b, v21.16b, v22.16b\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v18.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v30.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v6.4s\n"
- "srshl v20.4s, v20.4s, v6.4s\n"
- "sqadd v10.4s, v10.4s, v26.4s\n"
- "srshl v8.4s, v8.4s, v6.4s\n"
- "sqadd v7.4s, v7.4s, v23.4s\n"
- "srshl v17.4s, v17.4s, v6.4s\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "srshl v19.4s, v19.4s, v22.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v10.4s, v10.4s, v22.4s\n"
+ "smlal2 v19.4s, v16.8h, v27.8h\n"
+ "smlal v20.4s, v25.4h, v30.4h\n"
+ "smlal v24.4s, v16.4h, v30.4h\n"
+ "smlal v23.4s, v4.4h, v30.4h\n"
+ "smlal2 v15.4s, v25.8h, v31.8h\n"
+ "smlal v7.4s, v10.4h, v29.4h\n"
+ "smlal2 v5.4s, v25.8h, v30.8h\n"
+ "smlal2 v22.4s, v16.8h, v30.8h\n"
+ "smlal2 v19.4s, v4.8h, v30.8h\n"
+ "smlal v20.4s, v10.4h, v31.4h\n"
+ "smlal v24.4s, v4.4h, v31.4h\n"
+ "smlal v23.4s, v17.4h, v31.4h\n"
+ "smlal2 v15.4s, v10.8h, v29.8h\n"
+ "smlal v7.4s, v9.4h, v21.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v8.4s\n"
+ "smlal2 v5.4s, v10.8h, v31.8h\n"
+ "smlal2 v22.4s, v4.8h, v31.8h\n"
+ "and v27.16b, v7.16b, v28.16b\n"
+ "smlal2 v19.4s, v17.8h, v31.8h\n"
+ "smlal v20.4s, v9.4h, v29.4h\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "smlal v24.4s, v17.4h, v29.4h\n"
+ "smlal v23.4s, v6.4h, v29.4h\n"
+ "sqadd v7.4s, v7.4s, v27.4s\n"
+ "smlal2 v15.4s, v9.8h, v21.8h\n"
+ "smlal2 v5.4s, v9.8h, v29.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v2.4s\n"
+ "smlal2 v22.4s, v17.8h, v29.8h\n"
+ "smlal2 v19.4s, v6.8h, v29.8h\n"
+ "and v9.16b, v15.16b, v14.16b\n"
+ "smlal v20.4s, v12.4h, v21.4h\n"
+ "smlal v24.4s, v6.4h, v21.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+ "smlal v23.4s, v3.4h, v21.4h\n"
+ "smlal2 v5.4s, v12.8h, v21.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "smlal2 v22.4s, v6.8h, v21.8h\n"
+ "smlal2 v19.4s, v3.8h, v21.8h\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "and v25.16b, v20.16b, v28.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v2.4s\n"
+ "and v10.16b, v24.16b, v28.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v2.4s\n"
+ "and v21.16b, v23.16b, v28.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v2.4s\n"
+ "sqadd v15.4s, v15.4s, v9.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "and v9.16b, v5.16b, v14.16b\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "and v12.16b, v22.16b, v14.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v17.16b, v19.16b, v14.16b\n"
+ "sqadd v20.4s, v20.4s, v25.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v10.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v21.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v7.4s, v7.4s, v28.4s\n"
+ "srshl v20.4s, v20.4s, v28.4s\n"
+ "sqadd v5.4s, v5.4s, v9.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
+ "sqadd v22.4s, v22.4s, v12.4s\n"
+ "srshl v23.4s, v23.4s, v28.4s\n"
+ "sqadd v19.4s, v19.4s, v17.4s\n"
+ "srshl v15.4s, v15.4s, v14.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v5.4s, v5.4s, v14.4s\n"
"sqxtn v20.4h, v20.4s\n"
- "srshl v7.4s, v7.4s, v22.4s\n"
- "sqxtn v8.4h, v8.4s\n"
- "srshl v21.4s, v21.4s, v22.4s\n"
- "sqxtn v17.4h, v17.4s\n"
- "sqxtn2 v13.8h, v19.4s\n"
- "sqxtn2 v20.8h, v10.4s\n"
- "sqxtn2 v8.8h, v7.4s\n"
- "sqxtn2 v17.8h, v21.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v20.8h, v20.8h, v14.8h\n"
- "sqadd v8.8h, v8.8h, v14.8h\n"
- "sqadd v17.8h, v17.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v20.8h, v20.8h, v12.8h\n"
- "smax v8.8h, v8.8h, v12.8h\n"
- "smax v17.8h, v17.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v20.8h, v20.8h, v11.8h\n"
- "smin v8.8h, v8.8h, v11.8h\n"
- "smin v17.8h, v17.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "srshl v22.4s, v22.4s, v14.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "srshl v19.4s, v19.4s, v14.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v7.8h, v15.4s\n"
+ "sqxtn2 v20.8h, v5.4s\n"
+ "sqxtn2 v24.8h, v22.4s\n"
+ "sqxtn2 v23.8h, v19.4s\n"
+ "sqadd v7.8h, v7.8h, v26.8h\n"
+ "sqadd v20.8h, v20.8h, v26.8h\n"
+ "sqadd v24.8h, v24.8h, v26.8h\n"
+ "sqadd v23.8h, v23.8h, v26.8h\n"
+ "smax v7.8h, v7.8h, v11.8h\n"
+ "smax v20.8h, v20.8h, v11.8h\n"
+ "smax v24.8h, v24.8h, v11.8h\n"
+ "smax v23.8h, v23.8h, v11.8h\n"
+ "smin v7.8h, v7.8h, v0.8h\n"
+ "smin v20.8h, v20.8h, v0.8h\n"
+ "smin v24.8h, v24.8h, v0.8h\n"
+ "smin v23.8h, v23.8h, v0.8h\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "str d7, [x17, x4]\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str d13, [x17, x10]\n"
- "uzp1 v8.16b, v8.16b, v8.16b\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
- "str d20, [x6, x10]\n"
- "str d8, [x7, x10]\n"
- "str d17, [x16, x10]\n"
- "ldr q13, [x13, #0x0]\n"
- "ldr q19, [x13, #0x10]\n"
- "add x13, x13, #0x20\n"
- "ldr d0, [x3, #0x0]\n"
- "ldr d1, [x3, #0x8]\n"
- "add x10, x10, #0x8\n"
- "str x13, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d2, [x3, #0x10]\n"
- "ldr d3, [x3, #0x18]\n"
- "mov v20.16b, v13.16b\n"
- "mov v10.16b, v19.16b\n"
- "ldr d4, [x3, #0x20]\n"
- "ldp x9, x28, [x4, #0x0]\n"
- "mov v8.16b, v13.16b\n"
- "mov v7.16b, v19.16b\n"
- "ldp x27, x26, [x4, #0x10]\n"
- "ldp x25, x24, [x4, #0x20]\n"
- "mov v17.16b, v13.16b\n"
- "mov v21.16b, v19.16b\n"
- "ldp x23, x22, [x4, #0x30]\n"
- "ldp x21, x20, [x4, #0x40]\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "ldr d31, [x9, x0]\n"
- "ldr d30, [x28, x0]\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "ldr d29, [x27, x0]\n"
- "ldr d28, [x26, x0]\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "ldr d27, [x25, x0]\n"
- "ldr d23, [x24, x0]\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "usubl v29.8h, v29.8b, v9.8b\n"
- "ldr d25, [x23, x0]\n"
- "ldr d24, [x22, x0]\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "ldr d26, [x21, x0]\n"
- "ldr d22, [x20, x0]\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "usubl v22.8h, v22.8b, v9.8b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str d20, [x16, x4]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d24, [x15, x4]\n"
+ "str d23, [x14, x4]\n"
+ "ldr q7, [x20, #0x0]\n"
+ "ldr q15, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d6, [x6, #0x0]\n"
+ "ldr d14, [x6, #0x8]\n"
+ "add x4, x4, #0x8\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d10, [x6, #0x10]\n"
+ "ldr d21, [x6, #0x18]\n"
+ "mov v20.16b, v7.16b\n"
+ "mov v5.16b, v15.16b\n"
+ "ldr d12, [x6, #0x20]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "mov v24.16b, v7.16b\n"
+ "mov v22.16b, v15.16b\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "mov v23.16b, v7.16b\n"
+ "mov v19.16b, v15.16b\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "usubl v14.8h, v14.8b, v13.8b\n"
+ "ldr d31, [x9, x3]\n"
+ "ldr d17, [x28, x3]\n"
+ "usubl v10.8h, v10.8b, v13.8b\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "ldr d30, [x27, x3]\n"
+ "ldr d16, [x26, x3]\n"
+ "usubl v12.8h, v12.8b, v13.8b\n"
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "ldr d3, [x25, x3]\n"
+ "ldr d4, [x24, x3]\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "ldr d25, [x23, x3]\n"
+ "ldr d9, [x22, x3]\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "ldr d29, [x21, x3]\n"
+ "ldr d28, [x20, x3]\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
"bgt 1b\n"
"2:" // Tail
- "ldr q18, [x5, #0x0]\n"
- "ldr q6, [x8, #0x0]\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "ldr q5, [x5, #0x10]\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "ldr x20, [x4, #0x50]\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "smlal v8.4s, v29.4h, v0.4h\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "ldr x22, [x4, #0x58]\n"
- "ldr x21, [x4, #0x60]\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr d31, [x20, x0]\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v7.4s, v29.8h, v0.8h\n"
- "smlal v13.4s, v27.4h, v2.4h\n"
- "ldr x20, [x4, #0x68]\n"
- "ldr x26, [x4, #0x70]\n"
- "smlal2 v21.4s, v28.8h, v0.8h\n"
- "ldr d30, [x22, x0]\n"
- "smlal v20.4s, v27.4h, v1.4h\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "smlal v8.4s, v28.4h, v1.4h\n"
- "smlal v17.4s, v23.4h, v1.4h\n"
- "ldr x25, [x4, #0x78]\n"
- "ldr x23, [x4, #0x80]\n"
- "smlal2 v19.4s, v27.8h, v2.8h\n"
- "smlal2 v10.4s, v27.8h, v1.8h\n"
- "ldr d0, [x3, #0x28]\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "smlal2 v7.4s, v28.8h, v1.8h\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "ldr x24, [x4, #0x88]\n"
- "ldr x15, [x4, #0x90]\n"
- "smlal2 v21.4s, v23.8h, v1.8h\n"
- "ldr d27, [x21, x0]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v8.4s, v23.4h, v2.4h\n"
- "smlal v17.4s, v31.4h, v2.4h\n"
- "ldr x21, [x4, #0x98]\n"
- "ldr x14, [x4, #0xa0]\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "ldr d1, [x3, #0x30]\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "smlal2 v7.4s, v23.8h, v2.8h\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "ldr x13, [x4, #0xa8]\n"
- "ldr x12, [x4, #0xb0]\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "ldr d25, [x20, x0]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal v8.4s, v31.4h, v3.4h\n"
- "smlal v17.4s, v30.4h, v3.4h\n"
- "ldr x20, [x4, #0xb8]\n"
- "ldr x11, [x4, #0xc0]\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "ldr d2, [x3, #0x38]\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "smlal2 v7.4s, v31.8h, v3.8h\n"
- "smlal v13.4s, v29.4h, v0.4h\n"
- "ldr x22, [x4, #0xc8]\n"
- "ldr x9, [x4, #0xd0]\n"
- "smlal2 v21.4s, v30.8h, v3.8h\n"
- "ldr d24, [x26, x0]\n"
- "smlal v20.4s, v27.4h, v4.4h\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal v8.4s, v30.4h, v4.4h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "ldr x28, [x4, #0xd8]\n"
- "ldr x27, [x4, #0xe0]\n"
- "smlal2 v19.4s, v29.8h, v0.8h\n"
- "ldr d3, [x3, #0x40]\n"
- "smlal2 v10.4s, v27.8h, v4.8h\n"
- "ldr d27, [x25, x0]\n"
- "smlal2 v7.4s, v30.8h, v4.8h\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "ldr x26, [x4, #0xe8]\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0x48]\n"
- "smlal v20.4s, v28.4h, v0.4h\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v8.4s, v22.4h, v0.4h\n"
- "smlal v17.4s, v25.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "ldr x25, [x4, #0xf0]\n"
- "smlal2 v19.4s, v28.8h, v1.8h\n"
- "smlal2 v10.4s, v28.8h, v0.8h\n"
- "ldr d28, [x24, x0]\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "smlal2 v7.4s, v22.8h, v0.8h\n"
- "smlal v13.4s, v23.4h, v2.4h\n"
- "ldr x24, [x4, #0xf8]\n"
- "tst x1, #0x7\n"
- "smlal2 v21.4s, v25.8h, v0.8h\n"
- "ldr d0, [x3, #0x50]\n"
- "smlal v20.4s, v23.4h, v1.4h\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v1.4h\n"
- "smlal v17.4s, v24.4h, v1.4h\n"
- "add x5, x5, #0x20\n"
- "smlal2 v19.4s, v23.8h, v2.8h\n"
- "smlal2 v10.4s, v23.8h, v1.8h\n"
- "ldr d23, [x23, x0]\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v7.4s, v25.8h, v1.8h\n"
- "smlal v13.4s, v31.4h, v3.4h\n"
- "ldr x23, [x4, #0x100]\n"
- "smlal2 v21.4s, v24.8h, v1.8h\n"
- "ldr d1, [x3, #0x58]\n"
- "smlal v20.4s, v31.4h, v2.4h\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v24.4h, v2.4h\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal2 v19.4s, v31.8h, v3.8h\n"
- "smlal2 v10.4s, v31.8h, v2.8h\n"
- "ldr d31, [x15, x0]\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v7.4s, v24.8h, v2.8h\n"
- "smlal v13.4s, v30.4h, v4.4h\n"
- "ldr x15, [x4, #0x108]\n"
- "smlal2 v21.4s, v27.8h, v2.8h\n"
- "ldr d2, [x3, #0x60]\n"
- "smlal v20.4s, v30.4h, v3.4h\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v27.4h, v3.4h\n"
- "smlal v17.4s, v23.4h, v3.4h\n"
- "smlal2 v19.4s, v30.8h, v4.8h\n"
- "smlal2 v10.4s, v30.8h, v3.8h\n"
- "ldr d30, [x21, x0]\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "smlal2 v7.4s, v27.8h, v3.8h\n"
- "smlal v13.4s, v22.4h, v0.4h\n"
- "ldr x21, [x4, #0x110]\n"
- "smlal2 v21.4s, v23.8h, v3.8h\n"
- "ldr d3, [x3, #0x68]\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v23.4h, v4.4h\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "smlal2 v19.4s, v22.8h, v0.8h\n"
- "ldr d22, [x20, x0]\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "ldr d26, [x14, x0]\n"
- "smlal2 v7.4s, v23.8h, v4.8h\n"
- "smlal v13.4s, v25.4h, v1.4h\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "ldr x20, [x4, #0x118]\n"
- "smlal2 v21.4s, v28.8h, v4.8h\n"
- "ldr d4, [x3, #0x70]\n"
- "smlal v20.4s, v25.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v31.4h, v0.4h\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "usubl v22.8h, v22.8b, v9.8b\n"
+ "ldr d27, [x6, #0x28]\n"
+ "ldr d1, [x6, #0x30]\n"
+ "smlal v7.4s, v31.4h, v6.4h\n"
+ "smlal2 v15.4s, v31.8h, v6.8h\n"
+ "ldr d2, [x6, #0x38]\n"
+ "ldr d31, [x6, #0x40]\n"
+ "smlal v7.4s, v17.4h, v14.4h\n"
+ "smlal v20.4s, v17.4h, v6.4h\n"
+ "ldr d8, [x6, #0x48]\n"
+ "ldr x22, [x5, #0x50]\n"
+ "smlal v24.4s, v30.4h, v6.4h\n"
+ "smlal v23.4s, v16.4h, v6.4h\n"
+ "smlal2 v15.4s, v17.8h, v14.8h\n"
+ "smlal v7.4s, v3.4h, v10.4h\n"
+ "ldr x20, [x5, #0x58]\n"
+ "ldr x21, [x5, #0x60]\n"
+ "smlal2 v5.4s, v17.8h, v6.8h\n"
+ "ldr d17, [x22, x3]\n"
+ "smlal2 v22.4s, v30.8h, v6.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v6.8h\n"
+ "ldr d6, [x20, x3]\n"
+ "smlal v20.4s, v3.4h, v14.4h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal v24.4s, v16.4h, v14.4h\n"
+ "smlal v23.4s, v4.4h, v14.4h\n"
+ "usubl v27.8h, v27.8b, v13.8b\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal2 v15.4s, v3.8h, v10.8h\n"
+ "smlal v7.4s, v25.4h, v21.4h\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "ldr x22, [x5, #0x70]\n"
+ "smlal2 v5.4s, v3.8h, v14.8h\n"
+ "ldr d3, [x21, x3]\n"
+ "smlal2 v22.4s, v16.8h, v14.8h\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v14.8h\n"
+ "ldr d14, [x20, x3]\n"
+ "smlal v20.4s, v25.4h, v10.4h\n"
+ "usubl v14.8h, v14.8b, v18.8b\n"
+ "smlal v24.4s, v4.4h, v10.4h\n"
+ "smlal v23.4s, v17.4h, v10.4h\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "ldr x21, [x5, #0x78]\n"
+ "smlal2 v15.4s, v25.8h, v21.8h\n"
+ "smlal v7.4s, v9.4h, v12.4h\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x20, [x5, #0x80]\n"
+ "smlal2 v5.4s, v25.8h, v10.8h\n"
+ "ldr d25, [x22, x3]\n"
+ "smlal2 v22.4s, v4.8h, v10.8h\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v10.8h\n"
+ "ldr d10, [x21, x3]\n"
+ "smlal v20.4s, v9.4h, v21.4h\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "smlal v24.4s, v17.4h, v21.4h\n"
+ "smlal v23.4s, v6.4h, v21.4h\n"
+ "usubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x24, [x5, #0x88]\n"
+ "smlal2 v15.4s, v9.8h, v12.8h\n"
+ "smlal v7.4s, v30.4h, v27.4h\n"
+ "ldr x23, [x5, #0x90]\n"
+ "ldr x22, [x5, #0x98]\n"
+ "smlal2 v5.4s, v9.8h, v21.8h\n"
+ "ldr d9, [x20, x3]\n"
+ "smlal2 v22.4s, v17.8h, v21.8h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v21.8h\n"
+ "ldr d21, [x6, #0x50]\n"
+ "smlal v20.4s, v3.4h, v12.4h\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v12.4h\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "ldr x21, [x5, #0xa0]\n"
+ "ldr x20, [x5, #0xa8]\n"
+ "smlal2 v15.4s, v30.8h, v27.8h\n"
+ "ldr d30, [x24, x3]\n"
+ "smlal v7.4s, v16.4h, v1.4h\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "smlal2 v5.4s, v3.8h, v12.8h\n"
+ "ldr d3, [x6, #0x58]\n"
+ "smlal2 v22.4s, v6.8h, v12.8h\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "ldr d12, [x23, x3]\n"
+ "smlal v20.4s, v16.4h, v27.4h\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "smlal v24.4s, v28.4h, v27.4h\n"
+ "smlal v23.4s, v14.4h, v27.4h\n"
+ "ldr x13, [x5, #0xb0]\n"
+ "ldr x12, [x5, #0xb8]\n"
+ "smlal2 v15.4s, v16.8h, v1.8h\n"
+ "smlal v7.4s, v4.4h, v2.4h\n"
+ "ldr x11, [x5, #0xc0]\n"
+ "ldr x10, [x5, #0xc8]\n"
+ "smlal2 v5.4s, v16.8h, v27.8h\n"
+ "ldr d16, [x22, x3]\n"
+ "smlal2 v22.4s, v28.8h, v27.8h\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v27.8h\n"
+ "ldr d27, [x6, #0x60]\n"
+ "smlal v20.4s, v4.4h, v1.4h\n"
+ "usubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v1.4h\n"
+ "smlal v23.4s, v25.4h, v1.4h\n"
+ "ldr x9, [x5, #0xd0]\n"
+ "ldr x28, [x5, #0xd8]\n"
+ "smlal2 v15.4s, v4.8h, v2.8h\n"
+ "smlal v7.4s, v17.4h, v31.4h\n"
+ "ldr x27, [x5, #0xe0]\n"
+ "ldr x26, [x5, #0xe8]\n"
+ "smlal2 v5.4s, v4.8h, v1.8h\n"
+ "ldr d4, [x21, x3]\n"
+ "smlal2 v22.4s, v14.8h, v1.8h\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
"smlal2 v19.4s, v25.8h, v1.8h\n"
- "smlal2 v10.4s, v25.8h, v0.8h\n"
- "ldr d25, [x13, x0]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v0.8h\n"
- "smlal v13.4s, v24.4h, v2.4h\n"
- "smlal2 v21.4s, v30.8h, v0.8h\n"
- "ldr d0, [x3, #0x78]\n"
- "smlal v20.4s, v24.4h, v1.4h\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v30.4h, v1.4h\n"
- "smlal v17.4s, v26.4h, v1.4h\n"
- "smlal2 v19.4s, v24.8h, v2.8h\n"
- "smlal2 v10.4s, v24.8h, v1.8h\n"
- "ldr d24, [x12, x0]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v7.4s, v30.8h, v1.8h\n"
- "smlal v13.4s, v27.4h, v3.4h\n"
- "smlal2 v21.4s, v26.8h, v1.8h\n"
- "ldr d1, [x3, #0x80]\n"
- "smlal v20.4s, v27.4h, v2.4h\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v26.4h, v2.4h\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v19.4s, v27.8h, v3.8h\n"
- "smlal2 v10.4s, v27.8h, v2.8h\n"
- "ldr d27, [x11, x0]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal2 v7.4s, v26.8h, v2.8h\n"
- "smlal v13.4s, v23.4h, v4.4h\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "ldr d2, [x3, #0x88]\n"
- "smlal v20.4s, v23.4h, v3.4h\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal2 v19.4s, v23.8h, v4.8h\n"
- "smlal2 v10.4s, v23.8h, v3.8h\n"
- "ldr d23, [x22, x0]\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "ldr d3, [x3, #0x90]\n"
- "smlal v20.4s, v28.4h, v4.4h\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal v17.4s, v22.4h, v4.4h\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "ldr d31, [x9, x0]\n"
- "smlal2 v10.4s, v28.8h, v4.8h\n"
- "ldr d28, [x27, x0]\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v21.4s, v22.8h, v4.8h\n"
- "ldr d4, [x3, #0x98]\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v27.4h, v0.4h\n"
- "smlal v17.4s, v23.4h, v0.4h\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr d30, [x28, x0]\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "smlal2 v7.4s, v27.8h, v0.8h\n"
- "smlal v13.4s, v26.4h, v2.4h\n"
- "smlal2 v21.4s, v23.8h, v0.8h\n"
- "ldr d0, [x3, #0xa0]\n"
- "smlal v20.4s, v26.4h, v1.4h\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v23.4h, v1.4h\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal2 v19.4s, v26.8h, v2.8h\n"
- "smlal2 v10.4s, v26.8h, v1.8h\n"
- "ldr d26, [x26, x0]\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "smlal2 v7.4s, v23.8h, v1.8h\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "smlal2 v21.4s, v31.8h, v1.8h\n"
- "ldr d1, [x3, #0xa8]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v31.4h, v2.4h\n"
- "smlal v17.4s, v30.4h, v2.4h\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "ldr d25, [x25, x0]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v2.8h\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "smlal2 v21.4s, v30.8h, v2.8h\n"
- "ldr d2, [x3, #0xb0]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v30.4h, v3.4h\n"
- "smlal v17.4s, v28.4h, v3.4h\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "ldr d24, [x24, x0]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v7.4s, v30.8h, v3.8h\n"
- "smlal v13.4s, v27.4h, v0.4h\n"
- "smlal2 v21.4s, v28.8h, v3.8h\n"
- "ldr d3, [x3, #0xb8]\n"
- "smlal v20.4s, v22.4h, v4.4h\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v28.4h, v4.4h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal2 v19.4s, v27.8h, v0.8h\n"
- "ldr d27, [x23, x0]\n"
- "smlal2 v7.4s, v28.8h, v4.8h\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v13.4s, v23.4h, v1.4h\n"
- "smlal2 v10.4s, v22.8h, v4.8h\n"
- "ldr q22, [x8, #0x10]\n"
- "add x8, x8, #0x20\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0xc0]\n"
- "smlal v20.4s, v23.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v0.4h\n"
- "smlal v17.4s, v24.4h, v0.4h\n"
- "smlal2 v19.4s, v23.8h, v1.8h\n"
- "smlal2 v7.4s, v25.8h, v0.8h\n"
- "ldr d25, [x15, x0]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v10.4s, v23.8h, v0.8h\n"
- "smlal2 v21.4s, v24.8h, v0.8h\n"
- "smlal v20.4s, v31.4h, v1.4h\n"
- "smlal v8.4s, v24.4h, v1.4h\n"
- "smlal v17.4s, v27.4h, v1.4h\n"
- "smlal2 v19.4s, v31.8h, v2.8h\n"
- "smlal2 v7.4s, v24.8h, v1.8h\n"
- "ldr d24, [x21, x0]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal v13.4s, v30.4h, v3.4h\n"
- "smlal2 v10.4s, v31.8h, v1.8h\n"
- "smlal2 v21.4s, v27.8h, v1.8h\n"
+ "ldr d1, [x6, #0x68]\n"
+ "smlal v20.4s, v17.4h, v2.4h\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v2.4h\n"
+ "smlal v23.4s, v10.4h, v2.4h\n"
+ "ldr x25, [x5, #0xf0]\n"
+ "ldr x24, [x5, #0xf8]\n"
+ "smlal2 v15.4s, v17.8h, v31.8h\n"
+ "smlal v7.4s, v6.4h, v8.4h\n"
+ "ldr x23, [x5, #0x100]\n"
+ "ldr x22, [x5, #0x108]\n"
+ "smlal2 v5.4s, v17.8h, v2.8h\n"
+ "ldr d17, [x20, x3]\n"
+ "smlal2 v22.4s, v25.8h, v2.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v2.8h\n"
+ "ldr d2, [x6, #0x70]\n"
+ "smlal v20.4s, v6.4h, v31.4h\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v31.4h\n"
+ "smlal v23.4s, v9.4h, v31.4h\n"
+ "ldr x21, [x5, #0x110]\n"
+ "ldr x20, [x5, #0x118]\n"
+ "smlal2 v15.4s, v6.8h, v8.8h\n"
+ "smlal v7.4s, v28.4h, v21.4h\n"
+ "tst x1, #0x7\n"
+ "smlal2 v5.4s, v6.8h, v31.8h\n"
+ "ldr d6, [x13, x3]\n"
+ "smlal2 v22.4s, v10.8h, v31.8h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v31.8h\n"
+ "ldr d31, [x6, #0x78]\n"
+ "smlal v20.4s, v29.4h, v8.4h\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v9.4h, v8.4h\n"
+ "smlal v23.4s, v30.4h, v8.4h\n"
+ "smlal2 v15.4s, v28.8h, v21.8h\n"
+ "ldr d28, [x12, x3]\n"
+ "smlal v7.4s, v14.4h, v3.4h\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
+ "smlal2 v5.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x6, #0x80]\n"
+ "smlal2 v22.4s, v9.8h, v8.8h\n"
+ "usubl v29.8h, v29.8b, v13.8b\n"
+ "smlal2 v19.4s, v30.8h, v8.8h\n"
+ "ldr d8, [x11, x3]\n"
+ "smlal v20.4s, v14.4h, v21.4h\n"
+ "usubl v8.8h, v8.8b, v18.8b\n"
+ "smlal v24.4s, v12.4h, v21.4h\n"
+ "smlal v23.4s, v16.4h, v21.4h\n"
+ "smlal2 v15.4s, v14.8h, v3.8h\n"
+ "smlal v7.4s, v25.4h, v27.4h\n"
+ "smlal2 v5.4s, v14.8h, v21.8h\n"
+ "ldr d14, [x10, x3]\n"
+ "smlal2 v22.4s, v12.8h, v21.8h\n"
+ "usubl v14.8h, v14.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v21.8h\n"
+ "ldr d21, [x6, #0x88]\n"
+ "smlal v20.4s, v25.4h, v3.4h\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v16.4h, v3.4h\n"
+ "smlal v23.4s, v4.4h, v3.4h\n"
+ "smlal2 v15.4s, v25.8h, v27.8h\n"
+ "smlal v7.4s, v10.4h, v1.4h\n"
+ "smlal2 v5.4s, v25.8h, v3.8h\n"
+ "ldr d25, [x9, x3]\n"
+ "smlal2 v22.4s, v16.8h, v3.8h\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v3.8h\n"
+ "ldr d3, [x6, #0x90]\n"
+ "smlal v20.4s, v10.4h, v27.4h\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v4.4h, v27.4h\n"
+ "smlal v23.4s, v17.4h, v27.4h\n"
+ "smlal2 v15.4s, v10.8h, v1.8h\n"
+ "smlal v7.4s, v9.4h, v2.4h\n"
+ "smlal2 v5.4s, v10.8h, v27.8h\n"
+ "ldr d10, [x28, x3]\n"
+ "smlal2 v22.4s, v4.8h, v27.8h\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v27.8h\n"
+ "ldr d27, [x6, #0x98]\n"
+ "smlal v20.4s, v9.4h, v1.4h\n"
+ "usubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v17.4h, v1.4h\n"
+ "smlal v23.4s, v6.4h, v1.4h\n"
+ "smlal2 v15.4s, v9.8h, v2.8h\n"
+ "smlal v7.4s, v12.4h, v31.4h\n"
+ "smlal2 v5.4s, v9.8h, v1.8h\n"
+ "ldr d9, [x27, x3]\n"
+ "smlal2 v22.4s, v17.8h, v1.8h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v1.8h\n"
+ "ldr d1, [x6, #0xa0]\n"
"smlal v20.4s, v30.4h, v2.4h\n"
- "smlal v8.4s, v27.4h, v2.4h\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v19.4s, v30.8h, v3.8h\n"
- "smlal2 v7.4s, v27.8h, v2.8h\n"
- "ldr d27, [x20, x0]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v13.4s, v28.4h, v4.4h\n"
- "smlal2 v10.4s, v30.8h, v2.8h\n"
- "sqrdmulh v13.4s, v13.4s, v18.4s\n"
- "add x0, x0, #0x8\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "smlal v20.4s, v28.4h, v3.4h\n"
- "and v30.16b, v13.16b, v6.16b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "smlal2 v19.4s, v28.8h, v4.8h\n"
- "smlal2 v10.4s, v28.8h, v3.8h\n"
- "sqrdmulh v19.4s, v19.4s, v5.4s\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "and v16.16b, v19.16b, v22.16b\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "sqrdmulh v20.4s, v20.4s, v18.4s\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "sqrdmulh v8.4s, v8.4s, v18.4s\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal2 v21.4s, v27.8h, v4.8h\n"
- "sqrdmulh v17.4s, v17.4s, v18.4s\n"
- "sqadd v13.4s, v13.4s, v30.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "and v0.16b, v20.16b, v6.16b\n"
- "sqrdmulh v10.4s, v10.4s, v5.4s\n"
- "and v18.16b, v8.16b, v6.16b\n"
- "sqrdmulh v7.4s, v7.4s, v5.4s\n"
- "and v30.16b, v17.16b, v6.16b\n"
- "sqrdmulh v21.4s, v21.4s, v5.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v26.16b, v10.16b, v22.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "and v23.16b, v7.16b, v22.16b\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v2.4h\n"
+ "smlal v23.4s, v28.4h, v2.4h\n"
+ "smlal2 v15.4s, v12.8h, v31.8h\n"
+ "ldr d12, [x26, x3]\n"
+ "smlal v7.4s, v16.4h, v29.4h\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "smlal2 v5.4s, v30.8h, v2.8h\n"
+ "ldr d30, [x6, #0xa8]\n"
+ "smlal2 v22.4s, v6.8h, v2.8h\n"
+ "usubl v30.8h, v30.8b, v13.8b\n"
+ "smlal2 v19.4s, v28.8h, v2.8h\n"
+ "ldr d2, [x25, x3]\n"
+ "smlal v20.4s, v16.4h, v31.4h\n"
+ "usubl v2.8h, v2.8b, v18.8b\n"
+ "smlal v24.4s, v8.4h, v31.4h\n"
+ "smlal v23.4s, v14.4h, v31.4h\n"
+ "smlal2 v15.4s, v16.8h, v29.8h\n"
+ "smlal v7.4s, v4.4h, v21.4h\n"
+ "smlal2 v5.4s, v16.8h, v31.8h\n"
+ "ldr d16, [x24, x3]\n"
+ "smlal2 v22.4s, v8.8h, v31.8h\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v31.8h\n"
+ "ldr d31, [x6, #0xb0]\n"
+ "smlal v20.4s, v4.4h, v29.4h\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v29.4h\n"
+ "smlal v23.4s, v25.4h, v29.4h\n"
+ "smlal2 v15.4s, v4.8h, v21.8h\n"
+ "smlal v7.4s, v17.4h, v3.4h\n"
+ "smlal2 v5.4s, v4.8h, v29.8h\n"
+ "ldr d4, [x23, x3]\n"
+ "smlal2 v22.4s, v14.8h, v29.8h\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v19.4s, v25.8h, v29.8h\n"
+ "ldr d29, [x6, #0xb8]\n"
+ "smlal v20.4s, v17.4h, v21.4h\n"
+ "usubl v29.8h, v29.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v21.4h\n"
+ "smlal v23.4s, v10.4h, v21.4h\n"
+ "smlal2 v15.4s, v17.8h, v3.8h\n"
+ "smlal v7.4s, v6.4h, v27.4h\n"
+ "smlal2 v5.4s, v17.8h, v21.8h\n"
+ "ldr d17, [x22, x3]\n"
+ "smlal2 v22.4s, v25.8h, v21.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v21.8h\n"
+ "ldr d21, [x6, #0xc0]\n"
+ "smlal v20.4s, v6.4h, v3.4h\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v3.4h\n"
+ "smlal v23.4s, v9.4h, v3.4h\n"
+ "smlal2 v15.4s, v6.8h, v27.8h\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "smlal2 v5.4s, v6.8h, v3.8h\n"
+ "ldr d6, [x21, x3]\n"
+ "smlal2 v22.4s, v10.8h, v3.8h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v3.8h\n"
+ "ldr d3, [x20, x3]\n"
+ "smlal v20.4s, v28.4h, v27.4h\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "smlal v24.4s, v9.4h, v27.4h\n"
+ "smlal v23.4s, v12.4h, v27.4h\n"
+ "add x3, x3, #0x8\n"
+ "smlal2 v15.4s, v8.8h, v1.8h\n"
+ "ldr q8, [x7, #0x0]\n"
+ "smlal v7.4s, v14.4h, v30.4h\n"
+ "smlal2 v5.4s, v28.8h, v27.8h\n"
+ "ldr q28, [x8, #0x0]\n"
+ "smlal2 v22.4s, v9.8h, v27.8h\n"
+ "smlal2 v19.4s, v12.8h, v27.8h\n"
+ "ldr q27, [x7, #0x10]\n"
+ "smlal v20.4s, v14.4h, v1.4h\n"
+ "add x7, x7, #0x20\n"
+ "smlal v24.4s, v2.4h, v1.4h\n"
+ "smlal v23.4s, v16.4h, v1.4h\n"
+ "smlal2 v15.4s, v14.8h, v30.8h\n"
+ "smlal v7.4s, v25.4h, v31.4h\n"
+ "smlal2 v5.4s, v14.8h, v1.8h\n"
+ "ldr q14, [x8, #0x10]\n"
+ "smlal2 v22.4s, v2.8h, v1.8h\n"
+ "add x8, x8, #0x20\n"
+ "smlal2 v19.4s, v16.8h, v1.8h\n"
+ "smlal v20.4s, v25.4h, v30.4h\n"
+ "smlal v24.4s, v16.4h, v30.4h\n"
+ "smlal v23.4s, v4.4h, v30.4h\n"
+ "smlal2 v15.4s, v25.8h, v31.8h\n"
+ "smlal v7.4s, v10.4h, v29.4h\n"
+ "smlal2 v5.4s, v25.8h, v30.8h\n"
+ "smlal2 v22.4s, v16.8h, v30.8h\n"
+ "smlal2 v19.4s, v4.8h, v30.8h\n"
+ "smlal v20.4s, v10.4h, v31.4h\n"
+ "smlal v24.4s, v4.4h, v31.4h\n"
+ "smlal v23.4s, v17.4h, v31.4h\n"
+ "smlal2 v15.4s, v10.8h, v29.8h\n"
+ "smlal v7.4s, v9.4h, v21.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v8.4s\n"
+ "smlal2 v5.4s, v10.8h, v31.8h\n"
+ "smlal2 v22.4s, v4.8h, v31.8h\n"
+ "and v4.16b, v7.16b, v28.16b\n"
+ "smlal2 v19.4s, v17.8h, v31.8h\n"
+ "smlal v20.4s, v9.4h, v29.4h\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "smlal v24.4s, v17.4h, v29.4h\n"
+ "smlal v23.4s, v6.4h, v29.4h\n"
+ "sqadd v7.4s, v7.4s, v4.4s\n"
+ "smlal2 v15.4s, v9.8h, v21.8h\n"
+ "smlal2 v5.4s, v9.8h, v29.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v27.4s\n"
+ "smlal2 v22.4s, v17.8h, v29.8h\n"
+ "smlal2 v19.4s, v6.8h, v29.8h\n"
+ "and v30.16b, v15.16b, v14.16b\n"
+ "smlal v20.4s, v12.4h, v21.4h\n"
+ "smlal v24.4s, v6.4h, v21.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+ "smlal v23.4s, v3.4h, v21.4h\n"
+ "smlal2 v5.4s, v12.8h, v21.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "smlal2 v22.4s, v6.8h, v21.8h\n"
+ "smlal2 v19.4s, v3.8h, v21.8h\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
"sshr v30.4s, v30.4s, #0x1f\n"
- "and v16.16b, v21.16b, v22.16b\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v18.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v30.4s\n"
+ "and v3.16b, v20.16b, v28.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v27.4s\n"
+ "and v25.16b, v24.16b, v28.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+ "and v16.16b, v23.16b, v28.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v27.4s\n"
+ "sqadd v15.4s, v15.4s, v30.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "and v4.16b, v5.16b, v14.16b\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "and v10.16b, v22.16b, v14.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v6.4s\n"
- "srshl v20.4s, v20.4s, v6.4s\n"
- "sqadd v10.4s, v10.4s, v26.4s\n"
- "srshl v8.4s, v8.4s, v6.4s\n"
- "sqadd v7.4s, v7.4s, v23.4s\n"
- "srshl v17.4s, v17.4s, v6.4s\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "srshl v19.4s, v19.4s, v22.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v10.4s, v10.4s, v22.4s\n"
+ "and v12.16b, v19.16b, v14.16b\n"
+ "sqadd v20.4s, v20.4s, v3.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v25.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v16.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "srshl v7.4s, v7.4s, v28.4s\n"
+ "srshl v20.4s, v20.4s, v28.4s\n"
+ "sqadd v5.4s, v5.4s, v4.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
+ "sqadd v22.4s, v22.4s, v10.4s\n"
+ "srshl v23.4s, v23.4s, v28.4s\n"
+ "sqadd v19.4s, v19.4s, v12.4s\n"
+ "srshl v15.4s, v15.4s, v14.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v5.4s, v5.4s, v14.4s\n"
"sqxtn v20.4h, v20.4s\n"
- "srshl v7.4s, v7.4s, v22.4s\n"
- "sqxtn v8.4h, v8.4s\n"
- "srshl v21.4s, v21.4s, v22.4s\n"
- "sqxtn v17.4h, v17.4s\n"
- "sqxtn2 v13.8h, v19.4s\n"
- "sqxtn2 v20.8h, v10.4s\n"
- "sqxtn2 v8.8h, v7.4s\n"
- "sqxtn2 v17.8h, v21.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v20.8h, v20.8h, v14.8h\n"
- "sqadd v8.8h, v8.8h, v14.8h\n"
- "sqadd v17.8h, v17.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v20.8h, v20.8h, v12.8h\n"
- "smax v8.8h, v8.8h, v12.8h\n"
- "smax v17.8h, v17.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v20.8h, v20.8h, v11.8h\n"
- "smin v8.8h, v8.8h, v11.8h\n"
- "smin v17.8h, v17.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "srshl v22.4s, v22.4s, v14.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "srshl v19.4s, v19.4s, v14.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v7.8h, v15.4s\n"
+ "sqxtn2 v20.8h, v5.4s\n"
+ "sqxtn2 v24.8h, v22.4s\n"
+ "sqxtn2 v23.8h, v19.4s\n"
+ "sqadd v7.8h, v7.8h, v26.8h\n"
+ "sqadd v20.8h, v20.8h, v26.8h\n"
+ "sqadd v24.8h, v24.8h, v26.8h\n"
+ "sqadd v23.8h, v23.8h, v26.8h\n"
+ "smax v7.8h, v7.8h, v11.8h\n"
+ "smax v20.8h, v20.8h, v11.8h\n"
+ "smax v24.8h, v24.8h, v11.8h\n"
+ "smax v23.8h, v23.8h, v11.8h\n"
+ "smin v7.8h, v7.8h, v0.8h\n"
+ "smin v20.8h, v20.8h, v0.8h\n"
+ "smin v24.8h, v24.8h, v0.8h\n"
+ "smin v23.8h, v23.8h, v0.8h\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "str d7, [x17, x4]\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str d13, [x17, x10]\n"
- "uzp1 v8.16b, v8.16b, v8.16b\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
- "str d20, [x6, x10]\n"
- "str d8, [x7, x10]\n"
- "str d17, [x16, x10]\n"
- "add x10, x10, #0x8\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str d20, [x16, x4]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d24, [x15, x4]\n"
+ "str d23, [x14, x4]\n"
+ "add x4, x4, #0x8\n"
"beq 124f\n"
- "add x3, x3, #0xc8\n"
+ "add x6, x6, #0xc8\n"
"3:" // Oddments
- "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
"tbz x1, #2, 5f\n"
- "ld1 { v13.4s }, [x13], #0x10\n"
+ "ld1 { v7.4s }, [x20], #0x10\n"
"tbz x1, #1, 4f\n"
- "ld1 { v19.d }[0], [x13], #0x8\n"
+ "ld1 { v15.d }[0], [x20], #0x8\n"
"tbz x1, #0, 7f\n"
- "ld1 { v19.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
"tbz x1, #0, 7f\n"
- "ld1 { v19.s }[0], [x13]\n"
+ "ld1 { v15.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
"tbz x1, #1, 6f\n"
- "ld1 { v13.d }[0], [x13], #0x8\n"
+ "ld1 { v7.d }[0], [x20], #0x8\n"
"tbz x1, #0, 7f\n"
- "ld1 { v13.s }[2], [x13]\n"
+ "ld1 { v7.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 7f\n"
- "ld1 { v13.s }[0], [x13]\n"
+ "ld1 { v7.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x3, #0x0]\n"
- "ldr d1, [x3, #0x8]\n"
- "mov v20.16b, v13.16b\n"
- "mov v10.16b, v19.16b\n"
- "ldr d2, [x3, #0x10]\n"
- "ldr d3, [x3, #0x18]\n"
- "mov v8.16b, v13.16b\n"
- "mov v7.16b, v19.16b\n"
- "ldr d4, [x3, #0x20]\n"
- "ldp x9, x28, [x4, #0x0]\n"
- "mov v17.16b, v13.16b\n"
- "mov v21.16b, v19.16b\n"
- "ldp x27, x26, [x4, #0x10]\n"
- "ldp x25, x24, [x4, #0x20]\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "ldp x23, x22, [x4, #0x30]\n"
- "ldp x21, x20, [x4, #0x40]\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "add x9, x9, x0\n"
- "add x28, x28, x0\n"
- "add x27, x27, x0\n"
- "add x26, x26, x0\n"
- "add x25, x25, x0\n"
- "add x24, x24, x0\n"
- "add x23, x23, x0\n"
- "add x22, x22, x0\n"
- "add x21, x21, x0\n"
- "add x20, x20, x0\n"
+ "ldr d6, [x6, #0x0]\n"
+ "ldr d14, [x6, #0x8]\n"
+ "mov v20.16b, v7.16b\n"
+ "mov v5.16b, v15.16b\n"
+ "ldr d10, [x6, #0x10]\n"
+ "ldr d21, [x6, #0x18]\n"
+ "mov v24.16b, v7.16b\n"
+ "mov v22.16b, v15.16b\n"
+ "ldr d12, [x6, #0x20]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "mov v23.16b, v7.16b\n"
+ "mov v19.16b, v15.16b\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "usubl v14.8h, v14.8b, v13.8b\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "usubl v10.8h, v10.8b, v13.8b\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "usubl v12.8h, v12.8b, v13.8b\n"
+ "add x9, x9, x3\n"
+ "add x28, x28, x3\n"
+ "add x27, x27, x3\n"
+ "add x26, x26, x3\n"
+ "add x25, x25, x3\n"
+ "add x24, x24, x3\n"
+ "add x23, x23, x3\n"
+ "add x22, x22, x3\n"
+ "add x21, x21, x3\n"
+ "add x20, x20, x3\n"
"tbz x1, #2, 9f\n"
"ld1 { v31.s }[0], [x9], #0x4\n"
- "ld1 { v30.s }[0], [x28], #0x4\n"
- "ld1 { v29.s }[0], [x27], #0x4\n"
- "ld1 { v28.s }[0], [x26], #0x4\n"
- "ld1 { v27.s }[0], [x25], #0x4\n"
- "ld1 { v23.s }[0], [x24], #0x4\n"
+ "ld1 { v17.s }[0], [x28], #0x4\n"
+ "ld1 { v30.s }[0], [x27], #0x4\n"
+ "ld1 { v16.s }[0], [x26], #0x4\n"
+ "ld1 { v3.s }[0], [x25], #0x4\n"
+ "ld1 { v4.s }[0], [x24], #0x4\n"
"ld1 { v25.s }[0], [x23], #0x4\n"
- "ld1 { v24.s }[0], [x22], #0x4\n"
- "ld1 { v26.s }[0], [x21], #0x4\n"
- "ld1 { v22.s }[0], [x20], #0x4\n"
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v29.s }[0], [x21], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
"tbz x1, #1, 8f\n"
"ld1 { v31.h }[2], [x9], #0x2\n"
- "ld1 { v30.h }[2], [x28], #0x2\n"
- "ld1 { v29.h }[2], [x27], #0x2\n"
- "ld1 { v28.h }[2], [x26], #0x2\n"
- "ld1 { v27.h }[2], [x25], #0x2\n"
- "ld1 { v23.h }[2], [x24], #0x2\n"
+ "ld1 { v17.h }[2], [x28], #0x2\n"
+ "ld1 { v30.h }[2], [x27], #0x2\n"
+ "ld1 { v16.h }[2], [x26], #0x2\n"
+ "ld1 { v3.h }[2], [x25], #0x2\n"
+ "ld1 { v4.h }[2], [x24], #0x2\n"
"ld1 { v25.h }[2], [x23], #0x2\n"
- "ld1 { v24.h }[2], [x22], #0x2\n"
- "ld1 { v26.h }[2], [x21], #0x2\n"
- "ld1 { v22.h }[2], [x20], #0x2\n"
+ "ld1 { v9.h }[2], [x22], #0x2\n"
+ "ld1 { v29.h }[2], [x21], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
"tbz x1, #0, 11f\n"
"ld1 { v31.b }[6], [x9]\n"
- "ld1 { v30.b }[6], [x28]\n"
- "ld1 { v29.b }[6], [x27]\n"
- "ld1 { v28.b }[6], [x26]\n"
- "ld1 { v27.b }[6], [x25]\n"
- "ld1 { v23.b }[6], [x24]\n"
+ "ld1 { v17.b }[6], [x28]\n"
+ "ld1 { v30.b }[6], [x27]\n"
+ "ld1 { v16.b }[6], [x26]\n"
+ "ld1 { v3.b }[6], [x25]\n"
+ "ld1 { v4.b }[6], [x24]\n"
"ld1 { v25.b }[6], [x23]\n"
- "ld1 { v24.b }[6], [x22]\n"
- "ld1 { v26.b }[6], [x21]\n"
- "ld1 { v22.b }[6], [x20]\n"
+ "ld1 { v9.b }[6], [x22]\n"
+ "ld1 { v29.b }[6], [x21]\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
"tbz x1, #0, 11f\n"
"ld1 { v31.b }[4], [x9]\n"
- "ld1 { v30.b }[4], [x28]\n"
- "ld1 { v29.b }[4], [x27]\n"
- "ld1 { v28.b }[4], [x26]\n"
- "ld1 { v27.b }[4], [x25]\n"
- "ld1 { v23.b }[4], [x24]\n"
+ "ld1 { v17.b }[4], [x28]\n"
+ "ld1 { v30.b }[4], [x27]\n"
+ "ld1 { v16.b }[4], [x26]\n"
+ "ld1 { v3.b }[4], [x25]\n"
+ "ld1 { v4.b }[4], [x24]\n"
"ld1 { v25.b }[4], [x23]\n"
- "ld1 { v24.b }[4], [x22]\n"
- "ld1 { v26.b }[4], [x21]\n"
- "ld1 { v22.b }[4], [x20]\n"
+ "ld1 { v9.b }[4], [x22]\n"
+ "ld1 { v29.b }[4], [x21]\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
"tbz x1, #1, 10f\n"
"ld1 { v31.h }[0], [x9], #0x2\n"
- "ld1 { v30.h }[0], [x28], #0x2\n"
- "ld1 { v29.h }[0], [x27], #0x2\n"
- "ld1 { v28.h }[0], [x26], #0x2\n"
- "ld1 { v27.h }[0], [x25], #0x2\n"
- "ld1 { v23.h }[0], [x24], #0x2\n"
+ "ld1 { v17.h }[0], [x28], #0x2\n"
+ "ld1 { v30.h }[0], [x27], #0x2\n"
+ "ld1 { v16.h }[0], [x26], #0x2\n"
+ "ld1 { v3.h }[0], [x25], #0x2\n"
+ "ld1 { v4.h }[0], [x24], #0x2\n"
"ld1 { v25.h }[0], [x23], #0x2\n"
- "ld1 { v24.h }[0], [x22], #0x2\n"
- "ld1 { v26.h }[0], [x21], #0x2\n"
- "ld1 { v22.h }[0], [x20], #0x2\n"
+ "ld1 { v9.h }[0], [x22], #0x2\n"
+ "ld1 { v29.h }[0], [x21], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
"tbz x1, #0, 11f\n"
"ld1 { v31.b }[2], [x9]\n"
- "ld1 { v30.b }[2], [x28]\n"
- "ld1 { v29.b }[2], [x27]\n"
- "ld1 { v28.b }[2], [x26]\n"
- "ld1 { v27.b }[2], [x25]\n"
- "ld1 { v23.b }[2], [x24]\n"
+ "ld1 { v17.b }[2], [x28]\n"
+ "ld1 { v30.b }[2], [x27]\n"
+ "ld1 { v16.b }[2], [x26]\n"
+ "ld1 { v3.b }[2], [x25]\n"
+ "ld1 { v4.b }[2], [x24]\n"
"ld1 { v25.b }[2], [x23]\n"
- "ld1 { v24.b }[2], [x22]\n"
- "ld1 { v26.b }[2], [x21]\n"
- "ld1 { v22.b }[2], [x20]\n"
+ "ld1 { v9.b }[2], [x22]\n"
+ "ld1 { v29.b }[2], [x21]\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 11f\n"
"ld1 { v31.b }[0], [x9]\n"
- "ld1 { v30.b }[0], [x28]\n"
- "ld1 { v29.b }[0], [x27]\n"
- "ld1 { v28.b }[0], [x26]\n"
- "ld1 { v27.b }[0], [x25]\n"
- "ld1 { v23.b }[0], [x24]\n"
+ "ld1 { v17.b }[0], [x28]\n"
+ "ld1 { v30.b }[0], [x27]\n"
+ "ld1 { v16.b }[0], [x26]\n"
+ "ld1 { v3.b }[0], [x25]\n"
+ "ld1 { v4.b }[0], [x24]\n"
"ld1 { v25.b }[0], [x23]\n"
- "ld1 { v24.b }[0], [x22]\n"
- "ld1 { v26.b }[0], [x21]\n"
- "ld1 { v22.b }[0], [x20]\n"
+ "ld1 { v9.b }[0], [x22]\n"
+ "ld1 { v29.b }[0], [x21]\n"
+ "ld1 { v28.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "usubl v31.8h, v31.8b, v9.8b\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "ldr x20, [x4, #0x50]\n"
- "usubl v29.8h, v29.8b, v9.8b\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "smlal v8.4s, v29.4h, v0.4h\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "add x20, x20, x0\n"
- "smlal2 v7.4s, v29.8h, v0.8h\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "smlal2 v21.4s, v28.8h, v0.8h\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal v20.4s, v27.4h, v1.4h\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v10.4s, v27.8h, v1.8h\n"
- "smlal v8.4s, v28.4h, v1.4h\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v7.4s, v28.8h, v1.8h\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "smlal v17.4s, v23.4h, v1.4h\n"
- "usubl v22.8h, v22.8b, v9.8b\n"
- "smlal2 v21.4s, v23.8h, v1.8h\n"
- "smlal v13.4s, v27.4h, v2.4h\n"
- "smlal2 v19.4s, v27.8h, v2.8h\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "smlal v8.4s, v23.4h, v2.4h\n"
- "smlal2 v7.4s, v23.8h, v2.8h\n"
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal v7.4s, v31.4h, v6.4h\n"
+ "ldr x20, [x5, #0x50]\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "smlal2 v15.4s, v31.8h, v6.8h\n"
+ "smlal v20.4s, v17.4h, v6.4h\n"
+ "smlal2 v5.4s, v17.8h, v6.8h\n"
+ "smlal v24.4s, v30.4h, v6.4h\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "add x20, x20, x3\n"
+ "smlal2 v22.4s, v30.8h, v6.8h\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "smlal v23.4s, v16.4h, v6.4h\n"
+ "smlal2 v19.4s, v16.8h, v6.8h\n"
+ "smlal v7.4s, v17.4h, v14.4h\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v15.4s, v17.8h, v14.8h\n"
+ "smlal v20.4s, v3.4h, v14.4h\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v5.4s, v3.8h, v14.8h\n"
+ "smlal v24.4s, v16.4h, v14.4h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v22.4s, v16.8h, v14.8h\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "smlal v23.4s, v4.4h, v14.4h\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v14.8h\n"
+ "smlal v7.4s, v3.4h, v10.4h\n"
+ "smlal2 v15.4s, v3.8h, v10.8h\n"
+ "smlal v20.4s, v25.4h, v10.4h\n"
+ "smlal2 v5.4s, v25.8h, v10.8h\n"
+ "smlal v24.4s, v4.4h, v10.4h\n"
+ "smlal2 v22.4s, v4.8h, v10.8h\n"
"tbz x1, #2, 13f\n"
- "ld1 { v31.s }[0], [x20], #0x4\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
"tbz x1, #1, 12f\n"
- "ld1 { v31.h }[2], [x20], #0x2\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
"tbz x1, #0, 15f\n"
- "ld1 { v31.b }[6], [x20]\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 15f\n"
- "ld1 { v31.b }[4], [x20]\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
"tbz x1, #1, 14f\n"
- "ld1 { v31.h }[0], [x20], #0x2\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
"tbz x1, #0, 15f\n"
- "ld1 { v31.b }[2], [x20]\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 15f\n"
- "ld1 { v31.b }[0], [x20]\n"
+ "ld1 { v27.b }[0], [x20]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
- "usubl v31.8h, v31.8b, v9.8b\n"
- "ldr x22, [x4, #0x58]\n"
- "smlal v17.4s, v31.4h, v2.4h\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "add x22, x22, x0\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "smlal v8.4s, v31.4h, v3.4h\n"
- "smlal2 v7.4s, v31.8h, v3.8h\n"
+ "usubl v27.8h, v27.8b, v18.8b\n"
+ "ldr x20, [x5, #0x58]\n"
+ "smlal v23.4s, v27.4h, v10.4h\n"
+ "smlal2 v19.4s, v27.8h, v10.8h\n"
+ "smlal v7.4s, v25.4h, v21.4h\n"
+ "smlal2 v15.4s, v25.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v20.4s, v9.4h, v21.4h\n"
+ "smlal2 v5.4s, v9.8h, v21.8h\n"
+ "smlal v24.4s, v27.4h, v21.4h\n"
+ "smlal2 v22.4s, v27.8h, v21.8h\n"
"tbz x1, #2, 17f\n"
- "ld1 { v30.s }[0], [x22], #0x4\n"
+ "ld1 { v6.s }[0], [x20], #0x4\n"
"tbz x1, #1, 16f\n"
- "ld1 { v30.h }[2], [x22], #0x2\n"
+ "ld1 { v6.h }[2], [x20], #0x2\n"
"tbz x1, #0, 19f\n"
- "ld1 { v30.b }[6], [x22]\n"
+ "ld1 { v6.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 19f\n"
- "ld1 { v30.b }[4], [x22]\n"
+ "ld1 { v6.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
"tbz x1, #1, 18f\n"
- "ld1 { v30.h }[0], [x22], #0x2\n"
+ "ld1 { v6.h }[0], [x20], #0x2\n"
"tbz x1, #0, 19f\n"
- "ld1 { v30.b }[2], [x22]\n"
+ "ld1 { v6.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 19f\n"
- "ld1 { v30.b }[0], [x22]\n"
+ "ld1 { v6.b }[0], [x20]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
- "usubl v30.8h, v30.8b, v9.8b\n"
- "ldr x21, [x4, #0x60]\n"
- "smlal v17.4s, v30.4h, v3.4h\n"
- "smlal2 v21.4s, v30.8h, v3.8h\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "add x21, x21, x0\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "ldr x20, [x5, #0x60]\n"
+ "smlal v23.4s, v6.4h, v21.4h\n"
+ "smlal2 v19.4s, v6.8h, v21.8h\n"
+ "smlal v7.4s, v9.4h, v12.4h\n"
+ "smlal2 v15.4s, v9.8h, v12.8h\n"
+ "add x20, x20, x3\n"
"tbz x1, #2, 21f\n"
- "ld1 { v27.s }[0], [x21], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz x1, #1, 20f\n"
- "ld1 { v27.h }[2], [x21], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"tbz x1, #0, 23f\n"
- "ld1 { v27.b }[6], [x21]\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 23f\n"
- "ld1 { v27.b }[4], [x21]\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 5): Bit 2: Unset
"tbz x1, #1, 22f\n"
- "ld1 { v27.h }[0], [x21], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"tbz x1, #0, 23f\n"
- "ld1 { v27.b }[2], [x21]\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 23f\n"
- "ld1 { v27.b }[0], [x21]\n"
+ "ld1 { v9.b }[0], [x20]\n"
"23:" // Oddments: Load (0, 5): Bit 2: End
- "ldr d0, [x3, #0x28]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v20.4s, v27.4h, v4.4h\n"
- "smlal2 v10.4s, v27.8h, v4.8h\n"
- "smlal v8.4s, v30.4h, v4.4h\n"
- "smlal2 v7.4s, v30.8h, v4.8h\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "ldr x20, [x4, #0x68]\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "add x20, x20, x0\n"
- "smlal v13.4s, v29.4h, v0.4h\n"
- "smlal2 v19.4s, v29.8h, v0.8h\n"
- "smlal v20.4s, v28.4h, v0.4h\n"
- "smlal2 v10.4s, v28.8h, v0.8h\n"
- "smlal v8.4s, v22.4h, v0.4h\n"
- "smlal2 v7.4s, v22.8h, v0.8h\n"
+ "ldr d14, [x6, #0x28]\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal v20.4s, v9.4h, v12.4h\n"
+ "smlal2 v5.4s, v9.8h, v12.8h\n"
+ "smlal v24.4s, v6.4h, v12.4h\n"
+ "smlal2 v22.4s, v6.8h, v12.8h\n"
+ "usubl v14.8h, v14.8b, v13.8b\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v30.4h, v14.4h\n"
+ "smlal2 v15.4s, v30.8h, v14.8h\n"
+ "smlal v20.4s, v16.4h, v14.4h\n"
+ "smlal2 v5.4s, v16.8h, v14.8h\n"
+ "smlal v24.4s, v28.4h, v14.4h\n"
+ "smlal2 v22.4s, v28.8h, v14.8h\n"
"tbz x1, #2, 25f\n"
"ld1 { v25.s }[0], [x20], #0x4\n"
"tbz x1, #1, 24f\n"
@@ -1315,869 +1315,869 @@ void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
"tbz x1, #0, 27f\n"
"ld1 { v25.b }[0], [x20]\n"
"27:" // Oddments: Load (2, 1): Bit 2: End
- "ldr d1, [x3, #0x30]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "ldr x26, [x4, #0x70]\n"
- "smlal v17.4s, v25.4h, v0.4h\n"
- "smlal2 v21.4s, v25.8h, v0.8h\n"
- "add x26, x26, x0\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v19.4s, v28.8h, v1.8h\n"
- "smlal v20.4s, v23.4h, v1.4h\n"
- "smlal2 v10.4s, v23.8h, v1.8h\n"
- "smlal v8.4s, v25.4h, v1.4h\n"
- "smlal2 v7.4s, v25.8h, v1.8h\n"
+ "ldr d21, [x6, #0x30]\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "ldr x20, [x5, #0x70]\n"
+ "smlal v23.4s, v25.4h, v14.4h\n"
+ "smlal2 v19.4s, v25.8h, v14.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v16.4h, v21.4h\n"
+ "smlal2 v15.4s, v16.8h, v21.8h\n"
+ "smlal v20.4s, v4.4h, v21.4h\n"
+ "smlal2 v5.4s, v4.8h, v21.8h\n"
+ "smlal v24.4s, v25.4h, v21.4h\n"
+ "smlal2 v22.4s, v25.8h, v21.8h\n"
"tbz x1, #2, 29f\n"
- "ld1 { v24.s }[0], [x26], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz x1, #1, 28f\n"
- "ld1 { v24.h }[2], [x26], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"tbz x1, #0, 31f\n"
- "ld1 { v24.b }[6], [x26]\n"
+ "ld1 { v10.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 31f\n"
- "ld1 { v24.b }[4], [x26]\n"
+ "ld1 { v10.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
"tbz x1, #1, 30f\n"
- "ld1 { v24.h }[0], [x26], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"tbz x1, #0, 31f\n"
- "ld1 { v24.b }[2], [x26]\n"
+ "ld1 { v10.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 31f\n"
- "ld1 { v24.b }[0], [x26]\n"
+ "ld1 { v10.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "ldr d2, [x3, #0x38]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "ldr x25, [x4, #0x78]\n"
- "smlal v17.4s, v24.4h, v1.4h\n"
- "smlal2 v21.4s, v24.8h, v1.8h\n"
- "add x25, x25, x0\n"
- "smlal v13.4s, v23.4h, v2.4h\n"
- "smlal2 v19.4s, v23.8h, v2.8h\n"
- "smlal v20.4s, v31.4h, v2.4h\n"
- "smlal2 v10.4s, v31.8h, v2.8h\n"
- "smlal v8.4s, v24.4h, v2.4h\n"
- "smlal2 v7.4s, v24.8h, v2.8h\n"
+ "ldr d9, [x6, #0x38]\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "usubl v9.8h, v9.8b, v13.8b\n"
+ "ldr x20, [x5, #0x78]\n"
+ "smlal v23.4s, v10.4h, v21.4h\n"
+ "smlal2 v19.4s, v10.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v4.4h, v9.4h\n"
+ "smlal2 v15.4s, v4.8h, v9.8h\n"
+ "smlal v20.4s, v27.4h, v9.4h\n"
+ "smlal2 v5.4s, v27.8h, v9.8h\n"
+ "smlal v24.4s, v10.4h, v9.4h\n"
+ "smlal2 v22.4s, v10.8h, v9.8h\n"
"tbz x1, #2, 33f\n"
- "ld1 { v27.s }[0], [x25], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz x1, #1, 32f\n"
- "ld1 { v27.h }[2], [x25], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"tbz x1, #0, 35f\n"
- "ld1 { v27.b }[6], [x25]\n"
+ "ld1 { v12.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 35f\n"
- "ld1 { v27.b }[4], [x25]\n"
+ "ld1 { v12.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (2, 3): Bit 2: Unset
"tbz x1, #1, 34f\n"
- "ld1 { v27.h }[0], [x25], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"tbz x1, #0, 35f\n"
- "ld1 { v27.b }[2], [x25]\n"
+ "ld1 { v12.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 35f\n"
- "ld1 { v27.b }[0], [x25]\n"
+ "ld1 { v12.b }[0], [x20]\n"
"35:" // Oddments: Load (2, 3): Bit 2: End
- "ldr d3, [x3, #0x40]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "ldr x23, [x4, #0x80]\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal2 v21.4s, v27.8h, v2.8h\n"
- "add x23, x23, x0\n"
- "smlal v13.4s, v31.4h, v3.4h\n"
- "smlal2 v19.4s, v31.8h, v3.8h\n"
- "smlal v20.4s, v30.4h, v3.4h\n"
- "smlal2 v10.4s, v30.8h, v3.8h\n"
- "smlal v8.4s, v27.4h, v3.4h\n"
- "smlal2 v7.4s, v27.8h, v3.8h\n"
+ "ldr d31, [x6, #0x40]\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x20, [x5, #0x80]\n"
+ "smlal v23.4s, v12.4h, v9.4h\n"
+ "smlal2 v19.4s, v12.8h, v9.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v27.4h, v31.4h\n"
+ "smlal2 v15.4s, v27.8h, v31.8h\n"
+ "smlal v20.4s, v6.4h, v31.4h\n"
+ "smlal2 v5.4s, v6.8h, v31.8h\n"
+ "smlal v24.4s, v12.4h, v31.4h\n"
+ "smlal2 v22.4s, v12.8h, v31.8h\n"
"tbz x1, #2, 37f\n"
- "ld1 { v23.s }[0], [x23], #0x4\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
"tbz x1, #1, 36f\n"
- "ld1 { v23.h }[2], [x23], #0x2\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
"tbz x1, #0, 39f\n"
- "ld1 { v23.b }[6], [x23]\n"
+ "ld1 { v8.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 39f\n"
- "ld1 { v23.b }[4], [x23]\n"
+ "ld1 { v8.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 4): Bit 2: Unset
"tbz x1, #1, 38f\n"
- "ld1 { v23.h }[0], [x23], #0x2\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
"tbz x1, #0, 39f\n"
- "ld1 { v23.b }[2], [x23]\n"
+ "ld1 { v8.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 39f\n"
- "ld1 { v23.b }[0], [x23]\n"
+ "ld1 { v8.b }[0], [x20]\n"
"39:" // Oddments: Load (2, 4): Bit 2: End
- "ldr d4, [x3, #0x48]\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "ldr x24, [x4, #0x88]\n"
- "smlal v17.4s, v23.4h, v3.4h\n"
- "smlal2 v21.4s, v23.8h, v3.8h\n"
- "add x24, x24, x0\n"
- "smlal v13.4s, v30.4h, v4.4h\n"
- "smlal2 v19.4s, v30.8h, v4.8h\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "smlal v8.4s, v23.4h, v4.4h\n"
- "smlal2 v7.4s, v23.8h, v4.8h\n"
+ "ldr d16, [x6, #0x48]\n"
+ "usubl v8.8h, v8.8b, v18.8b\n"
+ "usubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0x88]\n"
+ "smlal v23.4s, v8.4h, v31.4h\n"
+ "smlal2 v19.4s, v8.8h, v31.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v6.4h, v16.4h\n"
+ "smlal2 v15.4s, v6.8h, v16.8h\n"
+ "smlal v20.4s, v29.4h, v16.4h\n"
+ "smlal2 v5.4s, v29.8h, v16.8h\n"
+ "smlal v24.4s, v8.4h, v16.4h\n"
+ "smlal2 v22.4s, v8.8h, v16.8h\n"
"tbz x1, #2, 41f\n"
- "ld1 { v28.s }[0], [x24], #0x4\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
"tbz x1, #1, 40f\n"
- "ld1 { v28.h }[2], [x24], #0x2\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
"tbz x1, #0, 43f\n"
- "ld1 { v28.b }[6], [x24]\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 43f\n"
- "ld1 { v28.b }[4], [x24]\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 5): Bit 2: Unset
"tbz x1, #1, 42f\n"
- "ld1 { v28.h }[0], [x24], #0x2\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
"tbz x1, #0, 43f\n"
- "ld1 { v28.b }[2], [x24]\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 43f\n"
- "ld1 { v28.b }[0], [x24]\n"
+ "ld1 { v27.b }[0], [x20]\n"
"43:" // Oddments: Load (2, 5): Bit 2: End
- "ldr d0, [x3, #0x50]\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "ldr x15, [x4, #0x90]\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "smlal2 v21.4s, v28.8h, v4.8h\n"
- "add x15, x15, x0\n"
- "smlal v13.4s, v22.4h, v0.4h\n"
- "smlal2 v19.4s, v22.8h, v0.8h\n"
- "smlal v20.4s, v25.4h, v0.4h\n"
- "smlal2 v10.4s, v25.8h, v0.8h\n"
+ "ldr d21, [x6, #0x50]\n"
+ "usubl v27.8h, v27.8b, v18.8b\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "ldr x20, [x5, #0x90]\n"
+ "smlal v23.4s, v27.4h, v16.4h\n"
+ "smlal2 v19.4s, v27.8h, v16.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v28.4h, v21.4h\n"
+ "smlal2 v15.4s, v28.8h, v21.8h\n"
+ "smlal v20.4s, v25.4h, v21.4h\n"
+ "smlal2 v5.4s, v25.8h, v21.8h\n"
"tbz x1, #2, 45f\n"
- "ld1 { v31.s }[0], [x15], #0x4\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
"tbz x1, #1, 44f\n"
- "ld1 { v31.h }[2], [x15], #0x2\n"
+ "ld1 { v31.h }[2], [x20], #0x2\n"
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[6], [x15]\n"
+ "ld1 { v31.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[4], [x15]\n"
+ "ld1 { v31.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (3, 0): Bit 2: Unset
"tbz x1, #1, 46f\n"
- "ld1 { v31.h }[0], [x15], #0x2\n"
+ "ld1 { v31.h }[0], [x20], #0x2\n"
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[2], [x15]\n"
+ "ld1 { v31.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[0], [x15]\n"
+ "ld1 { v31.b }[0], [x20]\n"
"47:" // Oddments: Load (3, 0): Bit 2: End
- "usubl v31.8h, v31.8b, v9.8b\n"
- "ldr x21, [x4, #0x98]\n"
- "smlal v8.4s, v31.4h, v0.4h\n"
- "smlal2 v7.4s, v31.8h, v0.8h\n"
- "add x21, x21, x0\n"
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "ldr x20, [x5, #0x98]\n"
+ "smlal v24.4s, v31.4h, v21.4h\n"
+ "smlal2 v22.4s, v31.8h, v21.8h\n"
+ "add x20, x20, x3\n"
"tbz x1, #2, 49f\n"
- "ld1 { v30.s }[0], [x21], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
"tbz x1, #1, 48f\n"
- "ld1 { v30.h }[2], [x21], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
"tbz x1, #0, 51f\n"
- "ld1 { v30.b }[6], [x21]\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
"tbz x1, #0, 51f\n"
- "ld1 { v30.b }[4], [x21]\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
"tbz x1, #1, 50f\n"
- "ld1 { v30.h }[0], [x21], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
"tbz x1, #0, 51f\n"
- "ld1 { v30.b }[2], [x21]\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 51f\n"
- "ld1 { v30.b }[0], [x21]\n"
+ "ld1 { v28.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "ldr d1, [x3, #0x58]\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "ldr x14, [x4, #0xa0]\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "smlal2 v21.4s, v30.8h, v0.8h\n"
- "add x14, x14, x0\n"
- "smlal v13.4s, v25.4h, v1.4h\n"
- "smlal2 v19.4s, v25.8h, v1.8h\n"
- "smlal v20.4s, v24.4h, v1.4h\n"
- "smlal2 v10.4s, v24.8h, v1.8h\n"
- "smlal v8.4s, v30.4h, v1.4h\n"
- "smlal2 v7.4s, v30.8h, v1.8h\n"
+ "ldr d2, [x6, #0x58]\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "ldr x20, [x5, #0xa0]\n"
+ "smlal v23.4s, v28.4h, v21.4h\n"
+ "smlal2 v19.4s, v28.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v25.4h, v2.4h\n"
+ "smlal2 v15.4s, v25.8h, v2.8h\n"
+ "smlal v20.4s, v10.4h, v2.4h\n"
+ "smlal2 v5.4s, v10.8h, v2.8h\n"
+ "smlal v24.4s, v28.4h, v2.4h\n"
+ "smlal2 v22.4s, v28.8h, v2.8h\n"
"tbz x1, #2, 53f\n"
- "ld1 { v26.s }[0], [x14], #0x4\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
"tbz x1, #1, 52f\n"
- "ld1 { v26.h }[2], [x14], #0x2\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
"tbz x1, #0, 55f\n"
- "ld1 { v26.b }[6], [x14]\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 55f\n"
- "ld1 { v26.b }[4], [x14]\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
"tbz x1, #1, 54f\n"
- "ld1 { v26.h }[0], [x14], #0x2\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
"tbz x1, #0, 55f\n"
- "ld1 { v26.b }[2], [x14]\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 55f\n"
- "ld1 { v26.b }[0], [x14]\n"
+ "ld1 { v21.b }[0], [x20]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "ldr d2, [x3, #0x60]\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "ldr x13, [x4, #0xa8]\n"
- "smlal v17.4s, v26.4h, v1.4h\n"
- "smlal2 v21.4s, v26.8h, v1.8h\n"
- "add x13, x13, x0\n"
- "smlal v13.4s, v24.4h, v2.4h\n"
- "smlal2 v19.4s, v24.8h, v2.8h\n"
- "smlal v20.4s, v27.4h, v2.4h\n"
- "smlal2 v10.4s, v27.8h, v2.8h\n"
- "smlal v8.4s, v26.4h, v2.4h\n"
- "smlal2 v7.4s, v26.8h, v2.8h\n"
+ "ldr d25, [x6, #0x60]\n"
+ "usubl v21.8h, v21.8b, v18.8b\n"
+ "usubl v25.8h, v25.8b, v13.8b\n"
+ "ldr x20, [x5, #0xa8]\n"
+ "smlal v23.4s, v21.4h, v2.4h\n"
+ "smlal2 v19.4s, v21.8h, v2.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v10.4h, v25.4h\n"
+ "smlal2 v15.4s, v10.8h, v25.8h\n"
+ "smlal v20.4s, v12.4h, v25.4h\n"
+ "smlal2 v5.4s, v12.8h, v25.8h\n"
+ "smlal v24.4s, v21.4h, v25.4h\n"
+ "smlal2 v22.4s, v21.8h, v25.8h\n"
"tbz x1, #2, 57f\n"
- "ld1 { v25.s }[0], [x13], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz x1, #1, 56f\n"
- "ld1 { v25.h }[2], [x13], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"tbz x1, #0, 59f\n"
- "ld1 { v25.b }[6], [x13]\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 59f\n"
"56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 59f\n"
- "ld1 { v25.b }[4], [x13]\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 59f\n"
"57:" // Oddments: Load (3, 3): Bit 2: Unset
"tbz x1, #1, 58f\n"
- "ld1 { v25.h }[0], [x13], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"tbz x1, #0, 59f\n"
- "ld1 { v25.b }[2], [x13]\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 59f\n"
"58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 59f\n"
- "ld1 { v25.b }[0], [x13]\n"
+ "ld1 { v9.b }[0], [x20]\n"
"59:" // Oddments: Load (3, 3): Bit 2: End
- "ldr d3, [x3, #0x68]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "ldr x12, [x4, #0xb0]\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "add x12, x12, x0\n"
- "smlal v13.4s, v27.4h, v3.4h\n"
- "smlal2 v19.4s, v27.8h, v3.8h\n"
- "smlal v20.4s, v23.4h, v3.4h\n"
- "smlal2 v10.4s, v23.8h, v3.8h\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "ldr d1, [x6, #0x68]\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "ldr x20, [x5, #0xb0]\n"
+ "smlal v23.4s, v9.4h, v25.4h\n"
+ "smlal2 v19.4s, v9.8h, v25.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v12.4h, v1.4h\n"
+ "smlal2 v15.4s, v12.8h, v1.8h\n"
+ "smlal v20.4s, v8.4h, v1.4h\n"
+ "smlal2 v5.4s, v8.8h, v1.8h\n"
+ "smlal v24.4s, v9.4h, v1.4h\n"
+ "smlal2 v22.4s, v9.8h, v1.8h\n"
"tbz x1, #2, 61f\n"
- "ld1 { v24.s }[0], [x12], #0x4\n"
+ "ld1 { v3.s }[0], [x20], #0x4\n"
"tbz x1, #1, 60f\n"
- "ld1 { v24.h }[2], [x12], #0x2\n"
+ "ld1 { v3.h }[2], [x20], #0x2\n"
"tbz x1, #0, 63f\n"
- "ld1 { v24.b }[6], [x12]\n"
+ "ld1 { v3.b }[6], [x20]\n"
"b 63f\n"
"60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 63f\n"
- "ld1 { v24.b }[4], [x12]\n"
+ "ld1 { v3.b }[4], [x20]\n"
"b 63f\n"
"61:" // Oddments: Load (3, 4): Bit 2: Unset
"tbz x1, #1, 62f\n"
- "ld1 { v24.h }[0], [x12], #0x2\n"
+ "ld1 { v3.h }[0], [x20], #0x2\n"
"tbz x1, #0, 63f\n"
- "ld1 { v24.b }[2], [x12]\n"
+ "ld1 { v3.b }[2], [x20]\n"
"b 63f\n"
"62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 63f\n"
- "ld1 { v24.b }[0], [x12]\n"
+ "ld1 { v3.b }[0], [x20]\n"
"63:" // Oddments: Load (3, 4): Bit 2: End
- "ldr d4, [x3, #0x70]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "ldr x20, [x4, #0xb8]\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "add x20, x20, x0\n"
- "smlal v13.4s, v23.4h, v4.4h\n"
- "smlal2 v19.4s, v23.8h, v4.8h\n"
- "smlal v20.4s, v28.4h, v4.4h\n"
- "smlal2 v10.4s, v28.8h, v4.8h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
+ "ldr d16, [x6, #0x70]\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "usubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0xb8]\n"
+ "smlal v23.4s, v3.4h, v1.4h\n"
+ "smlal2 v19.4s, v3.8h, v1.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v8.4h, v16.4h\n"
+ "smlal2 v15.4s, v8.8h, v16.8h\n"
+ "smlal v20.4s, v27.4h, v16.4h\n"
+ "smlal2 v5.4s, v27.8h, v16.8h\n"
+ "smlal v24.4s, v3.4h, v16.4h\n"
+ "smlal2 v22.4s, v3.8h, v16.8h\n"
"tbz x1, #2, 65f\n"
- "ld1 { v22.s }[0], [x20], #0x4\n"
+ "ld1 { v14.s }[0], [x20], #0x4\n"
"tbz x1, #1, 64f\n"
- "ld1 { v22.h }[2], [x20], #0x2\n"
+ "ld1 { v14.h }[2], [x20], #0x2\n"
"tbz x1, #0, 67f\n"
- "ld1 { v22.b }[6], [x20]\n"
+ "ld1 { v14.b }[6], [x20]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 67f\n"
- "ld1 { v22.b }[4], [x20]\n"
+ "ld1 { v14.b }[4], [x20]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 5): Bit 2: Unset
"tbz x1, #1, 66f\n"
- "ld1 { v22.h }[0], [x20], #0x2\n"
+ "ld1 { v14.h }[0], [x20], #0x2\n"
"tbz x1, #0, 67f\n"
- "ld1 { v22.b }[2], [x20]\n"
+ "ld1 { v14.b }[2], [x20]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 67f\n"
- "ld1 { v22.b }[0], [x20]\n"
+ "ld1 { v14.b }[0], [x20]\n"
"67:" // Oddments: Load (3, 5): Bit 2: End
- "ldr d0, [x3, #0x78]\n"
- "usubl v22.8h, v22.8b, v9.8b\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "ldr x11, [x4, #0xc0]\n"
- "smlal v17.4s, v22.4h, v4.4h\n"
- "smlal2 v21.4s, v22.8h, v4.8h\n"
- "add x11, x11, x0\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "ldr d17, [x6, #0x78]\n"
+ "usubl v14.8h, v14.8b, v18.8b\n"
+ "usubl v17.8h, v17.8b, v13.8b\n"
+ "ldr x20, [x5, #0xc0]\n"
+ "smlal v23.4s, v14.4h, v16.4h\n"
+ "smlal2 v19.4s, v14.8h, v16.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v31.4h, v17.4h\n"
+ "smlal2 v15.4s, v31.8h, v17.8h\n"
+ "smlal v20.4s, v28.4h, v17.4h\n"
+ "smlal2 v5.4s, v28.8h, v17.8h\n"
"tbz x1, #2, 69f\n"
- "ld1 { v27.s }[0], [x11], #0x4\n"
+ "ld1 { v1.s }[0], [x20], #0x4\n"
"tbz x1, #1, 68f\n"
- "ld1 { v27.h }[2], [x11], #0x2\n"
+ "ld1 { v1.h }[2], [x20], #0x2\n"
"tbz x1, #0, 71f\n"
- "ld1 { v27.b }[6], [x11]\n"
+ "ld1 { v1.b }[6], [x20]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
"tbz x1, #0, 71f\n"
- "ld1 { v27.b }[4], [x11]\n"
+ "ld1 { v1.b }[4], [x20]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 0): Bit 2: Unset
"tbz x1, #1, 70f\n"
- "ld1 { v27.h }[0], [x11], #0x2\n"
+ "ld1 { v1.h }[0], [x20], #0x2\n"
"tbz x1, #0, 71f\n"
- "ld1 { v27.b }[2], [x11]\n"
+ "ld1 { v1.b }[2], [x20]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 71f\n"
- "ld1 { v27.b }[0], [x11]\n"
+ "ld1 { v1.b }[0], [x20]\n"
"71:" // Oddments: Load (4, 0): Bit 2: End
- "usubl v27.8h, v27.8b, v9.8b\n"
- "ldr x22, [x4, #0xc8]\n"
- "smlal v8.4s, v27.4h, v0.4h\n"
- "smlal2 v7.4s, v27.8h, v0.8h\n"
- "add x22, x22, x0\n"
+ "usubl v1.8h, v1.8b, v18.8b\n"
+ "ldr x20, [x5, #0xc8]\n"
+ "smlal v24.4s, v1.4h, v17.4h\n"
+ "smlal2 v22.4s, v1.8h, v17.8h\n"
+ "add x20, x20, x3\n"
"tbz x1, #2, 73f\n"
- "ld1 { v23.s }[0], [x22], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x1, #1, 72f\n"
- "ld1 { v23.h }[2], [x22], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x1, #0, 75f\n"
- "ld1 { v23.b }[6], [x22]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
"tbz x1, #0, 75f\n"
- "ld1 { v23.b }[4], [x22]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 1): Bit 2: Unset
"tbz x1, #1, 74f\n"
- "ld1 { v23.h }[0], [x22], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x1, #0, 75f\n"
- "ld1 { v23.b }[2], [x22]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 75f\n"
- "ld1 { v23.b }[0], [x22]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"75:" // Oddments: Load (4, 1): Bit 2: End
- "ldr d1, [x3, #0x80]\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "ldr x9, [x4, #0xd0]\n"
- "smlal v17.4s, v23.4h, v0.4h\n"
- "smlal2 v21.4s, v23.8h, v0.8h\n"
- "add x9, x9, x0\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal v20.4s, v26.4h, v1.4h\n"
- "smlal2 v10.4s, v26.8h, v1.8h\n"
- "smlal v8.4s, v23.4h, v1.4h\n"
- "smlal2 v7.4s, v23.8h, v1.8h\n"
+ "ldr d29, [x6, #0x80]\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "usubl v29.8h, v29.8b, v13.8b\n"
+ "ldr x20, [x5, #0xd0]\n"
+ "smlal v23.4s, v16.4h, v17.4h\n"
+ "smlal2 v19.4s, v16.8h, v17.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v28.4h, v29.4h\n"
+ "smlal2 v15.4s, v28.8h, v29.8h\n"
+ "smlal v20.4s, v21.4h, v29.4h\n"
+ "smlal2 v5.4s, v21.8h, v29.8h\n"
+ "smlal v24.4s, v16.4h, v29.4h\n"
+ "smlal2 v22.4s, v16.8h, v29.8h\n"
"tbz x1, #2, 77f\n"
- "ld1 { v31.s }[0], [x9], #0x4\n"
+ "ld1 { v30.s }[0], [x20], #0x4\n"
"tbz x1, #1, 76f\n"
- "ld1 { v31.h }[2], [x9], #0x2\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
"tbz x1, #0, 79f\n"
- "ld1 { v31.b }[6], [x9]\n"
+ "ld1 { v30.b }[6], [x20]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 79f\n"
- "ld1 { v31.b }[4], [x9]\n"
+ "ld1 { v30.b }[4], [x20]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 2): Bit 2: Unset
"tbz x1, #1, 78f\n"
- "ld1 { v31.h }[0], [x9], #0x2\n"
+ "ld1 { v30.h }[0], [x20], #0x2\n"
"tbz x1, #0, 79f\n"
- "ld1 { v31.b }[2], [x9]\n"
+ "ld1 { v30.b }[2], [x20]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 79f\n"
- "ld1 { v31.b }[0], [x9]\n"
+ "ld1 { v30.b }[0], [x20]\n"
"79:" // Oddments: Load (4, 2): Bit 2: End
- "ldr d2, [x3, #0x88]\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "ldr x28, [x4, #0xd8]\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal2 v21.4s, v31.8h, v1.8h\n"
- "add x28, x28, x0\n"
- "smlal v13.4s, v26.4h, v2.4h\n"
- "smlal2 v19.4s, v26.8h, v2.8h\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "smlal v8.4s, v31.4h, v2.4h\n"
- "smlal2 v7.4s, v31.8h, v2.8h\n"
+ "ldr d12, [x6, #0x88]\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "usubl v12.8h, v12.8b, v13.8b\n"
+ "ldr x20, [x5, #0xd8]\n"
+ "smlal v23.4s, v30.4h, v29.4h\n"
+ "smlal2 v19.4s, v30.8h, v29.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v21.4h, v12.4h\n"
+ "smlal2 v15.4s, v21.8h, v12.8h\n"
+ "smlal v20.4s, v9.4h, v12.4h\n"
+ "smlal2 v5.4s, v9.8h, v12.8h\n"
+ "smlal v24.4s, v30.4h, v12.4h\n"
+ "smlal2 v22.4s, v30.8h, v12.8h\n"
"tbz x1, #2, 81f\n"
- "ld1 { v30.s }[0], [x28], #0x4\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
"tbz x1, #1, 80f\n"
- "ld1 { v30.h }[2], [x28], #0x2\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
"tbz x1, #0, 83f\n"
- "ld1 { v30.b }[6], [x28]\n"
+ "ld1 { v29.b }[6], [x20]\n"
"b 83f\n"
"80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 83f\n"
- "ld1 { v30.b }[4], [x28]\n"
+ "ld1 { v29.b }[4], [x20]\n"
"b 83f\n"
"81:" // Oddments: Load (4, 3): Bit 2: Unset
"tbz x1, #1, 82f\n"
- "ld1 { v30.h }[0], [x28], #0x2\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
"tbz x1, #0, 83f\n"
- "ld1 { v30.b }[2], [x28]\n"
+ "ld1 { v29.b }[2], [x20]\n"
"b 83f\n"
"82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 83f\n"
- "ld1 { v30.b }[0], [x28]\n"
+ "ld1 { v29.b }[0], [x20]\n"
"83:" // Oddments: Load (4, 3): Bit 2: End
- "ldr d3, [x3, #0x90]\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "ldr x27, [x4, #0xe0]\n"
- "smlal v17.4s, v30.4h, v2.4h\n"
- "smlal2 v21.4s, v30.8h, v2.8h\n"
- "add x27, x27, x0\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "smlal v8.4s, v30.4h, v3.4h\n"
- "smlal2 v7.4s, v30.8h, v3.8h\n"
+ "ldr d21, [x6, #0x90]\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "ldr x20, [x5, #0xe0]\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v9.4h, v21.4h\n"
+ "smlal2 v15.4s, v9.8h, v21.8h\n"
+ "smlal v20.4s, v3.4h, v21.4h\n"
+ "smlal2 v5.4s, v3.8h, v21.8h\n"
+ "smlal v24.4s, v29.4h, v21.4h\n"
+ "smlal2 v22.4s, v29.8h, v21.8h\n"
"tbz x1, #2, 85f\n"
- "ld1 { v28.s }[0], [x27], #0x4\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
"tbz x1, #1, 84f\n"
- "ld1 { v28.h }[2], [x27], #0x2\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
"tbz x1, #0, 87f\n"
- "ld1 { v28.b }[6], [x27]\n"
+ "ld1 { v25.b }[6], [x20]\n"
"b 87f\n"
"84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 87f\n"
- "ld1 { v28.b }[4], [x27]\n"
+ "ld1 { v25.b }[4], [x20]\n"
"b 87f\n"
"85:" // Oddments: Load (4, 4): Bit 2: Unset
"tbz x1, #1, 86f\n"
- "ld1 { v28.h }[0], [x27], #0x2\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
"tbz x1, #0, 87f\n"
- "ld1 { v28.b }[2], [x27]\n"
+ "ld1 { v25.b }[2], [x20]\n"
"b 87f\n"
"86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 87f\n"
- "ld1 { v28.b }[0], [x27]\n"
+ "ld1 { v25.b }[0], [x20]\n"
"87:" // Oddments: Load (4, 4): Bit 2: End
- "ldr d4, [x3, #0x98]\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "ldr x26, [x4, #0xe8]\n"
- "smlal v17.4s, v28.4h, v3.4h\n"
- "smlal2 v21.4s, v28.8h, v3.8h\n"
- "add x26, x26, x0\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "smlal v20.4s, v22.4h, v4.4h\n"
- "smlal2 v10.4s, v22.8h, v4.8h\n"
- "smlal v8.4s, v28.4h, v4.4h\n"
- "smlal2 v7.4s, v28.8h, v4.8h\n"
+ "ldr d8, [x6, #0x98]\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "usubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x20, [x5, #0xe8]\n"
+ "smlal v23.4s, v25.4h, v21.4h\n"
+ "smlal2 v19.4s, v25.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v3.4h, v8.4h\n"
+ "smlal2 v15.4s, v3.8h, v8.8h\n"
+ "smlal v20.4s, v14.4h, v8.4h\n"
+ "smlal2 v5.4s, v14.8h, v8.8h\n"
+ "smlal v24.4s, v25.4h, v8.4h\n"
+ "smlal2 v22.4s, v25.8h, v8.8h\n"
"tbz x1, #2, 89f\n"
- "ld1 { v26.s }[0], [x26], #0x4\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
"tbz x1, #1, 88f\n"
- "ld1 { v26.h }[2], [x26], #0x2\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
"tbz x1, #0, 91f\n"
- "ld1 { v26.b }[6], [x26]\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 91f\n"
"88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 91f\n"
- "ld1 { v26.b }[4], [x26]\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 91f\n"
"89:" // Oddments: Load (4, 5): Bit 2: Unset
"tbz x1, #1, 90f\n"
- "ld1 { v26.h }[0], [x26], #0x2\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
"tbz x1, #0, 91f\n"
- "ld1 { v26.b }[2], [x26]\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 91f\n"
"90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 91f\n"
- "ld1 { v26.b }[0], [x26]\n"
+ "ld1 { v21.b }[0], [x20]\n"
"91:" // Oddments: Load (4, 5): Bit 2: End
- "ldr d0, [x3, #0xa0]\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "ldr x25, [x4, #0xf0]\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "add x25, x25, x0\n"
- "smlal v13.4s, v27.4h, v0.4h\n"
- "smlal2 v19.4s, v27.8h, v0.8h\n"
- "smlal v20.4s, v23.4h, v0.4h\n"
- "smlal2 v10.4s, v23.8h, v0.8h\n"
+ "ldr d9, [x6, #0xa0]\n"
+ "usubl v21.8h, v21.8b, v18.8b\n"
+ "usubl v9.8h, v9.8b, v13.8b\n"
+ "ldr x20, [x5, #0xf0]\n"
+ "smlal v23.4s, v21.4h, v8.4h\n"
+ "smlal2 v19.4s, v21.8h, v8.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v1.4h, v9.4h\n"
+ "smlal2 v15.4s, v1.8h, v9.8h\n"
+ "smlal v20.4s, v16.4h, v9.4h\n"
+ "smlal2 v5.4s, v16.8h, v9.8h\n"
"tbz x1, #2, 93f\n"
- "ld1 { v25.s }[0], [x25], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz x1, #1, 92f\n"
- "ld1 { v25.h }[2], [x25], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"tbz x1, #0, 95f\n"
- "ld1 { v25.b }[6], [x25]\n"
+ "ld1 { v12.b }[6], [x20]\n"
"b 95f\n"
"92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
"tbz x1, #0, 95f\n"
- "ld1 { v25.b }[4], [x25]\n"
+ "ld1 { v12.b }[4], [x20]\n"
"b 95f\n"
"93:" // Oddments: Load (5, 0): Bit 2: Unset
"tbz x1, #1, 94f\n"
- "ld1 { v25.h }[0], [x25], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"tbz x1, #0, 95f\n"
- "ld1 { v25.b }[2], [x25]\n"
+ "ld1 { v12.b }[2], [x20]\n"
"b 95f\n"
"94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 95f\n"
- "ld1 { v25.b }[0], [x25]\n"
+ "ld1 { v12.b }[0], [x20]\n"
"95:" // Oddments: Load (5, 0): Bit 2: End
- "usubl v25.8h, v25.8b, v9.8b\n"
- "ldr x24, [x4, #0xf8]\n"
- "smlal v8.4s, v25.4h, v0.4h\n"
- "smlal2 v7.4s, v25.8h, v0.8h\n"
- "add x24, x24, x0\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "ldr x20, [x5, #0xf8]\n"
+ "smlal v24.4s, v12.4h, v9.4h\n"
+ "smlal2 v22.4s, v12.8h, v9.8h\n"
+ "add x20, x20, x3\n"
"tbz x1, #2, 97f\n"
- "ld1 { v24.s }[0], [x24], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz x1, #1, 96f\n"
- "ld1 { v24.h }[2], [x24], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"tbz x1, #0, 99f\n"
- "ld1 { v24.b }[6], [x24]\n"
+ "ld1 { v10.b }[6], [x20]\n"
"b 99f\n"
"96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
"tbz x1, #0, 99f\n"
- "ld1 { v24.b }[4], [x24]\n"
+ "ld1 { v10.b }[4], [x20]\n"
"b 99f\n"
"97:" // Oddments: Load (5, 1): Bit 2: Unset
"tbz x1, #1, 98f\n"
- "ld1 { v24.h }[0], [x24], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"tbz x1, #0, 99f\n"
- "ld1 { v24.b }[2], [x24]\n"
+ "ld1 { v10.b }[2], [x20]\n"
"b 99f\n"
"98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 99f\n"
- "ld1 { v24.b }[0], [x24]\n"
+ "ld1 { v10.b }[0], [x20]\n"
"99:" // Oddments: Load (5, 1): Bit 2: End
- "ldr d1, [x3, #0xa8]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "ldr x23, [x4, #0x100]\n"
- "smlal v17.4s, v24.4h, v0.4h\n"
- "smlal2 v21.4s, v24.8h, v0.8h\n"
- "add x23, x23, x0\n"
- "smlal v13.4s, v23.4h, v1.4h\n"
- "smlal2 v19.4s, v23.8h, v1.8h\n"
- "smlal v20.4s, v31.4h, v1.4h\n"
- "smlal2 v10.4s, v31.8h, v1.8h\n"
- "smlal v8.4s, v24.4h, v1.4h\n"
- "smlal2 v7.4s, v24.8h, v1.8h\n"
+ "ldr d12, [x6, #0xa8]\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "usubl v12.8h, v12.8b, v13.8b\n"
+ "ldr x20, [x5, #0x100]\n"
+ "smlal v23.4s, v10.4h, v9.4h\n"
+ "smlal2 v19.4s, v10.8h, v9.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v16.4h, v12.4h\n"
+ "smlal2 v15.4s, v16.8h, v12.8h\n"
+ "smlal v20.4s, v30.4h, v12.4h\n"
+ "smlal2 v5.4s, v30.8h, v12.8h\n"
+ "smlal v24.4s, v10.4h, v12.4h\n"
+ "smlal2 v22.4s, v10.8h, v12.8h\n"
"tbz x1, #2, 101f\n"
- "ld1 { v27.s }[0], [x23], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz x1, #1, 100f\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"tbz x1, #0, 103f\n"
- "ld1 { v27.b }[6], [x23]\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 103f\n"
"100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 103f\n"
- "ld1 { v27.b }[4], [x23]\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 103f\n"
"101:" // Oddments: Load (5, 2): Bit 2: Unset
"tbz x1, #1, 102f\n"
- "ld1 { v27.h }[0], [x23], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"tbz x1, #0, 103f\n"
- "ld1 { v27.b }[2], [x23]\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 103f\n"
"102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 103f\n"
- "ld1 { v27.b }[0], [x23]\n"
+ "ld1 { v9.b }[0], [x20]\n"
"103:" // Oddments: Load (5, 2): Bit 2: End
- "ldr d2, [x3, #0xb0]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "ldr x15, [x4, #0x108]\n"
- "smlal v17.4s, v27.4h, v1.4h\n"
- "smlal2 v21.4s, v27.8h, v1.8h\n"
- "add x15, x15, x0\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v19.4s, v31.8h, v2.8h\n"
- "smlal v20.4s, v30.4h, v2.4h\n"
- "smlal2 v10.4s, v30.8h, v2.8h\n"
- "smlal v8.4s, v27.4h, v2.4h\n"
- "smlal2 v7.4s, v27.8h, v2.8h\n"
+ "ldr d28, [x6, #0xb0]\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "usubl v28.8h, v28.8b, v13.8b\n"
+ "ldr x20, [x5, #0x108]\n"
+ "smlal v23.4s, v9.4h, v12.4h\n"
+ "smlal2 v19.4s, v9.8h, v12.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v30.4h, v28.4h\n"
+ "smlal2 v15.4s, v30.8h, v28.8h\n"
+ "smlal v20.4s, v29.4h, v28.4h\n"
+ "smlal2 v5.4s, v29.8h, v28.8h\n"
+ "smlal v24.4s, v9.4h, v28.4h\n"
+ "smlal2 v22.4s, v9.8h, v28.8h\n"
"tbz x1, #2, 105f\n"
- "ld1 { v25.s }[0], [x15], #0x4\n"
+ "ld1 { v2.s }[0], [x20], #0x4\n"
"tbz x1, #1, 104f\n"
- "ld1 { v25.h }[2], [x15], #0x2\n"
+ "ld1 { v2.h }[2], [x20], #0x2\n"
"tbz x1, #0, 107f\n"
- "ld1 { v25.b }[6], [x15]\n"
+ "ld1 { v2.b }[6], [x20]\n"
"b 107f\n"
"104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 107f\n"
- "ld1 { v25.b }[4], [x15]\n"
+ "ld1 { v2.b }[4], [x20]\n"
"b 107f\n"
"105:" // Oddments: Load (5, 3): Bit 2: Unset
"tbz x1, #1, 106f\n"
- "ld1 { v25.h }[0], [x15], #0x2\n"
+ "ld1 { v2.h }[0], [x20], #0x2\n"
"tbz x1, #0, 107f\n"
- "ld1 { v25.b }[2], [x15]\n"
+ "ld1 { v2.b }[2], [x20]\n"
"b 107f\n"
"106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 107f\n"
- "ld1 { v25.b }[0], [x15]\n"
+ "ld1 { v2.b }[0], [x20]\n"
"107:" // Oddments: Load (5, 3): Bit 2: End
- "ldr d3, [x3, #0xb8]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "ldr x21, [x4, #0x110]\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "add x21, x21, x0\n"
- "smlal v13.4s, v30.4h, v3.4h\n"
- "smlal2 v19.4s, v30.8h, v3.8h\n"
- "smlal v20.4s, v28.4h, v3.4h\n"
- "smlal2 v10.4s, v28.8h, v3.8h\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "ldr d30, [x6, #0xb8]\n"
+ "usubl v2.8h, v2.8b, v18.8b\n"
+ "usubl v30.8h, v30.8b, v13.8b\n"
+ "ldr x20, [x5, #0x110]\n"
+ "smlal v23.4s, v2.4h, v28.4h\n"
+ "smlal2 v19.4s, v2.8h, v28.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v29.4h, v30.4h\n"
+ "smlal2 v15.4s, v29.8h, v30.8h\n"
+ "smlal v20.4s, v25.4h, v30.4h\n"
+ "smlal2 v5.4s, v25.8h, v30.8h\n"
+ "smlal v24.4s, v2.4h, v30.4h\n"
+ "smlal2 v22.4s, v2.8h, v30.8h\n"
"tbz x1, #2, 109f\n"
- "ld1 { v24.s }[0], [x21], #0x4\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
"tbz x1, #1, 108f\n"
- "ld1 { v24.h }[2], [x21], #0x2\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
"tbz x1, #0, 111f\n"
- "ld1 { v24.b }[6], [x21]\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 111f\n"
"108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 111f\n"
- "ld1 { v24.b }[4], [x21]\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 111f\n"
"109:" // Oddments: Load (5, 4): Bit 2: Unset
"tbz x1, #1, 110f\n"
- "ld1 { v24.h }[0], [x21], #0x2\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
"tbz x1, #0, 111f\n"
- "ld1 { v24.b }[2], [x21]\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 111f\n"
"110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 111f\n"
- "ld1 { v24.b }[0], [x21]\n"
+ "ld1 { v27.b }[0], [x20]\n"
"111:" // Oddments: Load (5, 4): Bit 2: End
- "ldr d4, [x3, #0xc0]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "ldr x20, [x4, #0x118]\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "add x20, x20, x0\n"
- "smlal v13.4s, v28.4h, v4.4h\n"
- "smlal2 v19.4s, v28.8h, v4.8h\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
+ "ldr d8, [x6, #0xc0]\n"
+ "usubl v27.8h, v27.8b, v18.8b\n"
+ "usubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x20, [x5, #0x118]\n"
+ "smlal v23.4s, v27.4h, v30.4h\n"
+ "smlal2 v19.4s, v27.8h, v30.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v25.4h, v8.4h\n"
+ "smlal2 v15.4s, v25.8h, v8.8h\n"
+ "smlal v20.4s, v21.4h, v8.4h\n"
+ "smlal2 v5.4s, v21.8h, v8.8h\n"
+ "smlal v24.4s, v27.4h, v8.4h\n"
+ "smlal2 v22.4s, v27.8h, v8.8h\n"
"tbz x1, #2, 113f\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz x1, #1, 112f\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"tbz x1, #0, 115f\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 115f\n"
"112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 115f\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 115f\n"
"113:" // Oddments: Load (5, 5): Bit 2: Unset
"tbz x1, #1, 114f\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"tbz x1, #0, 115f\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 115f\n"
"114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 115f\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "ld1 { v9.b }[0], [x20]\n"
"115:" // Oddments: Load (5, 5): Bit 2: End
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
- "smlal2 v21.4s, v27.8h, v4.8h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal v23.4s, v9.4h, v8.4h\n"
+ "smlal2 v19.4s, v9.8h, v8.8h\n"
"tbz x1, #2, 117f\n"
- "ld1 { v18.4s }, [x5], #0x10\n"
- "ld1 { v6.4s }, [x8], #0x10\n"
+ "ld1 { v30.4s }, [x7], #0x10\n"
+ "ld1 { v12.4s }, [x8], #0x10\n"
"tbz x1, #1, 116f\n"
- "ld1 { v5.d }[0], [x5], #0x8\n"
- "ld1 { v22.d }[0], [x8], #0x8\n"
+ "ld1 { v14.d }[0], [x7], #0x8\n"
+ "ld1 { v27.d }[0], [x8], #0x8\n"
"tbz x1, #0, 119f\n"
- "ld1 { v5.s }[2], [x5]\n"
- "ld1 { v22.s }[2], [x8]\n"
+ "ld1 { v14.s }[2], [x7]\n"
+ "ld1 { v27.s }[2], [x8]\n"
"b 119f\n"
"116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
"tbz x1, #0, 119f\n"
- "ld1 { v5.s }[0], [x5]\n"
- "ld1 { v22.s }[0], [x8]\n"
+ "ld1 { v14.s }[0], [x7]\n"
+ "ld1 { v27.s }[0], [x8]\n"
"b 119f\n"
"117:" // Oddments: Load requant params: Bit 2: Unset
"tbz x1, #1, 118f\n"
- "ld1 { v18.d }[0], [x5], #0x8\n"
- "ld1 { v6.d }[0], [x8], #0x8\n"
+ "ld1 { v30.d }[0], [x7], #0x8\n"
+ "ld1 { v12.d }[0], [x8], #0x8\n"
"tbz x1, #0, 119f\n"
- "ld1 { v18.s }[2], [x5]\n"
- "ld1 { v6.s }[2], [x8]\n"
+ "ld1 { v30.s }[2], [x7]\n"
+ "ld1 { v12.s }[2], [x8]\n"
"b 119f\n"
"118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 119f\n"
- "ld1 { v18.s }[0], [x5]\n"
- "ld1 { v6.s }[0], [x8]\n"
+ "ld1 { v30.s }[0], [x7]\n"
+ "ld1 { v12.s }[0], [x8]\n"
"119:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v13.4s, v13.4s, v18.4s\n"
- "and v30.16b, v13.16b, v6.16b\n"
- "add x17, x17, x10\n"
- "add x6, x6, x10\n"
- "sqrdmulh v19.4s, v19.4s, v5.4s\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "add x7, x7, x10\n"
- "add x16, x16, x10\n"
- "and v16.16b, v19.16b, v22.16b\n"
- "sqrdmulh v20.4s, v20.4s, v18.4s\n"
- "sqrdmulh v8.4s, v8.4s, v18.4s\n"
- "sqrdmulh v17.4s, v17.4s, v18.4s\n"
- "sqadd v13.4s, v13.4s, v30.4s\n"
+ "sqrdmulh v7.4s, v7.4s, v30.4s\n"
+ "and v16.16b, v7.16b, v12.16b\n"
+ "add x17, x17, x4\n"
+ "add x16, x16, x4\n"
+ "sqrdmulh v15.4s, v15.4s, v14.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "and v0.16b, v20.16b, v6.16b\n"
- "sqrdmulh v10.4s, v10.4s, v5.4s\n"
- "and v18.16b, v8.16b, v6.16b\n"
- "sqrdmulh v7.4s, v7.4s, v5.4s\n"
- "and v30.16b, v17.16b, v6.16b\n"
- "sqrdmulh v21.4s, v21.4s, v5.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v26.16b, v10.16b, v22.16b\n"
+ "add x15, x15, x4\n"
+ "add x14, x14, x4\n"
+ "and v2.16b, v15.16b, v27.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+ "sqadd v7.4s, v7.4s, v16.4s\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "and v21.16b, v20.16b, v12.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v14.4s\n"
+ "and v18.16b, v24.16b, v12.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v14.4s\n"
+ "and v31.16b, v23.16b, v12.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v14.4s\n"
+ "sqadd v15.4s, v15.4s, v2.4s\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v9.16b, v5.16b, v27.16b\n"
"sshr v18.4s, v18.4s, #0x1f\n"
- "and v23.16b, v7.16b, v22.16b\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "and v16.16b, v21.16b, v22.16b\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v18.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v30.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v6.4s\n"
- "srshl v20.4s, v20.4s, v6.4s\n"
- "sqadd v10.4s, v10.4s, v26.4s\n"
- "srshl v8.4s, v8.4s, v6.4s\n"
- "sqadd v7.4s, v7.4s, v23.4s\n"
- "srshl v17.4s, v17.4s, v6.4s\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "srshl v19.4s, v19.4s, v22.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v10.4s, v10.4s, v22.4s\n"
+ "and v4.16b, v22.16b, v27.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v28.16b, v19.16b, v27.16b\n"
+ "sqadd v20.4s, v20.4s, v21.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v18.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v31.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "srshl v7.4s, v7.4s, v12.4s\n"
+ "srshl v20.4s, v20.4s, v12.4s\n"
+ "sqadd v5.4s, v5.4s, v9.4s\n"
+ "srshl v24.4s, v24.4s, v12.4s\n"
+ "sqadd v22.4s, v22.4s, v4.4s\n"
+ "srshl v23.4s, v23.4s, v12.4s\n"
+ "sqadd v19.4s, v19.4s, v28.4s\n"
+ "srshl v15.4s, v15.4s, v27.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v5.4s, v5.4s, v27.4s\n"
"sqxtn v20.4h, v20.4s\n"
- "srshl v7.4s, v7.4s, v22.4s\n"
- "sqxtn v8.4h, v8.4s\n"
- "srshl v21.4s, v21.4s, v22.4s\n"
- "sqxtn v17.4h, v17.4s\n"
- "sqxtn2 v13.8h, v19.4s\n"
- "sqxtn2 v20.8h, v10.4s\n"
- "sqxtn2 v8.8h, v7.4s\n"
- "sqxtn2 v17.8h, v21.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v20.8h, v20.8h, v14.8h\n"
- "sqadd v8.8h, v8.8h, v14.8h\n"
- "sqadd v17.8h, v17.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v20.8h, v20.8h, v12.8h\n"
- "smax v8.8h, v8.8h, v12.8h\n"
- "smax v17.8h, v17.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v20.8h, v20.8h, v11.8h\n"
- "smin v8.8h, v8.8h, v11.8h\n"
- "smin v17.8h, v17.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "srshl v22.4s, v22.4s, v27.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "srshl v19.4s, v19.4s, v27.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v7.8h, v15.4s\n"
+ "sqxtn2 v20.8h, v5.4s\n"
+ "sqxtn2 v24.8h, v22.4s\n"
+ "sqxtn2 v23.8h, v19.4s\n"
+ "sqadd v7.8h, v7.8h, v26.8h\n"
+ "sqadd v20.8h, v20.8h, v26.8h\n"
+ "sqadd v24.8h, v24.8h, v26.8h\n"
+ "sqadd v23.8h, v23.8h, v26.8h\n"
+ "smax v7.8h, v7.8h, v11.8h\n"
+ "smax v20.8h, v20.8h, v11.8h\n"
+ "smax v24.8h, v24.8h, v11.8h\n"
+ "smax v23.8h, v23.8h, v11.8h\n"
+ "smin v7.8h, v7.8h, v0.8h\n"
+ "smin v20.8h, v20.8h, v0.8h\n"
+ "smin v24.8h, v24.8h, v0.8h\n"
+ "smin v23.8h, v23.8h, v0.8h\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v8.16b, v8.16b, v8.16b\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
"tbz x1, #2, 121f\n"
- "st1 { v13.s }[0], [x17], #0x4\n"
- "st1 { v20.s }[0], [x6], #0x4\n"
- "st1 { v8.s }[0], [x7], #0x4\n"
- "st1 { v17.s }[0], [x16], #0x4\n"
+ "st1 { v7.s }[0], [x17], #0x4\n"
+ "st1 { v20.s }[0], [x16], #0x4\n"
+ "st1 { v24.s }[0], [x15], #0x4\n"
+ "st1 { v23.s }[0], [x14], #0x4\n"
"tbz x1, #1, 120f\n"
- "st1 { v13.h }[2], [x17], #0x2\n"
- "st1 { v20.h }[2], [x6], #0x2\n"
- "st1 { v8.h }[2], [x7], #0x2\n"
- "st1 { v17.h }[2], [x16], #0x2\n"
+ "st1 { v7.h }[2], [x17], #0x2\n"
+ "st1 { v20.h }[2], [x16], #0x2\n"
+ "st1 { v24.h }[2], [x15], #0x2\n"
+ "st1 { v23.h }[2], [x14], #0x2\n"
"tbz x1, #0, 123f\n"
- "st1 { v13.b }[6], [x17], #0x1\n"
- "st1 { v20.b }[6], [x6], #0x1\n"
- "st1 { v8.b }[6], [x7], #0x1\n"
- "st1 { v17.b }[6], [x16], #0x1\n"
+ "st1 { v7.b }[6], [x17], #0x1\n"
+ "st1 { v20.b }[6], [x16], #0x1\n"
+ "st1 { v24.b }[6], [x15], #0x1\n"
+ "st1 { v23.b }[6], [x14], #0x1\n"
"b 123f\n"
"120:" // Oddments: Bit 2: Bit 1: Unset
"tbz x1, #0, 123f\n"
- "st1 { v13.b }[4], [x17], #0x1\n"
- "st1 { v20.b }[4], [x6], #0x1\n"
- "st1 { v8.b }[4], [x7], #0x1\n"
- "st1 { v17.b }[4], [x16], #0x1\n"
+ "st1 { v7.b }[4], [x17], #0x1\n"
+ "st1 { v20.b }[4], [x16], #0x1\n"
+ "st1 { v24.b }[4], [x15], #0x1\n"
+ "st1 { v23.b }[4], [x14], #0x1\n"
"b 123f\n"
"121:" // Oddments: Bit 2: Unset
"tbz x1, #1, 122f\n"
- "st1 { v13.h }[0], [x17], #0x2\n"
- "st1 { v20.h }[0], [x6], #0x2\n"
- "st1 { v8.h }[0], [x7], #0x2\n"
- "st1 { v17.h }[0], [x16], #0x2\n"
+ "st1 { v7.h }[0], [x17], #0x2\n"
+ "st1 { v20.h }[0], [x16], #0x2\n"
+ "st1 { v24.h }[0], [x15], #0x2\n"
+ "st1 { v23.h }[0], [x14], #0x2\n"
"tbz x1, #0, 123f\n"
- "st1 { v13.b }[2], [x17], #0x1\n"
- "st1 { v20.b }[2], [x6], #0x1\n"
- "st1 { v8.b }[2], [x7], #0x1\n"
- "st1 { v17.b }[2], [x16], #0x1\n"
+ "st1 { v7.b }[2], [x17], #0x1\n"
+ "st1 { v20.b }[2], [x16], #0x1\n"
+ "st1 { v24.b }[2], [x15], #0x1\n"
+ "st1 { v23.b }[2], [x14], #0x1\n"
"b 123f\n"
"122:" // Oddments: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 123f\n"
- "st1 { v13.b }[0], [x17], #0x1\n"
- "st1 { v20.b }[0], [x6], #0x1\n"
- "st1 { v8.b }[0], [x7], #0x1\n"
- "st1 { v17.b }[0], [x16], #0x1\n"
+ "st1 { v7.b }[0], [x17], #0x1\n"
+ "st1 { v20.b }[0], [x16], #0x1\n"
+ "st1 { v24.b }[0], [x15], #0x1\n"
+ "st1 { v23.b }[0], [x14], #0x1\n"
"123:" // Oddments: Bit 2: End
"124:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp
index b859978b1e..814efe006e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -47,4 +47,5 @@ class a64_u8q_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKern
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
index 39001aa1fd..f7aa889b56 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -22,12 +22,13 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include "arm_gemm.hpp"
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
@@ -41,7 +42,7 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
)
{
__asm__ __volatile__(
- "lsr x12, %x[n_channels], #0x2\n"
+ "lsr x9, %x[n_channels], #0x2\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
"ld1r { v8.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
@@ -59,7 +60,7 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
"ld1r { v1.4s }, [x20]\n"
"mov x11, #0x0\n"
- "cbz x12, 6f\n"
+ "cbz x9, 6f\n"
"1:" // Channel loop
"movi v23.4s, #0x0\n"
"cbz %x[bias], 2f\n"
@@ -67,34 +68,34 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
"ldr q23, [%x[bias], x20]\n"
"2:" // Channel loop: Load bias: Done
"ldr s0, [%x[params]], #0x4\n"
- "mov x21, %x[inptrs]\n"
- "ldp x10, x9, [x21], #0x10\n"
- "subs x20, %x[n_points], #0x1\n"
- "ldr s14, [x10, x11]\n"
- "ldr s15, [x9, x11]\n"
+ "mov x25, %x[inptrs]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "subs x24, %x[n_points], #0x1\n"
+ "ldr s14, [x21, x11]\n"
+ "ldr s15, [x20, x11]\n"
"mov v24.16b, v23.16b\n"
"mov v25.16b, v23.16b\n"
- "ldp x28, x27, [x21], #0x10\n"
- "ldr s16, [x28, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "ldr s16, [x21, x11]\n"
"mov v26.16b, v23.16b\n"
"mov v27.16b, v23.16b\n"
- "ldr s17, [x27, x11]\n"
- "ldp x26, x25, [x21], #0x10\n"
+ "ldr s17, [x20, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
"mov v28.16b, v23.16b\n"
"mov v29.16b, v23.16b\n"
- "ldr s18, [x26, x11]\n"
- "ldr s19, [x25, x11]\n"
+ "ldr s18, [x21, x11]\n"
+ "ldr s19, [x20, x11]\n"
"mov v30.16b, v23.16b\n"
"mov v31.16b, v23.16b\n"
- "ldp x24, x23, [x21], #0x10\n"
- "ldr s20, [x24, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "ldr s20, [x21, x11]\n"
"usubl v0.8h, v0.8b, v5.8b\n"
"usubl v14.8h, v14.8b, v6.8b\n"
- "ldr s21, [x23, x11]\n"
- "ldr x22, [x21], #0x8\n"
+ "ldr s21, [x20, x11]\n"
+ "ldr x20, [x25], #0x8\n"
"usubl v15.8h, v15.8b, v6.8b\n"
"usubl v16.8h, v16.8b, v6.8b\n"
- "ldr s22, [x22, x11]\n"
+ "ldr s22, [x20, x11]\n"
"usubl v17.8h, v17.8b, v6.8b\n"
"usubl v18.8h, v18.8b, v6.8b\n"
"usubl v19.8h, v19.8b, v6.8b\n"
@@ -103,35 +104,35 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
"usubl v22.8h, v22.8b, v6.8b\n"
"ble 4f\n"
"3:" // Channel loop: Planar loop
- "ldp x10, x9, [x21], #0x10\n"
- "ldp x28, x27, [x21], #0x10\n"
+ "ldp x23, x22, [x25], #0x10\n"
+ "ldp x21, x20, [x25], #0x10\n"
"smlal v23.4s, v14.4h, v0.4h\n"
"smlal v24.4s, v15.4h, v0.4h\n"
- "ldr s14, [x10, x11]\n"
- "ldr s15, [x9, x11]\n"
+ "ldr s14, [x23, x11]\n"
+ "ldr s15, [x22, x11]\n"
"smlal v25.4s, v16.4h, v0.4h\n"
"smlal v26.4s, v17.4h, v0.4h\n"
- "ldr s16, [x28, x11]\n"
- "ldr s17, [x27, x11]\n"
+ "ldr s16, [x21, x11]\n"
+ "ldr s17, [x20, x11]\n"
"smlal v27.4s, v18.4h, v0.4h\n"
"smlal v28.4s, v19.4h, v0.4h\n"
- "ldp x26, x25, [x21], #0x10\n"
- "ldr s18, [x26, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "ldr s18, [x21, x11]\n"
"smlal v29.4s, v20.4h, v0.4h\n"
"smlal v30.4s, v21.4h, v0.4h\n"
- "ldr s19, [x25, x11]\n"
- "ldp x24, x23, [x21], #0x10\n"
+ "ldr s19, [x20, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
"smlal v31.4s, v22.4h, v0.4h\n"
- "subs x20, x20, #0x1\n"
+ "subs x24, x24, #0x1\n"
"ldr s0, [%x[params]], #0x4\n"
- "ldr s20, [x24, x11]\n"
+ "ldr s20, [x21, x11]\n"
"usubl v0.8h, v0.8b, v5.8b\n"
"usubl v14.8h, v14.8b, v6.8b\n"
- "ldr s21, [x23, x11]\n"
- "ldr x22, [x21], #0x8\n"
+ "ldr s21, [x20, x11]\n"
+ "ldr x20, [x25], #0x8\n"
"usubl v15.8h, v15.8b, v6.8b\n"
"usubl v16.8h, v16.8b, v6.8b\n"
- "ldr s22, [x22, x11]\n"
+ "ldr s22, [x20, x11]\n"
"usubl v17.8h, v17.8b, v6.8b\n"
"usubl v18.8h, v18.8b, v6.8b\n"
"usubl v19.8h, v19.8b, v6.8b\n"
@@ -167,45 +168,45 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
"sqrdmulh v24.4s, v24.4s, v2.4s\n"
"sqrdmulh v25.4s, v25.4s, v2.4s\n"
"ldr x20, [%x[outptrs], #0x40]\n"
- "and v21.16b, v23.16b, v1.16b\n"
- "and v20.16b, v24.16b, v1.16b\n"
- "and v19.16b, v25.16b, v1.16b\n"
+ "and v18.16b, v23.16b, v1.16b\n"
+ "and v17.16b, v24.16b, v1.16b\n"
+ "and v16.16b, v25.16b, v1.16b\n"
"sshl v26.4s, v26.4s, v3.4s\n"
"sshl v27.4s, v27.4s, v3.4s\n"
"sshl v28.4s, v28.4s, v3.4s\n"
"sshl v29.4s, v29.4s, v3.4s\n"
"sshl v30.4s, v30.4s, v3.4s\n"
"sshl v31.4s, v31.4s, v3.4s\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v2.4s\n"
"sqrdmulh v27.4s, v27.4s, v2.4s\n"
"sqrdmulh v28.4s, v28.4s, v2.4s\n"
"sqrdmulh v29.4s, v29.4s, v2.4s\n"
"sqrdmulh v30.4s, v30.4s, v2.4s\n"
"sqrdmulh v31.4s, v31.4s, v2.4s\n"
- "sqadd v23.4s, v23.4s, v21.4s\n"
- "sqadd v24.4s, v24.4s, v20.4s\n"
- "sqadd v25.4s, v25.4s, v19.4s\n"
- "and v18.16b, v26.16b, v1.16b\n"
- "and v17.16b, v27.16b, v1.16b\n"
- "and v16.16b, v28.16b, v1.16b\n"
- "and v21.16b, v29.16b, v1.16b\n"
- "and v20.16b, v30.16b, v1.16b\n"
- "and v19.16b, v31.16b, v1.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v18.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "and v21.16b, v26.16b, v1.16b\n"
+ "and v20.16b, v27.16b, v1.16b\n"
+ "and v19.16b, v28.16b, v1.16b\n"
+ "and v18.16b, v29.16b, v1.16b\n"
+ "and v17.16b, v30.16b, v1.16b\n"
+ "and v16.16b, v31.16b, v1.16b\n"
"sshr v21.4s, v21.4s, #0x1f\n"
"sshr v20.4s, v20.4s, #0x1f\n"
"sshr v19.4s, v19.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v18.4s\n"
- "sqadd v27.4s, v27.4s, v17.4s\n"
- "sqadd v28.4s, v28.4s, v16.4s\n"
- "sqadd v29.4s, v29.4s, v21.4s\n"
- "sqadd v30.4s, v30.4s, v20.4s\n"
- "sqadd v31.4s, v31.4s, v19.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v21.4s\n"
+ "sqadd v27.4s, v27.4s, v20.4s\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
"srshl v23.4s, v23.4s, v1.4s\n"
"srshl v24.4s, v24.4s, v1.4s\n"
"srshl v25.4s, v25.4s, v1.4s\n"
@@ -270,7 +271,7 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
"str s30, [x21, x11]\n"
"str s31, [x20, x11]\n"
"add x11, x11, #0x4\n"
- "cmp x11, x12, LSL #2\n"
+ "cmp x11, x9, LSL #2\n"
"blt 1b\n"
"6:" // Oddments
"tst %x[n_channels], #0x3\n"
@@ -288,61 +289,61 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
"8:" // Oddments: Load bias: Bit 1: End
"9:" // Oddments: Load bias: Done
"ldr s0, [%x[params]], #0x4\n"
- "mov x21, %x[inptrs]\n"
- "ldp x10, x9, [x21], #0x10\n"
+ "mov x10, %x[inptrs]\n"
+ "ldp x9, x28, [x10], #0x10\n"
"mov v24.16b, v23.16b\n"
- "ldp x28, x27, [x21], #0x10\n"
- "ldp x26, x25, [x21], #0x10\n"
+ "ldp x27, x26, [x10], #0x10\n"
+ "ldp x25, x24, [x10], #0x10\n"
"mov v25.16b, v23.16b\n"
"mov v26.16b, v23.16b\n"
- "ldp x24, x23, [x21], #0x10\n"
- "ldr x22, [x21], #0x8\n"
+ "ldp x23, x22, [x10], #0x10\n"
+ "ldr x21, [x10], #0x8\n"
"mov v27.16b, v23.16b\n"
"mov v28.16b, v23.16b\n"
"mov v29.16b, v23.16b\n"
"mov v30.16b, v23.16b\n"
- "add x10, x10, x11\n"
"add x9, x9, x11\n"
+ "add x28, x28, x11\n"
"mov v31.16b, v23.16b\n"
"usubl v0.8h, v0.8b, v5.8b\n"
- "add x28, x28, x11\n"
"add x27, x27, x11\n"
"add x26, x26, x11\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
+ "add x21, x21, x11\n"
"tbz %x[n_channels], #1, 10f\n"
- "ldr h14, [x10], #0x2\n"
- "ldr h15, [x9], #0x2\n"
- "ldr h16, [x28], #0x2\n"
- "ldr h17, [x27], #0x2\n"
- "ldr h18, [x26], #0x2\n"
- "ldr h19, [x25], #0x2\n"
- "ldr h20, [x24], #0x2\n"
- "ldr h21, [x23], #0x2\n"
- "ldr h22, [x22], #0x2\n"
+ "ldr h14, [x9], #0x2\n"
+ "ldr h15, [x28], #0x2\n"
+ "ldr h16, [x27], #0x2\n"
+ "ldr h17, [x26], #0x2\n"
+ "ldr h18, [x25], #0x2\n"
+ "ldr h19, [x24], #0x2\n"
+ "ldr h20, [x23], #0x2\n"
+ "ldr h21, [x22], #0x2\n"
+ "ldr h22, [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v14.b }[2], [x10], #0x1\n"
- "ld1 { v15.b }[2], [x9], #0x1\n"
- "ld1 { v16.b }[2], [x28], #0x1\n"
- "ld1 { v17.b }[2], [x27], #0x1\n"
- "ld1 { v18.b }[2], [x26], #0x1\n"
- "ld1 { v19.b }[2], [x25], #0x1\n"
- "ld1 { v20.b }[2], [x24], #0x1\n"
- "ld1 { v21.b }[2], [x23], #0x1\n"
- "ld1 { v22.b }[2], [x22], #0x1\n"
+ "ld1 { v14.b }[2], [x9], #0x1\n"
+ "ld1 { v15.b }[2], [x28], #0x1\n"
+ "ld1 { v16.b }[2], [x27], #0x1\n"
+ "ld1 { v17.b }[2], [x26], #0x1\n"
+ "ld1 { v18.b }[2], [x25], #0x1\n"
+ "ld1 { v19.b }[2], [x24], #0x1\n"
+ "ld1 { v20.b }[2], [x23], #0x1\n"
+ "ld1 { v21.b }[2], [x22], #0x1\n"
+ "ld1 { v22.b }[2], [x21], #0x1\n"
"b 11f\n"
"10:" // Oddments: Load: Bit 1: Unset
- "ldr b14, [x10], #0x1\n"
- "ldr b15, [x9], #0x1\n"
- "ldr b16, [x28], #0x1\n"
- "ldr b17, [x27], #0x1\n"
- "ldr b18, [x26], #0x1\n"
- "ldr b19, [x25], #0x1\n"
- "ldr b20, [x24], #0x1\n"
- "ldr b21, [x23], #0x1\n"
- "ldr b22, [x22], #0x1\n"
+ "ldr b14, [x9], #0x1\n"
+ "ldr b15, [x28], #0x1\n"
+ "ldr b16, [x27], #0x1\n"
+ "ldr b17, [x26], #0x1\n"
+ "ldr b18, [x25], #0x1\n"
+ "ldr b19, [x24], #0x1\n"
+ "ldr b20, [x23], #0x1\n"
+ "ldr b21, [x22], #0x1\n"
+ "ldr b22, [x21], #0x1\n"
"11:" // Oddments: Load: Bit 1: End
"subs x20, %x[n_points], #0x1\n"
"usubl v14.8h, v14.8b, v6.8b\n"
@@ -356,62 +357,62 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
"usubl v22.8h, v22.8b, v6.8b\n"
"ble 15f\n"
"12:" // Oddments: Planar loop
- "ldp x10, x9, [x21], #0x10\n"
- "ldp x28, x27, [x21], #0x10\n"
+ "ldp x9, x28, [x10], #0x10\n"
+ "ldp x27, x26, [x10], #0x10\n"
"smlal v23.4s, v14.4h, v0.4h\n"
"smlal v24.4s, v15.4h, v0.4h\n"
- "ldp x26, x25, [x21], #0x10\n"
- "ldp x24, x23, [x21], #0x10\n"
+ "ldp x25, x24, [x10], #0x10\n"
+ "ldp x23, x22, [x10], #0x10\n"
"smlal v25.4s, v16.4h, v0.4h\n"
"smlal v26.4s, v17.4h, v0.4h\n"
"smlal v27.4s, v18.4h, v0.4h\n"
"smlal v28.4s, v19.4h, v0.4h\n"
- "ldr x22, [x21], #0x8\n"
- "add x10, x10, x11\n"
+ "ldr x21, [x10], #0x8\n"
+ "add x9, x9, x11\n"
"smlal v29.4s, v20.4h, v0.4h\n"
"smlal v30.4s, v21.4h, v0.4h\n"
- "add x9, x9, x11\n"
"add x28, x28, x11\n"
+ "add x27, x27, x11\n"
"smlal v31.4s, v22.4h, v0.4h\n"
"ldr s0, [%x[params]], #0x4\n"
"usubl v0.8h, v0.8b, v5.8b\n"
- "add x27, x27, x11\n"
"add x26, x26, x11\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
+ "add x21, x21, x11\n"
"tbz %x[n_channels], #1, 13f\n"
- "ldr h14, [x10], #0x2\n"
- "ldr h15, [x9], #0x2\n"
- "ldr h16, [x28], #0x2\n"
- "ldr h17, [x27], #0x2\n"
- "ldr h18, [x26], #0x2\n"
- "ldr h19, [x25], #0x2\n"
- "ldr h20, [x24], #0x2\n"
- "ldr h21, [x23], #0x2\n"
- "ldr h22, [x22], #0x2\n"
+ "ldr h14, [x9], #0x2\n"
+ "ldr h15, [x28], #0x2\n"
+ "ldr h16, [x27], #0x2\n"
+ "ldr h17, [x26], #0x2\n"
+ "ldr h18, [x25], #0x2\n"
+ "ldr h19, [x24], #0x2\n"
+ "ldr h20, [x23], #0x2\n"
+ "ldr h21, [x22], #0x2\n"
+ "ldr h22, [x21], #0x2\n"
"tbz %x[n_channels], #0, 14f\n"
- "ld1 { v14.b }[2], [x10], #0x1\n"
- "ld1 { v15.b }[2], [x9], #0x1\n"
- "ld1 { v16.b }[2], [x28], #0x1\n"
- "ld1 { v17.b }[2], [x27], #0x1\n"
- "ld1 { v18.b }[2], [x26], #0x1\n"
- "ld1 { v19.b }[2], [x25], #0x1\n"
- "ld1 { v20.b }[2], [x24], #0x1\n"
- "ld1 { v21.b }[2], [x23], #0x1\n"
- "ld1 { v22.b }[2], [x22], #0x1\n"
+ "ld1 { v14.b }[2], [x9], #0x1\n"
+ "ld1 { v15.b }[2], [x28], #0x1\n"
+ "ld1 { v16.b }[2], [x27], #0x1\n"
+ "ld1 { v17.b }[2], [x26], #0x1\n"
+ "ld1 { v18.b }[2], [x25], #0x1\n"
+ "ld1 { v19.b }[2], [x24], #0x1\n"
+ "ld1 { v20.b }[2], [x23], #0x1\n"
+ "ld1 { v21.b }[2], [x22], #0x1\n"
+ "ld1 { v22.b }[2], [x21], #0x1\n"
"b 14f\n"
"13:" // Oddments: Planar loop: Load: Bit 1: Unset
- "ldr b14, [x10], #0x1\n"
- "ldr b15, [x9], #0x1\n"
- "ldr b16, [x28], #0x1\n"
- "ldr b17, [x27], #0x1\n"
- "ldr b18, [x26], #0x1\n"
- "ldr b19, [x25], #0x1\n"
- "ldr b20, [x24], #0x1\n"
- "ldr b21, [x23], #0x1\n"
- "ldr b22, [x22], #0x1\n"
+ "ldr b14, [x9], #0x1\n"
+ "ldr b15, [x28], #0x1\n"
+ "ldr b16, [x27], #0x1\n"
+ "ldr b17, [x26], #0x1\n"
+ "ldr b18, [x25], #0x1\n"
+ "ldr b19, [x24], #0x1\n"
+ "ldr b20, [x23], #0x1\n"
+ "ldr b21, [x22], #0x1\n"
+ "ldr b22, [x21], #0x1\n"
"14:" // Oddments: Planar loop: Load: Bit 1: End
"subs x20, x20, #0x1\n"
"usubl v14.8h, v14.8b, v6.8b\n"
@@ -457,9 +458,7 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
"cbz %x[rq_left_shift_ptr], 19f\n"
"ld1 { v3.s }[0], [x20], #0x4\n"
"19:" // Oddments: Load quantisation parameters: Bit 1: Unset: Bit 0: Load left shift: Done
-
"20:" // Oddments: Load quantisation parameters: Bit 1: End
-
"21:" // Oddments: Load quantisation parameters: Done
"sshl v23.4s, v23.4s, v3.4s\n"
"sshl v24.4s, v24.4s, v3.4s\n"
@@ -473,11 +472,11 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
"sqrdmulh v25.4s, v25.4s, v2.4s\n"
"ldr x20, [%x[outptrs], #0x40]\n"
"add x28, x28, x11\n"
- "and v21.16b, v23.16b, v1.16b\n"
- "and v20.16b, v24.16b, v1.16b\n"
+ "and v18.16b, v23.16b, v1.16b\n"
+ "and v17.16b, v24.16b, v1.16b\n"
"add x27, x27, x11\n"
"add x26, x26, x11\n"
- "and v19.16b, v25.16b, v1.16b\n"
+ "and v16.16b, v25.16b, v1.16b\n"
"sshl v26.4s, v26.4s, v3.4s\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
@@ -490,36 +489,36 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
"add x21, x21, x11\n"
"add x20, x20, x11\n"
"sshl v31.4s, v31.4s, v3.4s\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v2.4s\n"
"sqrdmulh v27.4s, v27.4s, v2.4s\n"
"sqrdmulh v28.4s, v28.4s, v2.4s\n"
"sqrdmulh v29.4s, v29.4s, v2.4s\n"
"sqrdmulh v30.4s, v30.4s, v2.4s\n"
"sqrdmulh v31.4s, v31.4s, v2.4s\n"
- "sqadd v23.4s, v23.4s, v21.4s\n"
- "sqadd v24.4s, v24.4s, v20.4s\n"
- "sqadd v25.4s, v25.4s, v19.4s\n"
- "and v18.16b, v26.16b, v1.16b\n"
- "and v17.16b, v27.16b, v1.16b\n"
- "and v16.16b, v28.16b, v1.16b\n"
- "and v21.16b, v29.16b, v1.16b\n"
- "and v20.16b, v30.16b, v1.16b\n"
- "and v19.16b, v31.16b, v1.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v18.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "and v21.16b, v26.16b, v1.16b\n"
+ "and v20.16b, v27.16b, v1.16b\n"
+ "and v19.16b, v28.16b, v1.16b\n"
+ "and v18.16b, v29.16b, v1.16b\n"
+ "and v17.16b, v30.16b, v1.16b\n"
+ "and v16.16b, v31.16b, v1.16b\n"
"sshr v21.4s, v21.4s, #0x1f\n"
"sshr v20.4s, v20.4s, #0x1f\n"
"sshr v19.4s, v19.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v18.4s\n"
- "sqadd v27.4s, v27.4s, v17.4s\n"
- "sqadd v28.4s, v28.4s, v16.4s\n"
- "sqadd v29.4s, v29.4s, v21.4s\n"
- "sqadd v30.4s, v30.4s, v20.4s\n"
- "sqadd v31.4s, v31.4s, v19.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v21.4s\n"
+ "sqadd v27.4s, v27.4s, v20.4s\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
"srshl v23.4s, v23.4s, v1.4s\n"
"srshl v24.4s, v24.4s, v1.4s\n"
"srshl v25.4s, v25.4s, v1.4s\n"
@@ -606,15 +605,14 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
"st1 { v30.b }[0], [x21], #0x1\n"
"st1 { v31.b }[0], [x20], #0x1\n"
"23:" // Oddments: Store: Bit 1: End
-
"24:" // End
-
: [params] "+&r" (params)
: [bias] "r" (qp.bias), [inptrs] "r" (inptrs), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (qp.per_channel_left_shifts), [rq_mul_ptr] "r" (qp.per_channel_muls), [rq_right_shift_ptr] "r" (qp.per_channel_right_shifts)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
index 134f657fb8..76965606f7 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -57,4 +57,5 @@ struct a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst :
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
index a6dba90f9e..d69f391514 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -22,12 +22,13 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include "arm_gemm.hpp"
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
@@ -40,169 +41,169 @@ void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
)
{
__asm__ __volatile__(
- "ldr q14, [%x[params], #0x0]\n"
+ "ldr q11, [%x[params], #0x0]\n"
"ldr q5, [%x[params], #0x10]\n"
- "movi v15.16b, #0x1\n"
- "ushr v15.4s, v15.4s, #0x8\n"
+ "movi v8.16b, #0x1\n"
+ "ushr v8.4s, v8.4s, #0x8\n"
"ldr q6, [%x[params], #0x20]\n"
"ldr q7, [%x[params], #0x30]\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
"ldr x20, [%x[inptrs], #0x8]\n"
"ld1 { v1.16b }, [x20]\n"
- "mov v29.16b, v1.16b\n"
- "mov v16.16b, v1.16b\n"
+ "mov v28.16b, v1.16b\n"
+ "mov v23.16b, v1.16b\n"
"ldr x20, [%x[inptrs], #0x10]\n"
"ld1 { v2.16b }, [x20]\n"
- "mov v28.16b, v1.16b\n"
- "mov v22.16b, v2.16b\n"
+ "mov v30.16b, v1.16b\n"
+ "mov v21.16b, v2.16b\n"
"ldr x20, [%x[inptrs], #0x20]\n"
"ld1 { v4.16b }, [x20]\n"
- "mov v31.16b, v2.16b\n"
- "mov v30.16b, v2.16b\n"
+ "mov v20.16b, v2.16b\n"
+ "mov v29.16b, v2.16b\n"
"ldr x20, [%x[inptrs], #0x0]\n"
"ld1 { v0.16b }, [x20]\n"
- "mov v23.16b, v4.16b\n"
- "mov v21.16b, v4.16b\n"
+ "mov v9.16b, v4.16b\n"
+ "mov v22.16b, v4.16b\n"
"ldr x20, [%x[inptrs], #0x18]\n"
"ld1 { v3.16b }, [x20]\n"
- "mov v20.16b, v4.16b\n"
- "ext v29.16b, v29.16b, v29.16b, #0x2\n"
- "ext v16.16b, v16.16b, v16.16b, #0x4\n"
- "ext v28.16b, v28.16b, v28.16b, #0x6\n"
+ "mov v31.16b, v4.16b\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x2\n"
+ "ext v23.16b, v23.16b, v23.16b, #0x4\n"
+ "ext v30.16b, v30.16b, v30.16b, #0x6\n"
"add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v13.4s }, [x20]\n"
- "ext v22.16b, v22.16b, v22.16b, #0x2\n"
- "ext v31.16b, v31.16b, v31.16b, #0x4\n"
- "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
"ld1r { v12.4s }, [x20]\n"
- "ext v30.16b, v30.16b, v30.16b, #0x6\n"
- "ext v23.16b, v23.16b, v23.16b, #0x2\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x2\n"
+ "ext v20.16b, v20.16b, v20.16b, #0x4\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v14.4s }, [x20]\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x6\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x2\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
- "ld1r { v11.4s }, [x20]\n"
- "ext v21.16b, v21.16b, v21.16b, #0x4\n"
- "ext v20.16b, v20.16b, v20.16b, #0x6\n"
+ "ld1r { v13.4s }, [x20]\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x4\n"
+ "ext v31.16b, v31.16b, v31.16b, #0x6\n"
"add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v10.4s }, [x20]\n"
- "mov v25.16b, v0.16b\n"
+ "ld1r { v15.4s }, [x20]\n"
+ "mov v27.16b, v0.16b\n"
"mov v19.16b, v0.16b\n"
"cmp %x[n_channels], #0x4\n"
"mov x9, #0x0\n"
"mov v18.16b, v0.16b\n"
- "mov v24.16b, v3.16b\n"
+ "mov v26.16b, v3.16b\n"
"mov x28, #0x0\n"
"ldp x27, x26, [%x[outptrs], #0x0]\n"
"mov v17.16b, v3.16b\n"
- "ext v25.16b, v25.16b, v25.16b, #0x2\n"
+ "mov v16.16b, v3.16b\n"
"ldp x25, x24, [%x[outptrs], #0x10]\n"
"ldp x23, x22, [%x[outptrs], #0x20]\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x2\n"
"ext v19.16b, v19.16b, v19.16b, #0x4\n"
- "ext v18.16b, v18.16b, v18.16b, #0x6\n"
"ldp x21, x20, [%x[outptrs], #0x30]\n"
"add %x[params], %x[params], #0x40\n"
- "zip1 v1.4s, v1.4s, v16.4s\n"
- "mov v16.16b, v3.16b\n"
- "zip1 v29.4s, v29.4s, v28.4s\n"
- "zip1 v2.4s, v2.4s, v31.4s\n"
- "zip1 v22.4s, v22.4s, v30.4s\n"
- "ext v24.16b, v24.16b, v24.16b, #0x2\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x6\n"
+ "zip1 v1.4s, v1.4s, v23.4s\n"
+ "zip1 v28.4s, v28.4s, v30.4s\n"
+ "zip1 v2.4s, v2.4s, v20.4s\n"
+ "zip1 v21.4s, v21.4s, v29.4s\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x2\n"
"ext v17.16b, v17.16b, v17.16b, #0x4\n"
"ext v16.16b, v16.16b, v16.16b, #0x6\n"
- "zip1 v4.4s, v4.4s, v21.4s\n"
- "zip1 v23.4s, v23.4s, v20.4s\n"
+ "zip1 v4.4s, v4.4s, v22.4s\n"
+ "zip1 v9.4s, v9.4s, v31.4s\n"
"zip1 v0.4s, v0.4s, v19.4s\n"
- "zip1 v25.4s, v25.4s, v18.4s\n"
- "zip1 v1.4s, v1.4s, v29.4s\n"
- "zip1 v2.4s, v2.4s, v22.4s\n"
- ".inst 0x6f81e1fa // udot v26.4s, v15.16b, v1.4b[0]\n"
+ "zip1 v27.4s, v27.4s, v18.4s\n"
+ "zip1 v1.4s, v1.4s, v28.4s\n"
+ "zip1 v2.4s, v2.4s, v21.4s\n"
+ ".inst 0x6f81e118 // udot v24.4s, v8.16b, v1.4b[0]\n"
"zip1 v3.4s, v3.4s, v17.4s\n"
- "zip1 v24.4s, v24.4s, v16.4s\n"
- ".inst 0x6fa1e1fb // udot v27.4s, v15.16b, v1.4b[1]\n"
- "zip1 v4.4s, v4.4s, v23.4s\n"
+ "zip1 v26.4s, v26.4s, v16.4s\n"
+ ".inst 0x6fa1e119 // udot v25.4s, v8.16b, v1.4b[1]\n"
+ "zip1 v4.4s, v4.4s, v9.4s\n"
"movi v23.4s, #0x0\n"
- ".inst 0x6f81e9f7 // udot v23.4s, v15.16b, v1.4b[2]\n"
+ ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n"
"movi v22.4s, #0x0\n"
"movi v21.4s, #0x0\n"
- ".inst 0x6fa1e9f6 // udot v22.4s, v15.16b, v1.4b[3]\n"
- "movi v20.4s, #0x0\n"
- "movi v9.4s, #0x0\n"
- ".inst 0x6f82e1f5 // udot v21.4s, v15.16b, v2.4b[0]\n"
- "movi v8.4s, #0x0\n"
+ ".inst 0x6fa1e916 // udot v22.4s, v8.16b, v1.4b[3]\n"
"movi v19.4s, #0x0\n"
- ".inst 0x6fa2e1f4 // udot v20.4s, v15.16b, v2.4b[1]\n"
+ "movi v9.4s, #0x0\n"
+ ".inst 0x6f82e115 // udot v21.4s, v8.16b, v2.4b[0]\n"
+ "movi v10.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ ".inst 0x6fa2e113 // udot v19.4s, v8.16b, v2.4b[1]\n"
"movi v18.4s, #0x0\n"
"movi v17.4s, #0x0\n"
- ".inst 0x6f82e9e9 // udot v9.4s, v15.16b, v2.4b[2]\n"
+ ".inst 0x6f82e909 // udot v9.4s, v8.16b, v2.4b[2]\n"
"movi v16.4s, #0x0\n"
- "zip1 v0.4s, v0.4s, v25.4s\n"
- ".inst 0x6fa2e9e8 // udot v8.4s, v15.16b, v2.4b[3]\n"
- "zip1 v3.4s, v3.4s, v24.4s\n"
- ".inst 0x6f84e1f3 // udot v19.4s, v15.16b, v4.4b[0]\n"
- ".inst 0x6fa4e1f2 // udot v18.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x6f84e9f1 // udot v17.4s, v15.16b, v4.4b[2]\n"
- ".inst 0x6fa4e9f0 // udot v16.4s, v15.16b, v4.4b[3]\n"
+ "zip1 v0.4s, v0.4s, v27.4s\n"
+ ".inst 0x6fa2e90a // udot v10.4s, v8.16b, v2.4b[3]\n"
+ "zip1 v3.4s, v3.4s, v26.4s\n"
+ ".inst 0x6f84e114 // udot v20.4s, v8.16b, v4.4b[0]\n"
+ ".inst 0x6fa4e112 // udot v18.4s, v8.16b, v4.4b[1]\n"
+ ".inst 0x6f84e911 // udot v17.4s, v8.16b, v4.4b[2]\n"
+ ".inst 0x6fa4e910 // udot v16.4s, v8.16b, v4.4b[3]\n"
"movi v31.4s, #0x0\n"
"movi v30.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- ".inst 0x6f80e1ff // udot v31.4s, v15.16b, v0.4b[0]\n"
+ "movi v26.4s, #0x0\n"
+ ".inst 0x6f80e11f // udot v31.4s, v8.16b, v0.4b[0]\n"
+ "movi v27.4s, #0x0\n"
"movi v28.4s, #0x0\n"
- ".inst 0x6fa0e1fe // udot v30.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x6f80e9fd // udot v29.4s, v15.16b, v0.4b[2]\n"
- ".inst 0x6fa0e9fc // udot v28.4s, v15.16b, v0.4b[3]\n"
- "add v24.4s, v26.4s, v21.4s\n"
- "add v25.4s, v27.4s, v20.4s\n"
- "add v26.4s, v23.4s, v9.4s\n"
- "add v27.4s, v22.4s, v8.4s\n"
- "add v23.4s, v19.4s, v21.4s\n"
- "movi v22.4s, #0x0\n"
- ".inst 0x6f83e1f6 // udot v22.4s, v15.16b, v3.4b[0]\n"
- "add v21.4s, v18.4s, v20.4s\n"
+ ".inst 0x6fa0e11e // udot v30.4s, v8.16b, v0.4b[1]\n"
+ "movi v29.4s, #0x0\n"
+ ".inst 0x6f80e91a // udot v26.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x6fa0e91b // udot v27.4s, v8.16b, v0.4b[3]\n"
+ ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ ".inst 0x6fa3e11d // udot v29.4s, v8.16b, v3.4b[1]\n"
+ "add v24.4s, v24.4s, v21.4s\n"
+ "add v25.4s, v25.4s, v19.4s\n"
+ "add v23.4s, v23.4s, v9.4s\n"
+ "add v22.4s, v22.4s, v10.4s\n"
+ "add v21.4s, v20.4s, v21.4s\n"
"movi v20.4s, #0x0\n"
- ".inst 0x6fa3e1f4 // udot v20.4s, v15.16b, v3.4b[1]\n"
- "add v19.4s, v17.4s, v9.4s\n"
+ ".inst 0x6f83e914 // udot v20.4s, v8.16b, v3.4b[2]\n"
+ "add v19.4s, v18.4s, v19.4s\n"
"movi v18.4s, #0x0\n"
- ".inst 0x6f83e9f2 // udot v18.4s, v15.16b, v3.4b[2]\n"
- "add v17.4s, v16.4s, v8.4s\n"
- "movi v16.4s, #0x0\n"
- ".inst 0x6fa3e9f0 // udot v16.4s, v15.16b, v3.4b[3]\n"
+ ".inst 0x6fa3e912 // udot v18.4s, v8.16b, v3.4b[3]\n"
+ "add v17.4s, v17.4s, v9.4s\n"
+ "add v16.4s, v16.4s, v10.4s\n"
"add v24.4s, v24.4s, v31.4s\n"
"add v25.4s, v25.4s, v30.4s\n"
- "add v26.4s, v26.4s, v29.4s\n"
- "add v27.4s, v27.4s, v28.4s\n"
- "add v28.4s, v23.4s, v22.4s\n"
- "add v29.4s, v21.4s, v20.4s\n"
- "add v30.4s, v19.4s, v18.4s\n"
- "add v31.4s, v17.4s, v16.4s\n"
- "neg v13.4s, v13.4s\n"
- "mul v24.4s, v24.4s, v13.4s\n"
- "mul v25.4s, v25.4s, v13.4s\n"
- "mul v26.4s, v26.4s, v13.4s\n"
- "mul v27.4s, v27.4s, v13.4s\n"
- "mul v28.4s, v28.4s, v13.4s\n"
- "mul v29.4s, v29.4s, v13.4s\n"
- "mul v30.4s, v30.4s, v13.4s\n"
- "mul v31.4s, v31.4s, v13.4s\n"
+ "add v26.4s, v23.4s, v26.4s\n"
+ "add v27.4s, v22.4s, v27.4s\n"
+ "add v28.4s, v21.4s, v28.4s\n"
+ "add v29.4s, v19.4s, v29.4s\n"
+ "add v30.4s, v17.4s, v20.4s\n"
+ "add v31.4s, v16.4s, v18.4s\n"
+ "neg v12.4s, v12.4s\n"
+ "mul v24.4s, v24.4s, v12.4s\n"
+ "mul v25.4s, v25.4s, v12.4s\n"
+ "mul v26.4s, v26.4s, v12.4s\n"
+ "mul v27.4s, v27.4s, v12.4s\n"
+ "mul v28.4s, v28.4s, v12.4s\n"
+ "mul v29.4s, v29.4s, v12.4s\n"
+ "mul v30.4s, v30.4s, v12.4s\n"
+ "mul v31.4s, v31.4s, v12.4s\n"
"zip1 v19.4s, v24.4s, v26.4s\n"
"zip1 v18.4s, v25.4s, v27.4s\n"
"zip1 v17.4s, v28.4s, v30.4s\n"
"zip1 v16.4s, v29.4s, v31.4s\n"
"zip1 v22.4s, v19.4s, v18.4s\n"
"zip1 v23.4s, v17.4s, v16.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
"ble 2f\n"
"1:" // Loop
- "ldr q21, [%x[params], #0x0]\n"
- "ldr q20, [%x[params], #0x10]\n"
+ "ldr q8, [%x[params], #0x0]\n"
+ "ldr q21, [%x[params], #0x10]\n"
".inst 0x6f80e0b8 // udot v24.4s, v5.16b, v0.4b[0]\n"
".inst 0x6fa0e0b9 // udot v25.4s, v5.16b, v0.4b[1]\n"
- "ldr q14, [%x[params], #0x20]\n"
+ "ldr q20, [%x[params], #0x20]\n"
".inst 0x6f80e8ba // udot v26.4s, v5.16b, v0.4b[2]\n"
".inst 0x6fa0e8bb // udot v27.4s, v5.16b, v0.4b[3]\n"
"sub %x[n_channels], %x[n_channels], #0x4\n"
@@ -219,43 +220,43 @@ void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
"ldr q5, [%x[params], #0x30]\n"
".inst 0x6f82e0f8 // udot v24.4s, v7.16b, v2.4b[0]\n"
".inst 0x6fa2e0f9 // udot v25.4s, v7.16b, v2.4b[1]\n"
- "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n"
".inst 0x6fa2e8fb // udot v27.4s, v7.16b, v2.4b[3]\n"
- "sqrdmulh v25.4s, v25.4s, v21.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v8.4s\n"
".inst 0x6f83e0dc // udot v28.4s, v6.16b, v3.4b[0]\n"
".inst 0x6fa3e0dd // udot v29.4s, v6.16b, v3.4b[1]\n"
- "sqrdmulh v26.4s, v26.4s, v21.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v8.4s\n"
".inst 0x6f83e8de // udot v30.4s, v6.16b, v3.4b[2]\n"
".inst 0x6fa3e8df // udot v31.4s, v6.16b, v3.4b[3]\n"
"ldr q6, [%x[params], #0x40]\n"
- "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v8.4s\n"
".inst 0x6f84e0fc // udot v28.4s, v7.16b, v4.4b[0]\n"
".inst 0x6fa4e0fd // udot v29.4s, v7.16b, v4.4b[1]\n"
- "and v19.16b, v24.16b, v20.16b\n"
+ "and v19.16b, v24.16b, v21.16b\n"
".inst 0x6f84e8fe // udot v30.4s, v7.16b, v4.4b[2]\n"
".inst 0x6fa4e8ff // udot v31.4s, v7.16b, v4.4b[3]\n"
"ldr q7, [%x[params], #0x50]\n"
- "and v18.16b, v25.16b, v20.16b\n"
- "and v17.16b, v26.16b, v20.16b\n"
- "and v16.16b, v27.16b, v20.16b\n"
+ "and v18.16b, v25.16b, v21.16b\n"
+ "and v17.16b, v26.16b, v21.16b\n"
+ "and v16.16b, v27.16b, v21.16b\n"
"add %x[params], %x[params], #0x60\n"
"sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v28.4s, v28.4s, v21.4s\n"
- "sqrdmulh v29.4s, v29.4s, v21.4s\n"
- "sqrdmulh v30.4s, v30.4s, v21.4s\n"
- "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v8.4s\n"
"sqadd v24.4s, v24.4s, v19.4s\n"
"sqadd v25.4s, v25.4s, v18.4s\n"
"sqadd v26.4s, v26.4s, v17.4s\n"
"sqadd v27.4s, v27.4s, v16.4s\n"
- "and v19.16b, v28.16b, v20.16b\n"
- "and v18.16b, v29.16b, v20.16b\n"
- "and v17.16b, v30.16b, v20.16b\n"
- "and v16.16b, v31.16b, v20.16b\n"
+ "and v19.16b, v28.16b, v21.16b\n"
+ "and v18.16b, v29.16b, v21.16b\n"
+ "and v17.16b, v30.16b, v21.16b\n"
+ "and v16.16b, v31.16b, v21.16b\n"
"sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
@@ -264,38 +265,38 @@ void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
"sqadd v29.4s, v29.4s, v18.4s\n"
"sqadd v30.4s, v30.4s, v17.4s\n"
"sqadd v31.4s, v31.4s, v16.4s\n"
- "srshl v24.4s, v24.4s, v20.4s\n"
- "srshl v25.4s, v25.4s, v20.4s\n"
- "srshl v26.4s, v26.4s, v20.4s\n"
- "srshl v27.4s, v27.4s, v20.4s\n"
- "srshl v28.4s, v28.4s, v20.4s\n"
- "srshl v29.4s, v29.4s, v20.4s\n"
- "srshl v30.4s, v30.4s, v20.4s\n"
- "srshl v31.4s, v31.4s, v20.4s\n"
- "add v24.4s, v24.4s, v12.4s\n"
- "add v25.4s, v25.4s, v12.4s\n"
- "add v26.4s, v26.4s, v12.4s\n"
- "add v27.4s, v27.4s, v12.4s\n"
- "add v28.4s, v28.4s, v12.4s\n"
- "add v29.4s, v29.4s, v12.4s\n"
- "add v30.4s, v30.4s, v12.4s\n"
- "add v31.4s, v31.4s, v12.4s\n"
- "smin v24.4s, v24.4s, v10.4s\n"
- "smin v25.4s, v25.4s, v10.4s\n"
- "smin v26.4s, v26.4s, v10.4s\n"
- "smin v27.4s, v27.4s, v10.4s\n"
- "smin v28.4s, v28.4s, v10.4s\n"
- "smin v29.4s, v29.4s, v10.4s\n"
- "smin v30.4s, v30.4s, v10.4s\n"
- "smin v31.4s, v31.4s, v10.4s\n"
- "smax v24.4s, v24.4s, v11.4s\n"
- "smax v25.4s, v25.4s, v11.4s\n"
- "smax v26.4s, v26.4s, v11.4s\n"
- "smax v27.4s, v27.4s, v11.4s\n"
- "smax v28.4s, v28.4s, v11.4s\n"
- "smax v29.4s, v29.4s, v11.4s\n"
- "smax v30.4s, v30.4s, v11.4s\n"
- "smax v31.4s, v31.4s, v11.4s\n"
+ "srshl v24.4s, v24.4s, v21.4s\n"
+ "srshl v25.4s, v25.4s, v21.4s\n"
+ "srshl v26.4s, v26.4s, v21.4s\n"
+ "srshl v27.4s, v27.4s, v21.4s\n"
+ "srshl v28.4s, v28.4s, v21.4s\n"
+ "srshl v29.4s, v29.4s, v21.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "srshl v31.4s, v31.4s, v21.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smin v24.4s, v24.4s, v15.4s\n"
+ "smin v25.4s, v25.4s, v15.4s\n"
+ "smin v26.4s, v26.4s, v15.4s\n"
+ "smin v27.4s, v27.4s, v15.4s\n"
+ "smin v28.4s, v28.4s, v15.4s\n"
+ "smin v29.4s, v29.4s, v15.4s\n"
+ "smin v30.4s, v30.4s, v15.4s\n"
+ "smin v31.4s, v31.4s, v15.4s\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "smax v27.4s, v27.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
@@ -329,14 +330,14 @@ void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
"dup v30.4s, v23.s[2]\n"
"dup v31.4s, v23.s[3]\n"
"add x28, x28, #0x4\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v20.4s\n"
+ "add v25.4s, v25.4s, v20.4s\n"
+ "add v26.4s, v26.4s, v20.4s\n"
+ "add v27.4s, v27.4s, v20.4s\n"
+ "add v28.4s, v28.4s, v20.4s\n"
+ "add v29.4s, v29.4s, v20.4s\n"
+ "add v30.4s, v30.4s, v20.4s\n"
+ "add v31.4s, v31.4s, v20.4s\n"
"bgt 1b\n"
"2:" // Tail
"ldr q21, [%x[params], #0x0]\n"
@@ -415,30 +416,30 @@ void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
"srshl v29.4s, v29.4s, v20.4s\n"
"srshl v30.4s, v30.4s, v20.4s\n"
"srshl v31.4s, v31.4s, v20.4s\n"
- "add v24.4s, v24.4s, v12.4s\n"
- "add v25.4s, v25.4s, v12.4s\n"
- "add v26.4s, v26.4s, v12.4s\n"
- "add v27.4s, v27.4s, v12.4s\n"
- "add v28.4s, v28.4s, v12.4s\n"
- "add v29.4s, v29.4s, v12.4s\n"
- "add v30.4s, v30.4s, v12.4s\n"
- "add v31.4s, v31.4s, v12.4s\n"
- "smin v24.4s, v24.4s, v10.4s\n"
- "smin v25.4s, v25.4s, v10.4s\n"
- "smin v26.4s, v26.4s, v10.4s\n"
- "smin v27.4s, v27.4s, v10.4s\n"
- "smin v28.4s, v28.4s, v10.4s\n"
- "smin v29.4s, v29.4s, v10.4s\n"
- "smin v30.4s, v30.4s, v10.4s\n"
- "smin v31.4s, v31.4s, v10.4s\n"
- "smax v24.4s, v24.4s, v11.4s\n"
- "smax v25.4s, v25.4s, v11.4s\n"
- "smax v26.4s, v26.4s, v11.4s\n"
- "smax v27.4s, v27.4s, v11.4s\n"
- "smax v28.4s, v28.4s, v11.4s\n"
- "smax v29.4s, v29.4s, v11.4s\n"
- "smax v30.4s, v30.4s, v11.4s\n"
- "smax v31.4s, v31.4s, v11.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smin v24.4s, v24.4s, v15.4s\n"
+ "smin v25.4s, v25.4s, v15.4s\n"
+ "smin v26.4s, v26.4s, v15.4s\n"
+ "smin v27.4s, v27.4s, v15.4s\n"
+ "smin v28.4s, v28.4s, v15.4s\n"
+ "smin v29.4s, v29.4s, v15.4s\n"
+ "smin v30.4s, v30.4s, v15.4s\n"
+ "smin v31.4s, v31.4s, v15.4s\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "smax v27.4s, v27.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
@@ -514,4 +515,5 @@ void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
index b575a5d169..4485aaa735 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -57,4 +57,5 @@ struct a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst :
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
index 027cc9e5a2..61cec2b66d 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -22,12 +22,13 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include "arm_gemm.hpp"
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
@@ -42,133 +43,133 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
__asm__ __volatile__(
"ldr q12, [%x[params], #0x0]\n"
"ldr q8, [%x[params], #0x10]\n"
- "movi v28.16b, #0x1\n"
- "movi v18.4s, #0x0\n"
+ "movi v30.16b, #0x1\n"
+ "movi v17.4s, #0x0\n"
"ldr q9, [%x[params], #0x20]\n"
"ldr q10, [%x[params], #0x30]\n"
- "movi v31.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
"ldr q11, [%x[params], #0x40]\n"
"ldr x20, [%x[inptrs], #0x18]\n"
- "movi v30.4s, #0x0\n"
- "movi v21.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
"ld1 { v3.16b }, [x20]\n"
"ldr x20, [%x[inptrs], #0x20]\n"
- "mov v16.16b, v3.16b\n"
- "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+ "mov v26.16b, v3.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
"ld1 { v4.16b }, [x20]\n"
"ldr x20, [%x[inptrs], #0x10]\n"
- "mov v15.16b, v4.16b\n"
- "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+ "mov v21.16b, v4.16b\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
"ld1 { v2.16b }, [x20]\n"
"ldr x20, [%x[inptrs], #0x8]\n"
- "mov v20.16b, v2.16b\n"
- "ext v20.16b, v20.16b, v20.16b, #0x1\n"
+ "mov v27.16b, v2.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
"ld1 { v1.16b }, [x20]\n"
"ldr x20, [%x[inptrs], #0x28]\n"
- "zip1 v3.2d, v3.2d, v16.2d\n"
- "zip1 v4.2d, v4.2d, v15.2d\n"
+ "zip1 v3.2d, v3.2d, v26.2d\n"
+ "zip1 v4.2d, v4.2d, v21.2d\n"
"ld1 { v5.16b }, [x20]\n"
"ldr x20, [%x[inptrs], #0x30]\n"
"mov v26.16b, v1.16b\n"
- "mov v13.16b, v5.16b\n"
+ "mov v22.16b, v5.16b\n"
"ld1 { v6.16b }, [x20]\n"
"ldr x20, [%x[inptrs], #0x38]\n"
"mov v19.16b, v6.16b\n"
"ext v26.16b, v26.16b, v26.16b, #0x1\n"
"ld1 { v7.16b }, [x20]\n"
"ldr x20, [%x[inptrs], #0x0]\n"
- "mov v17.16b, v7.16b\n"
- "zip1 v2.2d, v2.2d, v20.2d\n"
+ "mov v21.16b, v7.16b\n"
+ "zip1 v2.2d, v2.2d, v27.2d\n"
"ld1 { v0.16b }, [x20]\n"
- "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
"ext v19.16b, v19.16b, v19.16b, #0x1\n"
- ".inst 0x6f83e392 // udot v18.4s, v28.16b, v3.4b[0]\n"
- "ext v17.16b, v17.16b, v17.16b, #0x1\n"
- ".inst 0x6f83eb9f // udot v31.4s, v28.16b, v3.4b[2]\n"
- ".inst 0x6f84e398 // udot v24.4s, v28.16b, v4.4b[0]\n"
+ ".inst 0x6f83e3d1 // udot v17.4s, v30.16b, v3.4b[0]\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ ".inst 0x6f83ebd0 // udot v16.4s, v30.16b, v3.4b[2]\n"
+ ".inst 0x6f84e3d9 // udot v25.4s, v30.16b, v4.4b[0]\n"
"add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
"ld1r { v23.4s }, [x20]\n"
- ".inst 0x6f84eb9e // udot v30.4s, v28.16b, v4.4b[2]\n"
- "mov v16.16b, v0.16b\n"
- ".inst 0x6f82e395 // udot v21.4s, v28.16b, v2.4b[0]\n"
- "movi v20.4s, #0x0\n"
- "movi v29.4s, #0x1\n"
- ".inst 0x6f82eb94 // udot v20.4s, v28.16b, v2.4b[2]\n"
+ ".inst 0x6f84ebd8 // udot v24.4s, v30.16b, v4.4b[2]\n"
+ "mov v18.16b, v0.16b\n"
+ ".inst 0x6f82e3df // udot v31.4s, v30.16b, v2.4b[0]\n"
+ "movi v29.4s, #0x0\n"
+ "movi v28.4s, #0x1\n"
+ ".inst 0x6f82ebdd // udot v29.4s, v30.16b, v2.4b[2]\n"
"add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v14.4s }, [x20]\n"
- "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+ "ld1r { v13.4s }, [x20]\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x1\n"
"zip1 v1.2d, v1.2d, v26.2d\n"
- ".inst 0x6fa3e3b2 // udot v18.4s, v29.16b, v3.4b[1]\n"
- "zip1 v5.2d, v5.2d, v13.2d\n"
+ ".inst 0x6fa3e391 // udot v17.4s, v28.16b, v3.4b[1]\n"
+ "zip1 v5.2d, v5.2d, v22.2d\n"
"zip1 v6.2d, v6.2d, v19.2d\n"
- ".inst 0x6fa3ebbf // udot v31.4s, v29.16b, v3.4b[3]\n"
+ ".inst 0x6fa3eb90 // udot v16.4s, v28.16b, v3.4b[3]\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
- "ld1r { v13.4s }, [x20]\n"
- "zip1 v7.2d, v7.2d, v17.2d\n"
+ "ld1r { v14.4s }, [x20]\n"
+ "zip1 v7.2d, v7.2d, v21.2d\n"
"movi v22.4s, #0x0\n"
- ".inst 0x6fa4e3b8 // udot v24.4s, v29.16b, v4.4b[1]\n"
- "movi v26.4s, #0x0\n"
- ".inst 0x6fa4ebbe // udot v30.4s, v29.16b, v4.4b[3]\n"
- ".inst 0x6f81e396 // udot v22.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x6fa4e399 // udot v25.4s, v28.16b, v4.4b[1]\n"
+ "movi v21.4s, #0x0\n"
+ ".inst 0x6fa4eb98 // udot v24.4s, v28.16b, v4.4b[3]\n"
+ ".inst 0x6f81e3d6 // udot v22.4s, v30.16b, v1.4b[0]\n"
"add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
"ld1r { v15.4s }, [x20]\n"
- "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
"movi v27.4s, #0x0\n"
- ".inst 0x6f81eb9a // udot v26.4s, v28.16b, v1.4b[2]\n"
- "zip1 v0.2d, v0.2d, v16.2d\n"
+ ".inst 0x6f81ebd5 // udot v21.4s, v30.16b, v1.4b[2]\n"
+ "movi v20.4s, #0x0\n"
"movi v19.4s, #0x0\n"
- ".inst 0x6f85e399 // udot v25.4s, v28.16b, v5.4b[0]\n"
+ ".inst 0x6f85e3da // udot v26.4s, v30.16b, v5.4b[0]\n"
"cmp %x[n_channels], #0x4\n"
- ".inst 0x6f85eb9b // udot v27.4s, v28.16b, v5.4b[2]\n"
- ".inst 0x6f86e393 // udot v19.4s, v28.16b, v6.4b[0]\n"
- "add v24.4s, v18.4s, v24.4s\n"
- "mov x9, #0x0\n"
+ "zip1 v0.2d, v0.2d, v18.2d\n"
"movi v18.4s, #0x0\n"
- ".inst 0x6f86eb92 // udot v18.4s, v28.16b, v6.4b[2]\n"
- ".inst 0x6fa2e3b5 // udot v21.4s, v29.16b, v2.4b[1]\n"
+ ".inst 0x6f85ebdb // udot v27.4s, v30.16b, v5.4b[2]\n"
+ "mov x9, #0x0\n"
+ ".inst 0x6f86e3d4 // udot v20.4s, v30.16b, v6.4b[0]\n"
+ ".inst 0x6f86ebd3 // udot v19.4s, v30.16b, v6.4b[2]\n"
+ "add v17.4s, v17.4s, v25.4s\n"
"mov x28, #0x0\n"
- ".inst 0x6fa2ebb4 // udot v20.4s, v29.16b, v2.4b[3]\n"
- "add v17.4s, v31.4s, v30.4s\n"
- ".inst 0x6fa1e3b6 // udot v22.4s, v29.16b, v1.4b[1]\n"
+ "movi v25.4s, #0x0\n"
+ ".inst 0x6f87e3d2 // udot v18.4s, v30.16b, v7.4b[0]\n"
+ ".inst 0x6f87ebd9 // udot v25.4s, v30.16b, v7.4b[2]\n"
"ldp x27, x26, [%x[outptrs], #0x0]\n"
- "movi v16.4s, #0x0\n"
- ".inst 0x6f87e390 // udot v16.4s, v28.16b, v7.4b[0]\n"
- ".inst 0x6fa1ebba // udot v26.4s, v29.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e39f // udot v31.4s, v28.16b, v2.4b[1]\n"
+ ".inst 0x6fa2eb9d // udot v29.4s, v28.16b, v2.4b[3]\n"
+ "add v16.4s, v16.4s, v24.4s\n"
"ldp x25, x24, [%x[outptrs], #0x10]\n"
- ".inst 0x6fa5e3b9 // udot v25.4s, v29.16b, v5.4b[1]\n"
- ".inst 0x6fa5ebbb // udot v27.4s, v29.16b, v5.4b[3]\n"
- "add v30.4s, v21.4s, v24.4s\n"
+ "movi v24.4s, #0x0\n"
+ ".inst 0x6f80e3d8 // udot v24.4s, v30.16b, v0.4b[0]\n"
+ ".inst 0x6fa1e396 // udot v22.4s, v28.16b, v1.4b[1]\n"
"ldp x23, x22, [%x[outptrs], #0x20]\n"
- ".inst 0x6fa6e3b3 // udot v19.4s, v29.16b, v6.4b[1]\n"
- ".inst 0x6fa6ebb2 // udot v18.4s, v29.16b, v6.4b[3]\n"
- "add v31.4s, v20.4s, v17.4s\n"
+ ".inst 0x6fa1eb95 // udot v21.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x6fa5e39a // udot v26.4s, v28.16b, v5.4b[1]\n"
+ "add v31.4s, v31.4s, v17.4s\n"
"ldp x21, x20, [%x[outptrs], #0x30]\n"
- ".inst 0x6fa7e3b0 // udot v16.4s, v29.16b, v7.4b[1]\n"
- "add v22.4s, v22.4s, v30.4s\n"
+ ".inst 0x6fa5eb9b // udot v27.4s, v28.16b, v5.4b[3]\n"
+ ".inst 0x6fa6e394 // udot v20.4s, v28.16b, v6.4b[1]\n"
+ "add v29.4s, v29.4s, v16.4s\n"
"add %x[params], %x[params], #0x50\n"
- "add v21.4s, v26.4s, v31.4s\n"
- "add v20.4s, v25.4s, v19.4s\n"
- "add v19.4s, v27.4s, v18.4s\n"
- "add v18.4s, v16.4s, v24.4s\n"
- "movi v16.4s, #0x0\n"
- ".inst 0x6f87eb90 // udot v16.4s, v28.16b, v7.4b[2]\n"
- ".inst 0x6fa7ebb0 // udot v16.4s, v29.16b, v7.4b[3]\n"
- "add v17.4s, v16.4s, v17.4s\n"
- "movi v16.4s, #0x0\n"
- ".inst 0x6f80e390 // udot v16.4s, v28.16b, v0.4b[0]\n"
- ".inst 0x6fa0e3b0 // udot v16.4s, v29.16b, v0.4b[1]\n"
- "add v24.4s, v22.4s, v16.4s\n"
- "add v26.4s, v22.4s, v25.4s\n"
- "movi v16.4s, #0x0\n"
- ".inst 0x6f80eb90 // udot v16.4s, v28.16b, v0.4b[2]\n"
- ".inst 0x6fa0ebb0 // udot v16.4s, v29.16b, v0.4b[3]\n"
- "add v25.4s, v21.4s, v16.4s\n"
- "add v27.4s, v21.4s, v27.4s\n"
- "add v28.4s, v20.4s, v30.4s\n"
- "add v29.4s, v19.4s, v31.4s\n"
- "add v30.4s, v18.4s, v20.4s\n"
- "add v31.4s, v17.4s, v19.4s\n"
+ ".inst 0x6fa6eb93 // udot v19.4s, v28.16b, v6.4b[3]\n"
+ ".inst 0x6fa7e392 // udot v18.4s, v28.16b, v7.4b[1]\n"
+ "add v22.4s, v22.4s, v31.4s\n"
+ ".inst 0x6fa7eb99 // udot v25.4s, v28.16b, v7.4b[3]\n"
+ ".inst 0x6fa0e398 // udot v24.4s, v28.16b, v0.4b[1]\n"
+ "add v21.4s, v21.4s, v29.4s\n"
+ "add v20.4s, v26.4s, v20.4s\n"
+ "add v19.4s, v27.4s, v19.4s\n"
+ "add v18.4s, v18.4s, v17.4s\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x6f80ebd1 // udot v17.4s, v30.16b, v0.4b[2]\n"
+ ".inst 0x6fa0eb91 // udot v17.4s, v28.16b, v0.4b[3]\n"
+ "add v16.4s, v25.4s, v16.4s\n"
+ "add v24.4s, v22.4s, v24.4s\n"
+ "add v25.4s, v21.4s, v17.4s\n"
+ "add v26.4s, v26.4s, v22.4s\n"
+ "add v27.4s, v27.4s, v21.4s\n"
+ "add v28.4s, v20.4s, v31.4s\n"
+ "add v29.4s, v19.4s, v29.4s\n"
+ "add v30.4s, v20.4s, v18.4s\n"
+ "add v31.4s, v19.4s, v16.4s\n"
"neg v23.4s, v23.4s\n"
"mul v24.4s, v24.4s, v23.4s\n"
"mul v25.4s, v25.4s, v23.4s\n"
@@ -194,11 +195,11 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"add v31.4s, v31.4s, v12.4s\n"
"ble 2f\n"
"1:" // Loop
- "ldr q21, [%x[params], #0x60]\n"
- "ldr q20, [%x[params], #0x70]\n"
+ "ldr q12, [%x[params], #0x60]\n"
+ "ldr q21, [%x[params], #0x70]\n"
".inst 0x6f80e118 // udot v24.4s, v8.16b, v0.4b[0]\n"
".inst 0x6f80e919 // udot v25.4s, v8.16b, v0.4b[2]\n"
- "ldr q12, [%x[params], #0x80]\n"
+ "ldr q20, [%x[params], #0x80]\n"
".inst 0x6f81e11a // udot v26.4s, v8.16b, v1.4b[0]\n"
".inst 0x6f81e91b // udot v27.4s, v8.16b, v1.4b[2]\n"
"sub %x[n_channels], %x[n_channels], #0x4\n"
@@ -212,7 +213,7 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
".inst 0x6f82e91d // udot v29.4s, v8.16b, v2.4b[2]\n"
".inst 0x6f83e11e // udot v30.4s, v8.16b, v3.4b[0]\n"
".inst 0x6f83e91f // udot v31.4s, v8.16b, v3.4b[2]\n"
- "ldr q8, [%x[params], #0x0]\n"
+ "ldr q17, [%x[params], #0x0]\n"
".inst 0x6f81e158 // udot v24.4s, v10.16b, v1.4b[0]\n"
".inst 0x6f81e959 // udot v25.4s, v10.16b, v1.4b[2]\n"
".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
@@ -221,7 +222,7 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
".inst 0x6fa2e93d // udot v29.4s, v9.16b, v2.4b[3]\n"
".inst 0x6fa3e13e // udot v30.4s, v9.16b, v3.4b[1]\n"
".inst 0x6fa3e93f // udot v31.4s, v9.16b, v3.4b[3]\n"
- "ldr q9, [%x[params], #0x10]\n"
+ "ldr q16, [%x[params], #0x10]\n"
".inst 0x6fa1e178 // udot v24.4s, v11.16b, v1.4b[1]\n"
".inst 0x6fa1e979 // udot v25.4s, v11.16b, v1.4b[3]\n"
".inst 0x6fa2e17a // udot v26.4s, v11.16b, v2.4b[1]\n"
@@ -230,115 +231,115 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
".inst 0x6f83e95d // udot v29.4s, v10.16b, v3.4b[2]\n"
".inst 0x6f84e15e // udot v30.4s, v10.16b, v4.4b[0]\n"
".inst 0x6f84e95f // udot v31.4s, v10.16b, v4.4b[2]\n"
- "ldr q10, [%x[params], #0x20]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x6f82e919 // udot v25.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x6f83e11a // udot v26.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x6f83e91b // udot v27.4s, v8.16b, v3.4b[2]\n"
+ "ldr q19, [%x[params], #0x20]\n"
+ ".inst 0x6f82e238 // udot v24.4s, v17.16b, v2.4b[0]\n"
+ ".inst 0x6f82ea39 // udot v25.4s, v17.16b, v2.4b[2]\n"
+ ".inst 0x6f83e23a // udot v26.4s, v17.16b, v3.4b[0]\n"
+ ".inst 0x6f83ea3b // udot v27.4s, v17.16b, v3.4b[2]\n"
".inst 0x6fa3e17c // udot v28.4s, v11.16b, v3.4b[1]\n"
".inst 0x6fa3e97d // udot v29.4s, v11.16b, v3.4b[3]\n"
".inst 0x6fa4e17e // udot v30.4s, v11.16b, v4.4b[1]\n"
".inst 0x6fa4e97f // udot v31.4s, v11.16b, v4.4b[3]\n"
- "ldr q11, [%x[params], #0x30]\n"
- ".inst 0x6fa2e138 // udot v24.4s, v9.16b, v2.4b[1]\n"
- ".inst 0x6fa2e939 // udot v25.4s, v9.16b, v2.4b[3]\n"
- ".inst 0x6fa3e13a // udot v26.4s, v9.16b, v3.4b[1]\n"
- ".inst 0x6fa3e93b // udot v27.4s, v9.16b, v3.4b[3]\n"
- ".inst 0x6f84e11c // udot v28.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x6f84e91d // udot v29.4s, v8.16b, v4.4b[2]\n"
- ".inst 0x6f85e11e // udot v30.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x6f85e91f // udot v31.4s, v8.16b, v5.4b[2]\n"
- "ldr q8, [%x[params], #0x40]\n"
- ".inst 0x6f83e158 // udot v24.4s, v10.16b, v3.4b[0]\n"
- ".inst 0x6f83e959 // udot v25.4s, v10.16b, v3.4b[2]\n"
- ".inst 0x6f84e15a // udot v26.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x6f84e95b // udot v27.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x6fa4e13c // udot v28.4s, v9.16b, v4.4b[1]\n"
- ".inst 0x6fa4e93d // udot v29.4s, v9.16b, v4.4b[3]\n"
- ".inst 0x6fa5e13e // udot v30.4s, v9.16b, v5.4b[1]\n"
- ".inst 0x6fa5e93f // udot v31.4s, v9.16b, v5.4b[3]\n"
- "ldr q9, [%x[params], #0x50]\n"
- ".inst 0x6fa3e178 // udot v24.4s, v11.16b, v3.4b[1]\n"
- ".inst 0x6fa3e979 // udot v25.4s, v11.16b, v3.4b[3]\n"
- ".inst 0x6fa4e17a // udot v26.4s, v11.16b, v4.4b[1]\n"
- ".inst 0x6fa4e97b // udot v27.4s, v11.16b, v4.4b[3]\n"
- ".inst 0x6f85e15c // udot v28.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x6f85e95d // udot v29.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x6f86e15e // udot v30.4s, v10.16b, v6.4b[0]\n"
- ".inst 0x6f86e95f // udot v31.4s, v10.16b, v6.4b[2]\n"
+ "ldr q18, [%x[params], #0x30]\n"
+ ".inst 0x6fa2e218 // udot v24.4s, v16.16b, v2.4b[1]\n"
+ ".inst 0x6fa2ea19 // udot v25.4s, v16.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e21a // udot v26.4s, v16.16b, v3.4b[1]\n"
+ ".inst 0x6fa3ea1b // udot v27.4s, v16.16b, v3.4b[3]\n"
+ ".inst 0x6f84e23c // udot v28.4s, v17.16b, v4.4b[0]\n"
+ ".inst 0x6f84ea3d // udot v29.4s, v17.16b, v4.4b[2]\n"
+ ".inst 0x6f85e23e // udot v30.4s, v17.16b, v5.4b[0]\n"
+ ".inst 0x6f85ea3f // udot v31.4s, v17.16b, v5.4b[2]\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ ".inst 0x6f83e278 // udot v24.4s, v19.16b, v3.4b[0]\n"
+ ".inst 0x6f83ea79 // udot v25.4s, v19.16b, v3.4b[2]\n"
+ ".inst 0x6f84e27a // udot v26.4s, v19.16b, v4.4b[0]\n"
+ ".inst 0x6f84ea7b // udot v27.4s, v19.16b, v4.4b[2]\n"
+ ".inst 0x6fa4e21c // udot v28.4s, v16.16b, v4.4b[1]\n"
+ ".inst 0x6fa4ea1d // udot v29.4s, v16.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e21e // udot v30.4s, v16.16b, v5.4b[1]\n"
+ ".inst 0x6fa5ea1f // udot v31.4s, v16.16b, v5.4b[3]\n"
+ "ldr q16, [%x[params], #0x50]\n"
+ ".inst 0x6fa3e258 // udot v24.4s, v18.16b, v3.4b[1]\n"
+ ".inst 0x6fa3ea59 // udot v25.4s, v18.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e25a // udot v26.4s, v18.16b, v4.4b[1]\n"
+ ".inst 0x6fa4ea5b // udot v27.4s, v18.16b, v4.4b[3]\n"
+ ".inst 0x6f85e27c // udot v28.4s, v19.16b, v5.4b[0]\n"
+ ".inst 0x6f85ea7d // udot v29.4s, v19.16b, v5.4b[2]\n"
+ ".inst 0x6f86e27e // udot v30.4s, v19.16b, v6.4b[0]\n"
+ ".inst 0x6f86ea7f // udot v31.4s, v19.16b, v6.4b[2]\n"
"ldr q10, [%x[params], #0xb0]\n"
- ".inst 0x6f84e118 // udot v24.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x6f84e919 // udot v25.4s, v8.16b, v4.4b[2]\n"
- ".inst 0x6f85e11a // udot v26.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x6f85e91b // udot v27.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x6fa5e17c // udot v28.4s, v11.16b, v5.4b[1]\n"
- ".inst 0x6fa5e97d // udot v29.4s, v11.16b, v5.4b[3]\n"
- ".inst 0x6fa6e17e // udot v30.4s, v11.16b, v6.4b[1]\n"
- ".inst 0x6fa6e97f // udot v31.4s, v11.16b, v6.4b[3]\n"
+ ".inst 0x6f84e238 // udot v24.4s, v17.16b, v4.4b[0]\n"
+ ".inst 0x6f84ea39 // udot v25.4s, v17.16b, v4.4b[2]\n"
+ ".inst 0x6f85e23a // udot v26.4s, v17.16b, v5.4b[0]\n"
+ ".inst 0x6f85ea3b // udot v27.4s, v17.16b, v5.4b[2]\n"
+ ".inst 0x6fa5e25c // udot v28.4s, v18.16b, v5.4b[1]\n"
+ ".inst 0x6fa5ea5d // udot v29.4s, v18.16b, v5.4b[3]\n"
+ ".inst 0x6fa6e25e // udot v30.4s, v18.16b, v6.4b[1]\n"
+ ".inst 0x6fa6ea5f // udot v31.4s, v18.16b, v6.4b[3]\n"
"ldr q11, [%x[params], #0xc0]\n"
- ".inst 0x6fa4e138 // udot v24.4s, v9.16b, v4.4b[1]\n"
- ".inst 0x6fa4e939 // udot v25.4s, v9.16b, v4.4b[3]\n"
- "sqrdmulh v24.4s, v24.4s, v21.4s\n"
- ".inst 0x6fa5e13a // udot v26.4s, v9.16b, v5.4b[1]\n"
- ".inst 0x6fa5e93b // udot v27.4s, v9.16b, v5.4b[3]\n"
- "sqrdmulh v25.4s, v25.4s, v21.4s\n"
- ".inst 0x6f86e11c // udot v28.4s, v8.16b, v6.4b[0]\n"
- ".inst 0x6f86e91d // udot v29.4s, v8.16b, v6.4b[2]\n"
- "sqrdmulh v26.4s, v26.4s, v21.4s\n"
- ".inst 0x6f87e11e // udot v30.4s, v8.16b, v7.4b[0]\n"
- ".inst 0x6f87e91f // udot v31.4s, v8.16b, v7.4b[2]\n"
+ ".inst 0x6fa4e218 // udot v24.4s, v16.16b, v4.4b[1]\n"
+ ".inst 0x6fa4ea19 // udot v25.4s, v16.16b, v4.4b[3]\n"
+ "sqrdmulh v24.4s, v24.4s, v12.4s\n"
+ ".inst 0x6fa5e21a // udot v26.4s, v16.16b, v5.4b[1]\n"
+ ".inst 0x6fa5ea1b // udot v27.4s, v16.16b, v5.4b[3]\n"
+ "sqrdmulh v25.4s, v25.4s, v12.4s\n"
+ ".inst 0x6f86e23c // udot v28.4s, v17.16b, v6.4b[0]\n"
+ ".inst 0x6f86ea3d // udot v29.4s, v17.16b, v6.4b[2]\n"
+ "sqrdmulh v26.4s, v26.4s, v12.4s\n"
+ ".inst 0x6f87e23e // udot v30.4s, v17.16b, v7.4b[0]\n"
+ ".inst 0x6f87ea3f // udot v31.4s, v17.16b, v7.4b[2]\n"
"ldr q8, [%x[params], #0x90]\n"
- "sqrdmulh v27.4s, v27.4s, v21.4s\n"
- ".inst 0x6fa6e13c // udot v28.4s, v9.16b, v6.4b[1]\n"
- ".inst 0x6fa6e93d // udot v29.4s, v9.16b, v6.4b[3]\n"
- "and v19.16b, v24.16b, v20.16b\n"
- ".inst 0x6fa7e13e // udot v30.4s, v9.16b, v7.4b[1]\n"
- ".inst 0x6fa7e93f // udot v31.4s, v9.16b, v7.4b[3]\n"
+ "sqrdmulh v27.4s, v27.4s, v12.4s\n"
+ ".inst 0x6fa6e21c // udot v28.4s, v16.16b, v6.4b[1]\n"
+ ".inst 0x6fa6ea1d // udot v29.4s, v16.16b, v6.4b[3]\n"
+ "and v19.16b, v24.16b, v21.16b\n"
+ ".inst 0x6fa7e21e // udot v30.4s, v16.16b, v7.4b[1]\n"
+ ".inst 0x6fa7ea1f // udot v31.4s, v16.16b, v7.4b[3]\n"
"ldr q9, [%x[params], #0xa0]\n"
- "and v18.16b, v25.16b, v20.16b\n"
+ "and v18.16b, v25.16b, v21.16b\n"
+ "and v17.16b, v26.16b, v21.16b\n"
+ "and v16.16b, v27.16b, v21.16b\n"
+ "add %x[params], %x[params], #0xd0\n"
"sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
- "add %x[params], %x[params], #0xd0\n"
- "sqrdmulh v28.4s, v28.4s, v21.4s\n"
- "sqrdmulh v29.4s, v29.4s, v21.4s\n"
- "sqrdmulh v30.4s, v30.4s, v21.4s\n"
- "sqrdmulh v31.4s, v31.4s, v21.4s\n"
- "and v17.16b, v26.16b, v20.16b\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v24.4s, v24.4s, v19.4s\n"
- "and v16.16b, v27.16b, v20.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v28.4s, v28.4s, v12.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v12.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v12.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v12.4s\n"
+ "sqadd v24.4s, v24.4s, v19.4s\n"
"sqadd v25.4s, v25.4s, v18.4s\n"
"sqadd v26.4s, v26.4s, v17.4s\n"
"sqadd v27.4s, v27.4s, v16.4s\n"
- "and v19.16b, v28.16b, v20.16b\n"
- "and v18.16b, v29.16b, v20.16b\n"
- "and v17.16b, v30.16b, v20.16b\n"
+ "and v19.16b, v28.16b, v21.16b\n"
+ "and v18.16b, v29.16b, v21.16b\n"
+ "and v17.16b, v30.16b, v21.16b\n"
+ "and v16.16b, v31.16b, v21.16b\n"
"sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v28.4s, v28.4s, v19.4s\n"
- "and v16.16b, v31.16b, v20.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
"sqadd v29.4s, v29.4s, v18.4s\n"
"sqadd v30.4s, v30.4s, v17.4s\n"
"sqadd v31.4s, v31.4s, v16.4s\n"
- "srshl v24.4s, v24.4s, v20.4s\n"
- "srshl v25.4s, v25.4s, v20.4s\n"
- "srshl v26.4s, v26.4s, v20.4s\n"
- "srshl v27.4s, v27.4s, v20.4s\n"
- "srshl v28.4s, v28.4s, v20.4s\n"
- "srshl v29.4s, v29.4s, v20.4s\n"
- "srshl v30.4s, v30.4s, v20.4s\n"
- "srshl v31.4s, v31.4s, v20.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
+ "srshl v24.4s, v24.4s, v21.4s\n"
+ "srshl v25.4s, v25.4s, v21.4s\n"
+ "srshl v26.4s, v26.4s, v21.4s\n"
+ "srshl v27.4s, v27.4s, v21.4s\n"
+ "srshl v28.4s, v28.4s, v21.4s\n"
+ "srshl v29.4s, v29.4s, v21.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "srshl v31.4s, v31.4s, v21.4s\n"
+ "add v24.4s, v24.4s, v13.4s\n"
+ "add v25.4s, v25.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "add v27.4s, v27.4s, v13.4s\n"
+ "add v28.4s, v28.4s, v13.4s\n"
+ "add v29.4s, v29.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v13.4s\n"
+ "add v31.4s, v31.4s, v13.4s\n"
"smin v24.4s, v24.4s, v15.4s\n"
"smin v25.4s, v25.4s, v15.4s\n"
"smin v26.4s, v26.4s, v15.4s\n"
@@ -347,14 +348,14 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"smin v29.4s, v29.4s, v15.4s\n"
"smin v30.4s, v30.4s, v15.4s\n"
"smin v31.4s, v31.4s, v15.4s\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v14.4s\n"
+ "smax v25.4s, v25.4s, v14.4s\n"
+ "smax v26.4s, v26.4s, v14.4s\n"
+ "smax v27.4s, v27.4s, v14.4s\n"
+ "smax v28.4s, v28.4s, v14.4s\n"
+ "smax v29.4s, v29.4s, v14.4s\n"
+ "smax v30.4s, v30.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v14.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
@@ -388,14 +389,14 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"dup v30.4s, v23.s[2]\n"
"dup v31.4s, v23.s[3]\n"
"add x28, x28, #0x4\n"
- "add v24.4s, v24.4s, v12.4s\n"
- "add v25.4s, v25.4s, v12.4s\n"
- "add v26.4s, v26.4s, v12.4s\n"
- "add v27.4s, v27.4s, v12.4s\n"
- "add v28.4s, v28.4s, v12.4s\n"
- "add v29.4s, v29.4s, v12.4s\n"
- "add v30.4s, v30.4s, v12.4s\n"
- "add v31.4s, v31.4s, v12.4s\n"
+ "add v24.4s, v24.4s, v20.4s\n"
+ "add v25.4s, v25.4s, v20.4s\n"
+ "add v26.4s, v26.4s, v20.4s\n"
+ "add v27.4s, v27.4s, v20.4s\n"
+ "add v28.4s, v28.4s, v20.4s\n"
+ "add v29.4s, v29.4s, v20.4s\n"
+ "add v30.4s, v30.4s, v20.4s\n"
+ "add v31.4s, v31.4s, v20.4s\n"
"bgt 1b\n"
"2:" // Tail
"ldr q21, [%x[params], #0x60]\n"
@@ -420,7 +421,7 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"add x21, x21, x28\n"
".inst 0x6f83e11e // udot v30.4s, v8.16b, v3.4b[0]\n"
".inst 0x6f83e91f // udot v31.4s, v8.16b, v3.4b[2]\n"
- "ldr q8, [%x[params], #0x0]\n"
+ "ldr q17, [%x[params], #0x0]\n"
"add x20, x20, x28\n"
".inst 0x6f81e158 // udot v24.4s, v10.16b, v1.4b[0]\n"
".inst 0x6f81e959 // udot v25.4s, v10.16b, v1.4b[2]\n"
@@ -430,7 +431,7 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
".inst 0x6fa2e93d // udot v29.4s, v9.16b, v2.4b[3]\n"
".inst 0x6fa3e13e // udot v30.4s, v9.16b, v3.4b[1]\n"
".inst 0x6fa3e93f // udot v31.4s, v9.16b, v3.4b[3]\n"
- "ldr q9, [%x[params], #0x10]\n"
+ "ldr q16, [%x[params], #0x10]\n"
".inst 0x6fa1e178 // udot v24.4s, v11.16b, v1.4b[1]\n"
".inst 0x6fa1e979 // udot v25.4s, v11.16b, v1.4b[3]\n"
".inst 0x6fa2e17a // udot v26.4s, v11.16b, v2.4b[1]\n"
@@ -439,68 +440,68 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
".inst 0x6f83e95d // udot v29.4s, v10.16b, v3.4b[2]\n"
".inst 0x6f84e15e // udot v30.4s, v10.16b, v4.4b[0]\n"
".inst 0x6f84e95f // udot v31.4s, v10.16b, v4.4b[2]\n"
- "ldr q10, [%x[params], #0x20]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x6f82e919 // udot v25.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x6f83e11a // udot v26.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x6f83e91b // udot v27.4s, v8.16b, v3.4b[2]\n"
+ "ldr q19, [%x[params], #0x20]\n"
+ ".inst 0x6f82e238 // udot v24.4s, v17.16b, v2.4b[0]\n"
+ ".inst 0x6f82ea39 // udot v25.4s, v17.16b, v2.4b[2]\n"
+ ".inst 0x6f83e23a // udot v26.4s, v17.16b, v3.4b[0]\n"
+ ".inst 0x6f83ea3b // udot v27.4s, v17.16b, v3.4b[2]\n"
".inst 0x6fa3e17c // udot v28.4s, v11.16b, v3.4b[1]\n"
".inst 0x6fa3e97d // udot v29.4s, v11.16b, v3.4b[3]\n"
".inst 0x6fa4e17e // udot v30.4s, v11.16b, v4.4b[1]\n"
".inst 0x6fa4e97f // udot v31.4s, v11.16b, v4.4b[3]\n"
- "ldr q11, [%x[params], #0x30]\n"
- ".inst 0x6fa2e138 // udot v24.4s, v9.16b, v2.4b[1]\n"
- ".inst 0x6fa2e939 // udot v25.4s, v9.16b, v2.4b[3]\n"
- ".inst 0x6fa3e13a // udot v26.4s, v9.16b, v3.4b[1]\n"
- ".inst 0x6fa3e93b // udot v27.4s, v9.16b, v3.4b[3]\n"
- ".inst 0x6f84e11c // udot v28.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x6f84e91d // udot v29.4s, v8.16b, v4.4b[2]\n"
- ".inst 0x6f85e11e // udot v30.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x6f85e91f // udot v31.4s, v8.16b, v5.4b[2]\n"
- "ldr q8, [%x[params], #0x40]\n"
- ".inst 0x6f83e158 // udot v24.4s, v10.16b, v3.4b[0]\n"
- ".inst 0x6f83e959 // udot v25.4s, v10.16b, v3.4b[2]\n"
- ".inst 0x6f84e15a // udot v26.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x6f84e95b // udot v27.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x6fa4e13c // udot v28.4s, v9.16b, v4.4b[1]\n"
- ".inst 0x6fa4e93d // udot v29.4s, v9.16b, v4.4b[3]\n"
- ".inst 0x6fa5e13e // udot v30.4s, v9.16b, v5.4b[1]\n"
- ".inst 0x6fa5e93f // udot v31.4s, v9.16b, v5.4b[3]\n"
- "ldr q9, [%x[params], #0x50]\n"
+ "ldr q18, [%x[params], #0x30]\n"
+ ".inst 0x6fa2e218 // udot v24.4s, v16.16b, v2.4b[1]\n"
+ ".inst 0x6fa2ea19 // udot v25.4s, v16.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e21a // udot v26.4s, v16.16b, v3.4b[1]\n"
+ ".inst 0x6fa3ea1b // udot v27.4s, v16.16b, v3.4b[3]\n"
+ ".inst 0x6f84e23c // udot v28.4s, v17.16b, v4.4b[0]\n"
+ ".inst 0x6f84ea3d // udot v29.4s, v17.16b, v4.4b[2]\n"
+ ".inst 0x6f85e23e // udot v30.4s, v17.16b, v5.4b[0]\n"
+ ".inst 0x6f85ea3f // udot v31.4s, v17.16b, v5.4b[2]\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ ".inst 0x6f83e278 // udot v24.4s, v19.16b, v3.4b[0]\n"
+ ".inst 0x6f83ea79 // udot v25.4s, v19.16b, v3.4b[2]\n"
+ ".inst 0x6f84e27a // udot v26.4s, v19.16b, v4.4b[0]\n"
+ ".inst 0x6f84ea7b // udot v27.4s, v19.16b, v4.4b[2]\n"
+ ".inst 0x6fa4e21c // udot v28.4s, v16.16b, v4.4b[1]\n"
+ ".inst 0x6fa4ea1d // udot v29.4s, v16.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e21e // udot v30.4s, v16.16b, v5.4b[1]\n"
+ ".inst 0x6fa5ea1f // udot v31.4s, v16.16b, v5.4b[3]\n"
+ "ldr q16, [%x[params], #0x50]\n"
"add %x[params], %x[params], #0x80\n"
- ".inst 0x6fa3e178 // udot v24.4s, v11.16b, v3.4b[1]\n"
- ".inst 0x6fa3e979 // udot v25.4s, v11.16b, v3.4b[3]\n"
- ".inst 0x6fa4e17a // udot v26.4s, v11.16b, v4.4b[1]\n"
- ".inst 0x6fa4e97b // udot v27.4s, v11.16b, v4.4b[3]\n"
- ".inst 0x6f85e15c // udot v28.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x6f85e95d // udot v29.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x6f86e15e // udot v30.4s, v10.16b, v6.4b[0]\n"
- ".inst 0x6f86e95f // udot v31.4s, v10.16b, v6.4b[2]\n"
- ".inst 0x6f84e118 // udot v24.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x6f84e919 // udot v25.4s, v8.16b, v4.4b[2]\n"
- ".inst 0x6f85e11a // udot v26.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x6f85e91b // udot v27.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x6fa5e17c // udot v28.4s, v11.16b, v5.4b[1]\n"
- ".inst 0x6fa5e97d // udot v29.4s, v11.16b, v5.4b[3]\n"
- ".inst 0x6fa6e17e // udot v30.4s, v11.16b, v6.4b[1]\n"
- ".inst 0x6fa6e97f // udot v31.4s, v11.16b, v6.4b[3]\n"
- ".inst 0x6fa4e138 // udot v24.4s, v9.16b, v4.4b[1]\n"
- ".inst 0x6fa4e939 // udot v25.4s, v9.16b, v4.4b[3]\n"
+ ".inst 0x6fa3e258 // udot v24.4s, v18.16b, v3.4b[1]\n"
+ ".inst 0x6fa3ea59 // udot v25.4s, v18.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e25a // udot v26.4s, v18.16b, v4.4b[1]\n"
+ ".inst 0x6fa4ea5b // udot v27.4s, v18.16b, v4.4b[3]\n"
+ ".inst 0x6f85e27c // udot v28.4s, v19.16b, v5.4b[0]\n"
+ ".inst 0x6f85ea7d // udot v29.4s, v19.16b, v5.4b[2]\n"
+ ".inst 0x6f86e27e // udot v30.4s, v19.16b, v6.4b[0]\n"
+ ".inst 0x6f86ea7f // udot v31.4s, v19.16b, v6.4b[2]\n"
+ ".inst 0x6f84e238 // udot v24.4s, v17.16b, v4.4b[0]\n"
+ ".inst 0x6f84ea39 // udot v25.4s, v17.16b, v4.4b[2]\n"
+ ".inst 0x6f85e23a // udot v26.4s, v17.16b, v5.4b[0]\n"
+ ".inst 0x6f85ea3b // udot v27.4s, v17.16b, v5.4b[2]\n"
+ ".inst 0x6fa5e25c // udot v28.4s, v18.16b, v5.4b[1]\n"
+ ".inst 0x6fa5ea5d // udot v29.4s, v18.16b, v5.4b[3]\n"
+ ".inst 0x6fa6e25e // udot v30.4s, v18.16b, v6.4b[1]\n"
+ ".inst 0x6fa6ea5f // udot v31.4s, v18.16b, v6.4b[3]\n"
+ ".inst 0x6fa4e218 // udot v24.4s, v16.16b, v4.4b[1]\n"
+ ".inst 0x6fa4ea19 // udot v25.4s, v16.16b, v4.4b[3]\n"
"sqrdmulh v24.4s, v24.4s, v21.4s\n"
- ".inst 0x6fa5e13a // udot v26.4s, v9.16b, v5.4b[1]\n"
- ".inst 0x6fa5e93b // udot v27.4s, v9.16b, v5.4b[3]\n"
+ ".inst 0x6fa5e21a // udot v26.4s, v16.16b, v5.4b[1]\n"
+ ".inst 0x6fa5ea1b // udot v27.4s, v16.16b, v5.4b[3]\n"
"sqrdmulh v25.4s, v25.4s, v21.4s\n"
- ".inst 0x6f86e11c // udot v28.4s, v8.16b, v6.4b[0]\n"
- ".inst 0x6f86e91d // udot v29.4s, v8.16b, v6.4b[2]\n"
+ ".inst 0x6f86e23c // udot v28.4s, v17.16b, v6.4b[0]\n"
+ ".inst 0x6f86ea3d // udot v29.4s, v17.16b, v6.4b[2]\n"
"sqrdmulh v26.4s, v26.4s, v21.4s\n"
- ".inst 0x6f87e11e // udot v30.4s, v8.16b, v7.4b[0]\n"
- ".inst 0x6f87e91f // udot v31.4s, v8.16b, v7.4b[2]\n"
+ ".inst 0x6f87e23e // udot v30.4s, v17.16b, v7.4b[0]\n"
+ ".inst 0x6f87ea3f // udot v31.4s, v17.16b, v7.4b[2]\n"
"sqrdmulh v27.4s, v27.4s, v21.4s\n"
- ".inst 0x6fa6e13c // udot v28.4s, v9.16b, v6.4b[1]\n"
- ".inst 0x6fa6e93d // udot v29.4s, v9.16b, v6.4b[3]\n"
+ ".inst 0x6fa6e21c // udot v28.4s, v16.16b, v6.4b[1]\n"
+ ".inst 0x6fa6ea1d // udot v29.4s, v16.16b, v6.4b[3]\n"
"and v19.16b, v24.16b, v20.16b\n"
- ".inst 0x6fa7e13e // udot v30.4s, v9.16b, v7.4b[1]\n"
- ".inst 0x6fa7e93f // udot v31.4s, v9.16b, v7.4b[3]\n"
+ ".inst 0x6fa7e21e // udot v30.4s, v16.16b, v7.4b[1]\n"
+ ".inst 0x6fa7ea1f // udot v31.4s, v16.16b, v7.4b[3]\n"
"and v18.16b, v25.16b, v20.16b\n"
"and v17.16b, v26.16b, v20.16b\n"
"and v16.16b, v27.16b, v20.16b\n"
@@ -536,14 +537,14 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"srshl v29.4s, v29.4s, v20.4s\n"
"srshl v30.4s, v30.4s, v20.4s\n"
"srshl v31.4s, v31.4s, v20.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v13.4s\n"
+ "add v25.4s, v25.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "add v27.4s, v27.4s, v13.4s\n"
+ "add v28.4s, v28.4s, v13.4s\n"
+ "add v29.4s, v29.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v13.4s\n"
+ "add v31.4s, v31.4s, v13.4s\n"
"smin v24.4s, v24.4s, v15.4s\n"
"smin v25.4s, v25.4s, v15.4s\n"
"smin v26.4s, v26.4s, v15.4s\n"
@@ -552,14 +553,14 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"smin v29.4s, v29.4s, v15.4s\n"
"smin v30.4s, v30.4s, v15.4s\n"
"smin v31.4s, v31.4s, v15.4s\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v14.4s\n"
+ "smax v25.4s, v25.4s, v14.4s\n"
+ "smax v26.4s, v26.4s, v14.4s\n"
+ "smax v27.4s, v27.4s, v14.4s\n"
+ "smax v28.4s, v28.4s, v14.4s\n"
+ "smax v29.4s, v29.4s, v14.4s\n"
+ "smax v30.4s, v30.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v14.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
@@ -635,4 +636,5 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
index 13f903b95d..1f2d211be2 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
index bbb817a883..0770c126ec 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -22,12 +22,13 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include "arm_gemm.hpp"
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
@@ -47,21 +48,21 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
__asm__ __volatile__(
"lsr x10, %x[n_output_channels], #0x2\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
- "ld1r { v13.4s }, [x20]\n"
+ "ld1r { v15.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v11.4s }, [x20]\n"
+ "ld1r { v14.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v3.16b }, [x20]\n"
+ "ld1r { v13.16b }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
"ld1r { v12.16b }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v14.4s }, [x20]\n"
+ "ld1r { v11.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
- "ld1r { v15.4s }, [x20]\n"
+ "ld1r { v10.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
"ld1r { v9.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
- "ld1r { v10.4s }, [x20]\n"
+ "ld1r { v8.4s }, [x20]\n"
"mov x9, #0x0\n"
"cbz x10, 9f\n"
"1:" // Output channel loop
@@ -89,256 +90,256 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"cbz %x[rq_mul_ptr], 3f\n"
"lsl x20, x9, #0x2\n"
"ldr q9, [%x[rq_mul_ptr], x20]\n"
- "ldr q10, [%x[rq_right_shift_ptr], x20]\n"
+ "ldr q8, [%x[rq_right_shift_ptr], x20]\n"
"cbz %x[rq_left_shift_ptr], 3f\n"
- "ldr q15, [%x[rq_left_shift_ptr], x20]\n"
+ "ldr q10, [%x[rq_left_shift_ptr], x20]\n"
"3:" // Output channel loop: Load quantization parameters: Done
- "ldr s8, [%x[weights]], #0x4\n"
- "mov x20, %x[inptrs]\n"
- "ldp x25, x28, [x20], #0x10\n"
- "lsr x21, %x[kernel_points], #0x1\n"
- "ldr d2, [x25, #0x0]\n"
- "ldr d7, [x28, #0x0]\n"
- "usubl v2.8h, v2.8b, v3.8b\n"
- "usubl v7.8h, v7.8b, v3.8b\n"
- "usubl v8.8h, v8.8b, v12.8b\n"
- "cbz x21, 7f\n"
- "ldr s6, [%x[weights]], #0x4\n"
- "ldp x25, x28, [x20], #0x10\n"
- "subs x21, x21, #0x1\n"
- "usubl v6.8h, v6.8b, v12.8b\n"
- "ldr d1, [x25, #0x0]\n"
- "ldr d0, [x28, #0x0]\n"
- "usubl v1.8h, v1.8b, v3.8b\n"
- "usubl v0.8h, v0.8b, v3.8b\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "mov x22, %x[inptrs]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "lsr x23, %x[kernel_points], #0x1\n"
+ "ldr d0, [x21, #0x0]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "usubl v5.8h, v5.8b, v12.8b\n"
+ "cbz x23, 7f\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "subs x23, x23, #0x1\n"
+ "usubl v7.8h, v7.8b, v12.8b\n"
+ "ldr d3, [x21, #0x0]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
"beq 5f\n"
"4:" // Output channel loop: Kernel loop
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "subs x21, x21, #0x1\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "ldr d2, [x25, #0x0]\n"
- "usubl v2.8h, v2.8b, v3.8b\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "ldr d7, [x28, #0x0]\n"
- "ldr s8, [%x[weights]], #0x4\n"
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "usubl v7.8h, v7.8b, v3.8b\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "usubl v8.8h, v8.8b, v12.8b\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
- "ldr d1, [x25, #0x0]\n"
- "usubl v1.8h, v1.8b, v3.8b\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
- "ldr d0, [x28, #0x0]\n"
- "ldr s6, [%x[weights]], #0x4\n"
- "usubl v0.8h, v0.8b, v3.8b\n"
- "usubl v6.8h, v6.8b, v12.8b\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "subs x23, x23, #0x1\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d0, [x21, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "usubl v5.8h, v5.8b, v12.8b\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "ldr d3, [x21, #0x0]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "usubl v7.8h, v7.8b, v12.8b\n"
"bgt 4b\n"
"5:" // Output channel loop: Kernel loop tail
"tbnz %x[kernel_points], #0, 6f\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "sshl v16.4s, v16.4s, v15.4s\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "sshl v17.4s, v17.4s, v15.4s\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "sshl v18.4s, v18.4s, v15.4s\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "sshl v19.4s, v19.4s, v15.4s\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
"sqrdmulh v16.4s, v16.4s, v9.4s\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
"sqrdmulh v17.4s, v17.4s, v9.4s\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
"sqrdmulh v18.4s, v18.4s, v9.4s\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
"sqrdmulh v19.4s, v19.4s, v9.4s\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "and v5.16b, v16.16b, v10.16b\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "and v4.16b, v17.16b, v10.16b\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "and v2.16b, v18.16b, v10.16b\n"
- "and v1.16b, v19.16b, v10.16b\n"
- "sshl v20.4s, v20.4s, v15.4s\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "sshl v21.4s, v21.4s, v15.4s\n"
- "sshl v22.4s, v22.4s, v15.4s\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "sshl v23.4s, v23.4s, v15.4s\n"
- "sshl v24.4s, v24.4s, v15.4s\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "sshl v25.4s, v25.4s, v15.4s\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v20.4s, v20.4s, v9.4s\n"
"sqrdmulh v21.4s, v21.4s, v9.4s\n"
"sqrdmulh v22.4s, v22.4s, v9.4s\n"
"sqrdmulh v23.4s, v23.4s, v9.4s\n"
"sqrdmulh v24.4s, v24.4s, v9.4s\n"
"sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqadd v16.4s, v16.4s, v5.4s\n"
- "sqadd v17.4s, v17.4s, v4.4s\n"
- "sqadd v18.4s, v18.4s, v2.4s\n"
- "sqadd v19.4s, v19.4s, v1.4s\n"
- "and v8.16b, v20.16b, v10.16b\n"
- "and v0.16b, v21.16b, v10.16b\n"
- "and v5.16b, v22.16b, v10.16b\n"
- "and v4.16b, v23.16b, v10.16b\n"
- "and v2.16b, v24.16b, v10.16b\n"
- "and v1.16b, v25.16b, v10.16b\n"
- "sshl v26.4s, v26.4s, v15.4s\n"
- "sshl v27.4s, v27.4s, v15.4s\n"
- "sshl v28.4s, v28.4s, v15.4s\n"
- "sshl v29.4s, v29.4s, v15.4s\n"
- "sshl v30.4s, v30.4s, v15.4s\n"
- "sshl v31.4s, v31.4s, v15.4s\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v9.4s\n"
"sqrdmulh v27.4s, v27.4s, v9.4s\n"
"sqrdmulh v28.4s, v28.4s, v9.4s\n"
"sqrdmulh v29.4s, v29.4s, v9.4s\n"
"sqrdmulh v30.4s, v30.4s, v9.4s\n"
"sqrdmulh v31.4s, v31.4s, v9.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sqadd v22.4s, v22.4s, v5.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v2.4s\n"
- "sqadd v25.4s, v25.4s, v1.4s\n"
- "and v8.16b, v26.16b, v10.16b\n"
- "and v0.16b, v27.16b, v10.16b\n"
- "and v5.16b, v28.16b, v10.16b\n"
- "and v4.16b, v29.16b, v10.16b\n"
- "and v2.16b, v30.16b, v10.16b\n"
- "and v1.16b, v31.16b, v10.16b\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "srshl v17.4s, v17.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v10.4s\n"
- "srshl v19.4s, v19.4s, v10.4s\n"
- "srshl v20.4s, v20.4s, v10.4s\n"
- "srshl v21.4s, v21.4s, v10.4s\n"
- "srshl v22.4s, v22.4s, v10.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "sqadd v26.4s, v26.4s, v8.4s\n"
- "sqadd v27.4s, v27.4s, v0.4s\n"
- "sqadd v28.4s, v28.4s, v5.4s\n"
- "sqadd v29.4s, v29.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v2.4s\n"
- "sqadd v31.4s, v31.4s, v1.4s\n"
- "add v16.4s, v16.4s, v14.4s\n"
- "add v17.4s, v17.4s, v14.4s\n"
- "add v18.4s, v18.4s, v14.4s\n"
- "add v19.4s, v19.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v22.4s, v22.4s, v14.4s\n"
- "add v23.4s, v23.4s, v14.4s\n"
- "srshl v24.4s, v24.4s, v10.4s\n"
- "srshl v25.4s, v25.4s, v10.4s\n"
- "srshl v26.4s, v26.4s, v10.4s\n"
- "srshl v27.4s, v27.4s, v10.4s\n"
- "srshl v28.4s, v28.4s, v10.4s\n"
- "srshl v29.4s, v29.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v10.4s\n"
- "srshl v31.4s, v31.4s, v10.4s\n"
- "smin v16.4s, v16.4s, v11.4s\n"
- "smin v17.4s, v17.4s, v11.4s\n"
- "smin v18.4s, v18.4s, v11.4s\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "smin v23.4s, v23.4s, v11.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "smax v16.4s, v16.4s, v13.4s\n"
- "smax v17.4s, v17.4s, v13.4s\n"
- "smax v18.4s, v18.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smax v22.4s, v22.4s, v13.4s\n"
- "smax v23.4s, v23.4s, v13.4s\n"
- "smin v24.4s, v24.4s, v11.4s\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "smin v26.4s, v26.4s, v11.4s\n"
- "smin v27.4s, v27.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v31.4s, v31.4s, v11.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
@@ -347,263 +348,263 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s16, [x20, x9]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "str s17, [x21, x9]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s18, [x22, x9]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s19, [x23, x9]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s20, [x24, x9]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s21, [x25, x9]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s22, [x26, x9]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s23, [x27, x9]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x20, x9]\n"
+ "str s24, [x27, x9]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s25, [x21, x9]\n"
+ "str s25, [x26, x9]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s26, [x22, x9]\n"
+ "str s26, [x25, x9]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s27, [x23, x9]\n"
+ "str s27, [x24, x9]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s28, [x24, x9]\n"
- "str s29, [x25, x9]\n"
- "str s30, [x26, x9]\n"
- "str s31, [x27, x9]\n"
+ "str s28, [x23, x9]\n"
+ "str s29, [x22, x9]\n"
+ "str s30, [x21, x9]\n"
+ "str s31, [x20, x9]\n"
"b 8f\n"
"6:" // Output channel loop: Odd tail
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "ldr d2, [x25, #0x0]\n"
- "usubl v2.8h, v2.8b, v3.8b\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "ldr s8, [%x[weights]], #0x4\n"
- "ldr d7, [x28, #0x0]\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "usubl v8.8h, v8.8b, v12.8b\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "usubl v7.8h, v7.8b, v3.8b\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "sshl v16.4s, v16.4s, v15.4s\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "sshl v17.4s, v17.4s, v15.4s\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
- "sshl v18.4s, v18.4s, v15.4s\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
- "sshl v19.4s, v19.4s, v15.4s\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
+ "ldp x20, x28, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d0, [x20, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "ldr d4, [x28, #0x0]\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "usubl v5.8h, v5.8b, v12.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
"sqrdmulh v16.4s, v16.4s, v9.4s\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
"sqrdmulh v17.4s, v17.4s, v9.4s\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
"sqrdmulh v18.4s, v18.4s, v9.4s\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
"sqrdmulh v19.4s, v19.4s, v9.4s\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "and v5.16b, v16.16b, v10.16b\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "and v4.16b, v17.16b, v10.16b\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
- "and v2.16b, v18.16b, v10.16b\n"
- "and v1.16b, v19.16b, v10.16b\n"
- "sshl v20.4s, v20.4s, v15.4s\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "sshl v21.4s, v21.4s, v15.4s\n"
- "sshl v22.4s, v22.4s, v15.4s\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "sshl v23.4s, v23.4s, v15.4s\n"
- "sshl v24.4s, v24.4s, v15.4s\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "sshl v25.4s, v25.4s, v15.4s\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v20.4s, v20.4s, v9.4s\n"
"sqrdmulh v21.4s, v21.4s, v9.4s\n"
"sqrdmulh v22.4s, v22.4s, v9.4s\n"
"sqrdmulh v23.4s, v23.4s, v9.4s\n"
"sqrdmulh v24.4s, v24.4s, v9.4s\n"
"sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqadd v16.4s, v16.4s, v5.4s\n"
- "sqadd v17.4s, v17.4s, v4.4s\n"
- "sqadd v18.4s, v18.4s, v2.4s\n"
- "sqadd v19.4s, v19.4s, v1.4s\n"
- "and v8.16b, v20.16b, v10.16b\n"
- "and v0.16b, v21.16b, v10.16b\n"
- "and v5.16b, v22.16b, v10.16b\n"
- "and v4.16b, v23.16b, v10.16b\n"
- "and v2.16b, v24.16b, v10.16b\n"
- "and v1.16b, v25.16b, v10.16b\n"
- "sshl v26.4s, v26.4s, v15.4s\n"
- "sshl v27.4s, v27.4s, v15.4s\n"
- "sshl v28.4s, v28.4s, v15.4s\n"
- "sshl v29.4s, v29.4s, v15.4s\n"
- "sshl v30.4s, v30.4s, v15.4s\n"
- "sshl v31.4s, v31.4s, v15.4s\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v9.4s\n"
"sqrdmulh v27.4s, v27.4s, v9.4s\n"
"sqrdmulh v28.4s, v28.4s, v9.4s\n"
"sqrdmulh v29.4s, v29.4s, v9.4s\n"
"sqrdmulh v30.4s, v30.4s, v9.4s\n"
"sqrdmulh v31.4s, v31.4s, v9.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sqadd v22.4s, v22.4s, v5.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v2.4s\n"
- "sqadd v25.4s, v25.4s, v1.4s\n"
- "and v8.16b, v26.16b, v10.16b\n"
- "and v0.16b, v27.16b, v10.16b\n"
- "and v5.16b, v28.16b, v10.16b\n"
- "and v4.16b, v29.16b, v10.16b\n"
- "and v2.16b, v30.16b, v10.16b\n"
- "and v1.16b, v31.16b, v10.16b\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "srshl v17.4s, v17.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v10.4s\n"
- "srshl v19.4s, v19.4s, v10.4s\n"
- "srshl v20.4s, v20.4s, v10.4s\n"
- "srshl v21.4s, v21.4s, v10.4s\n"
- "srshl v22.4s, v22.4s, v10.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "sqadd v26.4s, v26.4s, v8.4s\n"
- "sqadd v27.4s, v27.4s, v0.4s\n"
- "sqadd v28.4s, v28.4s, v5.4s\n"
- "sqadd v29.4s, v29.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v2.4s\n"
- "sqadd v31.4s, v31.4s, v1.4s\n"
- "add v16.4s, v16.4s, v14.4s\n"
- "add v17.4s, v17.4s, v14.4s\n"
- "add v18.4s, v18.4s, v14.4s\n"
- "add v19.4s, v19.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v22.4s, v22.4s, v14.4s\n"
- "add v23.4s, v23.4s, v14.4s\n"
- "srshl v24.4s, v24.4s, v10.4s\n"
- "srshl v25.4s, v25.4s, v10.4s\n"
- "srshl v26.4s, v26.4s, v10.4s\n"
- "srshl v27.4s, v27.4s, v10.4s\n"
- "srshl v28.4s, v28.4s, v10.4s\n"
- "srshl v29.4s, v29.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v10.4s\n"
- "srshl v31.4s, v31.4s, v10.4s\n"
- "smin v16.4s, v16.4s, v11.4s\n"
- "smin v17.4s, v17.4s, v11.4s\n"
- "smin v18.4s, v18.4s, v11.4s\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "smin v23.4s, v23.4s, v11.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "smax v16.4s, v16.4s, v13.4s\n"
- "smax v17.4s, v17.4s, v13.4s\n"
- "smax v18.4s, v18.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smax v22.4s, v22.4s, v13.4s\n"
- "smax v23.4s, v23.4s, v13.4s\n"
- "smin v24.4s, v24.4s, v11.4s\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "smin v26.4s, v26.4s, v11.4s\n"
- "smin v27.4s, v27.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v31.4s, v31.4s, v11.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
@@ -612,224 +613,224 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s16, [x20, x9]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "str s17, [x21, x9]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s18, [x22, x9]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s19, [x23, x9]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s20, [x24, x9]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s21, [x25, x9]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s22, [x26, x9]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s23, [x27, x9]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x20, x9]\n"
+ "str s24, [x27, x9]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s25, [x21, x9]\n"
+ "str s25, [x26, x9]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s26, [x22, x9]\n"
+ "str s26, [x25, x9]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s27, [x23, x9]\n"
+ "str s27, [x24, x9]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s28, [x24, x9]\n"
- "str s29, [x25, x9]\n"
- "str s30, [x26, x9]\n"
- "str s31, [x27, x9]\n"
+ "str s28, [x23, x9]\n"
+ "str s29, [x22, x9]\n"
+ "str s30, [x21, x9]\n"
+ "str s31, [x20, x9]\n"
"b 8f\n"
"7:" // Output channel loop: Single kernel point
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "sshl v16.4s, v16.4s, v15.4s\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "sshl v17.4s, v17.4s, v15.4s\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "sshl v18.4s, v18.4s, v15.4s\n"
- "sshl v19.4s, v19.4s, v15.4s\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
"sqrdmulh v16.4s, v16.4s, v9.4s\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
"sqrdmulh v17.4s, v17.4s, v9.4s\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
"sqrdmulh v18.4s, v18.4s, v9.4s\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
"sqrdmulh v19.4s, v19.4s, v9.4s\n"
- "and v5.16b, v16.16b, v10.16b\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "and v4.16b, v17.16b, v10.16b\n"
- "and v2.16b, v18.16b, v10.16b\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "and v1.16b, v19.16b, v10.16b\n"
- "sshl v20.4s, v20.4s, v15.4s\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "sshl v21.4s, v21.4s, v15.4s\n"
- "sshl v22.4s, v22.4s, v15.4s\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "sshl v23.4s, v23.4s, v15.4s\n"
- "sshl v24.4s, v24.4s, v15.4s\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "sshl v25.4s, v25.4s, v15.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v20.4s, v20.4s, v9.4s\n"
"sqrdmulh v21.4s, v21.4s, v9.4s\n"
"sqrdmulh v22.4s, v22.4s, v9.4s\n"
"sqrdmulh v23.4s, v23.4s, v9.4s\n"
"sqrdmulh v24.4s, v24.4s, v9.4s\n"
"sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqadd v16.4s, v16.4s, v5.4s\n"
- "sqadd v17.4s, v17.4s, v4.4s\n"
- "sqadd v18.4s, v18.4s, v2.4s\n"
- "sqadd v19.4s, v19.4s, v1.4s\n"
- "and v8.16b, v20.16b, v10.16b\n"
- "and v0.16b, v21.16b, v10.16b\n"
- "and v5.16b, v22.16b, v10.16b\n"
- "and v4.16b, v23.16b, v10.16b\n"
- "and v2.16b, v24.16b, v10.16b\n"
- "and v1.16b, v25.16b, v10.16b\n"
- "sshl v26.4s, v26.4s, v15.4s\n"
- "sshl v27.4s, v27.4s, v15.4s\n"
- "sshl v28.4s, v28.4s, v15.4s\n"
- "sshl v29.4s, v29.4s, v15.4s\n"
- "sshl v30.4s, v30.4s, v15.4s\n"
- "sshl v31.4s, v31.4s, v15.4s\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v9.4s\n"
"sqrdmulh v27.4s, v27.4s, v9.4s\n"
"sqrdmulh v28.4s, v28.4s, v9.4s\n"
"sqrdmulh v29.4s, v29.4s, v9.4s\n"
"sqrdmulh v30.4s, v30.4s, v9.4s\n"
"sqrdmulh v31.4s, v31.4s, v9.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sqadd v22.4s, v22.4s, v5.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v2.4s\n"
- "sqadd v25.4s, v25.4s, v1.4s\n"
- "and v8.16b, v26.16b, v10.16b\n"
- "and v0.16b, v27.16b, v10.16b\n"
- "and v5.16b, v28.16b, v10.16b\n"
- "and v4.16b, v29.16b, v10.16b\n"
- "and v2.16b, v30.16b, v10.16b\n"
- "and v1.16b, v31.16b, v10.16b\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "srshl v17.4s, v17.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v10.4s\n"
- "srshl v19.4s, v19.4s, v10.4s\n"
- "srshl v20.4s, v20.4s, v10.4s\n"
- "srshl v21.4s, v21.4s, v10.4s\n"
- "srshl v22.4s, v22.4s, v10.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "sqadd v26.4s, v26.4s, v8.4s\n"
- "sqadd v27.4s, v27.4s, v0.4s\n"
- "sqadd v28.4s, v28.4s, v5.4s\n"
- "sqadd v29.4s, v29.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v2.4s\n"
- "sqadd v31.4s, v31.4s, v1.4s\n"
- "add v16.4s, v16.4s, v14.4s\n"
- "add v17.4s, v17.4s, v14.4s\n"
- "add v18.4s, v18.4s, v14.4s\n"
- "add v19.4s, v19.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v22.4s, v22.4s, v14.4s\n"
- "add v23.4s, v23.4s, v14.4s\n"
- "srshl v24.4s, v24.4s, v10.4s\n"
- "srshl v25.4s, v25.4s, v10.4s\n"
- "srshl v26.4s, v26.4s, v10.4s\n"
- "srshl v27.4s, v27.4s, v10.4s\n"
- "srshl v28.4s, v28.4s, v10.4s\n"
- "srshl v29.4s, v29.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v10.4s\n"
- "srshl v31.4s, v31.4s, v10.4s\n"
- "smin v16.4s, v16.4s, v11.4s\n"
- "smin v17.4s, v17.4s, v11.4s\n"
- "smin v18.4s, v18.4s, v11.4s\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "smin v23.4s, v23.4s, v11.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "smax v16.4s, v16.4s, v13.4s\n"
- "smax v17.4s, v17.4s, v13.4s\n"
- "smax v18.4s, v18.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smax v22.4s, v22.4s, v13.4s\n"
- "smax v23.4s, v23.4s, v13.4s\n"
- "smin v24.4s, v24.4s, v11.4s\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "smin v26.4s, v26.4s, v11.4s\n"
- "smin v27.4s, v27.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v31.4s, v31.4s, v11.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
@@ -838,62 +839,62 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s16, [x20, x9]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "str s17, [x21, x9]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s18, [x22, x9]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s19, [x23, x9]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s20, [x24, x9]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s21, [x25, x9]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s22, [x26, x9]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s23, [x27, x9]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x20, x9]\n"
+ "str s24, [x27, x9]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s25, [x21, x9]\n"
+ "str s25, [x26, x9]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s26, [x22, x9]\n"
+ "str s26, [x25, x9]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s27, [x23, x9]\n"
+ "str s27, [x24, x9]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s28, [x24, x9]\n"
- "str s29, [x25, x9]\n"
- "str s30, [x26, x9]\n"
- "str s31, [x27, x9]\n"
+ "str s28, [x23, x9]\n"
+ "str s29, [x22, x9]\n"
+ "str s30, [x21, x9]\n"
+ "str s31, [x20, x9]\n"
"8:" // Output channel loop: Done
"add x9, x9, #0x4\n"
"cmp x9, x10, LSL #2\n"
@@ -936,354 +937,354 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"cbz %x[rq_left_shift_ptr], 15f\n"
"tbz %x[n_output_channels], #1, 13f\n"
"ld1 { v9.d }[0], [x22], #0x8\n"
- "ld1 { v10.d }[0], [x21], #0x8\n"
- "ld1 { v15.d }[0], [x20], #0x8\n"
+ "ld1 { v8.d }[0], [x21], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_output_channels], #0, 14f\n"
"ld1 { v9.s }[2], [x22], #0x4\n"
- "ld1 { v10.s }[2], [x21], #0x4\n"
- "ld1 { v15.s }[2], [x20], #0x4\n"
+ "ld1 { v8.s }[2], [x21], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"b 14f\n"
"13:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: Unset
"ld1 { v9.s }[0], [x22], #0x4\n"
- "ld1 { v10.s }[0], [x21], #0x4\n"
- "ld1 { v15.s }[0], [x20], #0x4\n"
+ "ld1 { v8.s }[0], [x21], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"14:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: End
"b 18f\n"
"15:" // Output channel oddments: Load quantization parameters: No left shift
"tbz %x[n_output_channels], #1, 16f\n"
"ld1 { v9.d }[0], [x22], #0x8\n"
- "ld1 { v10.d }[0], [x21], #0x8\n"
+ "ld1 { v8.d }[0], [x21], #0x8\n"
"tbz %x[n_output_channels], #0, 17f\n"
"ld1 { v9.s }[2], [x22], #0x4\n"
- "ld1 { v10.s }[2], [x21], #0x4\n"
+ "ld1 { v8.s }[2], [x21], #0x4\n"
"b 17f\n"
"16:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: Unset
"ld1 { v9.s }[0], [x22], #0x4\n"
- "ld1 { v10.s }[0], [x21], #0x4\n"
+ "ld1 { v8.s }[0], [x21], #0x4\n"
"17:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: End
"18:" // Output channel oddments: Load quantization parameters: Done
- "ldr s8, [%x[weights]], #0x4\n"
- "mov x20, %x[inptrs]\n"
- "ldp x25, x28, [x20], #0x10\n"
- "lsr x21, %x[kernel_points], #0x1\n"
- "ldr d2, [x25, #0x0]\n"
- "ldr d7, [x28, #0x0]\n"
- "usubl v2.8h, v2.8b, v3.8b\n"
- "usubl v7.8h, v7.8b, v3.8b\n"
- "usubl v8.8h, v8.8b, v12.8b\n"
- "cbz x21, 22f\n"
- "ldr s6, [%x[weights]], #0x4\n"
- "ldp x25, x28, [x20], #0x10\n"
- "subs x21, x21, #0x1\n"
- "usubl v6.8h, v6.8b, v12.8b\n"
- "ldr d1, [x25, #0x0]\n"
- "ldr d0, [x28, #0x0]\n"
- "usubl v1.8h, v1.8b, v3.8b\n"
- "usubl v0.8h, v0.8b, v3.8b\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "mov x22, %x[inptrs]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "lsr x23, %x[kernel_points], #0x1\n"
+ "ldr d0, [x21, #0x0]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "usubl v5.8h, v5.8b, v12.8b\n"
+ "cbz x23, 22f\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "subs x23, x23, #0x1\n"
+ "usubl v7.8h, v7.8b, v12.8b\n"
+ "ldr d3, [x21, #0x0]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
"beq 20f\n"
"19:" // Output channel oddments: Kernel loop
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "subs x21, x21, #0x1\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "ldr d2, [x25, #0x0]\n"
- "usubl v2.8h, v2.8b, v3.8b\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "ldr d7, [x28, #0x0]\n"
- "ldr s8, [%x[weights]], #0x4\n"
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "usubl v7.8h, v7.8b, v3.8b\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "usubl v8.8h, v8.8b, v12.8b\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
- "ldr d1, [x25, #0x0]\n"
- "usubl v1.8h, v1.8b, v3.8b\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
- "ldr d0, [x28, #0x0]\n"
- "ldr s6, [%x[weights]], #0x4\n"
- "usubl v0.8h, v0.8b, v3.8b\n"
- "usubl v6.8h, v6.8b, v12.8b\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "subs x23, x23, #0x1\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d0, [x21, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "usubl v5.8h, v5.8b, v12.8b\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "ldr d3, [x21, #0x0]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "usubl v7.8h, v7.8b, v12.8b\n"
"bgt 19b\n"
"20:" // Output channel oddments: Kernel loop tail
"tbnz %x[kernel_points], #0, 21f\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
"b 23f\n"
"21:" // Output channel oddments: Odd tail
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "ldr d2, [x25, #0x0]\n"
- "usubl v2.8h, v2.8b, v3.8b\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "ldr d7, [x28, #0x0]\n"
- "ldr s8, [%x[weights]], #0x4\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "usubl v7.8h, v7.8b, v3.8b\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "usubl v8.8h, v8.8b, v12.8b\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d2, [x21, #0x0]\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr d1, [x20, #0x0]\n"
+ "ldr s0, [%x[weights]], #0x4\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "usubl v0.8h, v0.8b, v12.8b\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "smlal v16.4s, v0.4h, v2.h[0]\n"
+ "smlal v17.4s, v0.4h, v2.h[1]\n"
+ "smlal v18.4s, v0.4h, v2.h[2]\n"
+ "smlal v19.4s, v0.4h, v2.h[3]\n"
+ "smlal v20.4s, v0.4h, v2.h[4]\n"
+ "smlal v21.4s, v0.4h, v2.h[5]\n"
+ "smlal v22.4s, v0.4h, v2.h[6]\n"
+ "smlal v23.4s, v0.4h, v2.h[7]\n"
+ "smlal v24.4s, v0.4h, v1.h[0]\n"
+ "smlal v25.4s, v0.4h, v1.h[1]\n"
+ "smlal v26.4s, v0.4h, v1.h[2]\n"
+ "smlal v27.4s, v0.4h, v1.h[3]\n"
+ "smlal v28.4s, v0.4h, v1.h[4]\n"
+ "smlal v29.4s, v0.4h, v1.h[5]\n"
+ "smlal v30.4s, v0.4h, v1.h[6]\n"
+ "smlal v31.4s, v0.4h, v1.h[7]\n"
"b 23f\n"
"22:" // Output channel oddments: Single kernel point
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
"23:" // Output channel oddments: Done
- "sshl v16.4s, v16.4s, v15.4s\n"
- "sshl v17.4s, v17.4s, v15.4s\n"
- "sshl v18.4s, v18.4s, v15.4s\n"
- "sshl v19.4s, v19.4s, v15.4s\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
"sqrdmulh v16.4s, v16.4s, v9.4s\n"
"sqrdmulh v17.4s, v17.4s, v9.4s\n"
"sqrdmulh v18.4s, v18.4s, v9.4s\n"
"sqrdmulh v19.4s, v19.4s, v9.4s\n"
- "and v5.16b, v16.16b, v10.16b\n"
- "and v4.16b, v17.16b, v10.16b\n"
- "and v2.16b, v18.16b, v10.16b\n"
- "and v1.16b, v19.16b, v10.16b\n"
- "sshl v20.4s, v20.4s, v15.4s\n"
- "sshl v21.4s, v21.4s, v15.4s\n"
- "sshl v22.4s, v22.4s, v15.4s\n"
- "sshl v23.4s, v23.4s, v15.4s\n"
- "sshl v24.4s, v24.4s, v15.4s\n"
- "sshl v25.4s, v25.4s, v15.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v20.4s, v20.4s, v9.4s\n"
"sqrdmulh v21.4s, v21.4s, v9.4s\n"
"sqrdmulh v22.4s, v22.4s, v9.4s\n"
"sqrdmulh v23.4s, v23.4s, v9.4s\n"
"sqrdmulh v24.4s, v24.4s, v9.4s\n"
"sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqadd v16.4s, v16.4s, v5.4s\n"
- "sqadd v17.4s, v17.4s, v4.4s\n"
- "sqadd v18.4s, v18.4s, v2.4s\n"
- "sqadd v19.4s, v19.4s, v1.4s\n"
- "and v8.16b, v20.16b, v10.16b\n"
- "and v0.16b, v21.16b, v10.16b\n"
- "and v5.16b, v22.16b, v10.16b\n"
- "and v4.16b, v23.16b, v10.16b\n"
- "and v2.16b, v24.16b, v10.16b\n"
- "and v1.16b, v25.16b, v10.16b\n"
- "sshl v26.4s, v26.4s, v15.4s\n"
- "sshl v27.4s, v27.4s, v15.4s\n"
- "sshl v28.4s, v28.4s, v15.4s\n"
- "sshl v29.4s, v29.4s, v15.4s\n"
- "sshl v30.4s, v30.4s, v15.4s\n"
- "sshl v31.4s, v31.4s, v15.4s\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v9.4s\n"
"sqrdmulh v27.4s, v27.4s, v9.4s\n"
"sqrdmulh v28.4s, v28.4s, v9.4s\n"
"sqrdmulh v29.4s, v29.4s, v9.4s\n"
"sqrdmulh v30.4s, v30.4s, v9.4s\n"
"sqrdmulh v31.4s, v31.4s, v9.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sqadd v22.4s, v22.4s, v5.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v2.4s\n"
- "sqadd v25.4s, v25.4s, v1.4s\n"
- "and v8.16b, v26.16b, v10.16b\n"
- "and v0.16b, v27.16b, v10.16b\n"
- "and v5.16b, v28.16b, v10.16b\n"
- "and v4.16b, v29.16b, v10.16b\n"
- "and v2.16b, v30.16b, v10.16b\n"
- "and v1.16b, v31.16b, v10.16b\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v8.4s\n"
- "sqadd v27.4s, v27.4s, v0.4s\n"
- "sqadd v28.4s, v28.4s, v5.4s\n"
- "sqadd v29.4s, v29.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v2.4s\n"
- "sqadd v31.4s, v31.4s, v1.4s\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "srshl v17.4s, v17.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v10.4s\n"
- "srshl v19.4s, v19.4s, v10.4s\n"
- "srshl v20.4s, v20.4s, v10.4s\n"
- "srshl v21.4s, v21.4s, v10.4s\n"
- "srshl v22.4s, v22.4s, v10.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "srshl v24.4s, v24.4s, v10.4s\n"
- "srshl v25.4s, v25.4s, v10.4s\n"
- "srshl v26.4s, v26.4s, v10.4s\n"
- "srshl v27.4s, v27.4s, v10.4s\n"
- "srshl v28.4s, v28.4s, v10.4s\n"
- "srshl v29.4s, v29.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v10.4s\n"
- "srshl v31.4s, v31.4s, v10.4s\n"
- "add v16.4s, v16.4s, v14.4s\n"
- "add v17.4s, v17.4s, v14.4s\n"
- "add v18.4s, v18.4s, v14.4s\n"
- "add v19.4s, v19.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v22.4s, v22.4s, v14.4s\n"
- "add v23.4s, v23.4s, v14.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "smin v16.4s, v16.4s, v11.4s\n"
- "smin v17.4s, v17.4s, v11.4s\n"
- "smin v18.4s, v18.4s, v11.4s\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "smin v23.4s, v23.4s, v11.4s\n"
- "smin v24.4s, v24.4s, v11.4s\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "smin v26.4s, v26.4s, v11.4s\n"
- "smin v27.4s, v27.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v31.4s, v31.4s, v11.4s\n"
- "smax v16.4s, v16.4s, v13.4s\n"
- "smax v17.4s, v17.4s, v13.4s\n"
- "smax v18.4s, v18.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smax v22.4s, v22.4s, v13.4s\n"
- "smax v23.4s, v23.4s, v13.4s\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
@@ -1317,158 +1318,156 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"uzp1 v30.16b, v30.16b, v30.16b\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
"tbz %x[n_output_channels], #1, 24f\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x9\n"
- "add x21, x21, x9\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x9\n"
- "add x23, x23, x9\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x9\n"
- "add x25, x25, x9\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x9\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x9\n"
- "st1 { v16.h }[0], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x9\n"
- "st1 { v17.h }[0], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x9\n"
- "st1 { v18.h }[0], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x9\n"
- "st1 { v19.h }[0], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x9\n"
- "st1 { v20.h }[0], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x9\n"
- "st1 { v21.h }[0], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x9\n"
- "st1 { v22.h }[0], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x9\n"
- "st1 { v23.h }[0], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
+ "st1 { v16.h }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x9\n"
+ "st1 { v17.h }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x9\n"
+ "st1 { v18.h }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x9\n"
+ "st1 { v19.h }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x9\n"
+ "st1 { v20.h }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v21.h }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x9\n"
+ "st1 { v22.h }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x9\n"
+ "st1 { v23.h }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x9\n"
"add x9, x9, #0x2\n"
- "st1 { v24.h }[0], [x20]\n"
- "st1 { v25.h }[0], [x21]\n"
- "st1 { v26.h }[0], [x22]\n"
- "st1 { v27.h }[0], [x23]\n"
- "st1 { v28.h }[0], [x24]\n"
- "st1 { v29.h }[0], [x25]\n"
- "st1 { v30.h }[0], [x26]\n"
- "st1 { v31.h }[0], [x27]\n"
+ "st1 { v24.h }[0], [x27]\n"
+ "st1 { v25.h }[0], [x26]\n"
+ "st1 { v26.h }[0], [x25]\n"
+ "st1 { v27.h }[0], [x24]\n"
+ "st1 { v28.h }[0], [x23]\n"
+ "st1 { v29.h }[0], [x22]\n"
+ "st1 { v30.h }[0], [x21]\n"
+ "st1 { v31.h }[0], [x20]\n"
"tbz %x[n_output_channels], #0, 25f\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x9\n"
- "add x21, x21, x9\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x9\n"
- "add x23, x23, x9\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x9\n"
- "add x25, x25, x9\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x9\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x9\n"
- "st1 { v16.b }[2], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x9\n"
- "st1 { v17.b }[2], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x9\n"
- "st1 { v18.b }[2], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x9\n"
- "st1 { v19.b }[2], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x9\n"
- "st1 { v20.b }[2], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x9\n"
- "st1 { v21.b }[2], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x9\n"
- "st1 { v22.b }[2], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x9\n"
- "st1 { v23.b }[2], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
+ "st1 { v16.b }[2], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x9\n"
- "st1 { v24.b }[2], [x20]\n"
- "st1 { v25.b }[2], [x21]\n"
- "st1 { v26.b }[2], [x22]\n"
- "st1 { v27.b }[2], [x23]\n"
- "st1 { v28.b }[2], [x24]\n"
- "st1 { v29.b }[2], [x25]\n"
- "st1 { v30.b }[2], [x26]\n"
- "st1 { v31.b }[2], [x27]\n"
+ "st1 { v17.b }[2], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x9\n"
+ "st1 { v18.b }[2], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x9\n"
+ "st1 { v19.b }[2], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x9\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v21.b }[2], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x9\n"
+ "st1 { v22.b }[2], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x9\n"
+ "st1 { v23.b }[2], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x9\n"
+ "st1 { v24.b }[2], [x27]\n"
+ "st1 { v25.b }[2], [x26]\n"
+ "st1 { v26.b }[2], [x25]\n"
+ "st1 { v27.b }[2], [x24]\n"
+ "st1 { v28.b }[2], [x23]\n"
+ "st1 { v29.b }[2], [x22]\n"
+ "st1 { v30.b }[2], [x21]\n"
+ "st1 { v31.b }[2], [x20]\n"
"b 25f\n"
"24:" // Output channel oddments: Done: Store: Bit 1: Unset
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x9\n"
- "add x21, x21, x9\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x9\n"
- "add x23, x23, x9\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x9\n"
- "add x25, x25, x9\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x9\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x9\n"
- "st1 { v16.b }[0], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x9\n"
- "st1 { v17.b }[0], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x9\n"
- "st1 { v18.b }[0], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x9\n"
- "st1 { v19.b }[0], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x9\n"
- "st1 { v20.b }[0], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x9\n"
- "st1 { v21.b }[0], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x9\n"
- "st1 { v22.b }[0], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x9\n"
- "st1 { v23.b }[0], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
+ "st1 { v16.b }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x9\n"
- "st1 { v24.b }[0], [x20]\n"
- "st1 { v25.b }[0], [x21]\n"
- "st1 { v26.b }[0], [x22]\n"
- "st1 { v27.b }[0], [x23]\n"
- "st1 { v28.b }[0], [x24]\n"
- "st1 { v29.b }[0], [x25]\n"
- "st1 { v30.b }[0], [x26]\n"
- "st1 { v31.b }[0], [x27]\n"
+ "st1 { v17.b }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x9\n"
+ "st1 { v18.b }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x9\n"
+ "st1 { v19.b }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x9\n"
+ "st1 { v20.b }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v21.b }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x9\n"
+ "st1 { v22.b }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x9\n"
+ "st1 { v23.b }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x9\n"
+ "st1 { v24.b }[0], [x27]\n"
+ "st1 { v25.b }[0], [x26]\n"
+ "st1 { v26.b }[0], [x25]\n"
+ "st1 { v27.b }[0], [x24]\n"
+ "st1 { v28.b }[0], [x23]\n"
+ "st1 { v29.b }[0], [x22]\n"
+ "st1 { v30.b }[0], [x21]\n"
+ "st1 { v31.b }[0], [x20]\n"
"25:" // Output channel oddments: Done: Store: Bit 1: End
-
"26:" // Done
-
: [weights] "+&r" (weights)
: [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [n_output_channels] "r" ((uint64_t) n_output_channels), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (per_channel_left_shifts), [rq_mul_ptr] "r" (per_channel_muls), [rq_right_shift_ptr] "r" (per_channel_right_shifts)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
@@ -1477,4 +1476,5 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index 55731060f4..20a37b157f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -22,8 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
-
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
@@ -35,16 +34,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
- const unsigned int,
- const uint8_t *const *const,
- const uint8_t *const,
- const int32_t *const,
- const arm_gemm::Requantize32 &,
- const int32_t *const,
- const int32_t *const,
- uint8_t *const *const
-);
+void a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
class a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index afc6695ff1..d1872c90f8 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -91,1070 +91,1070 @@ void a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x16, [%x[params], %[offsetof_Params_n_channels]]\n"
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
- "lsr x8, x7, #0x3\n"
+ "lsr x15, x16, #0x3\n"
"add x20, x23, %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v12.16b }, [x20]\n"
+ "ld1r { v18.16b }, [x20]\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
"add x21, x23, %[offsetof_Requantize32_c_offset]\n"
"add x20, x23, %[offsetof_Requantize32_minval]\n"
- "ld1r { v20.8h }, [x21]\n"
- "ld1r { v15.8h }, [x20]\n"
+ "ld1r { v5.8h }, [x21]\n"
+ "ld1r { v14.8h }, [x20]\n"
"add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "mov x17, #0x0\n"
- "ld1r { v13.8h }, [x20]\n"
- "mov x16, #0x0\n"
- "add x15, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
- "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x11, x10, [x22, #0x0]\n"
- "ldp x9, x28, [x22, #0x10]\n"
- "cbz x8, 3f\n"
- "ldr d0, [x14, #0x0]\n"
- "ldr d1, [x14, #0x8]\n"
- "subs x8, x8, #0x1\n"
- "usubl v0.8h, v0.8b, v12.8b\n"
- "ldr d2, [x14, #0x10]\n"
- "ldr d3, [x14, #0x18]\n"
- "usubl v1.8h, v1.8b, v12.8b\n"
- "usubl v2.8h, v2.8b, v12.8b\n"
- "ldr d4, [x14, #0x20]\n"
- "ldr d5, [x14, #0x28]\n"
- "usubl v3.8h, v3.8b, v12.8b\n"
- "usubl v4.8h, v4.8b, v12.8b\n"
- "ldr d6, [x14, #0x30]\n"
- "ldr d7, [x14, #0x38]\n"
- "usubl v5.8h, v5.8b, v12.8b\n"
- "usubl v6.8h, v6.8b, v12.8b\n"
- "ldr d8, [x14, #0x40]\n"
- "ldr x27, [%x[params], %[offsetof_Params_bias]]\n"
- "usubl v7.8h, v7.8b, v12.8b\n"
- "usubl v8.8h, v8.8b, v12.8b\n"
- "ldr q14, [x27, #0x0]\n"
- "ldr q11, [x27, #0x10]\n"
- "add x27, x27, #0x20\n"
- "str x27, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x24, x23, [x15, #0x0]\n"
- "ldp x22, x21, [x15, #0x10]\n"
- "mov v16.16b, v14.16b\n"
- "mov v9.16b, v11.16b\n"
- "ldr d31, [x24, x17]\n"
- "ldr d30, [x23, x17]\n"
- "mov v24.16b, v14.16b\n"
- "mov v17.16b, v11.16b\n"
- "ldr d29, [x22, x17]\n"
- "ldr d28, [x21, x17]\n"
- "mov v23.16b, v14.16b\n"
- "mov v25.16b, v11.16b\n"
- "ldr x20, [x15, #0x20]\n"
- "ldr d27, [x20, x17]\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "ushll v28.8h, v28.8b, #0x0\n"
+ "mov x14, #0x0\n"
+ "ld1r { v12.8h }, [x20]\n"
+ "mov x13, #0x0\n"
+ "add x12, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x11, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x10, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x9, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x28, x27, [x22, #0x0]\n"
+ "ldp x26, x25, [x22, #0x10]\n"
+ "cbz x15, 3f\n"
+ "ldr d19, [x11, #0x0]\n"
+ "ldr d7, [x11, #0x8]\n"
+ "subs x15, x15, #0x1\n"
+ "usubl v19.8h, v19.8b, v18.8b\n"
+ "ldr d1, [x11, #0x10]\n"
+ "ldr d17, [x11, #0x18]\n"
+ "usubl v7.8h, v7.8b, v18.8b\n"
+ "usubl v1.8h, v1.8b, v18.8b\n"
+ "ldr d8, [x11, #0x20]\n"
+ "ldr d31, [x11, #0x28]\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "usubl v8.8h, v8.8b, v18.8b\n"
+ "ldr d29, [x11, #0x30]\n"
+ "ldr d16, [x11, #0x38]\n"
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "ldr d4, [x11, #0x40]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "ldr q28, [x20, #0x0]\n"
+ "ldr q9, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x23, x22, [x12, #0x0]\n"
+ "ldp x21, x20, [x12, #0x10]\n"
+ "mov v3.16b, v28.16b\n"
+ "mov v30.16b, v9.16b\n"
+ "ldr d23, [x23, x14]\n"
+ "ldr d10, [x22, x14]\n"
+ "mov v0.16b, v28.16b\n"
+ "mov v22.16b, v9.16b\n"
+ "ldr d11, [x21, x14]\n"
+ "ldr d13, [x20, x14]\n"
+ "mov v6.16b, v28.16b\n"
+ "mov v2.16b, v9.16b\n"
+ "ldr x20, [x12, #0x20]\n"
+ "ldr d27, [x20, x14]\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "ushll v13.8h, v13.8b, #0x0\n"
"ushll v27.8h, v27.8b, #0x0\n"
"beq 2f\n"
"1:" // Loop
- "ldr q22, [x13, #0x0]\n"
- "ldr q10, [x12, #0x0]\n"
- "smlal v14.4s, v31.4h, v4.4h\n"
- "smlal2 v11.4s, v31.8h, v4.8h\n"
- "ldr q18, [x13, #0x10]\n"
- "ldr q26, [x12, #0x10]\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal2 v9.4s, v31.8h, v3.8h\n"
- "smlal v14.4s, v30.4h, v0.4h\n"
- "smlal2 v11.4s, v30.8h, v0.8h\n"
- "ldr x22, [x15, #0x28]\n"
- "ldr x21, [x15, #0x38]\n"
- "smlal v16.4s, v29.4h, v2.4h\n"
- "smlal2 v9.4s, v29.8h, v2.8h\n"
- "ldr x20, [x15, #0x30]\n"
- "ldr d29, [x20, x17]\n"
- "smlal v24.4s, v31.4h, v1.4h\n"
- "smlal2 v17.4s, v31.8h, v1.8h\n"
- "ldr x26, [x15, #0x40]\n"
- "ldr x20, [x15, #0x48]\n"
- "ldr d30, [x20, x17]\n"
- "smlal v23.4s, v31.4h, v0.4h\n"
- "smlal2 v25.4s, v31.8h, v0.8h\n"
- "ldr d31, [x22, x17]\n"
- "smlal v14.4s, v28.4h, v5.4h\n"
- "smlal2 v11.4s, v28.8h, v5.8h\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "ldr x25, [x15, #0x50]\n"
- "smlal v16.4s, v28.4h, v4.4h\n"
- "smlal2 v9.4s, v28.8h, v4.8h\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "ldr x24, [x15, #0x58]\n"
- "smlal v24.4s, v28.4h, v2.4h\n"
- "smlal2 v17.4s, v28.8h, v2.8h\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "ldr x23, [x15, #0x60]\n"
- "smlal v23.4s, v28.4h, v1.4h\n"
- "smlal2 v25.4s, v28.8h, v1.8h\n"
- "ldr d28, [x21, x17]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal v14.4s, v27.4h, v7.4h\n"
- "smlal2 v11.4s, v27.8h, v7.8h\n"
- "ldr x22, [x15, #0x68]\n"
- "ldr x21, [x15, #0x70]\n"
- "smlal v16.4s, v27.4h, v6.4h\n"
- "smlal2 v9.4s, v27.8h, v6.8h\n"
- "ldr x20, [x15, #0x78]\n"
- "ldr x27, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal v24.4s, v31.4h, v6.4h\n"
- "smlal2 v17.4s, v31.8h, v6.8h\n"
- "ldr d31, [x26, x17]\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "smlal v23.4s, v27.4h, v3.4h\n"
- "smlal2 v25.4s, v27.8h, v3.8h\n"
- "add x14, x14, #0x48\n"
- "subs x8, x8, #0x1\n"
- "smlal v14.4s, v28.4h, v1.4h\n"
- "smlal2 v11.4s, v28.8h, v1.8h\n"
- "add x13, x13, #0x20\n"
- "add x12, x12, #0x20\n"
- "smlal v16.4s, v28.4h, v0.4h\n"
- "smlal2 v9.4s, v28.8h, v0.8h\n"
- "ldr d28, [x24, x17]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal v24.4s, v27.4h, v4.4h\n"
- "smlal v23.4s, v29.4h, v8.4h\n"
- "smlal2 v17.4s, v27.8h, v4.8h\n"
- "smlal2 v25.4s, v29.8h, v8.8h\n"
- "ldr d29, [x25, x17]\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal v14.4s, v31.4h, v2.4h\n"
- "smlal2 v11.4s, v31.8h, v2.8h\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "smlal2 v9.4s, v31.8h, v1.8h\n"
- "ldr d31, [x23, x17]\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "smlal v24.4s, v30.4h, v5.4h\n"
- "smlal v23.4s, v30.4h, v4.4h\n"
- "smlal v14.4s, v30.4h, v8.4h\n"
- "smlal2 v11.4s, v30.8h, v8.8h\n"
- "smlal v16.4s, v30.4h, v7.4h\n"
- "smlal2 v9.4s, v30.8h, v7.8h\n"
- "smlal2 v17.4s, v30.8h, v5.8h\n"
- "smlal2 v25.4s, v30.8h, v4.8h\n"
- "ldr d30, [x22, x17]\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "smlal v24.4s, v29.4h, v0.4h\n"
- "smlal v23.4s, v28.4h, v2.4h\n"
- "smlal v14.4s, v29.4h, v3.4h\n"
- "smlal2 v11.4s, v29.8h, v3.8h\n"
- "smlal2 v17.4s, v29.8h, v0.8h\n"
- "ldr d29, [x21, x17]\n"
- "smlal2 v25.4s, v28.8h, v2.8h\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal v24.4s, v31.4h, v3.4h\n"
- "smlal v23.4s, v30.4h, v5.4h\n"
- "smlal v16.4s, v28.4h, v5.4h\n"
- "smlal2 v9.4s, v28.8h, v5.8h\n"
- "ldr d28, [x20, x17]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal v14.4s, v31.4h, v6.4h\n"
- "smlal2 v17.4s, v31.8h, v3.8h\n"
- "sqrdmulh v14.4s, v14.4s, v22.4s\n"
- "add x17, x17, #0x8\n"
- "smlal2 v25.4s, v30.8h, v5.8h\n"
- "smlal v24.4s, v29.4h, v7.4h\n"
- "and v21.16b, v14.16b, v10.16b\n"
- "smlal v23.4s, v29.4h, v6.4h\n"
- "smlal2 v11.4s, v31.8h, v6.8h\n"
- "sqrdmulh v11.4s, v11.4s, v18.4s\n"
- "smlal2 v17.4s, v29.8h, v7.8h\n"
- "smlal2 v25.4s, v29.8h, v6.8h\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "smlal v16.4s, v30.4h, v8.4h\n"
- "smlal v24.4s, v28.4h, v8.4h\n"
- "and v4.16b, v11.16b, v26.16b\n"
- "smlal v23.4s, v28.4h, v7.4h\n"
- "smlal2 v9.4s, v30.8h, v8.8h\n"
- "sqrdmulh v16.4s, v16.4s, v22.4s\n"
- "smlal2 v17.4s, v28.8h, v8.8h\n"
- "smlal2 v25.4s, v28.8h, v7.8h\n"
- "sqrdmulh v24.4s, v24.4s, v22.4s\n"
- "sqrdmulh v23.4s, v23.4s, v22.4s\n"
- "sqadd v14.4s, v14.4s, v21.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "and v19.16b, v16.16b, v10.16b\n"
- "sqrdmulh v9.4s, v9.4s, v18.4s\n"
- "and v3.16b, v24.16b, v10.16b\n"
- "sqrdmulh v17.4s, v17.4s, v18.4s\n"
- "and v21.16b, v23.16b, v10.16b\n"
- "sqrdmulh v25.4s, v25.4s, v18.4s\n"
- "sqadd v11.4s, v11.4s, v4.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "and v27.16b, v9.16b, v26.16b\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "and v5.16b, v17.16b, v26.16b\n"
+ "ldr q24, [x10, #0x0]\n"
+ "ldr q25, [x9, #0x0]\n"
+ "smlal v28.4s, v23.4h, v8.4h\n"
+ "smlal2 v9.4s, v23.8h, v8.8h\n"
+ "ldr q20, [x10, #0x10]\n"
+ "ldr q26, [x9, #0x10]\n"
+ "smlal v28.4s, v10.4h, v19.4h\n"
+ "smlal v3.4s, v23.4h, v17.4h\n"
+ "ldr x20, [x12, #0x28]\n"
+ "ldr d21, [x20, x14]\n"
+ "smlal v0.4s, v23.4h, v7.4h\n"
+ "smlal v6.4s, v23.4h, v19.4h\n"
+ "smlal2 v9.4s, v10.8h, v19.8h\n"
+ "ldr x20, [x12, #0x38]\n"
+ "ldr d10, [x20, x14]\n"
+ "smlal v28.4s, v13.4h, v31.4h\n"
+ "smlal2 v30.4s, v23.8h, v17.8h\n"
+ "smlal2 v22.4s, v23.8h, v7.8h\n"
+ "ldr x20, [x12, #0x30]\n"
+ "ldr d15, [x20, x14]\n"
+ "smlal2 v2.4s, v23.8h, v19.8h\n"
+ "smlal v3.4s, v11.4h, v1.4h\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "ldr x20, [x12, #0x40]\n"
+ "ldr d23, [x20, x14]\n"
+ "smlal v0.4s, v13.4h, v1.4h\n"
+ "smlal v6.4s, v13.4h, v7.4h\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "smlal2 v9.4s, v13.8h, v31.8h\n"
+ "smlal v28.4s, v27.4h, v16.4h\n"
+ "ldr x20, [x12, #0x48]\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "smlal2 v30.4s, v11.8h, v1.8h\n"
+ "ldr d11, [x20, x14]\n"
+ "smlal2 v22.4s, v13.8h, v1.8h\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "smlal2 v2.4s, v13.8h, v7.8h\n"
+ "smlal v3.4s, v13.4h, v8.4h\n"
+ "ldr x21, [x12, #0x50]\n"
+ "ldr x20, [x12, #0x58]\n"
+ "smlal v0.4s, v21.4h, v29.4h\n"
+ "smlal v6.4s, v27.4h, v17.4h\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "ldr x24, [x12, #0x60]\n"
+ "smlal2 v9.4s, v27.8h, v16.8h\n"
+ "smlal v28.4s, v10.4h, v7.4h\n"
+ "ldr x23, [x12, #0x68]\n"
+ "ldr x22, [x12, #0x70]\n"
+ "smlal2 v30.4s, v13.8h, v8.8h\n"
+ "ldr d13, [x21, x14]\n"
+ "smlal2 v22.4s, v21.8h, v29.8h\n"
+ "ldr d21, [x20, x14]\n"
+ "smlal2 v2.4s, v27.8h, v17.8h\n"
+ "smlal v3.4s, v27.4h, v29.4h\n"
+ "ushll v13.8h, v13.8b, #0x0\n"
+ "ldr x21, [x12, #0x78]\n"
+ "smlal v0.4s, v27.4h, v8.4h\n"
+ "smlal v6.4s, v15.4h, v4.4h\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v9.4s, v10.8h, v7.8h\n"
+ "smlal v28.4s, v23.4h, v1.4h\n"
+ "add x11, x11, #0x48\n"
+ "subs x15, x15, #0x1\n"
+ "smlal2 v30.4s, v27.8h, v29.8h\n"
+ "smlal2 v22.4s, v27.8h, v8.8h\n"
+ "ldr d27, [x24, x14]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal2 v2.4s, v15.8h, v4.8h\n"
+ "ldr d15, [x23, x14]\n"
+ "smlal v3.4s, v10.4h, v19.4h\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "smlal v0.4s, v11.4h, v31.4h\n"
+ "smlal v6.4s, v11.4h, v8.4h\n"
+ "add x10, x10, #0x20\n"
+ "add x9, x9, #0x20\n"
+ "smlal2 v9.4s, v23.8h, v1.8h\n"
+ "smlal v28.4s, v11.4h, v4.4h\n"
+ "smlal2 v30.4s, v10.8h, v19.8h\n"
+ "ldr d10, [x22, x14]\n"
+ "smlal2 v22.4s, v11.8h, v31.8h\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "smlal2 v2.4s, v11.8h, v8.8h\n"
+ "ldr d8, [x21, x14]\n"
+ "smlal v3.4s, v23.4h, v7.4h\n"
+ "ushll v8.8h, v8.8b, #0x0\n"
+ "smlal v0.4s, v13.4h, v19.4h\n"
+ "smlal v6.4s, v21.4h, v1.4h\n"
+ "add x14, x14, #0x8\n"
+ "smlal2 v9.4s, v11.8h, v4.8h\n"
+ "smlal v28.4s, v13.4h, v17.4h\n"
+ "smlal2 v30.4s, v23.8h, v7.8h\n"
+ "smlal2 v22.4s, v13.8h, v19.8h\n"
+ "smlal2 v2.4s, v21.8h, v1.8h\n"
+ "smlal v3.4s, v11.4h, v16.4h\n"
+ "smlal v0.4s, v27.4h, v17.4h\n"
+ "smlal v6.4s, v15.4h, v31.4h\n"
+ "smlal2 v9.4s, v13.8h, v17.8h\n"
+ "smlal v28.4s, v27.4h, v29.4h\n"
+ "sqrdmulh v28.4s, v28.4s, v24.4s\n"
+ "smlal2 v30.4s, v11.8h, v16.8h\n"
+ "smlal2 v22.4s, v27.8h, v17.8h\n"
+ "and v17.16b, v28.16b, v25.16b\n"
+ "smlal2 v2.4s, v15.8h, v31.8h\n"
+ "smlal v3.4s, v21.4h, v31.4h\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "smlal v0.4s, v10.4h, v16.4h\n"
+ "smlal v6.4s, v10.4h, v29.4h\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "smlal2 v9.4s, v27.8h, v29.8h\n"
+ "smlal2 v30.4s, v21.8h, v31.8h\n"
+ "sqrdmulh v9.4s, v9.4s, v20.4s\n"
+ "smlal2 v22.4s, v10.8h, v16.8h\n"
+ "smlal2 v2.4s, v10.8h, v29.8h\n"
+ "and v23.16b, v9.16b, v26.16b\n"
+ "smlal v3.4s, v15.4h, v4.4h\n"
+ "smlal v0.4s, v8.4h, v4.4h\n"
+ "sqrdmulh v3.4s, v3.4s, v24.4s\n"
+ "smlal v6.4s, v8.4h, v16.4h\n"
+ "smlal2 v30.4s, v15.8h, v4.8h\n"
+ "sqrdmulh v0.4s, v0.4s, v24.4s\n"
+ "smlal2 v22.4s, v8.8h, v4.8h\n"
+ "smlal2 v2.4s, v8.8h, v16.8h\n"
+ "sqrdmulh v6.4s, v6.4s, v24.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v8.16b, v3.16b, v25.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v20.4s\n"
+ "and v11.16b, v0.16b, v25.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v20.4s\n"
+ "and v29.16b, v6.16b, v25.16b\n"
+ "sqrdmulh v2.4s, v2.4s, v20.4s\n"
+ "sqadd v9.4s, v9.4s, v23.4s\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "and v13.16b, v30.16b, v26.16b\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "and v21.16b, v22.16b, v26.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v23.16b, v2.16b, v26.16b\n"
+ "sqadd v3.4s, v3.4s, v8.4s\n"
+ "sshr v13.4s, v13.4s, #0x1f\n"
+ "sqadd v0.4s, v0.4s, v11.4s\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "and v4.16b, v25.16b, v26.16b\n"
- "sqadd v16.4s, v16.4s, v19.4s\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sqadd v24.4s, v24.4s, v3.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v21.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "srshl v14.4s, v14.4s, v10.4s\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "sqadd v9.4s, v9.4s, v27.4s\n"
- "srshl v24.4s, v24.4s, v10.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "sqadd v25.4s, v25.4s, v4.4s\n"
- "srshl v11.4s, v11.4s, v26.4s\n"
- "sqxtn v14.4h, v14.4s\n"
+ "sqadd v6.4s, v6.4s, v29.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "srshl v28.4s, v28.4s, v25.4s\n"
+ "srshl v3.4s, v3.4s, v25.4s\n"
+ "sqadd v30.4s, v30.4s, v13.4s\n"
+ "srshl v0.4s, v0.4s, v25.4s\n"
+ "sqadd v22.4s, v22.4s, v21.4s\n"
+ "srshl v6.4s, v6.4s, v25.4s\n"
+ "sqadd v2.4s, v2.4s, v23.4s\n"
"srshl v9.4s, v9.4s, v26.4s\n"
- "sqxtn v16.4h, v16.4s\n"
- "srshl v17.4s, v17.4s, v26.4s\n"
- "sqxtn v24.4h, v24.4s\n"
- "srshl v25.4s, v25.4s, v26.4s\n"
- "sqxtn v23.4h, v23.4s\n"
- "sqxtn2 v14.8h, v11.4s\n"
- "sqxtn2 v16.8h, v9.4s\n"
- "sqxtn2 v24.8h, v17.4s\n"
- "sqxtn2 v23.8h, v25.4s\n"
- "sqadd v14.8h, v14.8h, v20.8h\n"
- "sqadd v16.8h, v16.8h, v20.8h\n"
- "sqadd v24.8h, v24.8h, v20.8h\n"
- "sqadd v23.8h, v23.8h, v20.8h\n"
- "smax v14.8h, v14.8h, v15.8h\n"
- "smax v16.8h, v16.8h, v15.8h\n"
- "smax v24.8h, v24.8h, v15.8h\n"
- "smax v23.8h, v23.8h, v15.8h\n"
- "smin v14.8h, v14.8h, v13.8h\n"
- "smin v16.8h, v16.8h, v13.8h\n"
- "smin v24.8h, v24.8h, v13.8h\n"
- "smin v23.8h, v23.8h, v13.8h\n"
- "uzp1 v14.16b, v14.16b, v14.16b\n"
- "str d14, [x11, x16]\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "str d16, [x10, x16]\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str d24, [x9, x16]\n"
- "str d23, [x28, x16]\n"
- "ldr q14, [x27, #0x0]\n"
- "ldr q11, [x27, #0x10]\n"
- "add x27, x27, #0x20\n"
- "ldr d0, [x14, #0x0]\n"
- "ldr d1, [x14, #0x8]\n"
- "add x16, x16, #0x8\n"
- "str x27, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d2, [x14, #0x10]\n"
- "ldr d3, [x14, #0x18]\n"
- "mov v16.16b, v14.16b\n"
- "mov v9.16b, v11.16b\n"
- "ldr d4, [x14, #0x20]\n"
- "ldr d5, [x14, #0x28]\n"
- "mov v24.16b, v14.16b\n"
- "mov v17.16b, v11.16b\n"
- "ldr d6, [x14, #0x30]\n"
- "ldr d7, [x14, #0x38]\n"
- "mov v23.16b, v14.16b\n"
- "mov v25.16b, v11.16b\n"
- "ldr d8, [x14, #0x40]\n"
- "ldp x24, x23, [x15, #0x0]\n"
- "usubl v0.8h, v0.8b, v12.8b\n"
- "usubl v1.8h, v1.8b, v12.8b\n"
- "ldp x22, x21, [x15, #0x10]\n"
- "ldr d31, [x24, x17]\n"
- "usubl v2.8h, v2.8b, v12.8b\n"
- "usubl v3.8h, v3.8b, v12.8b\n"
- "ldr d30, [x23, x17]\n"
- "ldr d29, [x22, x17]\n"
- "usubl v4.8h, v4.8b, v12.8b\n"
- "usubl v5.8h, v5.8b, v12.8b\n"
- "ldr d28, [x21, x17]\n"
- "ldr x20, [x15, #0x20]\n"
- "usubl v6.8h, v6.8b, v12.8b\n"
- "usubl v7.8h, v7.8b, v12.8b\n"
- "ldr d27, [x20, x17]\n"
- "usubl v8.8h, v8.8b, v12.8b\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "ushll v28.8h, v28.8b, #0x0\n"
+ "sqxtn v28.4h, v28.4s\n"
+ "srshl v30.4s, v30.4s, v26.4s\n"
+ "sqxtn v3.4h, v3.4s\n"
+ "srshl v22.4s, v22.4s, v26.4s\n"
+ "sqxtn v0.4h, v0.4s\n"
+ "srshl v2.4s, v2.4s, v26.4s\n"
+ "sqxtn v6.4h, v6.4s\n"
+ "sqxtn2 v28.8h, v9.4s\n"
+ "sqxtn2 v3.8h, v30.4s\n"
+ "sqxtn2 v0.8h, v22.4s\n"
+ "sqxtn2 v6.8h, v2.4s\n"
+ "sqadd v28.8h, v28.8h, v5.8h\n"
+ "sqadd v3.8h, v3.8h, v5.8h\n"
+ "sqadd v0.8h, v0.8h, v5.8h\n"
+ "sqadd v6.8h, v6.8h, v5.8h\n"
+ "smax v28.8h, v28.8h, v14.8h\n"
+ "smax v3.8h, v3.8h, v14.8h\n"
+ "smax v0.8h, v0.8h, v14.8h\n"
+ "smax v6.8h, v6.8h, v14.8h\n"
+ "smin v28.8h, v28.8h, v12.8h\n"
+ "smin v3.8h, v3.8h, v12.8h\n"
+ "smin v0.8h, v0.8h, v12.8h\n"
+ "smin v6.8h, v6.8h, v12.8h\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str d28, [x28, x13]\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
+ "str d3, [x27, x13]\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "str d0, [x26, x13]\n"
+ "str d6, [x25, x13]\n"
+ "ldr q28, [x20, #0x0]\n"
+ "ldr q9, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d19, [x11, #0x0]\n"
+ "ldr d7, [x11, #0x8]\n"
+ "add x13, x13, #0x8\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d1, [x11, #0x10]\n"
+ "ldr d17, [x11, #0x18]\n"
+ "mov v3.16b, v28.16b\n"
+ "mov v30.16b, v9.16b\n"
+ "ldr d8, [x11, #0x20]\n"
+ "ldr d31, [x11, #0x28]\n"
+ "mov v0.16b, v28.16b\n"
+ "mov v22.16b, v9.16b\n"
+ "ldr d29, [x11, #0x30]\n"
+ "ldr d16, [x11, #0x38]\n"
+ "mov v6.16b, v28.16b\n"
+ "mov v2.16b, v9.16b\n"
+ "ldr d4, [x11, #0x40]\n"
+ "ldp x23, x22, [x12, #0x0]\n"
+ "usubl v19.8h, v19.8b, v18.8b\n"
+ "usubl v7.8h, v7.8b, v18.8b\n"
+ "ldp x21, x20, [x12, #0x10]\n"
+ "ldr d23, [x23, x14]\n"
+ "usubl v1.8h, v1.8b, v18.8b\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "ldr d10, [x22, x14]\n"
+ "ldr d11, [x21, x14]\n"
+ "usubl v8.8h, v8.8b, v18.8b\n"
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "ldr d13, [x20, x14]\n"
+ "ldr x20, [x12, #0x20]\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "ldr d27, [x20, x14]\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "ushll v13.8h, v13.8b, #0x0\n"
"ushll v27.8h, v27.8b, #0x0\n"
"bgt 1b\n"
"2:" // Tail
- "ldr q22, [x13, #0x0]\n"
- "ldr q10, [x12, #0x0]\n"
- "smlal v14.4s, v31.4h, v4.4h\n"
- "smlal2 v11.4s, v31.8h, v4.8h\n"
- "ldr q18, [x13, #0x10]\n"
- "ldr q26, [x12, #0x10]\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal2 v9.4s, v31.8h, v3.8h\n"
- "smlal v14.4s, v30.4h, v0.4h\n"
- "smlal2 v11.4s, v30.8h, v0.8h\n"
- "ldr x22, [x15, #0x28]\n"
- "ldr x21, [x15, #0x38]\n"
- "smlal v16.4s, v29.4h, v2.4h\n"
- "smlal2 v9.4s, v29.8h, v2.8h\n"
- "ldr x20, [x15, #0x30]\n"
- "ldr d29, [x20, x17]\n"
- "smlal v24.4s, v31.4h, v1.4h\n"
- "smlal2 v17.4s, v31.8h, v1.8h\n"
- "ldr x26, [x15, #0x40]\n"
- "ldr x20, [x15, #0x48]\n"
- "ldr d30, [x20, x17]\n"
- "smlal v23.4s, v31.4h, v0.4h\n"
- "smlal2 v25.4s, v31.8h, v0.8h\n"
- "ldr d31, [x22, x17]\n"
- "smlal v14.4s, v28.4h, v5.4h\n"
- "smlal2 v11.4s, v28.8h, v5.8h\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "ldr x25, [x15, #0x50]\n"
- "smlal v16.4s, v28.4h, v4.4h\n"
- "smlal2 v9.4s, v28.8h, v4.8h\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "ldr x24, [x15, #0x58]\n"
- "smlal v24.4s, v28.4h, v2.4h\n"
- "smlal2 v17.4s, v28.8h, v2.8h\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "ldr x23, [x15, #0x60]\n"
- "smlal v23.4s, v28.4h, v1.4h\n"
- "smlal2 v25.4s, v28.8h, v1.8h\n"
- "ldr d28, [x21, x17]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal v14.4s, v27.4h, v7.4h\n"
- "smlal2 v11.4s, v27.8h, v7.8h\n"
- "ldr x22, [x15, #0x68]\n"
- "ldr x21, [x15, #0x70]\n"
- "smlal v16.4s, v27.4h, v6.4h\n"
- "smlal2 v9.4s, v27.8h, v6.8h\n"
- "ldr x20, [x15, #0x78]\n"
- "tst x7, #0x7\n"
- "smlal v24.4s, v31.4h, v6.4h\n"
- "smlal2 v17.4s, v31.8h, v6.8h\n"
- "ldr d31, [x26, x17]\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "smlal v23.4s, v27.4h, v3.4h\n"
- "smlal2 v25.4s, v27.8h, v3.8h\n"
- "add x13, x13, #0x20\n"
- "add x12, x12, #0x20\n"
- "smlal v14.4s, v28.4h, v1.4h\n"
- "smlal2 v11.4s, v28.8h, v1.8h\n"
- "smlal v16.4s, v28.4h, v0.4h\n"
- "smlal2 v9.4s, v28.8h, v0.8h\n"
- "ldr d28, [x24, x17]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal v24.4s, v27.4h, v4.4h\n"
- "smlal v23.4s, v29.4h, v8.4h\n"
- "smlal2 v17.4s, v27.8h, v4.8h\n"
- "smlal2 v25.4s, v29.8h, v8.8h\n"
- "ldr d29, [x25, x17]\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal v14.4s, v31.4h, v2.4h\n"
- "smlal2 v11.4s, v31.8h, v2.8h\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "smlal2 v9.4s, v31.8h, v1.8h\n"
- "ldr d31, [x23, x17]\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "smlal v24.4s, v30.4h, v5.4h\n"
- "smlal v23.4s, v30.4h, v4.4h\n"
- "smlal v14.4s, v30.4h, v8.4h\n"
- "smlal2 v11.4s, v30.8h, v8.8h\n"
- "smlal v16.4s, v30.4h, v7.4h\n"
- "smlal2 v9.4s, v30.8h, v7.8h\n"
- "smlal2 v17.4s, v30.8h, v5.8h\n"
- "smlal2 v25.4s, v30.8h, v4.8h\n"
- "ldr d30, [x22, x17]\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "smlal v24.4s, v29.4h, v0.4h\n"
- "smlal v23.4s, v28.4h, v2.4h\n"
- "smlal v14.4s, v29.4h, v3.4h\n"
- "smlal2 v11.4s, v29.8h, v3.8h\n"
- "smlal2 v17.4s, v29.8h, v0.8h\n"
- "ldr d29, [x21, x17]\n"
- "smlal2 v25.4s, v28.8h, v2.8h\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal v24.4s, v31.4h, v3.4h\n"
- "smlal v23.4s, v30.4h, v5.4h\n"
- "smlal v16.4s, v28.4h, v5.4h\n"
- "smlal2 v9.4s, v28.8h, v5.8h\n"
- "ldr d28, [x20, x17]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal v14.4s, v31.4h, v6.4h\n"
- "smlal2 v17.4s, v31.8h, v3.8h\n"
- "sqrdmulh v14.4s, v14.4s, v22.4s\n"
- "add x17, x17, #0x8\n"
- "smlal2 v25.4s, v30.8h, v5.8h\n"
- "smlal v24.4s, v29.4h, v7.4h\n"
- "and v21.16b, v14.16b, v10.16b\n"
- "smlal v23.4s, v29.4h, v6.4h\n"
- "smlal2 v11.4s, v31.8h, v6.8h\n"
- "sqrdmulh v11.4s, v11.4s, v18.4s\n"
- "smlal2 v17.4s, v29.8h, v7.8h\n"
- "smlal2 v25.4s, v29.8h, v6.8h\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "smlal v16.4s, v30.4h, v8.4h\n"
- "smlal v24.4s, v28.4h, v8.4h\n"
- "and v4.16b, v11.16b, v26.16b\n"
- "smlal v23.4s, v28.4h, v7.4h\n"
- "smlal2 v9.4s, v30.8h, v8.8h\n"
- "sqrdmulh v16.4s, v16.4s, v22.4s\n"
- "smlal2 v17.4s, v28.8h, v8.8h\n"
- "smlal2 v25.4s, v28.8h, v7.8h\n"
- "sqrdmulh v24.4s, v24.4s, v22.4s\n"
- "sqrdmulh v23.4s, v23.4s, v22.4s\n"
- "sqadd v14.4s, v14.4s, v21.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "and v19.16b, v16.16b, v10.16b\n"
- "sqrdmulh v9.4s, v9.4s, v18.4s\n"
- "and v3.16b, v24.16b, v10.16b\n"
- "sqrdmulh v17.4s, v17.4s, v18.4s\n"
- "and v21.16b, v23.16b, v10.16b\n"
- "sqrdmulh v25.4s, v25.4s, v18.4s\n"
- "sqadd v11.4s, v11.4s, v4.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "and v27.16b, v9.16b, v26.16b\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "and v5.16b, v17.16b, v26.16b\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "and v4.16b, v25.16b, v26.16b\n"
- "sqadd v16.4s, v16.4s, v19.4s\n"
+ "ldr q26, [x10, #0x0]\n"
+ "ldr q25, [x9, #0x0]\n"
+ "smlal v28.4s, v23.4h, v8.4h\n"
+ "smlal2 v9.4s, v23.8h, v8.8h\n"
+ "ldr q24, [x10, #0x10]\n"
+ "ldr q20, [x9, #0x10]\n"
+ "smlal v28.4s, v10.4h, v19.4h\n"
+ "smlal v3.4s, v23.4h, v17.4h\n"
+ "ldr x20, [x12, #0x28]\n"
+ "ldr d21, [x20, x14]\n"
+ "smlal v0.4s, v23.4h, v7.4h\n"
+ "smlal v6.4s, v23.4h, v19.4h\n"
+ "smlal2 v9.4s, v10.8h, v19.8h\n"
+ "ldr x20, [x12, #0x38]\n"
+ "ldr d15, [x20, x14]\n"
+ "smlal v28.4s, v13.4h, v31.4h\n"
+ "smlal2 v30.4s, v23.8h, v17.8h\n"
+ "smlal2 v22.4s, v23.8h, v7.8h\n"
+ "ldr x20, [x12, #0x30]\n"
+ "ldr d10, [x20, x14]\n"
+ "smlal2 v2.4s, v23.8h, v19.8h\n"
+ "smlal v3.4s, v11.4h, v1.4h\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "ldr x20, [x12, #0x40]\n"
+ "ldr d23, [x20, x14]\n"
+ "smlal v0.4s, v13.4h, v1.4h\n"
+ "smlal v6.4s, v13.4h, v7.4h\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "smlal2 v9.4s, v13.8h, v31.8h\n"
+ "smlal v28.4s, v27.4h, v16.4h\n"
+ "ldr x20, [x12, #0x48]\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "smlal2 v30.4s, v11.8h, v1.8h\n"
+ "ldr d11, [x20, x14]\n"
+ "smlal2 v22.4s, v13.8h, v1.8h\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "smlal2 v2.4s, v13.8h, v7.8h\n"
+ "smlal v3.4s, v13.4h, v8.4h\n"
+ "ldr x24, [x12, #0x50]\n"
+ "ldr x20, [x12, #0x58]\n"
+ "smlal v0.4s, v21.4h, v29.4h\n"
+ "smlal v6.4s, v27.4h, v17.4h\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "ldr x23, [x12, #0x60]\n"
+ "smlal2 v9.4s, v27.8h, v16.8h\n"
+ "smlal v28.4s, v15.4h, v7.4h\n"
+ "ldr x22, [x12, #0x68]\n"
+ "ldr x21, [x12, #0x70]\n"
+ "smlal2 v30.4s, v13.8h, v8.8h\n"
+ "ldr d13, [x24, x14]\n"
+ "smlal2 v22.4s, v21.8h, v29.8h\n"
+ "ldr d21, [x20, x14]\n"
+ "smlal2 v2.4s, v27.8h, v17.8h\n"
+ "smlal v3.4s, v27.4h, v29.4h\n"
+ "ushll v13.8h, v13.8b, #0x0\n"
+ "ldr x20, [x12, #0x78]\n"
+ "smlal v0.4s, v27.4h, v8.4h\n"
+ "smlal v6.4s, v10.4h, v4.4h\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "tst x16, #0x7\n"
+ "smlal2 v9.4s, v15.8h, v7.8h\n"
+ "smlal v28.4s, v23.4h, v1.4h\n"
+ "add x10, x10, #0x20\n"
+ "add x9, x9, #0x20\n"
+ "smlal2 v30.4s, v27.8h, v29.8h\n"
+ "smlal2 v22.4s, v27.8h, v8.8h\n"
+ "ldr d27, [x23, x14]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal2 v2.4s, v10.8h, v4.8h\n"
+ "ldr d10, [x22, x14]\n"
+ "smlal v3.4s, v15.4h, v19.4h\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "smlal v0.4s, v11.4h, v31.4h\n"
+ "smlal v6.4s, v11.4h, v8.4h\n"
+ "smlal2 v9.4s, v23.8h, v1.8h\n"
+ "smlal v28.4s, v11.4h, v4.4h\n"
+ "smlal2 v30.4s, v15.8h, v19.8h\n"
+ "ldr d15, [x21, x14]\n"
+ "smlal2 v22.4s, v11.8h, v31.8h\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "smlal2 v2.4s, v11.8h, v8.8h\n"
+ "ldr d8, [x20, x14]\n"
+ "smlal v3.4s, v23.4h, v7.4h\n"
+ "ushll v8.8h, v8.8b, #0x0\n"
+ "smlal v0.4s, v13.4h, v19.4h\n"
+ "smlal v6.4s, v21.4h, v1.4h\n"
+ "add x14, x14, #0x8\n"
+ "smlal2 v9.4s, v11.8h, v4.8h\n"
+ "smlal v28.4s, v13.4h, v17.4h\n"
+ "smlal2 v30.4s, v23.8h, v7.8h\n"
+ "smlal2 v22.4s, v13.8h, v19.8h\n"
+ "smlal2 v2.4s, v21.8h, v1.8h\n"
+ "smlal v3.4s, v11.4h, v16.4h\n"
+ "smlal v0.4s, v27.4h, v17.4h\n"
+ "smlal v6.4s, v10.4h, v31.4h\n"
+ "smlal2 v9.4s, v13.8h, v17.8h\n"
+ "smlal v28.4s, v27.4h, v29.4h\n"
+ "sqrdmulh v28.4s, v28.4s, v26.4s\n"
+ "smlal2 v30.4s, v11.8h, v16.8h\n"
+ "smlal2 v22.4s, v27.8h, v17.8h\n"
+ "and v1.16b, v28.16b, v25.16b\n"
+ "smlal2 v2.4s, v10.8h, v31.8h\n"
+ "smlal v3.4s, v21.4h, v31.4h\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "smlal v0.4s, v15.4h, v16.4h\n"
+ "smlal v6.4s, v15.4h, v29.4h\n"
+ "sqadd v28.4s, v28.4s, v1.4s\n"
+ "smlal2 v9.4s, v27.8h, v29.8h\n"
+ "smlal2 v30.4s, v21.8h, v31.8h\n"
+ "sqrdmulh v9.4s, v9.4s, v24.4s\n"
+ "smlal2 v22.4s, v15.8h, v16.8h\n"
+ "smlal2 v2.4s, v15.8h, v29.8h\n"
+ "and v27.16b, v9.16b, v20.16b\n"
+ "smlal v3.4s, v10.4h, v4.4h\n"
+ "smlal v0.4s, v8.4h, v4.4h\n"
+ "sqrdmulh v3.4s, v3.4s, v26.4s\n"
+ "smlal v6.4s, v8.4h, v16.4h\n"
+ "smlal2 v30.4s, v10.8h, v4.8h\n"
+ "sqrdmulh v0.4s, v0.4s, v26.4s\n"
+ "smlal2 v22.4s, v8.8h, v4.8h\n"
+ "smlal2 v2.4s, v8.8h, v16.8h\n"
+ "sqrdmulh v6.4s, v6.4s, v26.4s\n"
"sshr v27.4s, v27.4s, #0x1f\n"
- "sqadd v24.4s, v24.4s, v3.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v21.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "srshl v14.4s, v14.4s, v10.4s\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
+ "and v16.16b, v3.16b, v25.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v24.4s\n"
+ "and v4.16b, v0.16b, v25.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+ "and v17.16b, v6.16b, v25.16b\n"
+ "sqrdmulh v2.4s, v2.4s, v24.4s\n"
"sqadd v9.4s, v9.4s, v27.4s\n"
- "srshl v24.4s, v24.4s, v10.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "sqadd v25.4s, v25.4s, v4.4s\n"
- "srshl v11.4s, v11.4s, v26.4s\n"
- "sqxtn v14.4h, v14.4s\n"
- "srshl v9.4s, v9.4s, v26.4s\n"
- "sqxtn v16.4h, v16.4s\n"
- "srshl v17.4s, v17.4s, v26.4s\n"
- "sqxtn v24.4h, v24.4s\n"
- "srshl v25.4s, v25.4s, v26.4s\n"
- "sqxtn v23.4h, v23.4s\n"
- "sqxtn2 v14.8h, v11.4s\n"
- "sqxtn2 v16.8h, v9.4s\n"
- "sqxtn2 v24.8h, v17.4s\n"
- "sqxtn2 v23.8h, v25.4s\n"
- "sqadd v14.8h, v14.8h, v20.8h\n"
- "sqadd v16.8h, v16.8h, v20.8h\n"
- "sqadd v24.8h, v24.8h, v20.8h\n"
- "sqadd v23.8h, v23.8h, v20.8h\n"
- "smax v14.8h, v14.8h, v15.8h\n"
- "smax v16.8h, v16.8h, v15.8h\n"
- "smax v24.8h, v24.8h, v15.8h\n"
- "smax v23.8h, v23.8h, v15.8h\n"
- "smin v14.8h, v14.8h, v13.8h\n"
- "smin v16.8h, v16.8h, v13.8h\n"
- "smin v24.8h, v24.8h, v13.8h\n"
- "smin v23.8h, v23.8h, v13.8h\n"
- "uzp1 v14.16b, v14.16b, v14.16b\n"
- "str d14, [x11, x16]\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "str d16, [x10, x16]\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str d24, [x9, x16]\n"
- "str d23, [x28, x16]\n"
- "add x16, x16, #0x8\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v8.16b, v30.16b, v20.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v26.16b, v22.16b, v20.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v11.16b, v2.16b, v20.16b\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sqadd v0.4s, v0.4s, v4.4s\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sqadd v6.4s, v6.4s, v17.4s\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "srshl v28.4s, v28.4s, v25.4s\n"
+ "srshl v3.4s, v3.4s, v25.4s\n"
+ "sqadd v30.4s, v30.4s, v8.4s\n"
+ "srshl v0.4s, v0.4s, v25.4s\n"
+ "sqadd v22.4s, v22.4s, v26.4s\n"
+ "srshl v6.4s, v6.4s, v25.4s\n"
+ "sqadd v2.4s, v2.4s, v11.4s\n"
+ "srshl v9.4s, v9.4s, v20.4s\n"
+ "sqxtn v28.4h, v28.4s\n"
+ "srshl v30.4s, v30.4s, v20.4s\n"
+ "sqxtn v3.4h, v3.4s\n"
+ "srshl v22.4s, v22.4s, v20.4s\n"
+ "sqxtn v0.4h, v0.4s\n"
+ "srshl v2.4s, v2.4s, v20.4s\n"
+ "sqxtn v6.4h, v6.4s\n"
+ "sqxtn2 v28.8h, v9.4s\n"
+ "sqxtn2 v3.8h, v30.4s\n"
+ "sqxtn2 v0.8h, v22.4s\n"
+ "sqxtn2 v6.8h, v2.4s\n"
+ "sqadd v28.8h, v28.8h, v5.8h\n"
+ "sqadd v3.8h, v3.8h, v5.8h\n"
+ "sqadd v0.8h, v0.8h, v5.8h\n"
+ "sqadd v6.8h, v6.8h, v5.8h\n"
+ "smax v28.8h, v28.8h, v14.8h\n"
+ "smax v3.8h, v3.8h, v14.8h\n"
+ "smax v0.8h, v0.8h, v14.8h\n"
+ "smax v6.8h, v6.8h, v14.8h\n"
+ "smin v28.8h, v28.8h, v12.8h\n"
+ "smin v3.8h, v3.8h, v12.8h\n"
+ "smin v0.8h, v0.8h, v12.8h\n"
+ "smin v6.8h, v6.8h, v12.8h\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str d28, [x28, x13]\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
+ "str d3, [x27, x13]\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "str d0, [x26, x13]\n"
+ "str d6, [x25, x13]\n"
+ "add x13, x13, #0x8\n"
"beq 64f\n"
- "add x14, x14, #0x48\n"
+ "add x11, x11, #0x48\n"
"3:" // Oddments
- "ldr x27, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x7, #2, 5f\n"
- "ld1 { v14.4s }, [x27], #0x10\n"
- "tbz x7, #1, 4f\n"
- "ld1 { v11.d }[0], [x27], #0x8\n"
- "tbz x7, #0, 7f\n"
- "ld1 { v11.s }[2], [x27]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x16, #2, 5f\n"
+ "ld1 { v28.4s }, [x20], #0x10\n"
+ "tbz x16, #1, 4f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz x16, #0, 7f\n"
+ "ld1 { v9.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x7, #0, 7f\n"
- "ld1 { v11.s }[0], [x27]\n"
+ "tbz x16, #0, 7f\n"
+ "ld1 { v9.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x7, #1, 6f\n"
- "ld1 { v14.d }[0], [x27], #0x8\n"
- "tbz x7, #0, 7f\n"
- "ld1 { v14.s }[2], [x27]\n"
+ "tbz x16, #1, 6f\n"
+ "ld1 { v28.d }[0], [x20], #0x8\n"
+ "tbz x16, #0, 7f\n"
+ "ld1 { v28.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 7f\n"
- "ld1 { v14.s }[0], [x27]\n"
+ "tbz x16, #0, 7f\n"
+ "ld1 { v28.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x14, #0x0]\n"
- "ldr d1, [x14, #0x8]\n"
- "mov v16.16b, v14.16b\n"
- "mov v9.16b, v11.16b\n"
- "ldr d2, [x14, #0x10]\n"
- "ldr d3, [x14, #0x18]\n"
- "mov v24.16b, v14.16b\n"
- "mov v17.16b, v11.16b\n"
- "ldr d4, [x14, #0x20]\n"
- "ldr d5, [x14, #0x28]\n"
- "mov v23.16b, v14.16b\n"
- "mov v25.16b, v11.16b\n"
- "ldr d6, [x14, #0x30]\n"
- "ldr d7, [x14, #0x38]\n"
- "usubl v0.8h, v0.8b, v12.8b\n"
- "usubl v1.8h, v1.8b, v12.8b\n"
- "ldr d8, [x14, #0x40]\n"
- "ldp x24, x23, [x15, #0x0]\n"
- "usubl v2.8h, v2.8b, v12.8b\n"
- "usubl v3.8h, v3.8b, v12.8b\n"
- "ldp x22, x21, [x15, #0x10]\n"
- "ldr x20, [x15, #0x20]\n"
- "usubl v4.8h, v4.8b, v12.8b\n"
- "usubl v5.8h, v5.8b, v12.8b\n"
- "usubl v6.8h, v6.8b, v12.8b\n"
- "usubl v7.8h, v7.8b, v12.8b\n"
- "usubl v8.8h, v8.8b, v12.8b\n"
- "add x24, x24, x17\n"
- "add x23, x23, x17\n"
- "add x22, x22, x17\n"
- "add x21, x21, x17\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 9f\n"
- "ld1 { v31.s }[0], [x24], #0x4\n"
- "ld1 { v30.s }[0], [x23], #0x4\n"
- "ld1 { v29.s }[0], [x22], #0x4\n"
- "ld1 { v28.s }[0], [x21], #0x4\n"
+ "ldr d19, [x11, #0x0]\n"
+ "ldr d7, [x11, #0x8]\n"
+ "mov v3.16b, v28.16b\n"
+ "mov v30.16b, v9.16b\n"
+ "ldr d1, [x11, #0x10]\n"
+ "ldr d17, [x11, #0x18]\n"
+ "mov v0.16b, v28.16b\n"
+ "mov v22.16b, v9.16b\n"
+ "ldr d8, [x11, #0x20]\n"
+ "ldr d31, [x11, #0x28]\n"
+ "mov v6.16b, v28.16b\n"
+ "mov v2.16b, v9.16b\n"
+ "ldr d29, [x11, #0x30]\n"
+ "ldr d16, [x11, #0x38]\n"
+ "usubl v19.8h, v19.8b, v18.8b\n"
+ "usubl v7.8h, v7.8b, v18.8b\n"
+ "ldr d4, [x11, #0x40]\n"
+ "ldp x24, x23, [x12, #0x0]\n"
+ "usubl v1.8h, v1.8b, v18.8b\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "ldp x22, x21, [x12, #0x10]\n"
+ "ldr x20, [x12, #0x20]\n"
+ "usubl v8.8h, v8.8b, v18.8b\n"
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "add x24, x24, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
+ "tbz x16, #2, 9f\n"
+ "ld1 { v23.s }[0], [x24], #0x4\n"
+ "ld1 { v10.s }[0], [x23], #0x4\n"
+ "ld1 { v11.s }[0], [x22], #0x4\n"
+ "ld1 { v13.s }[0], [x21], #0x4\n"
"ld1 { v27.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 8f\n"
- "ld1 { v31.h }[2], [x24], #0x2\n"
- "ld1 { v30.h }[2], [x23], #0x2\n"
- "ld1 { v29.h }[2], [x22], #0x2\n"
- "ld1 { v28.h }[2], [x21], #0x2\n"
+ "tbz x16, #1, 8f\n"
+ "ld1 { v23.h }[2], [x24], #0x2\n"
+ "ld1 { v10.h }[2], [x23], #0x2\n"
+ "ld1 { v11.h }[2], [x22], #0x2\n"
+ "ld1 { v13.h }[2], [x21], #0x2\n"
"ld1 { v27.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 11f\n"
- "ld1 { v31.b }[6], [x24]\n"
- "ld1 { v30.b }[6], [x23]\n"
- "ld1 { v29.b }[6], [x22]\n"
- "ld1 { v28.b }[6], [x21]\n"
+ "tbz x16, #0, 11f\n"
+ "ld1 { v23.b }[6], [x24]\n"
+ "ld1 { v10.b }[6], [x23]\n"
+ "ld1 { v11.b }[6], [x22]\n"
+ "ld1 { v13.b }[6], [x21]\n"
"ld1 { v27.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x7, #0, 11f\n"
- "ld1 { v31.b }[4], [x24]\n"
- "ld1 { v30.b }[4], [x23]\n"
- "ld1 { v29.b }[4], [x22]\n"
- "ld1 { v28.b }[4], [x21]\n"
+ "tbz x16, #0, 11f\n"
+ "ld1 { v23.b }[4], [x24]\n"
+ "ld1 { v10.b }[4], [x23]\n"
+ "ld1 { v11.b }[4], [x22]\n"
+ "ld1 { v13.b }[4], [x21]\n"
"ld1 { v27.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x7, #1, 10f\n"
- "ld1 { v31.h }[0], [x24], #0x2\n"
- "ld1 { v30.h }[0], [x23], #0x2\n"
- "ld1 { v29.h }[0], [x22], #0x2\n"
- "ld1 { v28.h }[0], [x21], #0x2\n"
+ "tbz x16, #1, 10f\n"
+ "ld1 { v23.h }[0], [x24], #0x2\n"
+ "ld1 { v10.h }[0], [x23], #0x2\n"
+ "ld1 { v11.h }[0], [x22], #0x2\n"
+ "ld1 { v13.h }[0], [x21], #0x2\n"
"ld1 { v27.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 11f\n"
- "ld1 { v31.b }[2], [x24]\n"
- "ld1 { v30.b }[2], [x23]\n"
- "ld1 { v29.b }[2], [x22]\n"
- "ld1 { v28.b }[2], [x21]\n"
+ "tbz x16, #0, 11f\n"
+ "ld1 { v23.b }[2], [x24]\n"
+ "ld1 { v10.b }[2], [x23]\n"
+ "ld1 { v11.b }[2], [x22]\n"
+ "ld1 { v13.b }[2], [x21]\n"
"ld1 { v27.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 11f\n"
- "ld1 { v31.b }[0], [x24]\n"
- "ld1 { v30.b }[0], [x23]\n"
- "ld1 { v29.b }[0], [x22]\n"
- "ld1 { v28.b }[0], [x21]\n"
+ "tbz x16, #0, 11f\n"
+ "ld1 { v23.b }[0], [x24]\n"
+ "ld1 { v10.b }[0], [x23]\n"
+ "ld1 { v11.b }[0], [x22]\n"
+ "ld1 { v13.b }[0], [x21]\n"
"ld1 { v27.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "ushll v31.8h, v31.8b, #0x0\n"
- "smlal v14.4s, v31.4h, v4.4h\n"
- "smlal2 v11.4s, v31.8h, v4.8h\n"
- "ldr x22, [x15, #0x28]\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal2 v9.4s, v31.8h, v3.8h\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "add x22, x22, x17\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal v24.4s, v31.4h, v1.4h\n"
- "smlal2 v17.4s, v31.8h, v1.8h\n"
- "smlal v23.4s, v31.4h, v0.4h\n"
- "smlal2 v25.4s, v31.8h, v0.8h\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal v14.4s, v30.4h, v0.4h\n"
- "smlal2 v11.4s, v30.8h, v0.8h\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "smlal v28.4s, v23.4h, v8.4h\n"
+ "smlal2 v9.4s, v23.8h, v8.8h\n"
+ "ldr x20, [x12, #0x28]\n"
+ "smlal v3.4s, v23.4h, v17.4h\n"
+ "smlal2 v30.4s, v23.8h, v17.8h\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "smlal v0.4s, v23.4h, v7.4h\n"
+ "smlal2 v22.4s, v23.8h, v7.8h\n"
+ "add x20, x20, x14\n"
+ "smlal v6.4s, v23.4h, v19.4h\n"
+ "smlal2 v2.4s, v23.8h, v19.8h\n"
+ "ushll v13.8h, v13.8b, #0x0\n"
+ "smlal v28.4s, v10.4h, v19.4h\n"
+ "smlal2 v9.4s, v10.8h, v19.8h\n"
"ushll v27.8h, v27.8b, #0x0\n"
- "smlal v16.4s, v29.4h, v2.4h\n"
- "smlal2 v9.4s, v29.8h, v2.8h\n"
- "smlal v14.4s, v28.4h, v5.4h\n"
- "smlal2 v11.4s, v28.8h, v5.8h\n"
- "smlal v16.4s, v28.4h, v4.4h\n"
- "smlal2 v9.4s, v28.8h, v4.8h\n"
- "smlal v24.4s, v28.4h, v2.4h\n"
- "smlal2 v17.4s, v28.8h, v2.8h\n"
- "smlal v23.4s, v28.4h, v1.4h\n"
- "smlal2 v25.4s, v28.8h, v1.8h\n"
- "tbz x7, #2, 13f\n"
- "ld1 { v31.s }[0], [x22], #0x4\n"
- "tbz x7, #1, 12f\n"
- "ld1 { v31.h }[2], [x22], #0x2\n"
- "tbz x7, #0, 15f\n"
- "ld1 { v31.b }[6], [x22]\n"
+ "smlal v3.4s, v11.4h, v1.4h\n"
+ "smlal2 v30.4s, v11.8h, v1.8h\n"
+ "smlal v28.4s, v13.4h, v31.4h\n"
+ "smlal2 v9.4s, v13.8h, v31.8h\n"
+ "smlal v3.4s, v13.4h, v8.4h\n"
+ "smlal2 v30.4s, v13.8h, v8.8h\n"
+ "smlal v0.4s, v13.4h, v1.4h\n"
+ "smlal2 v22.4s, v13.8h, v1.8h\n"
+ "smlal v6.4s, v13.4h, v7.4h\n"
+ "smlal2 v2.4s, v13.8h, v7.8h\n"
+ "tbz x16, #2, 13f\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 12f\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 15f\n"
+ "ld1 { v26.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x7, #0, 15f\n"
- "ld1 { v31.b }[4], [x22]\n"
+ "tbz x16, #0, 15f\n"
+ "ld1 { v26.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x7, #1, 14f\n"
- "ld1 { v31.h }[0], [x22], #0x2\n"
- "tbz x7, #0, 15f\n"
- "ld1 { v31.b }[2], [x22]\n"
+ "tbz x16, #1, 14f\n"
+ "ld1 { v26.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 15f\n"
+ "ld1 { v26.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 15f\n"
- "ld1 { v31.b }[0], [x22]\n"
+ "tbz x16, #0, 15f\n"
+ "ld1 { v26.b }[0], [x20]\n"
"15:" // Oddments: Load (3, 0): Bit 2: End
- "ushll v31.8h, v31.8b, #0x0\n"
- "smlal v24.4s, v31.4h, v6.4h\n"
- "smlal2 v17.4s, v31.8h, v6.8h\n"
- "ldr x20, [x15, #0x30]\n"
- "smlal v14.4s, v27.4h, v7.4h\n"
- "smlal2 v11.4s, v27.8h, v7.8h\n"
- "add x20, x20, x17\n"
- "smlal v16.4s, v27.4h, v6.4h\n"
- "smlal2 v9.4s, v27.8h, v6.8h\n"
- "smlal v24.4s, v27.4h, v4.4h\n"
- "smlal2 v17.4s, v27.8h, v4.8h\n"
- "smlal v23.4s, v27.4h, v3.4h\n"
- "smlal2 v25.4s, v27.8h, v3.8h\n"
- "tbz x7, #2, 17f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 16f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 19f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal v0.4s, v26.4h, v29.4h\n"
+ "smlal2 v22.4s, v26.8h, v29.8h\n"
+ "ldr x20, [x12, #0x30]\n"
+ "smlal v28.4s, v27.4h, v16.4h\n"
+ "smlal2 v9.4s, v27.8h, v16.8h\n"
+ "add x20, x20, x14\n"
+ "smlal v3.4s, v27.4h, v29.4h\n"
+ "smlal2 v30.4s, v27.8h, v29.8h\n"
+ "smlal v0.4s, v27.4h, v8.4h\n"
+ "smlal2 v22.4s, v27.8h, v8.8h\n"
+ "smlal v6.4s, v27.4h, v17.4h\n"
+ "smlal2 v2.4s, v27.8h, v17.8h\n"
+ "tbz x16, #2, 17f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 16f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 19f\n"
+ "ld1 { v23.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x7, #0, 19f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "tbz x16, #0, 19f\n"
+ "ld1 { v23.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x7, #1, 18f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 19f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "tbz x16, #1, 18f\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 19f\n"
+ "ld1 { v23.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 19f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "tbz x16, #0, 19f\n"
+ "ld1 { v23.b }[0], [x20]\n"
"19:" // Oddments: Load (3, 3): Bit 2: End
- "ushll v29.8h, v29.8b, #0x0\n"
- "ldr x21, [x15, #0x38]\n"
- "smlal v23.4s, v29.4h, v8.4h\n"
- "smlal2 v25.4s, v29.8h, v8.8h\n"
- "add x21, x21, x17\n"
- "tbz x7, #2, 21f\n"
- "ld1 { v28.s }[0], [x21], #0x4\n"
- "tbz x7, #1, 20f\n"
- "ld1 { v28.h }[2], [x21], #0x2\n"
- "tbz x7, #0, 23f\n"
- "ld1 { v28.b }[6], [x21]\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "ldr x20, [x12, #0x38]\n"
+ "smlal v6.4s, v23.4h, v4.4h\n"
+ "smlal2 v2.4s, v23.8h, v4.8h\n"
+ "add x20, x20, x14\n"
+ "tbz x16, #2, 21f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 20f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 23f\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
- "tbz x7, #0, 23f\n"
- "ld1 { v28.b }[4], [x21]\n"
+ "tbz x16, #0, 23f\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 1): Bit 2: Unset
- "tbz x7, #1, 22f\n"
- "ld1 { v28.h }[0], [x21], #0x2\n"
- "tbz x7, #0, 23f\n"
- "ld1 { v28.b }[2], [x21]\n"
+ "tbz x16, #1, 22f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 23f\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 23f\n"
- "ld1 { v28.b }[0], [x21]\n"
+ "tbz x16, #0, 23f\n"
+ "ld1 { v21.b }[0], [x20]\n"
"23:" // Oddments: Load (0, 1): Bit 2: End
- "ushll v28.8h, v28.8b, #0x0\n"
- "ldr x26, [x15, #0x40]\n"
- "smlal v14.4s, v28.4h, v1.4h\n"
- "smlal2 v11.4s, v28.8h, v1.8h\n"
- "smlal v16.4s, v28.4h, v0.4h\n"
- "smlal2 v9.4s, v28.8h, v0.8h\n"
- "add x26, x26, x17\n"
- "tbz x7, #2, 25f\n"
- "ld1 { v31.s }[0], [x26], #0x4\n"
- "tbz x7, #1, 24f\n"
- "ld1 { v31.h }[2], [x26], #0x2\n"
- "tbz x7, #0, 27f\n"
- "ld1 { v31.b }[6], [x26]\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "ldr x20, [x12, #0x40]\n"
+ "smlal v28.4s, v21.4h, v7.4h\n"
+ "smlal2 v9.4s, v21.8h, v7.8h\n"
+ "smlal v3.4s, v21.4h, v19.4h\n"
+ "smlal2 v30.4s, v21.8h, v19.8h\n"
+ "add x20, x20, x14\n"
+ "tbz x16, #2, 25f\n"
+ "ld1 { v18.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 24f\n"
+ "ld1 { v18.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 27f\n"
+ "ld1 { v18.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
- "tbz x7, #0, 27f\n"
- "ld1 { v31.b }[4], [x26]\n"
+ "tbz x16, #0, 27f\n"
+ "ld1 { v18.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (0, 2): Bit 2: Unset
- "tbz x7, #1, 26f\n"
- "ld1 { v31.h }[0], [x26], #0x2\n"
- "tbz x7, #0, 27f\n"
- "ld1 { v31.b }[2], [x26]\n"
+ "tbz x16, #1, 26f\n"
+ "ld1 { v18.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 27f\n"
+ "ld1 { v18.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 27f\n"
- "ld1 { v31.b }[0], [x26]\n"
+ "tbz x16, #0, 27f\n"
+ "ld1 { v18.b }[0], [x20]\n"
"27:" // Oddments: Load (0, 2): Bit 2: End
- "ushll v31.8h, v31.8b, #0x0\n"
- "ldr x20, [x15, #0x48]\n"
- "smlal v14.4s, v31.4h, v2.4h\n"
- "smlal2 v11.4s, v31.8h, v2.8h\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "smlal2 v9.4s, v31.8h, v1.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 29f\n"
- "ld1 { v30.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 28f\n"
- "ld1 { v30.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 31f\n"
- "ld1 { v30.b }[6], [x20]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "ldr x20, [x12, #0x48]\n"
+ "smlal v28.4s, v18.4h, v1.4h\n"
+ "smlal2 v9.4s, v18.8h, v1.8h\n"
+ "smlal v3.4s, v18.4h, v7.4h\n"
+ "smlal2 v30.4s, v18.8h, v7.8h\n"
+ "add x20, x20, x14\n"
+ "tbz x16, #2, 29f\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 28f\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 31f\n"
+ "ld1 { v15.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
- "tbz x7, #0, 31f\n"
- "ld1 { v30.b }[4], [x20]\n"
+ "tbz x16, #0, 31f\n"
+ "ld1 { v15.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
- "tbz x7, #1, 30f\n"
- "ld1 { v30.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 31f\n"
- "ld1 { v30.b }[2], [x20]\n"
+ "tbz x16, #1, 30f\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 31f\n"
+ "ld1 { v15.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 31f\n"
- "ld1 { v30.b }[0], [x20]\n"
+ "tbz x16, #0, 31f\n"
+ "ld1 { v15.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "ushll v30.8h, v30.8b, #0x0\n"
- "ldr x25, [x15, #0x50]\n"
- "smlal v14.4s, v30.4h, v8.4h\n"
- "smlal2 v11.4s, v30.8h, v8.8h\n"
- "smlal v16.4s, v30.4h, v7.4h\n"
- "smlal2 v9.4s, v30.8h, v7.8h\n"
- "add x25, x25, x17\n"
- "smlal v24.4s, v30.4h, v5.4h\n"
- "smlal2 v17.4s, v30.8h, v5.8h\n"
- "smlal v23.4s, v30.4h, v4.4h\n"
- "smlal2 v25.4s, v30.8h, v4.8h\n"
- "tbz x7, #2, 33f\n"
- "ld1 { v29.s }[0], [x25], #0x4\n"
- "tbz x7, #1, 32f\n"
- "ld1 { v29.h }[2], [x25], #0x2\n"
- "tbz x7, #0, 35f\n"
- "ld1 { v29.b }[6], [x25]\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "ldr x20, [x12, #0x50]\n"
+ "smlal v28.4s, v15.4h, v4.4h\n"
+ "smlal2 v9.4s, v15.8h, v4.8h\n"
+ "smlal v3.4s, v15.4h, v16.4h\n"
+ "smlal2 v30.4s, v15.8h, v16.8h\n"
+ "add x20, x20, x14\n"
+ "smlal v0.4s, v15.4h, v31.4h\n"
+ "smlal2 v22.4s, v15.8h, v31.8h\n"
+ "smlal v6.4s, v15.4h, v8.4h\n"
+ "smlal2 v2.4s, v15.8h, v8.8h\n"
+ "tbz x16, #2, 33f\n"
+ "ld1 { v20.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 32f\n"
+ "ld1 { v20.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 35f\n"
+ "ld1 { v20.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
- "tbz x7, #0, 35f\n"
- "ld1 { v29.b }[4], [x25]\n"
+ "tbz x16, #0, 35f\n"
+ "ld1 { v20.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (1, 0): Bit 2: Unset
- "tbz x7, #1, 34f\n"
- "ld1 { v29.h }[0], [x25], #0x2\n"
- "tbz x7, #0, 35f\n"
- "ld1 { v29.b }[2], [x25]\n"
+ "tbz x16, #1, 34f\n"
+ "ld1 { v20.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 35f\n"
+ "ld1 { v20.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 35f\n"
- "ld1 { v29.b }[0], [x25]\n"
+ "tbz x16, #0, 35f\n"
+ "ld1 { v20.b }[0], [x20]\n"
"35:" // Oddments: Load (1, 0): Bit 2: End
- "ushll v29.8h, v29.8b, #0x0\n"
- "ldr x24, [x15, #0x58]\n"
- "smlal v14.4s, v29.4h, v3.4h\n"
- "smlal2 v11.4s, v29.8h, v3.8h\n"
- "smlal v24.4s, v29.4h, v0.4h\n"
- "smlal2 v17.4s, v29.8h, v0.8h\n"
- "add x24, x24, x17\n"
- "tbz x7, #2, 37f\n"
- "ld1 { v28.s }[0], [x24], #0x4\n"
- "tbz x7, #1, 36f\n"
- "ld1 { v28.h }[2], [x24], #0x2\n"
- "tbz x7, #0, 39f\n"
- "ld1 { v28.b }[6], [x24]\n"
+ "ushll v20.8h, v20.8b, #0x0\n"
+ "ldr x20, [x12, #0x58]\n"
+ "smlal v28.4s, v20.4h, v17.4h\n"
+ "smlal2 v9.4s, v20.8h, v17.8h\n"
+ "smlal v0.4s, v20.4h, v19.4h\n"
+ "smlal2 v22.4s, v20.8h, v19.8h\n"
+ "add x20, x20, x14\n"
+ "tbz x16, #2, 37f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 36f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 39f\n"
+ "ld1 { v11.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x7, #0, 39f\n"
- "ld1 { v28.b }[4], [x24]\n"
+ "tbz x16, #0, 39f\n"
+ "ld1 { v11.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x7, #1, 38f\n"
- "ld1 { v28.h }[0], [x24], #0x2\n"
- "tbz x7, #0, 39f\n"
- "ld1 { v28.b }[2], [x24]\n"
+ "tbz x16, #1, 38f\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 39f\n"
+ "ld1 { v11.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 39f\n"
- "ld1 { v28.b }[0], [x24]\n"
+ "tbz x16, #0, 39f\n"
+ "ld1 { v11.b }[0], [x20]\n"
"39:" // Oddments: Load (1, 3): Bit 2: End
- "ushll v28.8h, v28.8b, #0x0\n"
- "ldr x23, [x15, #0x60]\n"
- "smlal v16.4s, v28.4h, v5.4h\n"
- "smlal2 v9.4s, v28.8h, v5.8h\n"
- "smlal v23.4s, v28.4h, v2.4h\n"
- "smlal2 v25.4s, v28.8h, v2.8h\n"
- "add x23, x23, x17\n"
- "tbz x7, #2, 41f\n"
- "ld1 { v31.s }[0], [x23], #0x4\n"
- "tbz x7, #1, 40f\n"
- "ld1 { v31.h }[2], [x23], #0x2\n"
- "tbz x7, #0, 43f\n"
- "ld1 { v31.b }[6], [x23]\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "ldr x20, [x12, #0x60]\n"
+ "smlal v3.4s, v11.4h, v31.4h\n"
+ "smlal2 v30.4s, v11.8h, v31.8h\n"
+ "smlal v6.4s, v11.4h, v1.4h\n"
+ "smlal2 v2.4s, v11.8h, v1.8h\n"
+ "add x20, x20, x14\n"
+ "tbz x16, #2, 41f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 40f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 43f\n"
+ "ld1 { v23.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
- "tbz x7, #0, 43f\n"
- "ld1 { v31.b }[4], [x23]\n"
+ "tbz x16, #0, 43f\n"
+ "ld1 { v23.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 0): Bit 2: Unset
- "tbz x7, #1, 42f\n"
- "ld1 { v31.h }[0], [x23], #0x2\n"
- "tbz x7, #0, 43f\n"
- "ld1 { v31.b }[2], [x23]\n"
+ "tbz x16, #1, 42f\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 43f\n"
+ "ld1 { v23.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 43f\n"
- "ld1 { v31.b }[0], [x23]\n"
+ "tbz x16, #0, 43f\n"
+ "ld1 { v23.b }[0], [x20]\n"
"43:" // Oddments: Load (2, 0): Bit 2: End
- "ushll v31.8h, v31.8b, #0x0\n"
- "ldr x22, [x15, #0x68]\n"
- "smlal v14.4s, v31.4h, v6.4h\n"
- "smlal2 v11.4s, v31.8h, v6.8h\n"
- "smlal v24.4s, v31.4h, v3.4h\n"
- "smlal2 v17.4s, v31.8h, v3.8h\n"
- "add x22, x22, x17\n"
- "tbz x7, #2, 45f\n"
- "ld1 { v30.s }[0], [x22], #0x4\n"
- "tbz x7, #1, 44f\n"
- "ld1 { v30.h }[2], [x22], #0x2\n"
- "tbz x7, #0, 47f\n"
- "ld1 { v30.b }[6], [x22]\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "ldr x20, [x12, #0x68]\n"
+ "smlal v28.4s, v23.4h, v29.4h\n"
+ "smlal2 v9.4s, v23.8h, v29.8h\n"
+ "smlal v0.4s, v23.4h, v17.4h\n"
+ "smlal2 v22.4s, v23.8h, v17.8h\n"
+ "add x20, x20, x14\n"
+ "tbz x16, #2, 45f\n"
+ "ld1 { v20.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 44f\n"
+ "ld1 { v20.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 47f\n"
+ "ld1 { v20.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x7, #0, 47f\n"
- "ld1 { v30.b }[4], [x22]\n"
+ "tbz x16, #0, 47f\n"
+ "ld1 { v20.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x7, #1, 46f\n"
- "ld1 { v30.h }[0], [x22], #0x2\n"
- "tbz x7, #0, 47f\n"
- "ld1 { v30.b }[2], [x22]\n"
+ "tbz x16, #1, 46f\n"
+ "ld1 { v20.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 47f\n"
+ "ld1 { v20.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 47f\n"
- "ld1 { v30.b }[0], [x22]\n"
+ "tbz x16, #0, 47f\n"
+ "ld1 { v20.b }[0], [x20]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "ushll v30.8h, v30.8b, #0x0\n"
- "ldr x21, [x15, #0x70]\n"
- "smlal v16.4s, v30.4h, v8.4h\n"
- "smlal2 v9.4s, v30.8h, v8.8h\n"
- "smlal v23.4s, v30.4h, v5.4h\n"
- "smlal2 v25.4s, v30.8h, v5.8h\n"
- "add x21, x21, x17\n"
- "tbz x7, #2, 49f\n"
- "ld1 { v29.s }[0], [x21], #0x4\n"
- "tbz x7, #1, 48f\n"
- "ld1 { v29.h }[2], [x21], #0x2\n"
- "tbz x7, #0, 51f\n"
- "ld1 { v29.b }[6], [x21]\n"
+ "ushll v20.8h, v20.8b, #0x0\n"
+ "ldr x20, [x12, #0x70]\n"
+ "smlal v3.4s, v20.4h, v4.4h\n"
+ "smlal2 v30.4s, v20.8h, v4.8h\n"
+ "smlal v6.4s, v20.4h, v31.4h\n"
+ "smlal2 v2.4s, v20.8h, v31.8h\n"
+ "add x20, x20, x14\n"
+ "tbz x16, #2, 49f\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 48f\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 51f\n"
+ "ld1 { v8.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x7, #0, 51f\n"
- "ld1 { v29.b }[4], [x21]\n"
+ "tbz x16, #0, 51f\n"
+ "ld1 { v8.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x7, #1, 50f\n"
- "ld1 { v29.h }[0], [x21], #0x2\n"
- "tbz x7, #0, 51f\n"
- "ld1 { v29.b }[2], [x21]\n"
+ "tbz x16, #1, 50f\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 51f\n"
+ "ld1 { v8.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 51f\n"
- "ld1 { v29.b }[0], [x21]\n"
+ "tbz x16, #0, 51f\n"
+ "ld1 { v8.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "ushll v29.8h, v29.8b, #0x0\n"
- "ldr x20, [x15, #0x78]\n"
- "smlal v24.4s, v29.4h, v7.4h\n"
- "smlal2 v17.4s, v29.8h, v7.8h\n"
- "smlal v23.4s, v29.4h, v6.4h\n"
- "smlal2 v25.4s, v29.8h, v6.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 53f\n"
- "ld1 { v28.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 52f\n"
- "ld1 { v28.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 55f\n"
- "ld1 { v28.b }[6], [x20]\n"
+ "ushll v8.8h, v8.8b, #0x0\n"
+ "ldr x20, [x12, #0x78]\n"
+ "smlal v0.4s, v8.4h, v16.4h\n"
+ "smlal2 v22.4s, v8.8h, v16.8h\n"
+ "smlal v6.4s, v8.4h, v29.4h\n"
+ "smlal2 v2.4s, v8.8h, v29.8h\n"
+ "add x20, x20, x14\n"
+ "tbz x16, #2, 53f\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 52f\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 55f\n"
+ "ld1 { v8.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x7, #0, 55f\n"
- "ld1 { v28.b }[4], [x20]\n"
+ "tbz x16, #0, 55f\n"
+ "ld1 { v8.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x7, #1, 54f\n"
- "ld1 { v28.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 55f\n"
- "ld1 { v28.b }[2], [x20]\n"
+ "tbz x16, #1, 54f\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 55f\n"
+ "ld1 { v8.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 55f\n"
- "ld1 { v28.b }[0], [x20]\n"
+ "tbz x16, #0, 55f\n"
+ "ld1 { v8.b }[0], [x20]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal v24.4s, v28.4h, v8.4h\n"
- "smlal2 v17.4s, v28.8h, v8.8h\n"
- "smlal v23.4s, v28.4h, v7.4h\n"
- "smlal2 v25.4s, v28.8h, v7.8h\n"
- "tbz x7, #2, 57f\n"
- "ld1 { v22.4s }, [x13], #0x10\n"
- "ld1 { v10.4s }, [x12], #0x10\n"
- "tbz x7, #1, 56f\n"
- "ld1 { v18.d }[0], [x13], #0x8\n"
- "ld1 { v26.d }[0], [x12], #0x8\n"
- "tbz x7, #0, 59f\n"
- "ld1 { v18.s }[2], [x13]\n"
- "ld1 { v26.s }[2], [x12]\n"
+ "ushll v8.8h, v8.8b, #0x0\n"
+ "smlal v0.4s, v8.4h, v4.4h\n"
+ "smlal2 v22.4s, v8.8h, v4.8h\n"
+ "smlal v6.4s, v8.4h, v16.4h\n"
+ "smlal2 v2.4s, v8.8h, v16.8h\n"
+ "tbz x16, #2, 57f\n"
+ "ld1 { v7.4s }, [x10], #0x10\n"
+ "ld1 { v23.4s }, [x9], #0x10\n"
+ "tbz x16, #1, 56f\n"
+ "ld1 { v11.d }[0], [x10], #0x8\n"
+ "ld1 { v27.d }[0], [x9], #0x8\n"
+ "tbz x16, #0, 59f\n"
+ "ld1 { v11.s }[2], [x10]\n"
+ "ld1 { v27.s }[2], [x9]\n"
"b 59f\n"
"56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x7, #0, 59f\n"
- "ld1 { v18.s }[0], [x13]\n"
- "ld1 { v26.s }[0], [x12]\n"
+ "tbz x16, #0, 59f\n"
+ "ld1 { v11.s }[0], [x10]\n"
+ "ld1 { v27.s }[0], [x9]\n"
"b 59f\n"
"57:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x7, #1, 58f\n"
- "ld1 { v22.d }[0], [x13], #0x8\n"
- "ld1 { v10.d }[0], [x12], #0x8\n"
- "tbz x7, #0, 59f\n"
- "ld1 { v22.s }[2], [x13]\n"
- "ld1 { v10.s }[2], [x12]\n"
+ "tbz x16, #1, 58f\n"
+ "ld1 { v7.d }[0], [x10], #0x8\n"
+ "ld1 { v23.d }[0], [x9], #0x8\n"
+ "tbz x16, #0, 59f\n"
+ "ld1 { v7.s }[2], [x10]\n"
+ "ld1 { v23.s }[2], [x9]\n"
"b 59f\n"
"58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 59f\n"
- "ld1 { v22.s }[0], [x13]\n"
- "ld1 { v10.s }[0], [x12]\n"
+ "tbz x16, #0, 59f\n"
+ "ld1 { v7.s }[0], [x10]\n"
+ "ld1 { v23.s }[0], [x9]\n"
"59:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v14.4s, v14.4s, v22.4s\n"
- "and v21.16b, v14.16b, v10.16b\n"
- "add x11, x11, x16\n"
- "add x10, x10, x16\n"
- "sqrdmulh v11.4s, v11.4s, v18.4s\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "add x9, x9, x16\n"
- "add x28, x28, x16\n"
- "and v4.16b, v11.16b, v26.16b\n"
- "sqrdmulh v16.4s, v16.4s, v22.4s\n"
- "sqrdmulh v24.4s, v24.4s, v22.4s\n"
- "sqrdmulh v23.4s, v23.4s, v22.4s\n"
- "sqadd v14.4s, v14.4s, v21.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v7.4s\n"
+ "and v20.16b, v28.16b, v23.16b\n"
+ "add x28, x28, x13\n"
+ "add x27, x27, x13\n"
+ "sqrdmulh v9.4s, v9.4s, v11.4s\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "add x26, x26, x13\n"
+ "add x25, x25, x13\n"
+ "and v4.16b, v9.16b, v27.16b\n"
+ "sqrdmulh v3.4s, v3.4s, v7.4s\n"
+ "sqrdmulh v0.4s, v0.4s, v7.4s\n"
+ "sqrdmulh v6.4s, v6.4s, v7.4s\n"
+ "sqadd v28.4s, v28.4s, v20.4s\n"
"sshr v4.4s, v4.4s, #0x1f\n"
- "and v19.16b, v16.16b, v10.16b\n"
- "sqrdmulh v9.4s, v9.4s, v18.4s\n"
- "and v3.16b, v24.16b, v10.16b\n"
- "sqrdmulh v17.4s, v17.4s, v18.4s\n"
- "and v21.16b, v23.16b, v10.16b\n"
- "sqrdmulh v25.4s, v25.4s, v18.4s\n"
- "sqadd v11.4s, v11.4s, v4.4s\n"
+ "and v19.16b, v3.16b, v23.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v11.4s\n"
+ "and v29.16b, v0.16b, v23.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v11.4s\n"
+ "and v26.16b, v6.16b, v23.16b\n"
+ "sqrdmulh v2.4s, v2.4s, v11.4s\n"
+ "sqadd v9.4s, v9.4s, v4.4s\n"
"sshr v19.4s, v19.4s, #0x1f\n"
- "and v27.16b, v9.16b, v26.16b\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "and v5.16b, v17.16b, v26.16b\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "and v4.16b, v25.16b, v26.16b\n"
- "sqadd v16.4s, v16.4s, v19.4s\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sqadd v24.4s, v24.4s, v3.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v21.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "srshl v14.4s, v14.4s, v10.4s\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "sqadd v9.4s, v9.4s, v27.4s\n"
- "srshl v24.4s, v24.4s, v10.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "sqadd v25.4s, v25.4s, v4.4s\n"
- "srshl v11.4s, v11.4s, v26.4s\n"
- "sqxtn v14.4h, v14.4s\n"
- "srshl v9.4s, v9.4s, v26.4s\n"
- "sqxtn v16.4h, v16.4s\n"
- "srshl v17.4s, v17.4s, v26.4s\n"
- "sqxtn v24.4h, v24.4s\n"
- "srshl v25.4s, v25.4s, v26.4s\n"
- "sqxtn v23.4h, v23.4s\n"
- "sqxtn2 v14.8h, v11.4s\n"
- "sqxtn2 v16.8h, v9.4s\n"
- "sqxtn2 v24.8h, v17.4s\n"
- "sqxtn2 v23.8h, v25.4s\n"
- "sqadd v14.8h, v14.8h, v20.8h\n"
- "sqadd v16.8h, v16.8h, v20.8h\n"
- "sqadd v24.8h, v24.8h, v20.8h\n"
- "sqadd v23.8h, v23.8h, v20.8h\n"
- "smax v14.8h, v14.8h, v15.8h\n"
- "smax v16.8h, v16.8h, v15.8h\n"
- "smax v24.8h, v24.8h, v15.8h\n"
- "smax v23.8h, v23.8h, v15.8h\n"
- "smin v14.8h, v14.8h, v13.8h\n"
- "smin v16.8h, v16.8h, v13.8h\n"
- "smin v24.8h, v24.8h, v13.8h\n"
- "smin v23.8h, v23.8h, v13.8h\n"
- "uzp1 v14.16b, v14.16b, v14.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "tbz x7, #2, 61f\n"
- "st1 { v14.s }[0], [x11], #0x4\n"
- "st1 { v16.s }[0], [x10], #0x4\n"
- "st1 { v24.s }[0], [x9], #0x4\n"
- "st1 { v23.s }[0], [x28], #0x4\n"
- "tbz x7, #1, 60f\n"
- "st1 { v14.h }[2], [x11], #0x2\n"
- "st1 { v16.h }[2], [x10], #0x2\n"
- "st1 { v24.h }[2], [x9], #0x2\n"
- "st1 { v23.h }[2], [x28], #0x2\n"
- "tbz x7, #0, 63f\n"
- "st1 { v14.b }[6], [x11], #0x1\n"
- "st1 { v16.b }[6], [x10], #0x1\n"
- "st1 { v24.b }[6], [x9], #0x1\n"
- "st1 { v23.b }[6], [x28], #0x1\n"
+ "and v17.16b, v30.16b, v27.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v8.16b, v22.16b, v27.16b\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "and v13.16b, v2.16b, v27.16b\n"
+ "sqadd v3.4s, v3.4s, v19.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v0.4s, v0.4s, v29.4s\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sqadd v6.4s, v6.4s, v26.4s\n"
+ "sshr v13.4s, v13.4s, #0x1f\n"
+ "srshl v28.4s, v28.4s, v23.4s\n"
+ "srshl v3.4s, v3.4s, v23.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "srshl v0.4s, v0.4s, v23.4s\n"
+ "sqadd v22.4s, v22.4s, v8.4s\n"
+ "srshl v6.4s, v6.4s, v23.4s\n"
+ "sqadd v2.4s, v2.4s, v13.4s\n"
+ "srshl v9.4s, v9.4s, v27.4s\n"
+ "sqxtn v28.4h, v28.4s\n"
+ "srshl v30.4s, v30.4s, v27.4s\n"
+ "sqxtn v3.4h, v3.4s\n"
+ "srshl v22.4s, v22.4s, v27.4s\n"
+ "sqxtn v0.4h, v0.4s\n"
+ "srshl v2.4s, v2.4s, v27.4s\n"
+ "sqxtn v6.4h, v6.4s\n"
+ "sqxtn2 v28.8h, v9.4s\n"
+ "sqxtn2 v3.8h, v30.4s\n"
+ "sqxtn2 v0.8h, v22.4s\n"
+ "sqxtn2 v6.8h, v2.4s\n"
+ "sqadd v28.8h, v28.8h, v5.8h\n"
+ "sqadd v3.8h, v3.8h, v5.8h\n"
+ "sqadd v0.8h, v0.8h, v5.8h\n"
+ "sqadd v6.8h, v6.8h, v5.8h\n"
+ "smax v28.8h, v28.8h, v14.8h\n"
+ "smax v3.8h, v3.8h, v14.8h\n"
+ "smax v0.8h, v0.8h, v14.8h\n"
+ "smax v6.8h, v6.8h, v14.8h\n"
+ "smin v28.8h, v28.8h, v12.8h\n"
+ "smin v3.8h, v3.8h, v12.8h\n"
+ "smin v0.8h, v0.8h, v12.8h\n"
+ "smin v6.8h, v6.8h, v12.8h\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "tbz x16, #2, 61f\n"
+ "st1 { v28.s }[0], [x28], #0x4\n"
+ "st1 { v3.s }[0], [x27], #0x4\n"
+ "st1 { v0.s }[0], [x26], #0x4\n"
+ "st1 { v6.s }[0], [x25], #0x4\n"
+ "tbz x16, #1, 60f\n"
+ "st1 { v28.h }[2], [x28], #0x2\n"
+ "st1 { v3.h }[2], [x27], #0x2\n"
+ "st1 { v0.h }[2], [x26], #0x2\n"
+ "st1 { v6.h }[2], [x25], #0x2\n"
+ "tbz x16, #0, 63f\n"
+ "st1 { v28.b }[6], [x28], #0x1\n"
+ "st1 { v3.b }[6], [x27], #0x1\n"
+ "st1 { v0.b }[6], [x26], #0x1\n"
+ "st1 { v6.b }[6], [x25], #0x1\n"
"b 63f\n"
"60:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x7, #0, 63f\n"
- "st1 { v14.b }[4], [x11], #0x1\n"
- "st1 { v16.b }[4], [x10], #0x1\n"
- "st1 { v24.b }[4], [x9], #0x1\n"
- "st1 { v23.b }[4], [x28], #0x1\n"
+ "tbz x16, #0, 63f\n"
+ "st1 { v28.b }[4], [x28], #0x1\n"
+ "st1 { v3.b }[4], [x27], #0x1\n"
+ "st1 { v0.b }[4], [x26], #0x1\n"
+ "st1 { v6.b }[4], [x25], #0x1\n"
"b 63f\n"
"61:" // Oddments: Bit 2: Unset
- "tbz x7, #1, 62f\n"
- "st1 { v14.h }[0], [x11], #0x2\n"
- "st1 { v16.h }[0], [x10], #0x2\n"
- "st1 { v24.h }[0], [x9], #0x2\n"
- "st1 { v23.h }[0], [x28], #0x2\n"
- "tbz x7, #0, 63f\n"
- "st1 { v14.b }[2], [x11], #0x1\n"
- "st1 { v16.b }[2], [x10], #0x1\n"
- "st1 { v24.b }[2], [x9], #0x1\n"
- "st1 { v23.b }[2], [x28], #0x1\n"
+ "tbz x16, #1, 62f\n"
+ "st1 { v28.h }[0], [x28], #0x2\n"
+ "st1 { v3.h }[0], [x27], #0x2\n"
+ "st1 { v0.h }[0], [x26], #0x2\n"
+ "st1 { v6.h }[0], [x25], #0x2\n"
+ "tbz x16, #0, 63f\n"
+ "st1 { v28.b }[2], [x28], #0x1\n"
+ "st1 { v3.b }[2], [x27], #0x1\n"
+ "st1 { v0.b }[2], [x26], #0x1\n"
+ "st1 { v6.b }[2], [x25], #0x1\n"
"b 63f\n"
"62:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 63f\n"
- "st1 { v14.b }[0], [x11], #0x1\n"
- "st1 { v16.b }[0], [x10], #0x1\n"
- "st1 { v24.b }[0], [x9], #0x1\n"
- "st1 { v23.b }[0], [x28], #0x1\n"
+ "tbz x16, #0, 63f\n"
+ "st1 { v28.b }[0], [x28], #0x1\n"
+ "st1 { v3.b }[0], [x27], #0x1\n"
+ "st1 { v0.b }[0], [x26], #0x1\n"
+ "st1 { v6.b }[0], [x25], #0x1\n"
"63:" // Oddments: Bit 2: End
"64:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index b27e8687e0..50778e9cbb 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -22,8 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
-
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
@@ -35,15 +34,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
- const unsigned int,
- const uint8_t *const *const,
- const uint8_t *const,
- const int32_t *const,
- const arm_gemm::Requantize32 &,
- const int32_t *const,
- const int32_t *const,
- uint8_t *const *const);
+void a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
class a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index a1e5c669b7..c807cb3ade 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -104,15 +104,15 @@ void a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
"lsr x8, x7, #0x3\n"
"add x20, x23, %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v19.16b }, [x20]\n"
+ "ld1r { v6.16b }, [x20]\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
"add x21, x23, %[offsetof_Requantize32_c_offset]\n"
"add x20, x23, %[offsetof_Requantize32_minval]\n"
- "ld1r { v12.8h }, [x21]\n"
- "ld1r { v14.8h }, [x20]\n"
+ "ld1r { v22.8h }, [x21]\n"
+ "ld1r { v13.8h }, [x20]\n"
"add x20, x23, %[offsetof_Requantize32_maxval]\n"
"mov x17, #0x0\n"
- "ld1r { v23.8h }, [x20]\n"
+ "ld1r { v5.8h }, [x20]\n"
"mov x16, #0x0\n"
"add x15, %x[params], %[offsetof_Params_inptrs]\n"
"ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
@@ -121,563 +121,563 @@ void a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ldp x11, x10, [x22, #0x0]\n"
"ldp x9, x28, [x22, #0x10]\n"
"cbz x8, 3f\n"
- "ldr d0, [x14, #0x0]\n"
- "ldr d1, [x14, #0x8]\n"
+ "ldr d12, [x14, #0x0]\n"
+ "ldr d11, [x14, #0x8]\n"
"subs x8, x8, #0x1\n"
- "usubl v0.8h, v0.8b, v19.8b\n"
- "ldr d2, [x14, #0x10]\n"
- "ldr d3, [x14, #0x18]\n"
- "usubl v1.8h, v1.8b, v19.8b\n"
- "usubl v2.8h, v2.8b, v19.8b\n"
- "ldr d4, [x14, #0x20]\n"
- "ldr d5, [x14, #0x28]\n"
- "usubl v3.8h, v3.8b, v19.8b\n"
- "usubl v4.8h, v4.8b, v19.8b\n"
- "ldr d6, [x14, #0x30]\n"
- "ldr d7, [x14, #0x38]\n"
- "usubl v5.8h, v5.8b, v19.8b\n"
- "usubl v6.8h, v6.8b, v19.8b\n"
- "ldr d8, [x14, #0x40]\n"
- "ldr x22, [%x[params], %[offsetof_Params_bias]]\n"
- "usubl v7.8h, v7.8b, v19.8b\n"
- "usubl v8.8h, v8.8b, v19.8b\n"
- "ldr q15, [x22, #0x0]\n"
- "ldr q13, [x22, #0x10]\n"
- "add x22, x22, #0x20\n"
- "str x22, [%x[params], %[offsetof_Params_bias]]\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "ldr d25, [x14, #0x10]\n"
+ "ldr d24, [x14, #0x18]\n"
+ "usubl v11.8h, v11.8b, v6.8b\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "ldr d23, [x14, #0x20]\n"
+ "ldr d7, [x14, #0x28]\n"
+ "usubl v24.8h, v24.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "ldr d3, [x14, #0x30]\n"
+ "ldr d9, [x14, #0x38]\n"
+ "usubl v7.8h, v7.8b, v6.8b\n"
+ "usubl v3.8h, v3.8b, v6.8b\n"
+ "ldr d30, [x14, #0x40]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "usubl v9.8h, v9.8b, v6.8b\n"
+ "usubl v30.8h, v30.8b, v6.8b\n"
+ "ldr q8, [x20, #0x0]\n"
+ "ldr q2, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
"ldp x27, x26, [x15, #0x0]\n"
"ldp x25, x24, [x15, #0x10]\n"
- "mov v17.16b, v15.16b\n"
- "mov v20.16b, v13.16b\n"
+ "mov v21.16b, v8.16b\n"
+ "mov v4.16b, v2.16b\n"
"ldp x23, x22, [x15, #0x20]\n"
"ldp x21, x20, [x15, #0x30]\n"
- "mov v11.16b, v15.16b\n"
- "mov v10.16b, v13.16b\n"
- "ldr d31, [x27, x17]\n"
- "ldr d30, [x26, x17]\n"
- "mov v9.16b, v15.16b\n"
- "mov v22.16b, v13.16b\n"
- "ldr d29, [x25, x17]\n"
- "ldr d28, [x24, x17]\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "ldr d27, [x23, x17]\n"
- "ldr d26, [x22, x17]\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "ldr d25, [x21, x17]\n"
- "ldr d24, [x20, x17]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
+ "mov v20.16b, v8.16b\n"
+ "mov v1.16b, v2.16b\n"
+ "ldr d26, [x27, x17]\n"
+ "ldr d18, [x26, x17]\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v14.16b, v2.16b\n"
+ "ldr d10, [x25, x17]\n"
+ "ldr d27, [x24, x17]\n"
"ushll v26.8h, v26.8b, #0x0\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "ushll v24.8h, v24.8b, #0x0\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "ldr d17, [x23, x17]\n"
+ "ldr d19, [x22, x17]\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "ldr d15, [x21, x17]\n"
+ "ldr d28, [x20, x17]\n"
+ "ushll v17.8h, v17.8b, #0x0\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
"beq 2f\n"
"1:" // Loop
- "ldr q18, [x13, #0x0]\n"
- "smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v13.4s, v31.8h, v8.8h\n"
- "ldr x23, [x15, #0x40]\n"
- "smlal v17.4s, v31.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "ldr x22, [x15, #0x48]\n"
- "ldr x21, [x15, #0x50]\n"
- "smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v13.4s, v30.8h, v0.8h\n"
- "ldr q21, [x12, #0x0]\n"
- "ldr x20, [x15, #0x58]\n"
- "smlal v17.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "ldr d28, [x22, x17]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v13.4s, v29.8h, v1.8h\n"
- "ldr d29, [x23, x17]\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
- "ldr d27, [x21, x17]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v13.4s, v26.8h, v3.8h\n"
+ "ldr q31, [x13, #0x0]\n"
+ "ldr q0, [x12, #0x0]\n"
+ "smlal v8.4s, v26.4h, v30.4h\n"
+ "smlal2 v2.4s, v26.8h, v30.8h\n"
+ "ldr q29, [x13, #0x10]\n"
+ "ldr x21, [x15, #0x58]\n"
+ "smlal v8.4s, v18.4h, v12.4h\n"
+ "smlal v21.4s, v26.4h, v3.4h\n"
+ "ldr x20, [x15, #0x78]\n"
+ "ldr x25, [x15, #0x60]\n"
+ "smlal v20.4s, v26.4h, v25.4h\n"
+ "smlal v16.4s, v26.4h, v12.4h\n"
+ "ldr x24, [x15, #0x80]\n"
+ "smlal2 v2.4s, v18.8h, v12.8h\n"
+ "ldr d18, [x21, x17]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "smlal v8.4s, v10.4h, v11.4h\n"
+ "smlal2 v4.4s, v26.8h, v3.8h\n"
+ "ldr x23, [x15, #0x68]\n"
+ "ldr x22, [x15, #0x88]\n"
+ "smlal2 v1.4s, v26.8h, v25.8h\n"
+ "smlal2 v14.4s, v26.8h, v12.8h\n"
"ldr d26, [x20, x17]\n"
- "ldr x23, [x15, #0x78]\n"
- "smlal v17.4s, v24.4h, v0.4h\n"
- "smlal2 v20.4s, v24.8h, v0.8h\n"
- "ldr x20, [x15, #0x60]\n"
"ushll v26.8h, v26.8b, #0x0\n"
- "smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v13.4s, v25.8h, v4.8h\n"
- "ldr d25, [x20, x17]\n"
- "ldr x21, [x15, #0x80]\n"
- "smlal v17.4s, v29.4h, v4.4h\n"
- "smlal2 v20.4s, v29.8h, v4.8h\n"
- "ldr q30, [x13, #0x10]\n"
- "ldr x20, [x15, #0x68]\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v13.4s, v24.8h, v2.8h\n"
- "ldr d29, [x20, x17]\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal v17.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "ldr d28, [x21, x17]\n"
- "ldr x22, [x15, #0x88]\n"
- "smlal v11.4s, v31.4h, v2.4h\n"
- "smlal2 v10.4s, v31.8h, v2.8h\n"
+ "smlal v21.4s, v27.4h, v11.4h\n"
+ "smlal v20.4s, v18.4h, v24.4h\n"
+ "ldr x21, [x15, #0x40]\n"
"ldr x20, [x15, #0x70]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v13.4s, v27.8h, v5.8h\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "ldr x24, [x15, #0x98]\n"
- "smlal v17.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
- "ldr d27, [x23, x17]\n"
+ "smlal v16.4s, v26.4h, v23.4h\n"
+ "smlal2 v2.4s, v10.8h, v11.8h\n"
+ "ldr d10, [x25, x17]\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "smlal v8.4s, v19.4h, v24.4h\n"
+ "smlal2 v4.4s, v27.8h, v11.8h\n"
+ "ldr d27, [x24, x17]\n"
"ushll v27.8h, v27.8b, #0x0\n"
- "smlal v9.4s, v31.4h, v0.4h\n"
- "smlal v11.4s, v26.4h, v3.4h\n"
- "ldr x21, [x15, #0x90]\n"
- "ldr x23, [x15, #0xa8]\n"
- "smlal2 v10.4s, v26.8h, v3.8h\n"
+ "smlal2 v1.4s, v18.8h, v24.8h\n"
+ "ldr d18, [x23, x17]\n"
+ "smlal2 v14.4s, v26.8h, v23.8h\n"
"ldr d26, [x22, x17]\n"
- "smlal2 v22.4s, v31.8h, v0.8h\n"
- "ldr d24, [x20, x17]\n"
- "smlal v9.4s, v27.4h, v4.4h\n"
- "smlal v11.4s, v25.4h, v0.4h\n"
+ "ldr x24, [x15, #0x98]\n"
+ "smlal v21.4s, v17.4h, v25.4h\n"
+ "smlal v20.4s, v10.4h, v12.4h\n"
+ "ldr x23, [x15, #0x50]\n"
+ "smlal v16.4s, v27.4h, v11.4h\n"
+ "smlal2 v2.4s, v19.8h, v24.8h\n"
+ "ldr d19, [x21, x17]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
"ushll v26.8h, v26.8b, #0x0\n"
- "ldr x20, [x15, #0xa0]\n"
- "smlal2 v10.4s, v25.8h, v0.8h\n"
- "ldr q31, [x12, #0x10]\n"
- "smlal2 v22.4s, v27.8h, v4.8h\n"
- "ldr d27, [x20, x17]\n"
- "smlal v9.4s, v28.4h, v1.4h\n"
- "smlal v15.4s, v25.4h, v6.4h\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "ldr x22, [x15, #0xb0]\n"
- "smlal2 v13.4s, v25.8h, v6.8h\n"
- "ldr d25, [x21, x17]\n"
- "smlal v11.4s, v29.4h, v4.4h\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal2 v10.4s, v29.8h, v4.8h\n"
- "ldr d29, [x24, x17]\n"
- "smlal2 v22.4s, v28.8h, v1.8h\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal v9.4s, v26.4h, v5.4h\n"
- "smlal v15.4s, v24.4h, v7.4h\n"
- "ldr x21, [x15, #0xb8]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal2 v13.4s, v24.8h, v7.8h\n"
- "smlal v11.4s, v24.4h, v1.4h\n"
- "ldr x20, [x15, #0xc0]\n"
- "sqrdmulh v15.4s, v15.4s, v18.4s\n"
- "smlal2 v10.4s, v24.8h, v1.8h\n"
- "ldr d24, [x23, x17]\n"
- "smlal2 v22.4s, v26.8h, v5.8h\n"
- "ldr d26, [x22, x17]\n"
- "smlal v9.4s, v29.4h, v2.4h\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "smlal2 v22.4s, v29.8h, v2.8h\n"
- "ldr x22, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal v11.4s, v25.4h, v6.4h\n"
- "smlal v9.4s, v24.4h, v3.4h\n"
+ "smlal v8.4s, v15.4h, v23.4h\n"
+ "smlal2 v4.4s, v17.8h, v25.8h\n"
+ "ldr d17, [x20, x17]\n"
+ "ldr x22, [x15, #0x48]\n"
+ "smlal2 v1.4s, v10.8h, v12.8h\n"
+ "smlal2 v14.4s, v27.8h, v11.8h\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v21.4s, v28.4h, v12.4h\n"
+ "smlal v20.4s, v18.4h, v23.4h\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "smlal v16.4s, v26.4h, v7.4h\n"
+ "smlal2 v2.4s, v15.8h, v23.8h\n"
+ "ldr d15, [x24, x17]\n"
+ "ushll v17.8h, v17.8b, #0x0\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "smlal v8.4s, v28.4h, v25.4h\n"
+ "smlal2 v4.4s, v28.8h, v12.8h\n"
+ "ldr d12, [x23, x17]\n"
+ "ushll v12.8h, v12.8b, #0x0\n"
+ "smlal2 v1.4s, v18.8h, v23.8h\n"
+ "ldr d18, [x22, x17]\n"
+ "smlal2 v14.4s, v26.8h, v7.8h\n"
+ "ldr d26, [x21, x17]\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "smlal v21.4s, v19.4h, v23.4h\n"
+ "smlal v20.4s, v17.4h, v11.4h\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "smlal v16.4s, v15.4h, v25.4h\n"
+ "smlal2 v2.4s, v28.8h, v25.8h\n"
+ "ldr d28, [x20, x17]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
"ushll v26.8h, v26.8b, #0x0\n"
+ "smlal v8.4s, v12.4h, v7.4h\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal2 v4.4s, v19.8h, v23.8h\n"
+ "ldr d23, [x22, x17]\n"
+ "ldr d19, [x21, x17]\n"
+ "smlal2 v1.4s, v17.8h, v11.8h\n"
+ "ldr d11, [x20, x17]\n"
+ "smlal2 v14.4s, v15.8h, v25.8h\n"
+ "ldr q25, [x12, #0x10]\n"
+ "smlal v21.4s, v18.4h, v7.4h\n"
+ "smlal v20.4s, v26.4h, v3.4h\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "ldr x21, [x15, #0xc0]\n"
+ "smlal v16.4s, v28.4h, v24.4h\n"
+ "smlal2 v2.4s, v12.8h, v7.8h\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal v8.4s, v10.4h, v3.4h\n"
+ "smlal2 v4.4s, v18.8h, v7.8h\n"
+ "ldr d18, [x21, x17]\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "smlal2 v1.4s, v26.8h, v3.8h\n"
+ "smlal2 v14.4s, v28.8h, v24.8h\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
"add x14, x14, #0x48\n"
- "smlal v17.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
- "and v2.16b, v15.16b, v21.16b\n"
+ "smlal v21.4s, v12.4h, v24.4h\n"
+ "smlal v20.4s, v23.4h, v9.4h\n"
+ "add x17, x17, #0x8\n"
"subs x8, x8, #0x1\n"
- "smlal2 v10.4s, v25.8h, v6.8h\n"
- "ldr d25, [x21, x17]\n"
- "smlal2 v22.4s, v24.8h, v3.8h\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal v11.4s, v27.4h, v7.4h\n"
- "smlal v9.4s, v26.4h, v7.4h\n"
- "sqrdmulh v13.4s, v13.4s, v30.4s\n"
+ "smlal v16.4s, v19.4h, v9.4h\n"
+ "smlal2 v2.4s, v10.8h, v3.8h\n"
"add x13, x13, #0x20\n"
- "smlal v17.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "ldr d29, [x20, x17]\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal2 v10.4s, v27.8h, v7.8h\n"
- "smlal2 v22.4s, v26.8h, v7.8h\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "add x17, x17, #0x8\n"
- "smlal v11.4s, v24.4h, v5.4h\n"
- "smlal v9.4s, v25.4h, v6.4h\n"
- "and v16.16b, v13.16b, v31.16b\n"
"add x12, x12, #0x20\n"
- "smlal2 v10.4s, v24.8h, v5.8h\n"
- "smlal2 v22.4s, v25.8h, v6.8h\n"
- "sqrdmulh v17.4s, v17.4s, v18.4s\n"
- "smlal v11.4s, v25.4h, v8.4h\n"
- "smlal v9.4s, v29.4h, v8.4h\n"
- "sqrdmulh v11.4s, v11.4s, v18.4s\n"
- "smlal2 v10.4s, v25.8h, v8.8h\n"
- "smlal2 v22.4s, v29.8h, v8.8h\n"
- "sqrdmulh v9.4s, v9.4s, v18.4s\n"
- "sqadd v15.4s, v15.4s, v2.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "and v18.16b, v17.16b, v21.16b\n"
- "sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "and v28.16b, v11.16b, v21.16b\n"
- "sqrdmulh v10.4s, v10.4s, v30.4s\n"
- "and v2.16b, v9.16b, v21.16b\n"
- "sqrdmulh v22.4s, v22.4s, v30.4s\n"
- "sqadd v13.4s, v13.4s, v16.4s\n"
+ "smlal v8.4s, v17.4h, v9.4h\n"
+ "smlal2 v4.4s, v12.8h, v24.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v31.4s\n"
+ "smlal2 v1.4s, v23.8h, v9.8h\n"
+ "smlal2 v14.4s, v19.8h, v9.8h\n"
+ "and v10.16b, v8.16b, v0.16b\n"
+ "smlal v21.4s, v27.4h, v9.4h\n"
+ "smlal v20.4s, v28.4h, v7.4h\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "smlal v16.4s, v11.4h, v3.4h\n"
+ "smlal2 v2.4s, v17.8h, v9.8h\n"
+ "sqrdmulh v2.4s, v2.4s, v29.4s\n"
+ "smlal2 v4.4s, v27.8h, v9.8h\n"
+ "smlal2 v1.4s, v28.8h, v7.8h\n"
+ "and v12.16b, v2.16b, v25.16b\n"
+ "smlal2 v14.4s, v11.8h, v3.8h\n"
+ "smlal v21.4s, v15.4h, v30.4h\n"
+ "sqrdmulh v21.4s, v21.4s, v31.4s\n"
+ "smlal v20.4s, v11.4h, v30.4h\n"
+ "smlal v16.4s, v18.4h, v30.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v31.4s\n"
+ "smlal2 v4.4s, v15.8h, v30.8h\n"
+ "smlal2 v1.4s, v11.8h, v30.8h\n"
+ "sqrdmulh v16.4s, v16.4s, v31.4s\n"
+ "smlal2 v14.4s, v18.8h, v30.8h\n"
+ "sqadd v8.4s, v8.4s, v10.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "and v27.16b, v21.16b, v0.16b\n"
+ "sqrdmulh v4.4s, v4.4s, v29.4s\n"
+ "and v24.16b, v20.16b, v0.16b\n"
+ "sqrdmulh v1.4s, v1.4s, v29.4s\n"
+ "and v19.16b, v16.16b, v0.16b\n"
+ "sqrdmulh v14.4s, v14.4s, v29.4s\n"
+ "sqadd v2.4s, v2.4s, v12.4s\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "and v18.16b, v4.16b, v25.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "and v17.16b, v1.16b, v25.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "and v15.16b, v14.16b, v25.16b\n"
+ "sqadd v21.4s, v21.4s, v27.4s\n"
"sshr v18.4s, v18.4s, #0x1f\n"
- "and v4.16b, v20.16b, v31.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "and v3.16b, v10.16b, v31.16b\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "and v16.16b, v22.16b, v31.16b\n"
- "sqadd v17.4s, v17.4s, v18.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v11.4s, v11.4s, v28.4s\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v2.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v21.4s\n"
- "srshl v17.4s, v17.4s, v21.4s\n"
- "sqadd v20.4s, v20.4s, v4.4s\n"
- "srshl v11.4s, v11.4s, v21.4s\n"
- "sqadd v10.4s, v10.4s, v3.4s\n"
- "srshl v9.4s, v9.4s, v21.4s\n"
- "sqadd v22.4s, v22.4s, v16.4s\n"
- "srshl v13.4s, v13.4s, v31.4s\n"
- "sqxtn v15.4h, v15.4s\n"
- "srshl v20.4s, v20.4s, v31.4s\n"
- "sqxtn v17.4h, v17.4s\n"
- "srshl v10.4s, v10.4s, v31.4s\n"
- "sqxtn v11.4h, v11.4s\n"
- "srshl v22.4s, v22.4s, v31.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "sqxtn2 v15.8h, v13.4s\n"
- "sqxtn2 v17.8h, v20.4s\n"
- "sqxtn2 v11.8h, v10.4s\n"
- "sqxtn2 v9.8h, v22.4s\n"
- "sqadd v15.8h, v15.8h, v12.8h\n"
- "sqadd v17.8h, v17.8h, v12.8h\n"
- "sqadd v11.8h, v11.8h, v12.8h\n"
- "sqadd v9.8h, v9.8h, v12.8h\n"
- "smax v15.8h, v15.8h, v14.8h\n"
- "smax v17.8h, v17.8h, v14.8h\n"
- "smax v11.8h, v11.8h, v14.8h\n"
- "smax v9.8h, v9.8h, v14.8h\n"
- "smin v15.8h, v15.8h, v23.8h\n"
- "smin v17.8h, v17.8h, v23.8h\n"
- "smin v11.8h, v11.8h, v23.8h\n"
- "smin v9.8h, v9.8h, v23.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
- "str d15, [x11, x16]\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
- "str d17, [x10, x16]\n"
- "str d11, [x9, x16]\n"
- "str d9, [x28, x16]\n"
- "ldr q15, [x22, #0x0]\n"
- "ldr q13, [x22, #0x10]\n"
- "add x22, x22, #0x20\n"
- "ldr d0, [x14, #0x0]\n"
- "ldr d1, [x14, #0x8]\n"
+ "sqadd v20.4s, v20.4s, v24.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v19.4s\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "srshl v8.4s, v8.4s, v0.4s\n"
+ "srshl v21.4s, v21.4s, v0.4s\n"
+ "sqadd v4.4s, v4.4s, v18.4s\n"
+ "srshl v20.4s, v20.4s, v0.4s\n"
+ "sqadd v1.4s, v1.4s, v17.4s\n"
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "sqadd v14.4s, v14.4s, v15.4s\n"
+ "srshl v2.4s, v2.4s, v25.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v4.4s, v4.4s, v25.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "srshl v1.4s, v1.4s, v25.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v14.4s, v14.4s, v25.4s\n"
+ "sqxtn v16.4h, v16.4s\n"
+ "sqxtn2 v8.8h, v2.4s\n"
+ "sqxtn2 v21.8h, v4.4s\n"
+ "sqxtn2 v20.8h, v1.4s\n"
+ "sqxtn2 v16.8h, v14.4s\n"
+ "sqadd v8.8h, v8.8h, v22.8h\n"
+ "sqadd v21.8h, v21.8h, v22.8h\n"
+ "sqadd v20.8h, v20.8h, v22.8h\n"
+ "sqadd v16.8h, v16.8h, v22.8h\n"
+ "smax v8.8h, v8.8h, v13.8h\n"
+ "smax v21.8h, v21.8h, v13.8h\n"
+ "smax v20.8h, v20.8h, v13.8h\n"
+ "smax v16.8h, v16.8h, v13.8h\n"
+ "smin v8.8h, v8.8h, v5.8h\n"
+ "smin v21.8h, v21.8h, v5.8h\n"
+ "smin v20.8h, v20.8h, v5.8h\n"
+ "smin v16.8h, v16.8h, v5.8h\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "str d8, [x11, x16]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d21, [x10, x16]\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str d20, [x9, x16]\n"
+ "str d16, [x28, x16]\n"
+ "ldr q8, [x20, #0x0]\n"
+ "ldr q2, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d12, [x14, #0x0]\n"
+ "ldr d11, [x14, #0x8]\n"
"add x16, x16, #0x8\n"
- "str x22, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d2, [x14, #0x10]\n"
- "ldr d3, [x14, #0x18]\n"
- "mov v17.16b, v15.16b\n"
- "mov v20.16b, v13.16b\n"
- "ldr d4, [x14, #0x20]\n"
- "ldr d5, [x14, #0x28]\n"
- "mov v11.16b, v15.16b\n"
- "mov v10.16b, v13.16b\n"
- "ldr d6, [x14, #0x30]\n"
- "ldr d7, [x14, #0x38]\n"
- "mov v9.16b, v15.16b\n"
- "mov v22.16b, v13.16b\n"
- "ldr d8, [x14, #0x40]\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d25, [x14, #0x10]\n"
+ "ldr d24, [x14, #0x18]\n"
+ "mov v21.16b, v8.16b\n"
+ "mov v4.16b, v2.16b\n"
+ "ldr d23, [x14, #0x20]\n"
+ "ldr d7, [x14, #0x28]\n"
+ "mov v20.16b, v8.16b\n"
+ "mov v1.16b, v2.16b\n"
+ "ldr d3, [x14, #0x30]\n"
+ "ldr d9, [x14, #0x38]\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v14.16b, v2.16b\n"
+ "ldr d30, [x14, #0x40]\n"
"ldp x27, x26, [x15, #0x0]\n"
- "usubl v0.8h, v0.8b, v19.8b\n"
- "usubl v1.8h, v1.8b, v19.8b\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v11.8h, v11.8b, v6.8b\n"
"ldp x25, x24, [x15, #0x10]\n"
"ldp x23, x22, [x15, #0x20]\n"
- "usubl v2.8h, v2.8b, v19.8b\n"
- "usubl v3.8h, v3.8b, v19.8b\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "usubl v24.8h, v24.8b, v6.8b\n"
"ldp x21, x20, [x15, #0x30]\n"
- "ldr d31, [x27, x17]\n"
- "usubl v4.8h, v4.8b, v19.8b\n"
- "usubl v5.8h, v5.8b, v19.8b\n"
- "ldr d30, [x26, x17]\n"
- "ldr d29, [x25, x17]\n"
- "usubl v6.8h, v6.8b, v19.8b\n"
- "usubl v7.8h, v7.8b, v19.8b\n"
- "ldr d28, [x24, x17]\n"
- "ldr d27, [x23, x17]\n"
- "usubl v8.8h, v8.8b, v19.8b\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "ldr d26, [x22, x17]\n"
- "ldr d25, [x21, x17]\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "ldr d24, [x20, x17]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "ushll v27.8h, v27.8b, #0x0\n"
+ "ldr d26, [x27, x17]\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "usubl v7.8h, v7.8b, v6.8b\n"
+ "ldr d18, [x26, x17]\n"
+ "ldr d10, [x25, x17]\n"
+ "usubl v3.8h, v3.8b, v6.8b\n"
+ "usubl v9.8h, v9.8b, v6.8b\n"
+ "ldr d27, [x24, x17]\n"
+ "ldr d17, [x23, x17]\n"
+ "usubl v30.8h, v30.8b, v6.8b\n"
"ushll v26.8h, v26.8b, #0x0\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "ushll v24.8h, v24.8b, #0x0\n"
+ "ldr d19, [x22, x17]\n"
+ "ldr d15, [x21, x17]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ldr d28, [x20, x17]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "ushll v17.8h, v17.8b, #0x0\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
"bgt 1b\n"
"2:" // Tail
- "ldr q18, [x13, #0x0]\n"
- "smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v13.4s, v31.8h, v8.8h\n"
- "ldr x23, [x15, #0x40]\n"
- "smlal v17.4s, v31.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "ldr x22, [x15, #0x48]\n"
- "ldr x21, [x15, #0x50]\n"
- "smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v13.4s, v30.8h, v0.8h\n"
- "ldr q21, [x12, #0x0]\n"
- "ldr x20, [x15, #0x58]\n"
- "smlal v17.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "ldr d28, [x22, x17]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v13.4s, v29.8h, v1.8h\n"
- "ldr d29, [x23, x17]\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
- "ldr d27, [x21, x17]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v13.4s, v26.8h, v3.8h\n"
+ "ldr q0, [x13, #0x0]\n"
+ "ldr q31, [x12, #0x0]\n"
+ "smlal v8.4s, v26.4h, v30.4h\n"
+ "smlal2 v2.4s, v26.8h, v30.8h\n"
+ "ldr q29, [x13, #0x10]\n"
+ "ldr x21, [x15, #0x58]\n"
+ "smlal v8.4s, v18.4h, v12.4h\n"
+ "smlal v21.4s, v26.4h, v3.4h\n"
+ "ldr x20, [x15, #0x78]\n"
+ "ldr x25, [x15, #0x60]\n"
+ "smlal v20.4s, v26.4h, v25.4h\n"
+ "smlal v16.4s, v26.4h, v12.4h\n"
+ "ldr x24, [x15, #0x80]\n"
+ "smlal2 v2.4s, v18.8h, v12.8h\n"
+ "ldr d18, [x21, x17]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "smlal v8.4s, v10.4h, v11.4h\n"
+ "smlal2 v4.4s, v26.8h, v3.8h\n"
+ "ldr x23, [x15, #0x68]\n"
+ "ldr x22, [x15, #0x88]\n"
+ "smlal2 v1.4s, v26.8h, v25.8h\n"
+ "smlal2 v14.4s, v26.8h, v12.8h\n"
"ldr d26, [x20, x17]\n"
- "ldr x23, [x15, #0x78]\n"
- "smlal v17.4s, v24.4h, v0.4h\n"
- "smlal2 v20.4s, v24.8h, v0.8h\n"
- "ldr x20, [x15, #0x60]\n"
"ushll v26.8h, v26.8b, #0x0\n"
- "smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v13.4s, v25.8h, v4.8h\n"
- "ldr d25, [x20, x17]\n"
- "ldr x21, [x15, #0x80]\n"
- "smlal v17.4s, v29.4h, v4.4h\n"
- "smlal2 v20.4s, v29.8h, v4.8h\n"
- "ldr q30, [x13, #0x10]\n"
- "ldr x20, [x15, #0x68]\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v13.4s, v24.8h, v2.8h\n"
- "ldr d29, [x20, x17]\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal v17.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "ldr d28, [x21, x17]\n"
- "ldr x22, [x15, #0x88]\n"
- "smlal v11.4s, v31.4h, v2.4h\n"
- "smlal2 v10.4s, v31.8h, v2.8h\n"
+ "smlal v21.4s, v27.4h, v11.4h\n"
+ "smlal v20.4s, v18.4h, v24.4h\n"
+ "ldr x21, [x15, #0x40]\n"
"ldr x20, [x15, #0x70]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v13.4s, v27.8h, v5.8h\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "ldr x24, [x15, #0x98]\n"
- "smlal v17.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
- "ldr d27, [x23, x17]\n"
+ "smlal v16.4s, v26.4h, v23.4h\n"
+ "smlal2 v2.4s, v10.8h, v11.8h\n"
+ "ldr d10, [x25, x17]\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "smlal v8.4s, v19.4h, v24.4h\n"
+ "smlal2 v4.4s, v27.8h, v11.8h\n"
+ "ldr d27, [x24, x17]\n"
"ushll v27.8h, v27.8b, #0x0\n"
- "smlal v9.4s, v31.4h, v0.4h\n"
- "smlal v11.4s, v26.4h, v3.4h\n"
- "ldr x21, [x15, #0x90]\n"
- "ldr x23, [x15, #0xa8]\n"
- "smlal2 v10.4s, v26.8h, v3.8h\n"
+ "smlal2 v1.4s, v18.8h, v24.8h\n"
+ "ldr d18, [x23, x17]\n"
+ "smlal2 v14.4s, v26.8h, v23.8h\n"
"ldr d26, [x22, x17]\n"
- "smlal2 v22.4s, v31.8h, v0.8h\n"
- "ldr d24, [x20, x17]\n"
- "smlal v9.4s, v27.4h, v4.4h\n"
- "smlal v11.4s, v25.4h, v0.4h\n"
+ "ldr x24, [x15, #0x98]\n"
+ "smlal v21.4s, v17.4h, v25.4h\n"
+ "smlal v20.4s, v10.4h, v12.4h\n"
+ "ldr x23, [x15, #0x50]\n"
+ "smlal v16.4s, v27.4h, v11.4h\n"
+ "smlal2 v2.4s, v19.8h, v24.8h\n"
+ "ldr d19, [x21, x17]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
"ushll v26.8h, v26.8b, #0x0\n"
- "ldr x20, [x15, #0xa0]\n"
- "smlal2 v10.4s, v25.8h, v0.8h\n"
- "ldr q31, [x12, #0x10]\n"
- "smlal2 v22.4s, v27.8h, v4.8h\n"
- "ldr d27, [x20, x17]\n"
- "smlal v9.4s, v28.4h, v1.4h\n"
- "smlal v15.4s, v25.4h, v6.4h\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "ldr x22, [x15, #0xb0]\n"
- "smlal2 v13.4s, v25.8h, v6.8h\n"
- "ldr d25, [x21, x17]\n"
- "smlal v11.4s, v29.4h, v4.4h\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal2 v10.4s, v29.8h, v4.8h\n"
- "ldr d29, [x24, x17]\n"
- "smlal2 v22.4s, v28.8h, v1.8h\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal v9.4s, v26.4h, v5.4h\n"
- "smlal v15.4s, v24.4h, v7.4h\n"
- "ldr x21, [x15, #0xb8]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal2 v13.4s, v24.8h, v7.8h\n"
- "smlal v11.4s, v24.4h, v1.4h\n"
+ "smlal v8.4s, v15.4h, v23.4h\n"
+ "smlal2 v4.4s, v17.8h, v25.8h\n"
+ "ldr d17, [x20, x17]\n"
+ "ldr x22, [x15, #0x48]\n"
+ "smlal2 v1.4s, v10.8h, v12.8h\n"
+ "smlal2 v14.4s, v27.8h, v11.8h\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v21.4s, v28.4h, v12.4h\n"
+ "smlal v20.4s, v18.4h, v23.4h\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "smlal v16.4s, v26.4h, v7.4h\n"
+ "smlal2 v2.4s, v15.8h, v23.8h\n"
+ "ldr d15, [x24, x17]\n"
+ "ushll v17.8h, v17.8b, #0x0\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "smlal v8.4s, v28.4h, v25.4h\n"
+ "smlal2 v4.4s, v28.8h, v12.8h\n"
+ "ldr d12, [x23, x17]\n"
+ "ushll v12.8h, v12.8b, #0x0\n"
+ "smlal2 v1.4s, v18.8h, v23.8h\n"
+ "ldr d18, [x22, x17]\n"
+ "smlal2 v14.4s, v26.8h, v7.8h\n"
+ "ldr d26, [x21, x17]\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "smlal v21.4s, v19.4h, v23.4h\n"
+ "smlal v20.4s, v17.4h, v11.4h\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "smlal v16.4s, v15.4h, v25.4h\n"
+ "smlal2 v2.4s, v28.8h, v25.8h\n"
+ "ldr d28, [x20, x17]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal v8.4s, v12.4h, v7.4h\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal2 v4.4s, v19.8h, v23.8h\n"
+ "ldr d23, [x22, x17]\n"
+ "ldr d19, [x21, x17]\n"
+ "smlal2 v1.4s, v17.8h, v11.8h\n"
+ "ldr d11, [x20, x17]\n"
+ "smlal2 v14.4s, v15.8h, v25.8h\n"
+ "ldr q25, [x12, #0x10]\n"
+ "smlal v21.4s, v18.4h, v7.4h\n"
+ "smlal v20.4s, v26.4h, v3.4h\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
"ldr x20, [x15, #0xc0]\n"
- "sqrdmulh v15.4s, v15.4s, v18.4s\n"
- "smlal2 v10.4s, v24.8h, v1.8h\n"
- "ldr d24, [x23, x17]\n"
- "smlal2 v22.4s, v26.8h, v5.8h\n"
- "ldr d26, [x22, x17]\n"
- "smlal v9.4s, v29.4h, v2.4h\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "smlal2 v22.4s, v29.8h, v2.8h\n"
+ "smlal v16.4s, v28.4h, v24.4h\n"
+ "smlal2 v2.4s, v12.8h, v7.8h\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
"tst x7, #0x7\n"
- "smlal v11.4s, v25.4h, v6.4h\n"
- "smlal v9.4s, v24.4h, v3.4h\n"
- "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal v8.4s, v10.4h, v3.4h\n"
+ "smlal2 v4.4s, v18.8h, v7.8h\n"
+ "ldr d18, [x20, x17]\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "smlal2 v1.4s, v26.8h, v3.8h\n"
+ "smlal2 v14.4s, v28.8h, v24.8h\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "add x17, x17, #0x8\n"
+ "smlal v21.4s, v12.4h, v24.4h\n"
+ "smlal v20.4s, v23.4h, v9.4h\n"
"add x13, x13, #0x20\n"
- "smlal v17.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
- "and v2.16b, v15.16b, v21.16b\n"
"add x12, x12, #0x20\n"
- "smlal2 v10.4s, v25.8h, v6.8h\n"
- "ldr d25, [x21, x17]\n"
- "smlal2 v22.4s, v24.8h, v3.8h\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal v11.4s, v27.4h, v7.4h\n"
- "smlal v9.4s, v26.4h, v7.4h\n"
- "sqrdmulh v13.4s, v13.4s, v30.4s\n"
- "smlal v17.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "ldr d29, [x20, x17]\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal2 v10.4s, v27.8h, v7.8h\n"
- "smlal2 v22.4s, v26.8h, v7.8h\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "add x17, x17, #0x8\n"
- "smlal v11.4s, v24.4h, v5.4h\n"
- "smlal v9.4s, v25.4h, v6.4h\n"
- "and v16.16b, v13.16b, v31.16b\n"
- "smlal2 v10.4s, v24.8h, v5.8h\n"
- "smlal2 v22.4s, v25.8h, v6.8h\n"
- "sqrdmulh v17.4s, v17.4s, v18.4s\n"
- "smlal v11.4s, v25.4h, v8.4h\n"
- "smlal v9.4s, v29.4h, v8.4h\n"
- "sqrdmulh v11.4s, v11.4s, v18.4s\n"
- "smlal2 v10.4s, v25.8h, v8.8h\n"
- "smlal2 v22.4s, v29.8h, v8.8h\n"
- "sqrdmulh v9.4s, v9.4s, v18.4s\n"
- "sqadd v15.4s, v15.4s, v2.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "and v18.16b, v17.16b, v21.16b\n"
- "sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "and v28.16b, v11.16b, v21.16b\n"
- "sqrdmulh v10.4s, v10.4s, v30.4s\n"
- "and v2.16b, v9.16b, v21.16b\n"
- "sqrdmulh v22.4s, v22.4s, v30.4s\n"
- "sqadd v13.4s, v13.4s, v16.4s\n"
+ "smlal v16.4s, v19.4h, v9.4h\n"
+ "smlal2 v2.4s, v10.8h, v3.8h\n"
+ "smlal v8.4s, v17.4h, v9.4h\n"
+ "smlal2 v4.4s, v12.8h, v24.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v0.4s\n"
+ "smlal2 v1.4s, v23.8h, v9.8h\n"
+ "smlal2 v14.4s, v19.8h, v9.8h\n"
+ "and v23.16b, v8.16b, v31.16b\n"
+ "smlal v21.4s, v27.4h, v9.4h\n"
+ "smlal v20.4s, v28.4h, v7.4h\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "smlal v16.4s, v11.4h, v3.4h\n"
+ "smlal2 v2.4s, v17.8h, v9.8h\n"
+ "sqrdmulh v2.4s, v2.4s, v29.4s\n"
+ "smlal2 v4.4s, v27.8h, v9.8h\n"
+ "smlal2 v1.4s, v28.8h, v7.8h\n"
+ "and v7.16b, v2.16b, v25.16b\n"
+ "smlal2 v14.4s, v11.8h, v3.8h\n"
+ "smlal v21.4s, v15.4h, v30.4h\n"
+ "sqrdmulh v21.4s, v21.4s, v0.4s\n"
+ "smlal v20.4s, v11.4h, v30.4h\n"
+ "smlal v16.4s, v18.4h, v30.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v0.4s\n"
+ "smlal2 v4.4s, v15.8h, v30.8h\n"
+ "smlal2 v1.4s, v11.8h, v30.8h\n"
+ "sqrdmulh v16.4s, v16.4s, v0.4s\n"
+ "smlal2 v14.4s, v18.8h, v30.8h\n"
+ "sqadd v8.4s, v8.4s, v23.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v23.16b, v21.16b, v31.16b\n"
+ "sqrdmulh v4.4s, v4.4s, v29.4s\n"
+ "and v24.16b, v20.16b, v31.16b\n"
+ "sqrdmulh v1.4s, v1.4s, v29.4s\n"
+ "and v19.16b, v16.16b, v31.16b\n"
+ "sqrdmulh v14.4s, v14.4s, v29.4s\n"
+ "sqadd v2.4s, v2.4s, v7.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v18.16b, v4.16b, v25.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "and v17.16b, v1.16b, v25.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "and v15.16b, v14.16b, v25.16b\n"
+ "sqadd v21.4s, v21.4s, v23.4s\n"
"sshr v18.4s, v18.4s, #0x1f\n"
- "and v4.16b, v20.16b, v31.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "and v3.16b, v10.16b, v31.16b\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "and v16.16b, v22.16b, v31.16b\n"
- "sqadd v17.4s, v17.4s, v18.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v11.4s, v11.4s, v28.4s\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v2.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v21.4s\n"
- "srshl v17.4s, v17.4s, v21.4s\n"
- "sqadd v20.4s, v20.4s, v4.4s\n"
- "srshl v11.4s, v11.4s, v21.4s\n"
- "sqadd v10.4s, v10.4s, v3.4s\n"
- "srshl v9.4s, v9.4s, v21.4s\n"
- "sqadd v22.4s, v22.4s, v16.4s\n"
- "srshl v13.4s, v13.4s, v31.4s\n"
- "sqxtn v15.4h, v15.4s\n"
+ "sqadd v20.4s, v20.4s, v24.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v19.4s\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "srshl v8.4s, v8.4s, v31.4s\n"
+ "srshl v21.4s, v21.4s, v31.4s\n"
+ "sqadd v4.4s, v4.4s, v18.4s\n"
"srshl v20.4s, v20.4s, v31.4s\n"
- "sqxtn v17.4h, v17.4s\n"
- "srshl v10.4s, v10.4s, v31.4s\n"
- "sqxtn v11.4h, v11.4s\n"
- "srshl v22.4s, v22.4s, v31.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "sqxtn2 v15.8h, v13.4s\n"
- "sqxtn2 v17.8h, v20.4s\n"
- "sqxtn2 v11.8h, v10.4s\n"
- "sqxtn2 v9.8h, v22.4s\n"
- "sqadd v15.8h, v15.8h, v12.8h\n"
- "sqadd v17.8h, v17.8h, v12.8h\n"
- "sqadd v11.8h, v11.8h, v12.8h\n"
- "sqadd v9.8h, v9.8h, v12.8h\n"
- "smax v15.8h, v15.8h, v14.8h\n"
- "smax v17.8h, v17.8h, v14.8h\n"
- "smax v11.8h, v11.8h, v14.8h\n"
- "smax v9.8h, v9.8h, v14.8h\n"
- "smin v15.8h, v15.8h, v23.8h\n"
- "smin v17.8h, v17.8h, v23.8h\n"
- "smin v11.8h, v11.8h, v23.8h\n"
- "smin v9.8h, v9.8h, v23.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
- "str d15, [x11, x16]\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
- "str d17, [x10, x16]\n"
- "str d11, [x9, x16]\n"
- "str d9, [x28, x16]\n"
+ "sqadd v1.4s, v1.4s, v17.4s\n"
+ "srshl v16.4s, v16.4s, v31.4s\n"
+ "sqadd v14.4s, v14.4s, v15.4s\n"
+ "srshl v2.4s, v2.4s, v25.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v4.4s, v4.4s, v25.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "srshl v1.4s, v1.4s, v25.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v14.4s, v14.4s, v25.4s\n"
+ "sqxtn v16.4h, v16.4s\n"
+ "sqxtn2 v8.8h, v2.4s\n"
+ "sqxtn2 v21.8h, v4.4s\n"
+ "sqxtn2 v20.8h, v1.4s\n"
+ "sqxtn2 v16.8h, v14.4s\n"
+ "sqadd v8.8h, v8.8h, v22.8h\n"
+ "sqadd v21.8h, v21.8h, v22.8h\n"
+ "sqadd v20.8h, v20.8h, v22.8h\n"
+ "sqadd v16.8h, v16.8h, v22.8h\n"
+ "smax v8.8h, v8.8h, v13.8h\n"
+ "smax v21.8h, v21.8h, v13.8h\n"
+ "smax v20.8h, v20.8h, v13.8h\n"
+ "smax v16.8h, v16.8h, v13.8h\n"
+ "smin v8.8h, v8.8h, v5.8h\n"
+ "smin v21.8h, v21.8h, v5.8h\n"
+ "smin v20.8h, v20.8h, v5.8h\n"
+ "smin v16.8h, v16.8h, v5.8h\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "str d8, [x11, x16]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d21, [x10, x16]\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str d20, [x9, x16]\n"
+ "str d16, [x28, x16]\n"
"add x16, x16, #0x8\n"
"beq 88f\n"
"add x14, x14, #0x48\n"
"3:" // Oddments
- "ldr x22, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
"tbz x7, #2, 5f\n"
- "ld1 { v15.4s }, [x22], #0x10\n"
+ "ld1 { v8.4s }, [x20], #0x10\n"
"tbz x7, #1, 4f\n"
- "ld1 { v13.d }[0], [x22], #0x8\n"
+ "ld1 { v2.d }[0], [x20], #0x8\n"
"tbz x7, #0, 7f\n"
- "ld1 { v13.s }[2], [x22]\n"
+ "ld1 { v2.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
"tbz x7, #0, 7f\n"
- "ld1 { v13.s }[0], [x22]\n"
+ "ld1 { v2.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
"tbz x7, #1, 6f\n"
- "ld1 { v15.d }[0], [x22], #0x8\n"
+ "ld1 { v8.d }[0], [x20], #0x8\n"
"tbz x7, #0, 7f\n"
- "ld1 { v15.s }[2], [x22]\n"
+ "ld1 { v8.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 7f\n"
- "ld1 { v15.s }[0], [x22]\n"
+ "ld1 { v8.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x14, #0x0]\n"
- "ldr d1, [x14, #0x8]\n"
- "mov v17.16b, v15.16b\n"
- "mov v20.16b, v13.16b\n"
- "ldr d2, [x14, #0x10]\n"
- "ldr d3, [x14, #0x18]\n"
- "mov v11.16b, v15.16b\n"
- "mov v10.16b, v13.16b\n"
- "ldr d4, [x14, #0x20]\n"
- "ldr d5, [x14, #0x28]\n"
- "mov v9.16b, v15.16b\n"
- "mov v22.16b, v13.16b\n"
- "ldr d6, [x14, #0x30]\n"
- "ldr d7, [x14, #0x38]\n"
- "usubl v0.8h, v0.8b, v19.8b\n"
- "usubl v1.8h, v1.8b, v19.8b\n"
- "ldr d8, [x14, #0x40]\n"
+ "ldr d12, [x14, #0x0]\n"
+ "ldr d11, [x14, #0x8]\n"
+ "mov v21.16b, v8.16b\n"
+ "mov v4.16b, v2.16b\n"
+ "ldr d25, [x14, #0x10]\n"
+ "ldr d24, [x14, #0x18]\n"
+ "mov v20.16b, v8.16b\n"
+ "mov v1.16b, v2.16b\n"
+ "ldr d23, [x14, #0x20]\n"
+ "ldr d7, [x14, #0x28]\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v14.16b, v2.16b\n"
+ "ldr d3, [x14, #0x30]\n"
+ "ldr d9, [x14, #0x38]\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v11.8h, v11.8b, v6.8b\n"
+ "ldr d30, [x14, #0x40]\n"
"ldp x27, x26, [x15, #0x0]\n"
- "usubl v2.8h, v2.8b, v19.8b\n"
- "usubl v3.8h, v3.8b, v19.8b\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "usubl v24.8h, v24.8b, v6.8b\n"
"ldp x25, x24, [x15, #0x10]\n"
"ldp x23, x22, [x15, #0x20]\n"
- "usubl v4.8h, v4.8b, v19.8b\n"
- "usubl v5.8h, v5.8b, v19.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "usubl v7.8h, v7.8b, v6.8b\n"
"ldp x21, x20, [x15, #0x30]\n"
- "usubl v6.8h, v6.8b, v19.8b\n"
- "usubl v7.8h, v7.8b, v19.8b\n"
- "usubl v8.8h, v8.8b, v19.8b\n"
+ "usubl v3.8h, v3.8b, v6.8b\n"
+ "usubl v9.8h, v9.8b, v6.8b\n"
+ "usubl v30.8h, v30.8b, v6.8b\n"
"add x27, x27, x17\n"
"add x26, x26, x17\n"
"add x25, x25, x17\n"
@@ -687,700 +687,700 @@ void a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"add x21, x21, x17\n"
"add x20, x20, x17\n"
"tbz x7, #2, 9f\n"
- "ld1 { v31.s }[0], [x27], #0x4\n"
- "ld1 { v30.s }[0], [x26], #0x4\n"
- "ld1 { v29.s }[0], [x25], #0x4\n"
- "ld1 { v28.s }[0], [x24], #0x4\n"
- "ld1 { v27.s }[0], [x23], #0x4\n"
- "ld1 { v26.s }[0], [x22], #0x4\n"
- "ld1 { v25.s }[0], [x21], #0x4\n"
- "ld1 { v24.s }[0], [x20], #0x4\n"
+ "ld1 { v26.s }[0], [x27], #0x4\n"
+ "ld1 { v18.s }[0], [x26], #0x4\n"
+ "ld1 { v10.s }[0], [x25], #0x4\n"
+ "ld1 { v27.s }[0], [x24], #0x4\n"
+ "ld1 { v17.s }[0], [x23], #0x4\n"
+ "ld1 { v19.s }[0], [x22], #0x4\n"
+ "ld1 { v15.s }[0], [x21], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
"tbz x7, #1, 8f\n"
- "ld1 { v31.h }[2], [x27], #0x2\n"
- "ld1 { v30.h }[2], [x26], #0x2\n"
- "ld1 { v29.h }[2], [x25], #0x2\n"
- "ld1 { v28.h }[2], [x24], #0x2\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
- "ld1 { v26.h }[2], [x22], #0x2\n"
- "ld1 { v25.h }[2], [x21], #0x2\n"
- "ld1 { v24.h }[2], [x20], #0x2\n"
+ "ld1 { v26.h }[2], [x27], #0x2\n"
+ "ld1 { v18.h }[2], [x26], #0x2\n"
+ "ld1 { v10.h }[2], [x25], #0x2\n"
+ "ld1 { v27.h }[2], [x24], #0x2\n"
+ "ld1 { v17.h }[2], [x23], #0x2\n"
+ "ld1 { v19.h }[2], [x22], #0x2\n"
+ "ld1 { v15.h }[2], [x21], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[6], [x27]\n"
- "ld1 { v30.b }[6], [x26]\n"
- "ld1 { v29.b }[6], [x25]\n"
- "ld1 { v28.b }[6], [x24]\n"
- "ld1 { v27.b }[6], [x23]\n"
- "ld1 { v26.b }[6], [x22]\n"
- "ld1 { v25.b }[6], [x21]\n"
- "ld1 { v24.b }[6], [x20]\n"
+ "ld1 { v26.b }[6], [x27]\n"
+ "ld1 { v18.b }[6], [x26]\n"
+ "ld1 { v10.b }[6], [x25]\n"
+ "ld1 { v27.b }[6], [x24]\n"
+ "ld1 { v17.b }[6], [x23]\n"
+ "ld1 { v19.b }[6], [x22]\n"
+ "ld1 { v15.b }[6], [x21]\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[4], [x27]\n"
- "ld1 { v30.b }[4], [x26]\n"
- "ld1 { v29.b }[4], [x25]\n"
- "ld1 { v28.b }[4], [x24]\n"
- "ld1 { v27.b }[4], [x23]\n"
- "ld1 { v26.b }[4], [x22]\n"
- "ld1 { v25.b }[4], [x21]\n"
- "ld1 { v24.b }[4], [x20]\n"
+ "ld1 { v26.b }[4], [x27]\n"
+ "ld1 { v18.b }[4], [x26]\n"
+ "ld1 { v10.b }[4], [x25]\n"
+ "ld1 { v27.b }[4], [x24]\n"
+ "ld1 { v17.b }[4], [x23]\n"
+ "ld1 { v19.b }[4], [x22]\n"
+ "ld1 { v15.b }[4], [x21]\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
"tbz x7, #1, 10f\n"
- "ld1 { v31.h }[0], [x27], #0x2\n"
- "ld1 { v30.h }[0], [x26], #0x2\n"
- "ld1 { v29.h }[0], [x25], #0x2\n"
- "ld1 { v28.h }[0], [x24], #0x2\n"
- "ld1 { v27.h }[0], [x23], #0x2\n"
- "ld1 { v26.h }[0], [x22], #0x2\n"
- "ld1 { v25.h }[0], [x21], #0x2\n"
- "ld1 { v24.h }[0], [x20], #0x2\n"
+ "ld1 { v26.h }[0], [x27], #0x2\n"
+ "ld1 { v18.h }[0], [x26], #0x2\n"
+ "ld1 { v10.h }[0], [x25], #0x2\n"
+ "ld1 { v27.h }[0], [x24], #0x2\n"
+ "ld1 { v17.h }[0], [x23], #0x2\n"
+ "ld1 { v19.h }[0], [x22], #0x2\n"
+ "ld1 { v15.h }[0], [x21], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[2], [x27]\n"
- "ld1 { v30.b }[2], [x26]\n"
- "ld1 { v29.b }[2], [x25]\n"
- "ld1 { v28.b }[2], [x24]\n"
- "ld1 { v27.b }[2], [x23]\n"
- "ld1 { v26.b }[2], [x22]\n"
- "ld1 { v25.b }[2], [x21]\n"
- "ld1 { v24.b }[2], [x20]\n"
+ "ld1 { v26.b }[2], [x27]\n"
+ "ld1 { v18.b }[2], [x26]\n"
+ "ld1 { v10.b }[2], [x25]\n"
+ "ld1 { v27.b }[2], [x24]\n"
+ "ld1 { v17.b }[2], [x23]\n"
+ "ld1 { v19.b }[2], [x22]\n"
+ "ld1 { v15.b }[2], [x21]\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[0], [x27]\n"
- "ld1 { v30.b }[0], [x26]\n"
- "ld1 { v29.b }[0], [x25]\n"
- "ld1 { v28.b }[0], [x24]\n"
- "ld1 { v27.b }[0], [x23]\n"
- "ld1 { v26.b }[0], [x22]\n"
- "ld1 { v25.b }[0], [x21]\n"
- "ld1 { v24.b }[0], [x20]\n"
+ "ld1 { v26.b }[0], [x27]\n"
+ "ld1 { v18.b }[0], [x26]\n"
+ "ld1 { v10.b }[0], [x25]\n"
+ "ld1 { v27.b }[0], [x24]\n"
+ "ld1 { v17.b }[0], [x23]\n"
+ "ld1 { v19.b }[0], [x22]\n"
+ "ld1 { v15.b }[0], [x21]\n"
+ "ld1 { v28.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "ushll v31.8h, v31.8b, #0x0\n"
- "smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v13.4s, v31.8h, v8.8h\n"
- "ldr x23, [x15, #0x40]\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v13.4s, v30.8h, v0.8h\n"
- "add x23, x23, x17\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal v17.4s, v31.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v13.4s, v29.8h, v1.8h\n"
- "ushll v28.8h, v28.8b, #0x0\n"
"ushll v26.8h, v26.8b, #0x0\n"
- "smlal v17.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v13.4s, v26.8h, v3.8h\n"
+ "smlal v8.4s, v26.4h, v30.4h\n"
+ "smlal2 v2.4s, v26.8h, v30.8h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "smlal v8.4s, v18.4h, v12.4h\n"
+ "smlal2 v2.4s, v18.8h, v12.8h\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "smlal v21.4s, v26.4h, v3.4h\n"
+ "smlal2 v4.4s, v26.8h, v3.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v8.4s, v10.4h, v11.4h\n"
+ "smlal2 v2.4s, v10.8h, v11.8h\n"
"ushll v27.8h, v27.8b, #0x0\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
- "smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v13.4s, v25.8h, v4.8h\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "smlal v11.4s, v31.4h, v2.4h\n"
- "smlal2 v10.4s, v31.8h, v2.8h\n"
- "smlal v9.4s, v31.4h, v0.4h\n"
- "smlal2 v22.4s, v31.8h, v0.8h\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v13.4s, v24.8h, v2.8h\n"
- "smlal v17.4s, v24.4h, v0.4h\n"
- "smlal2 v20.4s, v24.8h, v0.8h\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "smlal v21.4s, v27.4h, v11.4h\n"
+ "smlal2 v4.4s, v27.8h, v11.8h\n"
+ "smlal v8.4s, v19.4h, v24.4h\n"
+ "smlal2 v2.4s, v19.8h, v24.8h\n"
+ "ushll v17.8h, v17.8b, #0x0\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "smlal v21.4s, v17.4h, v25.4h\n"
+ "smlal2 v4.4s, v17.8h, v25.8h\n"
+ "smlal v8.4s, v15.4h, v23.4h\n"
+ "smlal2 v2.4s, v15.8h, v23.8h\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal v20.4s, v26.4h, v25.4h\n"
+ "smlal2 v1.4s, v26.8h, v25.8h\n"
+ "smlal v16.4s, v26.4h, v12.4h\n"
+ "smlal2 v14.4s, v26.8h, v12.8h\n"
+ "smlal v8.4s, v28.4h, v25.4h\n"
+ "smlal2 v2.4s, v28.8h, v25.8h\n"
+ "smlal v21.4s, v28.4h, v12.4h\n"
+ "smlal2 v4.4s, v28.8h, v12.8h\n"
"tbz x7, #2, 13f\n"
- "ld1 { v29.s }[0], [x23], #0x4\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
"tbz x7, #1, 12f\n"
- "ld1 { v29.h }[2], [x23], #0x2\n"
+ "ld1 { v31.h }[2], [x20], #0x2\n"
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[6], [x23]\n"
+ "ld1 { v31.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[4], [x23]\n"
+ "ld1 { v31.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
"tbz x7, #1, 14f\n"
- "ld1 { v29.h }[0], [x23], #0x2\n"
+ "ld1 { v31.h }[0], [x20], #0x2\n"
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[2], [x23]\n"
+ "ld1 { v31.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[0], [x23]\n"
+ "ld1 { v31.b }[0], [x20]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
- "ushll v29.8h, v29.8b, #0x0\n"
- "ldr x22, [x15, #0x48]\n"
- "smlal v17.4s, v29.4h, v4.4h\n"
- "smlal2 v20.4s, v29.8h, v4.8h\n"
- "add x22, x22, x17\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "ldr x20, [x15, #0x48]\n"
+ "smlal v21.4s, v31.4h, v23.4h\n"
+ "smlal2 v4.4s, v31.8h, v23.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 17f\n"
- "ld1 { v28.s }[0], [x22], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
"tbz x7, #1, 16f\n"
- "ld1 { v28.h }[2], [x22], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[6], [x22]\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[4], [x22]\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
"tbz x7, #1, 18f\n"
- "ld1 { v28.h }[0], [x22], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[2], [x22]\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[0], [x22]\n"
+ "ld1 { v28.b }[0], [x20]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
"ushll v28.8h, v28.8b, #0x0\n"
- "ldr x21, [x15, #0x50]\n"
- "smlal v17.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "add x21, x21, x17\n"
+ "ldr x20, [x15, #0x50]\n"
+ "smlal v21.4s, v28.4h, v7.4h\n"
+ "smlal2 v4.4s, v28.8h, v7.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 21f\n"
- "ld1 { v27.s }[0], [x21], #0x4\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
"tbz x7, #1, 20f\n"
- "ld1 { v27.h }[2], [x21], #0x2\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[6], [x21]\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[4], [x21]\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (1, 2): Bit 2: Unset
"tbz x7, #1, 22f\n"
- "ld1 { v27.h }[0], [x21], #0x2\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[2], [x21]\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[0], [x21]\n"
+ "ld1 { v27.b }[0], [x20]\n"
"23:" // Oddments: Load (1, 2): Bit 2: End
"ushll v27.8h, v27.8b, #0x0\n"
"ldr x20, [x15, #0x58]\n"
- "smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v13.4s, v27.8h, v5.8h\n"
- "smlal v17.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "smlal v8.4s, v27.4h, v7.4h\n"
+ "smlal2 v2.4s, v27.8h, v7.8h\n"
+ "smlal v21.4s, v27.4h, v24.4h\n"
+ "smlal2 v4.4s, v27.8h, v24.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 25f\n"
- "ld1 { v26.s }[0], [x20], #0x4\n"
+ "ld1 { v0.s }[0], [x20], #0x4\n"
"tbz x7, #1, 24f\n"
- "ld1 { v26.h }[2], [x20], #0x2\n"
+ "ld1 { v0.h }[2], [x20], #0x2\n"
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[6], [x20]\n"
+ "ld1 { v0.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[4], [x20]\n"
+ "ld1 { v0.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (3, 0): Bit 2: Unset
"tbz x7, #1, 26f\n"
- "ld1 { v26.h }[0], [x20], #0x2\n"
+ "ld1 { v0.h }[0], [x20], #0x2\n"
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[2], [x20]\n"
+ "ld1 { v0.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[0], [x20]\n"
+ "ld1 { v0.b }[0], [x20]\n"
"27:" // Oddments: Load (3, 0): Bit 2: End
- "ushll v26.8h, v26.8b, #0x0\n"
+ "ushll v0.8h, v0.8b, #0x0\n"
"ldr x20, [x15, #0x60]\n"
- "smlal v11.4s, v26.4h, v3.4h\n"
- "smlal2 v10.4s, v26.8h, v3.8h\n"
+ "smlal v20.4s, v0.4h, v24.4h\n"
+ "smlal2 v1.4s, v0.8h, v24.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 29f\n"
- "ld1 { v25.s }[0], [x20], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
"tbz x7, #1, 28f\n"
- "ld1 { v25.h }[2], [x20], #0x2\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[6], [x20]\n"
+ "ld1 { v15.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[4], [x20]\n"
+ "ld1 { v15.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 0): Bit 2: Unset
"tbz x7, #1, 30f\n"
- "ld1 { v25.h }[0], [x20], #0x2\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[2], [x20]\n"
+ "ld1 { v15.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[0], [x20]\n"
+ "ld1 { v15.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 0): Bit 2: End
- "ushll v25.8h, v25.8b, #0x0\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
"ldr x20, [x15, #0x68]\n"
- "smlal v15.4s, v25.4h, v6.4h\n"
- "smlal2 v13.4s, v25.8h, v6.8h\n"
- "smlal v11.4s, v25.4h, v0.4h\n"
- "smlal2 v10.4s, v25.8h, v0.8h\n"
+ "smlal v8.4s, v15.4h, v3.4h\n"
+ "smlal2 v2.4s, v15.8h, v3.8h\n"
+ "smlal v20.4s, v15.4h, v12.4h\n"
+ "smlal2 v1.4s, v15.8h, v12.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 33f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
+ "ld1 { v0.s }[0], [x20], #0x4\n"
"tbz x7, #1, 32f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
+ "ld1 { v0.h }[2], [x20], #0x2\n"
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "ld1 { v0.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "ld1 { v0.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (3, 1): Bit 2: Unset
"tbz x7, #1, 34f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
+ "ld1 { v0.h }[0], [x20], #0x2\n"
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "ld1 { v0.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "ld1 { v0.b }[0], [x20]\n"
"35:" // Oddments: Load (3, 1): Bit 2: End
- "ushll v29.8h, v29.8b, #0x0\n"
+ "ushll v0.8h, v0.8b, #0x0\n"
"ldr x20, [x15, #0x70]\n"
- "smlal v11.4s, v29.4h, v4.4h\n"
- "smlal2 v10.4s, v29.8h, v4.8h\n"
+ "smlal v20.4s, v0.4h, v23.4h\n"
+ "smlal2 v1.4s, v0.8h, v23.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 37f\n"
- "ld1 { v24.s }[0], [x20], #0x4\n"
+ "ld1 { v6.s }[0], [x20], #0x4\n"
"tbz x7, #1, 36f\n"
- "ld1 { v24.h }[2], [x20], #0x2\n"
+ "ld1 { v6.h }[2], [x20], #0x2\n"
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[6], [x20]\n"
+ "ld1 { v6.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[4], [x20]\n"
+ "ld1 { v6.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 1): Bit 2: Unset
"tbz x7, #1, 38f\n"
- "ld1 { v24.h }[0], [x20], #0x2\n"
+ "ld1 { v6.h }[0], [x20], #0x2\n"
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[2], [x20]\n"
+ "ld1 { v6.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[0], [x20]\n"
+ "ld1 { v6.b }[0], [x20]\n"
"39:" // Oddments: Load (2, 1): Bit 2: End
- "ushll v24.8h, v24.8b, #0x0\n"
- "ldr x23, [x15, #0x78]\n"
- "smlal v15.4s, v24.4h, v7.4h\n"
- "smlal2 v13.4s, v24.8h, v7.8h\n"
- "smlal v11.4s, v24.4h, v1.4h\n"
- "smlal2 v10.4s, v24.8h, v1.8h\n"
- "add x23, x23, x17\n"
+ "ushll v6.8h, v6.8b, #0x0\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v8.4s, v6.4h, v9.4h\n"
+ "smlal2 v2.4s, v6.8h, v9.8h\n"
+ "smlal v20.4s, v6.4h, v11.4h\n"
+ "smlal2 v1.4s, v6.8h, v11.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 41f\n"
- "ld1 { v27.s }[0], [x23], #0x4\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
"tbz x7, #1, 40f\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[6], [x23]\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[4], [x23]\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (3, 3): Bit 2: Unset
"tbz x7, #1, 42f\n"
- "ld1 { v27.h }[0], [x23], #0x2\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[2], [x23]\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[0], [x23]\n"
+ "ld1 { v27.b }[0], [x20]\n"
"43:" // Oddments: Load (3, 3): Bit 2: End
"ushll v27.8h, v27.8b, #0x0\n"
- "ldr x21, [x15, #0x80]\n"
- "smlal v9.4s, v27.4h, v4.4h\n"
- "smlal2 v22.4s, v27.8h, v4.8h\n"
- "add x21, x21, x17\n"
+ "ldr x20, [x15, #0x80]\n"
+ "smlal v16.4s, v27.4h, v23.4h\n"
+ "smlal2 v14.4s, v27.8h, v23.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 45f\n"
- "ld1 { v28.s }[0], [x21], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz x7, #1, 44f\n"
- "ld1 { v28.h }[2], [x21], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[6], [x21]\n"
+ "ld1 { v10.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[4], [x21]\n"
+ "ld1 { v10.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
"tbz x7, #1, 46f\n"
- "ld1 { v28.h }[0], [x21], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[2], [x21]\n"
+ "ld1 { v10.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[0], [x21]\n"
+ "ld1 { v10.b }[0], [x20]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "ushll v28.8h, v28.8b, #0x0\n"
- "ldr x22, [x15, #0x88]\n"
- "smlal v17.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
- "smlal v9.4s, v28.4h, v1.4h\n"
- "smlal2 v22.4s, v28.8h, v1.8h\n"
- "add x22, x22, x17\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ldr x20, [x15, #0x88]\n"
+ "smlal v21.4s, v10.4h, v9.4h\n"
+ "smlal2 v4.4s, v10.8h, v9.8h\n"
+ "smlal v16.4s, v10.4h, v11.4h\n"
+ "smlal2 v14.4s, v10.8h, v11.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 49f\n"
- "ld1 { v26.s }[0], [x22], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
"tbz x7, #1, 48f\n"
- "ld1 { v26.h }[2], [x22], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[6], [x22]\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[4], [x22]\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 4): Bit 2: Unset
"tbz x7, #1, 50f\n"
- "ld1 { v26.h }[0], [x22], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[2], [x22]\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[0], [x22]\n"
+ "ld1 { v28.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 4): Bit 2: End
- "ushll v26.8h, v26.8b, #0x0\n"
- "ldr x21, [x15, #0x90]\n"
- "smlal v9.4s, v26.4h, v5.4h\n"
- "smlal2 v22.4s, v26.8h, v5.8h\n"
- "add x21, x21, x17\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "ldr x20, [x15, #0x90]\n"
+ "smlal v16.4s, v28.4h, v7.4h\n"
+ "smlal2 v14.4s, v28.8h, v7.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 53f\n"
- "ld1 { v25.s }[0], [x21], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
"tbz x7, #1, 52f\n"
- "ld1 { v25.h }[2], [x21], #0x2\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[6], [x21]\n"
+ "ld1 { v15.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[4], [x21]\n"
+ "ld1 { v15.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (4, 0): Bit 2: Unset
"tbz x7, #1, 54f\n"
- "ld1 { v25.h }[0], [x21], #0x2\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[2], [x21]\n"
+ "ld1 { v15.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[0], [x21]\n"
+ "ld1 { v15.b }[0], [x20]\n"
"55:" // Oddments: Load (4, 0): Bit 2: End
- "ushll v25.8h, v25.8b, #0x0\n"
- "ldr x24, [x15, #0x98]\n"
- "smlal v11.4s, v25.4h, v6.4h\n"
- "smlal2 v10.4s, v25.8h, v6.8h\n"
- "add x24, x24, x17\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "ldr x20, [x15, #0x98]\n"
+ "smlal v20.4s, v15.4h, v3.4h\n"
+ "smlal2 v1.4s, v15.8h, v3.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 57f\n"
- "ld1 { v29.s }[0], [x24], #0x4\n"
+ "ld1 { v6.s }[0], [x20], #0x4\n"
"tbz x7, #1, 56f\n"
- "ld1 { v29.h }[2], [x24], #0x2\n"
+ "ld1 { v6.h }[2], [x20], #0x2\n"
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[6], [x24]\n"
+ "ld1 { v6.b }[6], [x20]\n"
"b 59f\n"
"56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[4], [x24]\n"
+ "ld1 { v6.b }[4], [x20]\n"
"b 59f\n"
"57:" // Oddments: Load (2, 4): Bit 2: Unset
"tbz x7, #1, 58f\n"
- "ld1 { v29.h }[0], [x24], #0x2\n"
+ "ld1 { v6.h }[0], [x20], #0x2\n"
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[2], [x24]\n"
+ "ld1 { v6.b }[2], [x20]\n"
"b 59f\n"
"58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[0], [x24]\n"
+ "ld1 { v6.b }[0], [x20]\n"
"59:" // Oddments: Load (2, 4): Bit 2: End
- "ushll v29.8h, v29.8b, #0x0\n"
+ "ushll v6.8h, v6.8b, #0x0\n"
"ldr x20, [x15, #0xa0]\n"
- "smlal v17.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "smlal v9.4s, v29.4h, v2.4h\n"
- "smlal2 v22.4s, v29.8h, v2.8h\n"
+ "smlal v21.4s, v6.4h, v30.4h\n"
+ "smlal2 v4.4s, v6.8h, v30.8h\n"
+ "smlal v16.4s, v6.4h, v25.4h\n"
+ "smlal2 v14.4s, v6.8h, v25.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 61f\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
"tbz x7, #1, 60f\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ld1 { v23.b }[6], [x20]\n"
"b 63f\n"
"60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "ld1 { v23.b }[4], [x20]\n"
"b 63f\n"
"61:" // Oddments: Load (4, 1): Bit 2: Unset
"tbz x7, #1, 62f\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "ld1 { v23.b }[2], [x20]\n"
"b 63f\n"
"62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "ld1 { v23.b }[0], [x20]\n"
"63:" // Oddments: Load (4, 1): Bit 2: End
- "ushll v27.8h, v27.8b, #0x0\n"
- "ldr x23, [x15, #0xa8]\n"
- "smlal v11.4s, v27.4h, v7.4h\n"
- "smlal2 v10.4s, v27.8h, v7.8h\n"
- "add x23, x23, x17\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v20.4s, v23.4h, v9.4h\n"
+ "smlal2 v1.4s, v23.8h, v9.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 65f\n"
- "ld1 { v24.s }[0], [x23], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz x7, #1, 64f\n"
- "ld1 { v24.h }[2], [x23], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[6], [x23]\n"
+ "ld1 { v12.b }[6], [x20]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[4], [x23]\n"
+ "ld1 { v12.b }[4], [x20]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 2): Bit 2: Unset
"tbz x7, #1, 66f\n"
- "ld1 { v24.h }[0], [x23], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[2], [x23]\n"
+ "ld1 { v12.b }[2], [x20]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[0], [x23]\n"
+ "ld1 { v12.b }[0], [x20]\n"
"67:" // Oddments: Load (3, 2): Bit 2: End
- "ushll v24.8h, v24.8b, #0x0\n"
- "ldr x22, [x15, #0xb0]\n"
- "smlal v11.4s, v24.4h, v5.4h\n"
- "smlal2 v10.4s, v24.8h, v5.8h\n"
- "smlal v9.4s, v24.4h, v3.4h\n"
- "smlal2 v22.4s, v24.8h, v3.8h\n"
- "add x22, x22, x17\n"
+ "ushll v12.8h, v12.8b, #0x0\n"
+ "ldr x20, [x15, #0xb0]\n"
+ "smlal v20.4s, v12.4h, v7.4h\n"
+ "smlal2 v1.4s, v12.8h, v7.8h\n"
+ "smlal v16.4s, v12.4h, v24.4h\n"
+ "smlal2 v14.4s, v12.8h, v24.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 69f\n"
- "ld1 { v26.s }[0], [x22], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz x7, #1, 68f\n"
- "ld1 { v26.h }[2], [x22], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[6], [x22]\n"
+ "ld1 { v10.b }[6], [x20]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[4], [x22]\n"
+ "ld1 { v10.b }[4], [x20]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 3): Bit 2: Unset
"tbz x7, #1, 70f\n"
- "ld1 { v26.h }[0], [x22], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[2], [x22]\n"
+ "ld1 { v10.b }[2], [x20]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[0], [x22]\n"
+ "ld1 { v10.b }[0], [x20]\n"
"71:" // Oddments: Load (4, 3): Bit 2: End
- "ushll v26.8h, v26.8b, #0x0\n"
- "ldr x21, [x15, #0xb8]\n"
- "smlal v9.4s, v26.4h, v7.4h\n"
- "smlal2 v22.4s, v26.8h, v7.8h\n"
- "add x21, x21, x17\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "smlal v16.4s, v10.4h, v9.4h\n"
+ "smlal2 v14.4s, v10.8h, v9.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 73f\n"
- "ld1 { v25.s }[0], [x21], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
"tbz x7, #1, 72f\n"
- "ld1 { v25.h }[2], [x21], #0x2\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[6], [x21]\n"
+ "ld1 { v15.b }[6], [x20]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[4], [x21]\n"
+ "ld1 { v15.b }[4], [x20]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 2): Bit 2: Unset
"tbz x7, #1, 74f\n"
- "ld1 { v25.h }[0], [x21], #0x2\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[2], [x21]\n"
+ "ld1 { v15.b }[2], [x20]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[0], [x21]\n"
+ "ld1 { v15.b }[0], [x20]\n"
"75:" // Oddments: Load (4, 2): Bit 2: End
- "ushll v25.8h, v25.8b, #0x0\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
"ldr x20, [x15, #0xc0]\n"
- "smlal v11.4s, v25.4h, v8.4h\n"
- "smlal2 v10.4s, v25.8h, v8.8h\n"
- "smlal v9.4s, v25.4h, v6.4h\n"
- "smlal2 v22.4s, v25.8h, v6.8h\n"
+ "smlal v20.4s, v15.4h, v30.4h\n"
+ "smlal2 v1.4s, v15.8h, v30.8h\n"
+ "smlal v16.4s, v15.4h, v3.4h\n"
+ "smlal2 v14.4s, v15.8h, v3.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 77f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
"tbz x7, #1, 76f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 4): Bit 2: Unset
"tbz x7, #1, 78f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "ld1 { v28.b }[0], [x20]\n"
"79:" // Oddments: Load (4, 4): Bit 2: End
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal v9.4s, v29.4h, v8.4h\n"
- "smlal2 v22.4s, v29.8h, v8.8h\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal v16.4s, v28.4h, v30.4h\n"
+ "smlal2 v14.4s, v28.8h, v30.8h\n"
"tbz x7, #2, 81f\n"
- "ld1 { v18.4s }, [x13], #0x10\n"
- "ld1 { v21.4s }, [x12], #0x10\n"
+ "ld1 { v19.4s }, [x13], #0x10\n"
+ "ld1 { v23.4s }, [x12], #0x10\n"
"tbz x7, #1, 80f\n"
- "ld1 { v30.d }[0], [x13], #0x8\n"
- "ld1 { v31.d }[0], [x12], #0x8\n"
+ "ld1 { v18.d }[0], [x13], #0x8\n"
+ "ld1 { v24.d }[0], [x12], #0x8\n"
"tbz x7, #0, 83f\n"
- "ld1 { v30.s }[2], [x13]\n"
- "ld1 { v31.s }[2], [x12]\n"
+ "ld1 { v18.s }[2], [x13]\n"
+ "ld1 { v24.s }[2], [x12]\n"
"b 83f\n"
"80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
"tbz x7, #0, 83f\n"
- "ld1 { v30.s }[0], [x13]\n"
- "ld1 { v31.s }[0], [x12]\n"
+ "ld1 { v18.s }[0], [x13]\n"
+ "ld1 { v24.s }[0], [x12]\n"
"b 83f\n"
"81:" // Oddments: Load requant params: Bit 2: Unset
"tbz x7, #1, 82f\n"
- "ld1 { v18.d }[0], [x13], #0x8\n"
- "ld1 { v21.d }[0], [x12], #0x8\n"
+ "ld1 { v19.d }[0], [x13], #0x8\n"
+ "ld1 { v23.d }[0], [x12], #0x8\n"
"tbz x7, #0, 83f\n"
- "ld1 { v18.s }[2], [x13]\n"
- "ld1 { v21.s }[2], [x12]\n"
+ "ld1 { v19.s }[2], [x13]\n"
+ "ld1 { v23.s }[2], [x12]\n"
"b 83f\n"
"82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 83f\n"
- "ld1 { v18.s }[0], [x13]\n"
- "ld1 { v21.s }[0], [x12]\n"
+ "ld1 { v19.s }[0], [x13]\n"
+ "ld1 { v23.s }[0], [x12]\n"
"83:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v15.4s, v15.4s, v18.4s\n"
- "and v2.16b, v15.16b, v21.16b\n"
+ "sqrdmulh v8.4s, v8.4s, v19.4s\n"
+ "and v17.16b, v8.16b, v23.16b\n"
"add x11, x11, x16\n"
"add x10, x10, x16\n"
- "sqrdmulh v13.4s, v13.4s, v30.4s\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
+ "sqrdmulh v2.4s, v2.4s, v18.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
"add x9, x9, x16\n"
"add x28, x28, x16\n"
- "and v16.16b, v13.16b, v31.16b\n"
- "sqrdmulh v17.4s, v17.4s, v18.4s\n"
- "sqrdmulh v11.4s, v11.4s, v18.4s\n"
- "sqrdmulh v9.4s, v9.4s, v18.4s\n"
- "sqadd v15.4s, v15.4s, v2.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "and v18.16b, v17.16b, v21.16b\n"
- "sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "and v28.16b, v11.16b, v21.16b\n"
- "sqrdmulh v10.4s, v10.4s, v30.4s\n"
- "and v2.16b, v9.16b, v21.16b\n"
- "sqrdmulh v22.4s, v22.4s, v30.4s\n"
- "sqadd v13.4s, v13.4s, v16.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "and v4.16b, v20.16b, v31.16b\n"
+ "and v11.16b, v2.16b, v24.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v19.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v19.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v19.4s\n"
+ "sqadd v8.4s, v8.4s, v17.4s\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "and v28.16b, v21.16b, v23.16b\n"
+ "sqrdmulh v4.4s, v4.4s, v18.4s\n"
+ "and v17.16b, v20.16b, v23.16b\n"
+ "sqrdmulh v1.4s, v1.4s, v18.4s\n"
+ "and v19.16b, v16.16b, v23.16b\n"
+ "sqrdmulh v14.4s, v14.4s, v18.4s\n"
+ "sqadd v2.4s, v2.4s, v11.4s\n"
"sshr v28.4s, v28.4s, #0x1f\n"
- "and v3.16b, v10.16b, v31.16b\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "and v16.16b, v22.16b, v31.16b\n"
- "sqadd v17.4s, v17.4s, v18.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v11.4s, v11.4s, v28.4s\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v2.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v21.4s\n"
- "srshl v17.4s, v17.4s, v21.4s\n"
- "sqadd v20.4s, v20.4s, v4.4s\n"
- "srshl v11.4s, v11.4s, v21.4s\n"
- "sqadd v10.4s, v10.4s, v3.4s\n"
- "srshl v9.4s, v9.4s, v21.4s\n"
- "sqadd v22.4s, v22.4s, v16.4s\n"
- "srshl v13.4s, v13.4s, v31.4s\n"
- "sqxtn v15.4h, v15.4s\n"
- "srshl v20.4s, v20.4s, v31.4s\n"
- "sqxtn v17.4h, v17.4s\n"
- "srshl v10.4s, v10.4s, v31.4s\n"
- "sqxtn v11.4h, v11.4s\n"
- "srshl v22.4s, v22.4s, v31.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "sqxtn2 v15.8h, v13.4s\n"
- "sqxtn2 v17.8h, v20.4s\n"
- "sqxtn2 v11.8h, v10.4s\n"
- "sqxtn2 v9.8h, v22.4s\n"
- "sqadd v15.8h, v15.8h, v12.8h\n"
- "sqadd v17.8h, v17.8h, v12.8h\n"
- "sqadd v11.8h, v11.8h, v12.8h\n"
- "sqadd v9.8h, v9.8h, v12.8h\n"
- "smax v15.8h, v15.8h, v14.8h\n"
- "smax v17.8h, v17.8h, v14.8h\n"
- "smax v11.8h, v11.8h, v14.8h\n"
- "smax v9.8h, v9.8h, v14.8h\n"
- "smin v15.8h, v15.8h, v23.8h\n"
- "smin v17.8h, v17.8h, v23.8h\n"
- "smin v11.8h, v11.8h, v23.8h\n"
- "smin v9.8h, v9.8h, v23.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "and v18.16b, v4.16b, v24.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v12.16b, v1.16b, v24.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "and v25.16b, v14.16b, v24.16b\n"
+ "sqadd v21.4s, v21.4s, v28.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v19.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "srshl v8.4s, v8.4s, v23.4s\n"
+ "srshl v21.4s, v21.4s, v23.4s\n"
+ "sqadd v4.4s, v4.4s, v18.4s\n"
+ "srshl v20.4s, v20.4s, v23.4s\n"
+ "sqadd v1.4s, v1.4s, v12.4s\n"
+ "srshl v16.4s, v16.4s, v23.4s\n"
+ "sqadd v14.4s, v14.4s, v25.4s\n"
+ "srshl v2.4s, v2.4s, v24.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v4.4s, v4.4s, v24.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "srshl v1.4s, v1.4s, v24.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v14.4s, v14.4s, v24.4s\n"
+ "sqxtn v16.4h, v16.4s\n"
+ "sqxtn2 v8.8h, v2.4s\n"
+ "sqxtn2 v21.8h, v4.4s\n"
+ "sqxtn2 v20.8h, v1.4s\n"
+ "sqxtn2 v16.8h, v14.4s\n"
+ "sqadd v8.8h, v8.8h, v22.8h\n"
+ "sqadd v21.8h, v21.8h, v22.8h\n"
+ "sqadd v20.8h, v20.8h, v22.8h\n"
+ "sqadd v16.8h, v16.8h, v22.8h\n"
+ "smax v8.8h, v8.8h, v13.8h\n"
+ "smax v21.8h, v21.8h, v13.8h\n"
+ "smax v20.8h, v20.8h, v13.8h\n"
+ "smax v16.8h, v16.8h, v13.8h\n"
+ "smin v8.8h, v8.8h, v5.8h\n"
+ "smin v21.8h, v21.8h, v5.8h\n"
+ "smin v20.8h, v20.8h, v5.8h\n"
+ "smin v16.8h, v16.8h, v5.8h\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
"tbz x7, #2, 85f\n"
- "st1 { v15.s }[0], [x11], #0x4\n"
- "st1 { v17.s }[0], [x10], #0x4\n"
- "st1 { v11.s }[0], [x9], #0x4\n"
- "st1 { v9.s }[0], [x28], #0x4\n"
+ "st1 { v8.s }[0], [x11], #0x4\n"
+ "st1 { v21.s }[0], [x10], #0x4\n"
+ "st1 { v20.s }[0], [x9], #0x4\n"
+ "st1 { v16.s }[0], [x28], #0x4\n"
"tbz x7, #1, 84f\n"
- "st1 { v15.h }[2], [x11], #0x2\n"
- "st1 { v17.h }[2], [x10], #0x2\n"
- "st1 { v11.h }[2], [x9], #0x2\n"
- "st1 { v9.h }[2], [x28], #0x2\n"
+ "st1 { v8.h }[2], [x11], #0x2\n"
+ "st1 { v21.h }[2], [x10], #0x2\n"
+ "st1 { v20.h }[2], [x9], #0x2\n"
+ "st1 { v16.h }[2], [x28], #0x2\n"
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[6], [x11], #0x1\n"
- "st1 { v17.b }[6], [x10], #0x1\n"
- "st1 { v11.b }[6], [x9], #0x1\n"
- "st1 { v9.b }[6], [x28], #0x1\n"
+ "st1 { v8.b }[6], [x11], #0x1\n"
+ "st1 { v21.b }[6], [x10], #0x1\n"
+ "st1 { v20.b }[6], [x9], #0x1\n"
+ "st1 { v16.b }[6], [x28], #0x1\n"
"b 87f\n"
"84:" // Oddments: Bit 2: Bit 1: Unset
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[4], [x11], #0x1\n"
- "st1 { v17.b }[4], [x10], #0x1\n"
- "st1 { v11.b }[4], [x9], #0x1\n"
- "st1 { v9.b }[4], [x28], #0x1\n"
+ "st1 { v8.b }[4], [x11], #0x1\n"
+ "st1 { v21.b }[4], [x10], #0x1\n"
+ "st1 { v20.b }[4], [x9], #0x1\n"
+ "st1 { v16.b }[4], [x28], #0x1\n"
"b 87f\n"
"85:" // Oddments: Bit 2: Unset
"tbz x7, #1, 86f\n"
- "st1 { v15.h }[0], [x11], #0x2\n"
- "st1 { v17.h }[0], [x10], #0x2\n"
- "st1 { v11.h }[0], [x9], #0x2\n"
- "st1 { v9.h }[0], [x28], #0x2\n"
+ "st1 { v8.h }[0], [x11], #0x2\n"
+ "st1 { v21.h }[0], [x10], #0x2\n"
+ "st1 { v20.h }[0], [x9], #0x2\n"
+ "st1 { v16.h }[0], [x28], #0x2\n"
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[2], [x11], #0x1\n"
- "st1 { v17.b }[2], [x10], #0x1\n"
- "st1 { v11.b }[2], [x9], #0x1\n"
- "st1 { v9.b }[2], [x28], #0x1\n"
+ "st1 { v8.b }[2], [x11], #0x1\n"
+ "st1 { v21.b }[2], [x10], #0x1\n"
+ "st1 { v20.b }[2], [x9], #0x1\n"
+ "st1 { v16.b }[2], [x28], #0x1\n"
"b 87f\n"
"86:" // Oddments: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[0], [x11], #0x1\n"
- "st1 { v17.b }[0], [x10], #0x1\n"
- "st1 { v11.b }[0], [x9], #0x1\n"
- "st1 { v9.b }[0], [x28], #0x1\n"
+ "st1 { v8.b }[0], [x11], #0x1\n"
+ "st1 { v21.b }[0], [x10], #0x1\n"
+ "st1 { v20.b }[0], [x9], #0x1\n"
+ "st1 { v16.b }[0], [x28], #0x1\n"
"87:" // Oddments: Bit 2: End
"88:" // End
:
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index 7075f58f92..f2ab5831d8 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -22,8 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
-
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
@@ -35,15 +34,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
- const unsigned int,
- const uint8_t *const *const,
- const uint8_t *const,
- const int32_t *const,
- const arm_gemm::Requantize32 &,
- const int32_t *const,
- const int32_t *const,
- uint8_t *const *const);
+void a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
class a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index eec3ba5900..c8fe567e77 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -111,2071 +111,2071 @@ void a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n"
- "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
- "lsr x9, x4, #0x3\n"
- "add x24, x22, %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v9.16b }, [x24]\n"
- "ldr x25, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x28, x22, %[offsetof_Requantize32_c_offset]\n"
- "add x24, x22, %[offsetof_Requantize32_minval]\n"
- "ld1r { v15.8h }, [x28]\n"
- "ld1r { v14.8h }, [x24]\n"
- "add x20, x22, %[offsetof_Requantize32_maxval]\n"
- "mov x3, #0x0\n"
+ "ldr x2, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+ "lsr x3, x2, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v2.16b }, [x20]\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x21, x23, %[offsetof_Requantize32_c_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_minval]\n"
+ "ld1r { v25.8h }, [x21]\n"
"ld1r { v12.8h }, [x20]\n"
- "mov x1, #0x0\n"
- "add x2, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x0, [%x[params], %[offsetof_Params_weights]]\n"
- "ldr x6, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "ldr x5, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x7, x8, [x25, #0x0]\n"
- "ldp x17, x16, [x25, #0x10]\n"
- "cbz x9, 3f\n"
- "ldr d0, [x0, #0x0]\n"
- "ldr d1, [x0, #0x8]\n"
- "subs x9, x9, #0x1\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "ldr d2, [x0, #0x10]\n"
- "ldr d3, [x0, #0x18]\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "ldr d4, [x0, #0x20]\n"
- "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "ldr q11, [x13, #0x0]\n"
- "ldr q13, [x13, #0x10]\n"
- "add x13, x13, #0x20\n"
- "str x13, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x10, x28, [x2, #0x0]\n"
- "ldp x27, x26, [x2, #0x10]\n"
- "mov v20.16b, v11.16b\n"
- "mov v19.16b, v13.16b\n"
- "ldp x25, x24, [x2, #0x20]\n"
- "ldp x23, x22, [x2, #0x30]\n"
- "mov v8.16b, v11.16b\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "mov x4, #0x0\n"
+ "ld1r { v26.8h }, [x20]\n"
+ "mov x5, #0x0\n"
+ "add x6, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x17, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x16, x15, [x22, #0x0]\n"
+ "ldp x14, x13, [x22, #0x10]\n"
+ "cbz x3, 3f\n"
+ "ldr d21, [x7, #0x0]\n"
+ "ldr d15, [x7, #0x8]\n"
+ "subs x3, x3, #0x1\n"
+ "usubl v21.8h, v21.8b, v2.8b\n"
+ "ldr d29, [x7, #0x10]\n"
+ "ldr d18, [x7, #0x18]\n"
+ "usubl v15.8h, v15.8b, v2.8b\n"
+ "usubl v29.8h, v29.8b, v2.8b\n"
+ "ldr d3, [x7, #0x20]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "usubl v18.8h, v18.8b, v2.8b\n"
+ "usubl v3.8h, v3.8b, v2.8b\n"
+ "ldr q13, [x20, #0x0]\n"
+ "ldr q24, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x9, x28, [x6, #0x0]\n"
+ "ldp x27, x26, [x6, #0x10]\n"
"mov v7.16b, v13.16b\n"
- "ldp x21, x20, [x2, #0x40]\n"
- "ldr d31, [x10, x3]\n"
- "mov v6.16b, v11.16b\n"
- "mov v5.16b, v13.16b\n"
- "ldr d30, [x28, x3]\n"
- "ldr d29, [x27, x3]\n"
- "ushll v31.8h, v31.8b, #0x0\n"
+ "mov v14.16b, v24.16b\n"
+ "ldp x25, x24, [x6, #0x20]\n"
+ "ldp x23, x22, [x6, #0x30]\n"
+ "mov v27.16b, v13.16b\n"
+ "mov v22.16b, v24.16b\n"
+ "ldp x21, x20, [x6, #0x40]\n"
+ "ldr d10, [x9, x4]\n"
+ "mov v8.16b, v13.16b\n"
+ "mov v17.16b, v24.16b\n"
+ "ldr d16, [x28, x4]\n"
+ "ldr d23, [x27, x4]\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "ldr d30, [x26, x4]\n"
+ "ldr d4, [x25, x4]\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
"ushll v30.8h, v30.8b, #0x0\n"
- "ldr d28, [x26, x3]\n"
- "ldr d27, [x25, x3]\n"
- "ushll v29.8h, v29.8b, #0x0\n"
+ "ldr d28, [x24, x4]\n"
+ "ldr d31, [x23, x4]\n"
+ "ushll v4.8h, v4.8b, #0x0\n"
"ushll v28.8h, v28.8b, #0x0\n"
- "ldr d23, [x24, x3]\n"
- "ldr d25, [x23, x3]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "ushll v23.8h, v23.8b, #0x0\n"
- "ldr d24, [x22, x3]\n"
- "ldr d26, [x21, x3]\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "ldr d22, [x20, x3]\n"
- "ushll v26.8h, v26.8b, #0x0\n"
- "ushll v22.8h, v22.8b, #0x0\n"
+ "ldr d1, [x22, x4]\n"
+ "ldr d9, [x21, x4]\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "ushll v1.8h, v1.8b, #0x0\n"
+ "ldr d11, [x20, x4]\n"
+ "ushll v9.8h, v9.8b, #0x0\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
"beq 2f\n"
"1:" // Loop
- "ldr q18, [x6, #0x0]\n"
- "ldr q21, [x5, #0x0]\n"
- "smlal v11.4s, v31.4h, v0.4h\n"
- "smlal2 v13.4s, v31.8h, v0.8h\n"
- "ldr q16, [x6, #0x10]\n"
- "ldr q10, [x5, #0x10]\n"
- "smlal v11.4s, v30.4h, v1.4h\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "ldr x20, [x2, #0x50]\n"
- "smlal v8.4s, v29.4h, v0.4h\n"
- "smlal v6.4s, v28.4h, v0.4h\n"
- "ldr x22, [x2, #0x58]\n"
- "smlal2 v13.4s, v30.8h, v1.8h\n"
- "smlal2 v19.4s, v30.8h, v0.8h\n"
- "ldr d31, [x20, x3]\n"
+ "ldr d5, [x7, #0x28]\n"
+ "ldr d6, [x7, #0x30]\n"
+ "smlal v13.4s, v10.4h, v21.4h\n"
+ "smlal2 v24.4s, v10.8h, v21.8h\n"
+ "ldr d19, [x7, #0x38]\n"
+ "ldr d0, [x7, #0x40]\n"
+ "smlal v13.4s, v16.4h, v15.4h\n"
+ "smlal v7.4s, v16.4h, v21.4h\n"
+ "ldr d10, [x7, #0x48]\n"
+ "ldr d20, [x7, #0x50]\n"
+ "smlal v27.4s, v23.4h, v21.4h\n"
+ "smlal v8.4s, v30.4h, v21.4h\n"
+ "ldr x21, [x6, #0x50]\n"
+ "smlal2 v24.4s, v16.8h, v15.8h\n"
+ "smlal v13.4s, v4.4h, v29.4h\n"
+ "ldr x20, [x6, #0x58]\n"
+ "smlal2 v14.4s, v16.8h, v21.8h\n"
+ "ldr d16, [x21, x4]\n"
+ "smlal2 v22.4s, v23.8h, v21.8h\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "smlal2 v17.4s, v30.8h, v21.8h\n"
+ "ldr d21, [x20, x4]\n"
+ "smlal v7.4s, v4.4h, v15.4h\n"
+ "ldr x22, [x6, #0x60]\n"
+ "smlal v27.4s, v30.4h, v15.4h\n"
+ "smlal v8.4s, v28.4h, v15.4h\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "ldr x20, [x6, #0x68]\n"
+ "smlal2 v24.4s, v4.8h, v29.8h\n"
+ "smlal v13.4s, v31.4h, v18.4h\n"
+ "usubl v5.8h, v5.8b, v2.8b\n"
+ "ldr x21, [x6, #0x70]\n"
+ "smlal2 v14.4s, v4.8h, v15.8h\n"
+ "ldr d4, [x22, x4]\n"
+ "smlal2 v22.4s, v30.8h, v15.8h\n"
+ "ushll v4.8h, v4.8b, #0x0\n"
+ "smlal2 v17.4s, v28.8h, v15.8h\n"
+ "ldr d15, [x20, x4]\n"
+ "smlal v7.4s, v31.4h, v29.4h\n"
+ "usubl v6.8h, v6.8b, v2.8b\n"
+ "smlal v27.4s, v28.4h, v29.4h\n"
+ "smlal v8.4s, v16.4h, v29.4h\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "ldr x20, [x6, #0x78]\n"
+ "smlal2 v24.4s, v31.8h, v18.8h\n"
+ "smlal v13.4s, v1.4h, v3.4h\n"
+ "usubl v19.8h, v19.8b, v2.8b\n"
+ "ldr x22, [x6, #0x80]\n"
+ "smlal2 v14.4s, v31.8h, v29.8h\n"
+ "ldr d31, [x21, x4]\n"
+ "smlal2 v22.4s, v28.8h, v29.8h\n"
"ushll v31.8h, v31.8b, #0x0\n"
- "smlal2 v7.4s, v29.8h, v0.8h\n"
- "smlal v11.4s, v27.4h, v2.4h\n"
- "ldr x21, [x2, #0x60]\n"
- "ldr x20, [x2, #0x68]\n"
- "smlal2 v5.4s, v28.8h, v0.8h\n"
- "ldr d30, [x22, x3]\n"
- "smlal v20.4s, v27.4h, v1.4h\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "smlal v8.4s, v28.4h, v1.4h\n"
- "smlal v6.4s, v23.4h, v1.4h\n"
- "ldr x25, [x2, #0x70]\n"
- "ldr x26, [x2, #0x78]\n"
- "smlal2 v13.4s, v27.8h, v2.8h\n"
- "smlal2 v19.4s, v27.8h, v1.8h\n"
- "ldr d0, [x0, #0x28]\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "smlal2 v7.4s, v28.8h, v1.8h\n"
- "smlal v11.4s, v25.4h, v3.4h\n"
- "ldr x23, [x2, #0x80]\n"
- "ldr x24, [x2, #0x88]\n"
- "smlal2 v5.4s, v23.8h, v1.8h\n"
- "ldr d27, [x21, x3]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal v8.4s, v23.4h, v2.4h\n"
- "smlal v6.4s, v31.4h, v2.4h\n"
- "ldr x15, [x2, #0x90]\n"
- "ldr x21, [x2, #0x98]\n"
- "smlal2 v13.4s, v25.8h, v3.8h\n"
- "smlal2 v19.4s, v25.8h, v2.8h\n"
- "ldr d1, [x0, #0x30]\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "smlal2 v7.4s, v23.8h, v2.8h\n"
- "smlal v11.4s, v24.4h, v4.4h\n"
- "ldr x14, [x2, #0xa0]\n"
- "ldr x13, [x2, #0xa8]\n"
- "smlal2 v5.4s, v31.8h, v2.8h\n"
- "ldr d25, [x20, x3]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal v8.4s, v31.4h, v3.4h\n"
- "smlal v6.4s, v30.4h, v3.4h\n"
- "ldr x12, [x2, #0xb0]\n"
- "ldr x20, [x2, #0xb8]\n"
- "smlal2 v13.4s, v24.8h, v4.8h\n"
- "smlal2 v19.4s, v24.8h, v3.8h\n"
- "ldr d2, [x0, #0x38]\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v3.8h\n"
- "smlal v11.4s, v29.4h, v0.4h\n"
- "ldr x11, [x2, #0xc0]\n"
- "ldr x10, [x2, #0xc8]\n"
- "smlal2 v5.4s, v30.8h, v3.8h\n"
- "ldr d24, [x25, x3]\n"
- "smlal v20.4s, v27.4h, v4.4h\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "smlal v8.4s, v30.4h, v4.4h\n"
- "smlal v6.4s, v26.4h, v4.4h\n"
- "ldr x22, [x2, #0xd0]\n"
- "ldr x28, [x2, #0xd8]\n"
- "smlal2 v13.4s, v29.8h, v0.8h\n"
- "ldr d3, [x0, #0x40]\n"
- "smlal2 v19.4s, v27.8h, v4.8h\n"
- "ldr d27, [x26, x3]\n"
- "smlal2 v7.4s, v30.8h, v4.8h\n"
- "smlal v11.4s, v28.4h, v1.4h\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "ldr x27, [x2, #0xe0]\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "ldr d4, [x0, #0x48]\n"
- "smlal v20.4s, v28.4h, v0.4h\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal v8.4s, v22.4h, v0.4h\n"
- "smlal v6.4s, v25.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "ldr x26, [x2, #0xe8]\n"
- "smlal2 v13.4s, v28.8h, v1.8h\n"
- "smlal2 v19.4s, v28.8h, v0.8h\n"
- "ldr d28, [x24, x3]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal2 v7.4s, v22.8h, v0.8h\n"
- "smlal v11.4s, v23.4h, v2.4h\n"
- "ldr x25, [x2, #0xf0]\n"
- "subs x9, x9, #0x1\n"
- "smlal2 v5.4s, v25.8h, v0.8h\n"
- "ldr d0, [x0, #0x50]\n"
- "smlal v20.4s, v23.4h, v1.4h\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "smlal v8.4s, v25.4h, v1.4h\n"
- "smlal v6.4s, v24.4h, v1.4h\n"
- "add x6, x6, #0x20\n"
- "add x5, x5, #0x20\n"
- "smlal2 v13.4s, v23.8h, v2.8h\n"
- "smlal2 v19.4s, v23.8h, v1.8h\n"
- "ldr d23, [x23, x3]\n"
+ "smlal2 v17.4s, v16.8h, v29.8h\n"
+ "ldr d29, [x20, x4]\n"
+ "smlal v7.4s, v1.4h, v18.4h\n"
+ "usubl v0.8h, v0.8b, v2.8b\n"
+ "smlal v27.4s, v16.4h, v18.4h\n"
+ "smlal v8.4s, v21.4h, v18.4h\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "ldr x20, [x6, #0x88]\n"
+ "smlal2 v24.4s, v1.8h, v3.8h\n"
+ "smlal v13.4s, v23.4h, v5.4h\n"
+ "usubl v10.8h, v10.8b, v2.8b\n"
+ "ldr x21, [x6, #0x90]\n"
+ "smlal2 v14.4s, v1.8h, v18.8h\n"
+ "ldr d1, [x22, x4]\n"
+ "smlal2 v22.4s, v16.8h, v18.8h\n"
+ "ushll v1.8h, v1.8b, #0x0\n"
+ "smlal2 v17.4s, v21.8h, v18.8h\n"
+ "ldr d18, [x20, x4]\n"
+ "smlal v7.4s, v4.4h, v3.4h\n"
+ "usubl v20.8h, v20.8b, v2.8b\n"
+ "smlal v27.4s, v21.4h, v3.4h\n"
+ "smlal v8.4s, v9.4h, v3.4h\n"
+ "ldr x20, [x6, #0x98]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "smlal2 v24.4s, v23.8h, v5.8h\n"
+ "ldr d23, [x7, #0x58]\n"
+ "smlal v13.4s, v30.4h, v6.4h\n"
+ "usubl v23.8h, v23.8b, v2.8b\n"
+ "smlal2 v14.4s, v4.8h, v3.8h\n"
+ "ldr d4, [x21, x4]\n"
+ "smlal2 v22.4s, v21.8h, v3.8h\n"
+ "ldr x23, [x6, #0xa0]\n"
+ "smlal2 v17.4s, v9.8h, v3.8h\n"
+ "ldr d3, [x20, x4]\n"
+ "smlal v7.4s, v30.4h, v5.4h\n"
+ "ushll v4.8h, v4.8b, #0x0\n"
+ "smlal v27.4s, v11.4h, v5.4h\n"
+ "smlal v8.4s, v15.4h, v5.4h\n"
+ "ushll v3.8h, v3.8b, #0x0\n"
+ "ldr x22, [x6, #0xa8]\n"
+ "smlal2 v24.4s, v30.8h, v6.8h\n"
+ "smlal v13.4s, v28.4h, v19.4h\n"
+ "ldr x21, [x6, #0xb0]\n"
+ "ldr x20, [x6, #0xb8]\n"
+ "smlal2 v14.4s, v30.8h, v5.8h\n"
+ "ldr d30, [x7, #0x60]\n"
+ "smlal2 v22.4s, v11.8h, v5.8h\n"
+ "usubl v30.8h, v30.8b, v2.8b\n"
+ "smlal2 v17.4s, v15.8h, v5.8h\n"
+ "ldr d5, [x23, x4]\n"
+ "smlal v7.4s, v28.4h, v6.4h\n"
+ "ushll v5.8h, v5.8b, #0x0\n"
+ "smlal v27.4s, v15.4h, v6.4h\n"
+ "smlal v8.4s, v31.4h, v6.4h\n"
+ "ldr x12, [x6, #0xc0]\n"
+ "ldr x11, [x6, #0xc8]\n"
+ "smlal2 v24.4s, v28.8h, v19.8h\n"
+ "smlal v13.4s, v16.4h, v0.4h\n"
+ "ldr x10, [x6, #0xd0]\n"
+ "ldr x9, [x6, #0xd8]\n"
+ "smlal2 v14.4s, v28.8h, v6.8h\n"
+ "ldr d28, [x7, #0x68]\n"
+ "smlal2 v22.4s, v15.8h, v6.8h\n"
+ "usubl v28.8h, v28.8b, v2.8b\n"
+ "smlal2 v17.4s, v31.8h, v6.8h\n"
+ "ldr d6, [x22, x4]\n"
+ "smlal v7.4s, v16.4h, v19.4h\n"
+ "ushll v6.8h, v6.8b, #0x0\n"
+ "smlal v27.4s, v31.4h, v19.4h\n"
+ "smlal v8.4s, v29.4h, v19.4h\n"
+ "ldr x28, [x6, #0xe0]\n"
+ "ldr x27, [x6, #0xe8]\n"
+ "smlal2 v24.4s, v16.8h, v0.8h\n"
+ "smlal v13.4s, v21.4h, v10.4h\n"
+ "ldr x26, [x6, #0xf0]\n"
+ "ldr x25, [x6, #0xf8]\n"
+ "smlal2 v14.4s, v16.8h, v19.8h\n"
+ "ldr d16, [x7, #0x70]\n"
+ "smlal2 v22.4s, v31.8h, v19.8h\n"
+ "usubl v16.8h, v16.8b, v2.8b\n"
+ "smlal2 v17.4s, v29.8h, v19.8h\n"
+ "ldr d19, [x21, x4]\n"
+ "smlal v7.4s, v21.4h, v0.4h\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "smlal v27.4s, v29.4h, v0.4h\n"
+ "smlal v8.4s, v1.4h, v0.4h\n"
+ "ldr x24, [x6, #0x100]\n"
+ "ldr x23, [x6, #0x108]\n"
+ "smlal2 v24.4s, v21.8h, v10.8h\n"
+ "smlal v13.4s, v11.4h, v20.4h\n"
+ "ldr x22, [x6, #0x110]\n"
+ "ldr x21, [x6, #0x118]\n"
+ "smlal2 v14.4s, v21.8h, v0.8h\n"
+ "ldr d21, [x7, #0x78]\n"
+ "smlal2 v22.4s, v29.8h, v0.8h\n"
+ "usubl v21.8h, v21.8b, v2.8b\n"
+ "smlal2 v17.4s, v1.8h, v0.8h\n"
+ "ldr d0, [x20, x4]\n"
+ "smlal v7.4s, v9.4h, v10.4h\n"
+ "ushll v0.8h, v0.8b, #0x0\n"
+ "smlal v27.4s, v1.4h, v10.4h\n"
+ "smlal v8.4s, v18.4h, v10.4h\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "subs x3, x3, #0x1\n"
+ "smlal2 v24.4s, v11.8h, v20.8h\n"
+ "ldr d11, [x7, #0x80]\n"
+ "smlal v13.4s, v15.4h, v23.4h\n"
+ "usubl v11.8h, v11.8b, v2.8b\n"
+ "smlal2 v14.4s, v9.8h, v10.8h\n"
+ "ldr d9, [x12, x4]\n"
+ "smlal2 v22.4s, v1.8h, v10.8h\n"
+ "ushll v9.8h, v9.8b, #0x0\n"
+ "smlal2 v17.4s, v18.8h, v10.8h\n"
+ "ldr d10, [x11, x4]\n"
+ "smlal v7.4s, v15.4h, v20.4h\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "smlal v27.4s, v4.4h, v20.4h\n"
+ "smlal v8.4s, v3.4h, v20.4h\n"
+ "smlal2 v24.4s, v15.8h, v23.8h\n"
+ "smlal v13.4s, v31.4h, v30.4h\n"
+ "smlal2 v14.4s, v15.8h, v20.8h\n"
+ "ldr d15, [x7, #0x88]\n"
+ "smlal2 v22.4s, v4.8h, v20.8h\n"
+ "usubl v15.8h, v15.8b, v2.8b\n"
+ "smlal2 v17.4s, v3.8h, v20.8h\n"
+ "ldr d20, [x10, x4]\n"
+ "smlal v7.4s, v31.4h, v23.4h\n"
+ "ushll v20.8h, v20.8b, #0x0\n"
+ "smlal v27.4s, v3.4h, v23.4h\n"
+ "smlal v8.4s, v5.4h, v23.4h\n"
+ "smlal2 v24.4s, v31.8h, v30.8h\n"
+ "smlal v13.4s, v29.4h, v28.4h\n"
+ "smlal2 v14.4s, v31.8h, v23.8h\n"
+ "ldr d31, [x7, #0x90]\n"
+ "smlal2 v22.4s, v3.8h, v23.8h\n"
+ "usubl v31.8h, v31.8b, v2.8b\n"
+ "smlal2 v17.4s, v5.8h, v23.8h\n"
+ "ldr d23, [x9, x4]\n"
+ "smlal v7.4s, v29.4h, v30.4h\n"
"ushll v23.8h, v23.8b, #0x0\n"
- "smlal2 v7.4s, v25.8h, v1.8h\n"
- "smlal v11.4s, v31.4h, v3.4h\n"
- "ldr x24, [x2, #0xf8]\n"
- "smlal2 v5.4s, v24.8h, v1.8h\n"
- "ldr d1, [x0, #0x58]\n"
- "smlal v20.4s, v31.4h, v2.4h\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "smlal v8.4s, v24.4h, v2.4h\n"
- "smlal v6.4s, v27.4h, v2.4h\n"
- "smlal2 v13.4s, v31.8h, v3.8h\n"
- "smlal2 v19.4s, v31.8h, v2.8h\n"
- "ldr d31, [x15, x3]\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "smlal2 v7.4s, v24.8h, v2.8h\n"
- "smlal v11.4s, v30.4h, v4.4h\n"
- "ldr x23, [x2, #0x100]\n"
- "smlal2 v5.4s, v27.8h, v2.8h\n"
- "ldr d2, [x0, #0x60]\n"
- "smlal v20.4s, v30.4h, v3.4h\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "smlal v8.4s, v27.4h, v3.4h\n"
- "smlal v6.4s, v23.4h, v3.4h\n"
- "smlal2 v13.4s, v30.8h, v4.8h\n"
- "smlal2 v19.4s, v30.8h, v3.8h\n"
- "ldr d30, [x21, x3]\n"
+ "smlal v27.4s, v5.4h, v30.4h\n"
+ "smlal v8.4s, v6.4h, v30.4h\n"
+ "smlal2 v24.4s, v29.8h, v28.8h\n"
+ "smlal v13.4s, v1.4h, v16.4h\n"
+ "smlal2 v14.4s, v29.8h, v30.8h\n"
+ "ldr d29, [x7, #0x98]\n"
+ "smlal2 v22.4s, v5.8h, v30.8h\n"
+ "usubl v29.8h, v29.8b, v2.8b\n"
+ "smlal2 v17.4s, v6.8h, v30.8h\n"
+ "ldr d30, [x28, x4]\n"
+ "smlal v7.4s, v1.4h, v28.4h\n"
"ushll v30.8h, v30.8b, #0x0\n"
- "smlal2 v7.4s, v27.8h, v3.8h\n"
- "smlal v11.4s, v22.4h, v0.4h\n"
- "ldr x15, [x2, #0x108]\n"
- "smlal2 v5.4s, v23.8h, v3.8h\n"
- "ldr d3, [x0, #0x68]\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "smlal v8.4s, v23.4h, v4.4h\n"
- "smlal v6.4s, v28.4h, v4.4h\n"
- "smlal2 v13.4s, v22.8h, v0.8h\n"
- "ldr d22, [x20, x3]\n"
- "smlal2 v19.4s, v26.8h, v4.8h\n"
- "ldr d26, [x14, x3]\n"
- "smlal2 v7.4s, v23.8h, v4.8h\n"
- "smlal v11.4s, v25.4h, v1.4h\n"
- "ushll v26.8h, v26.8b, #0x0\n"
- "ldr x21, [x2, #0x110]\n"
- "smlal2 v5.4s, v28.8h, v4.8h\n"
- "ldr d4, [x0, #0x70]\n"
- "smlal v20.4s, v25.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "smlal v8.4s, v31.4h, v0.4h\n"
- "smlal v6.4s, v30.4h, v0.4h\n"
- "ushll v22.8h, v22.8b, #0x0\n"
- "ldr x20, [x2, #0x118]\n"
- "smlal2 v13.4s, v25.8h, v1.8h\n"
- "smlal2 v19.4s, v25.8h, v0.8h\n"
- "ldr d25, [x13, x3]\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal2 v7.4s, v31.8h, v0.8h\n"
- "smlal v11.4s, v24.4h, v2.4h\n"
- "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal2 v5.4s, v30.8h, v0.8h\n"
- "ldr d0, [x0, #0x78]\n"
- "smlal v20.4s, v24.4h, v1.4h\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "smlal v8.4s, v30.4h, v1.4h\n"
- "smlal v6.4s, v26.4h, v1.4h\n"
- "smlal2 v13.4s, v24.8h, v2.8h\n"
- "smlal2 v19.4s, v24.8h, v1.8h\n"
- "ldr d24, [x12, x3]\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "smlal2 v7.4s, v30.8h, v1.8h\n"
- "smlal v11.4s, v27.4h, v3.4h\n"
- "smlal2 v5.4s, v26.8h, v1.8h\n"
- "ldr d1, [x0, #0x80]\n"
- "smlal v20.4s, v27.4h, v2.4h\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "smlal v8.4s, v26.4h, v2.4h\n"
- "smlal v6.4s, v25.4h, v2.4h\n"
- "smlal2 v13.4s, v27.8h, v3.8h\n"
- "smlal2 v19.4s, v27.8h, v2.8h\n"
- "ldr d27, [x11, x3]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal2 v7.4s, v26.8h, v2.8h\n"
- "smlal v11.4s, v23.4h, v4.4h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "ldr d2, [x0, #0x88]\n"
- "smlal v20.4s, v23.4h, v3.4h\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v6.4s, v24.4h, v3.4h\n"
- "smlal2 v13.4s, v23.8h, v4.8h\n"
- "smlal2 v19.4s, v23.8h, v3.8h\n"
- "ldr d23, [x10, x3]\n"
- "ushll v23.8h, v23.8b, #0x0\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal v11.4s, v31.4h, v0.4h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "ldr d3, [x0, #0x90]\n"
- "smlal v20.4s, v28.4h, v4.4h\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal v6.4s, v22.4h, v4.4h\n"
- "smlal2 v13.4s, v31.8h, v0.8h\n"
- "ldr d31, [x22, x3]\n"
- "smlal2 v19.4s, v28.8h, v4.8h\n"
- "ldr d28, [x27, x3]\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal v11.4s, v30.4h, v1.4h\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "smlal2 v5.4s, v22.8h, v4.8h\n"
- "ldr d4, [x0, #0x98]\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "smlal v8.4s, v27.4h, v0.4h\n"
- "smlal v6.4s, v23.4h, v0.4h\n"
+ "smlal v27.4s, v6.4h, v28.4h\n"
+ "smlal v8.4s, v19.4h, v28.4h\n"
+ "smlal2 v24.4s, v1.8h, v16.8h\n"
+ "smlal v13.4s, v4.4h, v21.4h\n"
+ "smlal2 v14.4s, v1.8h, v28.8h\n"
+ "ldr d1, [x7, #0xa0]\n"
+ "smlal2 v22.4s, v6.8h, v28.8h\n"
+ "usubl v1.8h, v1.8b, v2.8b\n"
+ "smlal2 v17.4s, v19.8h, v28.8h\n"
+ "ldr d28, [x27, x4]\n"
+ "smlal v7.4s, v18.4h, v16.4h\n"
"ushll v28.8h, v28.8b, #0x0\n"
- "smlal2 v13.4s, v30.8h, v1.8h\n"
- "smlal2 v19.4s, v30.8h, v0.8h\n"
- "ldr d30, [x28, x3]\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "smlal2 v7.4s, v27.8h, v0.8h\n"
- "smlal v11.4s, v26.4h, v2.4h\n"
- "smlal2 v5.4s, v23.8h, v0.8h\n"
- "ldr d0, [x0, #0xa0]\n"
- "smlal v20.4s, v26.4h, v1.4h\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "smlal v8.4s, v23.4h, v1.4h\n"
- "smlal v6.4s, v31.4h, v1.4h\n"
- "smlal2 v13.4s, v26.8h, v2.8h\n"
- "smlal2 v19.4s, v26.8h, v1.8h\n"
- "ldr d26, [x26, x3]\n"
- "ushll v26.8h, v26.8b, #0x0\n"
- "smlal2 v7.4s, v23.8h, v1.8h\n"
- "smlal v11.4s, v25.4h, v3.4h\n"
- "smlal2 v5.4s, v31.8h, v1.8h\n"
- "ldr d1, [x0, #0xa8]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "smlal v8.4s, v31.4h, v2.4h\n"
- "smlal v6.4s, v30.4h, v2.4h\n"
- "smlal2 v13.4s, v25.8h, v3.8h\n"
- "smlal2 v19.4s, v25.8h, v2.8h\n"
- "ldr d25, [x25, x3]\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal2 v7.4s, v31.8h, v2.8h\n"
- "smlal v11.4s, v24.4h, v4.4h\n"
- "smlal2 v5.4s, v30.8h, v2.8h\n"
- "ldr d2, [x0, #0xb0]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "smlal v8.4s, v30.4h, v3.4h\n"
- "smlal v6.4s, v28.4h, v3.4h\n"
- "smlal2 v13.4s, v24.8h, v4.8h\n"
- "smlal2 v19.4s, v24.8h, v3.8h\n"
- "ldr d24, [x24, x3]\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "smlal2 v7.4s, v30.8h, v3.8h\n"
- "smlal v11.4s, v27.4h, v0.4h\n"
- "smlal2 v5.4s, v28.8h, v3.8h\n"
- "ldr d3, [x0, #0xb8]\n"
- "smlal v20.4s, v22.4h, v4.4h\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "smlal v8.4s, v28.4h, v4.4h\n"
- "smlal v6.4s, v26.4h, v4.4h\n"
- "smlal2 v13.4s, v27.8h, v0.8h\n"
- "ldr d27, [x23, x3]\n"
- "smlal2 v7.4s, v28.8h, v4.8h\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal v11.4s, v23.4h, v1.4h\n"
- "smlal2 v19.4s, v22.8h, v4.8h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "ldr d4, [x0, #0xc0]\n"
- "smlal v20.4s, v23.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "smlal v8.4s, v25.4h, v0.4h\n"
- "smlal v6.4s, v24.4h, v0.4h\n"
- "add x0, x0, #0xc8\n"
- "smlal2 v13.4s, v23.8h, v1.8h\n"
- "smlal2 v7.4s, v25.8h, v0.8h\n"
- "ldr d25, [x15, x3]\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal v11.4s, v31.4h, v2.4h\n"
- "smlal2 v19.4s, v23.8h, v0.8h\n"
- "smlal2 v5.4s, v24.8h, v0.8h\n"
- "smlal v20.4s, v31.4h, v1.4h\n"
- "smlal v8.4s, v24.4h, v1.4h\n"
- "smlal v6.4s, v27.4h, v1.4h\n"
- "smlal2 v13.4s, v31.8h, v2.8h\n"
- "smlal2 v7.4s, v24.8h, v1.8h\n"
- "ldr d24, [x21, x3]\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "smlal v11.4s, v30.4h, v3.4h\n"
- "smlal2 v19.4s, v31.8h, v1.8h\n"
- "smlal2 v5.4s, v27.8h, v1.8h\n"
- "smlal v20.4s, v30.4h, v2.4h\n"
- "smlal v8.4s, v27.4h, v2.4h\n"
- "smlal v6.4s, v25.4h, v2.4h\n"
- "smlal2 v13.4s, v30.8h, v3.8h\n"
- "smlal2 v7.4s, v27.8h, v2.8h\n"
- "ldr d27, [x20, x3]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal v11.4s, v28.4h, v4.4h\n"
- "smlal2 v19.4s, v30.8h, v2.8h\n"
- "sqrdmulh v11.4s, v11.4s, v18.4s\n"
- "add x3, x3, #0x8\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "smlal v20.4s, v28.4h, v3.4h\n"
- "and v31.16b, v11.16b, v21.16b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v6.4s, v24.4h, v3.4h\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "smlal2 v13.4s, v28.8h, v4.8h\n"
- "smlal2 v19.4s, v28.8h, v3.8h\n"
- "sqrdmulh v13.4s, v13.4s, v16.4s\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "and v17.16b, v13.16b, v10.16b\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "sqrdmulh v20.4s, v20.4s, v18.4s\n"
- "smlal v6.4s, v27.4h, v4.4h\n"
- "smlal2 v19.4s, v26.8h, v4.8h\n"
- "sqrdmulh v8.4s, v8.4s, v18.4s\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal2 v5.4s, v27.8h, v4.8h\n"
- "sqrdmulh v6.4s, v6.4s, v18.4s\n"
- "sqadd v11.4s, v11.4s, v31.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v26.16b, v20.16b, v21.16b\n"
- "sqrdmulh v19.4s, v19.4s, v16.4s\n"
- "and v18.16b, v8.16b, v21.16b\n"
- "sqrdmulh v7.4s, v7.4s, v16.4s\n"
- "and v31.16b, v6.16b, v21.16b\n"
- "sqrdmulh v5.4s, v5.4s, v16.4s\n"
- "sqadd v13.4s, v13.4s, v17.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "and v27.16b, v19.16b, v10.16b\n"
+ "smlal v27.4s, v19.4h, v16.4h\n"
+ "smlal v8.4s, v0.4h, v16.4h\n"
+ "smlal2 v24.4s, v4.8h, v21.8h\n"
+ "ldr d4, [x7, #0xa8]\n"
+ "smlal v13.4s, v3.4h, v11.4h\n"
+ "usubl v4.8h, v4.8b, v2.8b\n"
+ "smlal2 v14.4s, v18.8h, v16.8h\n"
+ "ldr d18, [x26, x4]\n"
+ "smlal2 v22.4s, v19.8h, v16.8h\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "smlal2 v17.4s, v0.8h, v16.8h\n"
+ "ldr d16, [x25, x4]\n"
+ "smlal v7.4s, v3.4h, v21.4h\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "smlal v27.4s, v9.4h, v21.4h\n"
+ "smlal v8.4s, v10.4h, v21.4h\n"
+ "smlal2 v24.4s, v3.8h, v11.8h\n"
+ "smlal v13.4s, v5.4h, v15.4h\n"
+ "smlal2 v14.4s, v3.8h, v21.8h\n"
+ "ldr d3, [x7, #0xb0]\n"
+ "smlal2 v22.4s, v9.8h, v21.8h\n"
+ "usubl v3.8h, v3.8b, v2.8b\n"
+ "smlal2 v17.4s, v10.8h, v21.8h\n"
+ "ldr d21, [x24, x4]\n"
+ "smlal v7.4s, v5.4h, v11.4h\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "smlal v27.4s, v10.4h, v11.4h\n"
+ "smlal v8.4s, v20.4h, v11.4h\n"
+ "smlal2 v24.4s, v5.8h, v15.8h\n"
+ "smlal v13.4s, v6.4h, v31.4h\n"
+ "smlal2 v14.4s, v5.8h, v11.8h\n"
+ "ldr d5, [x7, #0xb8]\n"
+ "smlal2 v22.4s, v10.8h, v11.8h\n"
+ "usubl v5.8h, v5.8b, v2.8b\n"
+ "smlal2 v17.4s, v20.8h, v11.8h\n"
+ "ldr d11, [x23, x4]\n"
+ "smlal v7.4s, v6.4h, v15.4h\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "smlal v27.4s, v20.4h, v15.4h\n"
+ "smlal v8.4s, v23.4h, v15.4h\n"
+ "smlal2 v24.4s, v6.8h, v31.8h\n"
+ "smlal v13.4s, v19.4h, v29.4h\n"
+ "smlal2 v14.4s, v6.8h, v15.8h\n"
+ "ldr d6, [x7, #0xc0]\n"
+ "smlal2 v22.4s, v20.8h, v15.8h\n"
+ "usubl v6.8h, v6.8b, v2.8b\n"
+ "smlal2 v17.4s, v23.8h, v15.8h\n"
+ "ldr d15, [x22, x4]\n"
+ "smlal v7.4s, v19.4h, v31.4h\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "smlal v27.4s, v23.4h, v31.4h\n"
+ "smlal v8.4s, v30.4h, v31.4h\n"
+ "add x7, x7, #0xc8\n"
+ "smlal2 v24.4s, v19.8h, v29.8h\n"
+ "smlal v13.4s, v9.4h, v1.4h\n"
+ "smlal2 v14.4s, v19.8h, v31.8h\n"
+ "ldr d19, [x21, x4]\n"
+ "smlal2 v22.4s, v23.8h, v31.8h\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "smlal2 v17.4s, v30.8h, v31.8h\n"
+ "ldr q31, [x8, #0x0]\n"
+ "smlal v7.4s, v0.4h, v29.4h\n"
+ "add x4, x4, #0x8\n"
+ "smlal v27.4s, v30.4h, v29.4h\n"
+ "smlal v8.4s, v28.4h, v29.4h\n"
+ "smlal2 v24.4s, v9.8h, v1.8h\n"
+ "ldr q9, [x17, #0x0]\n"
+ "smlal v13.4s, v10.4h, v4.4h\n"
+ "smlal2 v14.4s, v0.8h, v29.8h\n"
+ "ldr q0, [x8, #0x10]\n"
+ "smlal2 v22.4s, v30.8h, v29.8h\n"
+ "add x8, x8, #0x20\n"
+ "smlal2 v17.4s, v28.8h, v29.8h\n"
+ "ldr q29, [x17, #0x10]\n"
+ "smlal v7.4s, v10.4h, v1.4h\n"
+ "add x17, x17, #0x20\n"
+ "smlal v27.4s, v18.4h, v1.4h\n"
+ "smlal v8.4s, v16.4h, v1.4h\n"
+ "smlal2 v24.4s, v10.8h, v4.8h\n"
+ "smlal v13.4s, v20.4h, v3.4h\n"
+ "smlal2 v14.4s, v10.8h, v1.8h\n"
+ "smlal2 v22.4s, v18.8h, v1.8h\n"
+ "smlal2 v17.4s, v16.8h, v1.8h\n"
+ "smlal v7.4s, v20.4h, v4.4h\n"
+ "smlal v27.4s, v16.4h, v4.4h\n"
+ "smlal v8.4s, v21.4h, v4.4h\n"
+ "smlal2 v24.4s, v20.8h, v3.8h\n"
+ "smlal v13.4s, v23.4h, v5.4h\n"
+ "smlal2 v14.4s, v20.8h, v4.8h\n"
+ "smlal2 v22.4s, v16.8h, v4.8h\n"
+ "smlal2 v17.4s, v21.8h, v4.8h\n"
+ "smlal v7.4s, v23.4h, v3.4h\n"
+ "smlal v27.4s, v21.4h, v3.4h\n"
+ "smlal v8.4s, v11.4h, v3.4h\n"
+ "smlal2 v24.4s, v23.8h, v5.8h\n"
+ "smlal v13.4s, v30.4h, v6.4h\n"
+ "sqrdmulh v13.4s, v13.4s, v31.4s\n"
+ "smlal2 v14.4s, v23.8h, v3.8h\n"
+ "smlal2 v22.4s, v21.8h, v3.8h\n"
+ "and v23.16b, v13.16b, v9.16b\n"
+ "smlal2 v17.4s, v11.8h, v3.8h\n"
+ "smlal v7.4s, v30.4h, v5.4h\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "smlal v27.4s, v11.4h, v5.4h\n"
+ "smlal v8.4s, v15.4h, v5.4h\n"
+ "sqadd v13.4s, v13.4s, v23.4s\n"
+ "smlal2 v24.4s, v30.8h, v6.8h\n"
+ "smlal2 v14.4s, v30.8h, v5.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v0.4s\n"
+ "smlal2 v22.4s, v11.8h, v5.8h\n"
+ "smlal2 v17.4s, v15.8h, v5.8h\n"
+ "and v10.16b, v24.16b, v29.16b\n"
+ "smlal v7.4s, v28.4h, v6.4h\n"
+ "smlal v27.4s, v15.4h, v6.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v31.4s\n"
+ "smlal v8.4s, v19.4h, v6.4h\n"
+ "smlal2 v14.4s, v28.8h, v6.8h\n"
+ "sqrdmulh v27.4s, v27.4s, v31.4s\n"
+ "smlal2 v22.4s, v15.8h, v6.8h\n"
+ "smlal2 v17.4s, v19.8h, v6.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v31.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "and v28.16b, v7.16b, v9.16b\n"
+ "sqrdmulh v14.4s, v14.4s, v0.4s\n"
+ "and v20.16b, v27.16b, v9.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v0.4s\n"
+ "and v23.16b, v8.16b, v9.16b\n"
+ "sqrdmulh v17.4s, v17.4s, v0.4s\n"
+ "sqadd v24.4s, v24.4s, v10.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "and v18.16b, v14.16b, v29.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v30.16b, v22.16b, v29.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v5.16b, v17.16b, v29.16b\n"
+ "sqadd v7.4s, v7.4s, v28.4s\n"
"sshr v18.4s, v18.4s, #0x1f\n"
- "and v25.16b, v7.16b, v10.16b\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "and v17.16b, v5.16b, v10.16b\n"
- "sqadd v20.4s, v20.4s, v26.4s\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v18.4s\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "sqadd v6.4s, v6.4s, v31.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "srshl v11.4s, v11.4s, v21.4s\n"
- "srshl v20.4s, v20.4s, v21.4s\n"
- "sqadd v19.4s, v19.4s, v27.4s\n"
- "srshl v8.4s, v8.4s, v21.4s\n"
- "sqadd v7.4s, v7.4s, v25.4s\n"
- "srshl v6.4s, v6.4s, v21.4s\n"
- "sqadd v5.4s, v5.4s, v17.4s\n"
- "srshl v13.4s, v13.4s, v10.4s\n"
- "sqxtn v11.4h, v11.4s\n"
- "srshl v19.4s, v19.4s, v10.4s\n"
- "sqxtn v20.4h, v20.4s\n"
- "srshl v7.4s, v7.4s, v10.4s\n"
+ "sqadd v27.4s, v27.4s, v20.4s\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v23.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v9.4s\n"
+ "srshl v7.4s, v7.4s, v9.4s\n"
+ "sqadd v14.4s, v14.4s, v18.4s\n"
+ "srshl v27.4s, v27.4s, v9.4s\n"
+ "sqadd v22.4s, v22.4s, v30.4s\n"
+ "srshl v8.4s, v8.4s, v9.4s\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "srshl v24.4s, v24.4s, v29.4s\n"
+ "sqxtn v13.4h, v13.4s\n"
+ "srshl v14.4s, v14.4s, v29.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v22.4s, v22.4s, v29.4s\n"
+ "sqxtn v27.4h, v27.4s\n"
+ "srshl v17.4s, v17.4s, v29.4s\n"
"sqxtn v8.4h, v8.4s\n"
- "srshl v5.4s, v5.4s, v10.4s\n"
- "sqxtn v6.4h, v6.4s\n"
- "sqxtn2 v11.8h, v13.4s\n"
- "sqxtn2 v20.8h, v19.4s\n"
- "sqxtn2 v8.8h, v7.4s\n"
- "sqxtn2 v6.8h, v5.4s\n"
- "sqadd v11.8h, v11.8h, v15.8h\n"
- "sqadd v20.8h, v20.8h, v15.8h\n"
- "sqadd v8.8h, v8.8h, v15.8h\n"
- "sqadd v6.8h, v6.8h, v15.8h\n"
- "smax v11.8h, v11.8h, v14.8h\n"
- "smax v20.8h, v20.8h, v14.8h\n"
- "smax v8.8h, v8.8h, v14.8h\n"
- "smax v6.8h, v6.8h, v14.8h\n"
- "smin v11.8h, v11.8h, v12.8h\n"
- "smin v20.8h, v20.8h, v12.8h\n"
- "smin v8.8h, v8.8h, v12.8h\n"
- "smin v6.8h, v6.8h, v12.8h\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str d11, [x7, x1]\n"
+ "sqxtn2 v13.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v14.4s\n"
+ "sqxtn2 v27.8h, v22.4s\n"
+ "sqxtn2 v8.8h, v17.4s\n"
+ "sqadd v13.8h, v13.8h, v25.8h\n"
+ "sqadd v7.8h, v7.8h, v25.8h\n"
+ "sqadd v27.8h, v27.8h, v25.8h\n"
+ "sqadd v8.8h, v8.8h, v25.8h\n"
+ "smax v13.8h, v13.8h, v12.8h\n"
+ "smax v7.8h, v7.8h, v12.8h\n"
+ "smax v27.8h, v27.8h, v12.8h\n"
+ "smax v8.8h, v8.8h, v12.8h\n"
+ "smin v13.8h, v13.8h, v26.8h\n"
+ "smin v7.8h, v7.8h, v26.8h\n"
+ "smin v27.8h, v27.8h, v26.8h\n"
+ "smin v8.8h, v8.8h, v26.8h\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "str d13, [x16, x5]\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str d7, [x15, x5]\n"
"uzp1 v8.16b, v8.16b, v8.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "str d20, [x8, x1]\n"
- "str d8, [x17, x1]\n"
- "str d6, [x16, x1]\n"
- "ldr q11, [x13, #0x0]\n"
- "ldr q13, [x13, #0x10]\n"
- "add x13, x13, #0x20\n"
- "ldr d0, [x0, #0x0]\n"
- "ldr d1, [x0, #0x8]\n"
- "add x1, x1, #0x8\n"
- "str x13, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d2, [x0, #0x10]\n"
- "ldr d3, [x0, #0x18]\n"
- "mov v20.16b, v11.16b\n"
- "mov v19.16b, v13.16b\n"
- "ldr d4, [x0, #0x20]\n"
- "ldp x10, x28, [x2, #0x0]\n"
- "mov v8.16b, v11.16b\n"
+ "str d27, [x14, x5]\n"
+ "str d8, [x13, x5]\n"
+ "ldr q13, [x20, #0x0]\n"
+ "ldr q24, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d21, [x7, #0x0]\n"
+ "ldr d15, [x7, #0x8]\n"
+ "add x5, x5, #0x8\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d29, [x7, #0x10]\n"
+ "ldr d18, [x7, #0x18]\n"
"mov v7.16b, v13.16b\n"
- "ldp x27, x26, [x2, #0x10]\n"
- "ldp x25, x24, [x2, #0x20]\n"
- "mov v6.16b, v11.16b\n"
- "mov v5.16b, v13.16b\n"
- "ldp x23, x22, [x2, #0x30]\n"
- "ldp x21, x20, [x2, #0x40]\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "ldr d31, [x10, x3]\n"
- "ldr d30, [x28, x3]\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "ldr d29, [x27, x3]\n"
- "ldr d28, [x26, x3]\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "ldr d27, [x25, x3]\n"
- "ldr d23, [x24, x3]\n"
+ "mov v14.16b, v24.16b\n"
+ "ldr d3, [x7, #0x20]\n"
+ "ldp x9, x28, [x6, #0x0]\n"
+ "mov v27.16b, v13.16b\n"
+ "mov v22.16b, v24.16b\n"
+ "ldp x27, x26, [x6, #0x10]\n"
+ "ldp x25, x24, [x6, #0x20]\n"
+ "mov v8.16b, v13.16b\n"
+ "mov v17.16b, v24.16b\n"
+ "ldp x23, x22, [x6, #0x30]\n"
+ "ldp x21, x20, [x6, #0x40]\n"
+ "usubl v21.8h, v21.8b, v2.8b\n"
+ "usubl v15.8h, v15.8b, v2.8b\n"
+ "ldr d10, [x9, x4]\n"
+ "ldr d16, [x28, x4]\n"
+ "usubl v29.8h, v29.8b, v2.8b\n"
+ "usubl v18.8h, v18.8b, v2.8b\n"
+ "ldr d23, [x27, x4]\n"
+ "ldr d30, [x26, x4]\n"
+ "usubl v3.8h, v3.8b, v2.8b\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ldr d4, [x25, x4]\n"
+ "ldr d28, [x24, x4]\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "ldr d31, [x23, x4]\n"
+ "ldr d1, [x22, x4]\n"
"ushll v30.8h, v30.8b, #0x0\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "ldr d25, [x23, x3]\n"
- "ldr d24, [x22, x3]\n"
+ "ushll v4.8h, v4.8b, #0x0\n"
+ "ldr d9, [x21, x4]\n"
+ "ldr d11, [x20, x4]\n"
"ushll v28.8h, v28.8b, #0x0\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "ldr d26, [x21, x3]\n"
- "ldr d22, [x20, x3]\n"
- "ushll v23.8h, v23.8b, #0x0\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "ushll v26.8h, v26.8b, #0x0\n"
- "ushll v22.8h, v22.8b, #0x0\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "ushll v1.8h, v1.8b, #0x0\n"
+ "ushll v9.8h, v9.8b, #0x0\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
"bgt 1b\n"
"2:" // Tail
- "ldr q18, [x6, #0x0]\n"
- "ldr q21, [x5, #0x0]\n"
- "smlal v11.4s, v31.4h, v0.4h\n"
- "smlal2 v13.4s, v31.8h, v0.8h\n"
- "ldr q16, [x6, #0x10]\n"
- "ldr q10, [x5, #0x10]\n"
- "smlal v11.4s, v30.4h, v1.4h\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "ldr x20, [x2, #0x50]\n"
- "smlal v8.4s, v29.4h, v0.4h\n"
- "smlal v6.4s, v28.4h, v0.4h\n"
- "ldr x22, [x2, #0x58]\n"
- "smlal2 v13.4s, v30.8h, v1.8h\n"
- "smlal2 v19.4s, v30.8h, v0.8h\n"
- "ldr d31, [x20, x3]\n"
+ "ldr d0, [x7, #0x28]\n"
+ "ldr d20, [x7, #0x30]\n"
+ "smlal v13.4s, v10.4h, v21.4h\n"
+ "smlal2 v24.4s, v10.8h, v21.8h\n"
+ "ldr d6, [x7, #0x38]\n"
+ "ldr d19, [x7, #0x40]\n"
+ "smlal v13.4s, v16.4h, v15.4h\n"
+ "smlal v7.4s, v16.4h, v21.4h\n"
+ "ldr d10, [x7, #0x48]\n"
+ "ldr d5, [x7, #0x50]\n"
+ "smlal v27.4s, v23.4h, v21.4h\n"
+ "smlal v8.4s, v30.4h, v21.4h\n"
+ "ldr x21, [x6, #0x50]\n"
+ "smlal2 v24.4s, v16.8h, v15.8h\n"
+ "smlal v13.4s, v4.4h, v29.4h\n"
+ "ldr x20, [x6, #0x58]\n"
+ "smlal2 v14.4s, v16.8h, v21.8h\n"
+ "ldr d16, [x21, x4]\n"
+ "smlal2 v22.4s, v23.8h, v21.8h\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "smlal2 v17.4s, v30.8h, v21.8h\n"
+ "ldr d21, [x20, x4]\n"
+ "smlal v7.4s, v4.4h, v15.4h\n"
+ "ldr x22, [x6, #0x60]\n"
+ "smlal v27.4s, v30.4h, v15.4h\n"
+ "smlal v8.4s, v28.4h, v15.4h\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "ldr x20, [x6, #0x68]\n"
+ "smlal2 v24.4s, v4.8h, v29.8h\n"
+ "smlal v13.4s, v31.4h, v18.4h\n"
+ "usubl v0.8h, v0.8b, v2.8b\n"
+ "ldr x21, [x6, #0x70]\n"
+ "smlal2 v14.4s, v4.8h, v15.8h\n"
+ "ldr d4, [x22, x4]\n"
+ "smlal2 v22.4s, v30.8h, v15.8h\n"
+ "ushll v4.8h, v4.8b, #0x0\n"
+ "smlal2 v17.4s, v28.8h, v15.8h\n"
+ "ldr d15, [x20, x4]\n"
+ "smlal v7.4s, v31.4h, v29.4h\n"
+ "usubl v20.8h, v20.8b, v2.8b\n"
+ "smlal v27.4s, v28.4h, v29.4h\n"
+ "smlal v8.4s, v16.4h, v29.4h\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "ldr x20, [x6, #0x78]\n"
+ "smlal2 v24.4s, v31.8h, v18.8h\n"
+ "smlal v13.4s, v1.4h, v3.4h\n"
+ "usubl v6.8h, v6.8b, v2.8b\n"
+ "ldr x22, [x6, #0x80]\n"
+ "smlal2 v14.4s, v31.8h, v29.8h\n"
+ "ldr d31, [x21, x4]\n"
+ "smlal2 v22.4s, v28.8h, v29.8h\n"
"ushll v31.8h, v31.8b, #0x0\n"
- "smlal2 v7.4s, v29.8h, v0.8h\n"
- "smlal v11.4s, v27.4h, v2.4h\n"
- "ldr x21, [x2, #0x60]\n"
- "ldr x20, [x2, #0x68]\n"
- "smlal2 v5.4s, v28.8h, v0.8h\n"
- "ldr d30, [x22, x3]\n"
- "smlal v20.4s, v27.4h, v1.4h\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "smlal v8.4s, v28.4h, v1.4h\n"
- "smlal v6.4s, v23.4h, v1.4h\n"
- "ldr x25, [x2, #0x70]\n"
- "ldr x26, [x2, #0x78]\n"
- "smlal2 v13.4s, v27.8h, v2.8h\n"
- "smlal2 v19.4s, v27.8h, v1.8h\n"
- "ldr d0, [x0, #0x28]\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "smlal2 v7.4s, v28.8h, v1.8h\n"
- "smlal v11.4s, v25.4h, v3.4h\n"
- "ldr x23, [x2, #0x80]\n"
- "ldr x24, [x2, #0x88]\n"
- "smlal2 v5.4s, v23.8h, v1.8h\n"
- "ldr d27, [x21, x3]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal v8.4s, v23.4h, v2.4h\n"
- "smlal v6.4s, v31.4h, v2.4h\n"
- "ldr x15, [x2, #0x90]\n"
- "ldr x21, [x2, #0x98]\n"
- "smlal2 v13.4s, v25.8h, v3.8h\n"
- "smlal2 v19.4s, v25.8h, v2.8h\n"
- "ldr d1, [x0, #0x30]\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "smlal2 v7.4s, v23.8h, v2.8h\n"
- "smlal v11.4s, v24.4h, v4.4h\n"
- "ldr x14, [x2, #0xa0]\n"
- "ldr x13, [x2, #0xa8]\n"
- "smlal2 v5.4s, v31.8h, v2.8h\n"
- "ldr d25, [x20, x3]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal v8.4s, v31.4h, v3.4h\n"
- "smlal v6.4s, v30.4h, v3.4h\n"
- "ldr x12, [x2, #0xb0]\n"
- "ldr x20, [x2, #0xb8]\n"
- "smlal2 v13.4s, v24.8h, v4.8h\n"
- "smlal2 v19.4s, v24.8h, v3.8h\n"
- "ldr d2, [x0, #0x38]\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v3.8h\n"
- "smlal v11.4s, v29.4h, v0.4h\n"
- "ldr x11, [x2, #0xc0]\n"
- "ldr x10, [x2, #0xc8]\n"
- "smlal2 v5.4s, v30.8h, v3.8h\n"
- "ldr d24, [x25, x3]\n"
- "smlal v20.4s, v27.4h, v4.4h\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "smlal v8.4s, v30.4h, v4.4h\n"
- "smlal v6.4s, v26.4h, v4.4h\n"
- "ldr x22, [x2, #0xd0]\n"
- "ldr x28, [x2, #0xd8]\n"
- "smlal2 v13.4s, v29.8h, v0.8h\n"
- "ldr d3, [x0, #0x40]\n"
- "smlal2 v19.4s, v27.8h, v4.8h\n"
- "ldr d27, [x26, x3]\n"
- "smlal2 v7.4s, v30.8h, v4.8h\n"
- "smlal v11.4s, v28.4h, v1.4h\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "ldr x27, [x2, #0xe0]\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "ldr d4, [x0, #0x48]\n"
- "smlal v20.4s, v28.4h, v0.4h\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal v8.4s, v22.4h, v0.4h\n"
- "smlal v6.4s, v25.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "ldr x26, [x2, #0xe8]\n"
- "smlal2 v13.4s, v28.8h, v1.8h\n"
- "smlal2 v19.4s, v28.8h, v0.8h\n"
- "ldr d28, [x24, x3]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal2 v7.4s, v22.8h, v0.8h\n"
- "smlal v11.4s, v23.4h, v2.4h\n"
- "ldr x25, [x2, #0xf0]\n"
- "ldr x24, [x2, #0xf8]\n"
- "smlal2 v5.4s, v25.8h, v0.8h\n"
- "ldr d0, [x0, #0x50]\n"
- "smlal v20.4s, v23.4h, v1.4h\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "smlal v8.4s, v25.4h, v1.4h\n"
- "smlal v6.4s, v24.4h, v1.4h\n"
- "tst x4, #0x7\n"
- "add x6, x6, #0x20\n"
- "smlal2 v13.4s, v23.8h, v2.8h\n"
- "smlal2 v19.4s, v23.8h, v1.8h\n"
- "ldr d23, [x23, x3]\n"
+ "smlal2 v17.4s, v16.8h, v29.8h\n"
+ "ldr d29, [x20, x4]\n"
+ "smlal v7.4s, v1.4h, v18.4h\n"
+ "usubl v19.8h, v19.8b, v2.8b\n"
+ "smlal v27.4s, v16.4h, v18.4h\n"
+ "smlal v8.4s, v21.4h, v18.4h\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "ldr x20, [x6, #0x88]\n"
+ "smlal2 v24.4s, v1.8h, v3.8h\n"
+ "smlal v13.4s, v23.4h, v0.4h\n"
+ "usubl v10.8h, v10.8b, v2.8b\n"
+ "ldr x21, [x6, #0x90]\n"
+ "smlal2 v14.4s, v1.8h, v18.8h\n"
+ "ldr d1, [x22, x4]\n"
+ "smlal2 v22.4s, v16.8h, v18.8h\n"
+ "ushll v1.8h, v1.8b, #0x0\n"
+ "smlal2 v17.4s, v21.8h, v18.8h\n"
+ "ldr d18, [x20, x4]\n"
+ "smlal v7.4s, v4.4h, v3.4h\n"
+ "usubl v5.8h, v5.8b, v2.8b\n"
+ "smlal v27.4s, v21.4h, v3.4h\n"
+ "smlal v8.4s, v9.4h, v3.4h\n"
+ "ldr x20, [x6, #0x98]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "smlal2 v24.4s, v23.8h, v0.8h\n"
+ "ldr d23, [x7, #0x58]\n"
+ "smlal v13.4s, v30.4h, v20.4h\n"
+ "usubl v23.8h, v23.8b, v2.8b\n"
+ "smlal2 v14.4s, v4.8h, v3.8h\n"
+ "ldr d4, [x21, x4]\n"
+ "smlal2 v22.4s, v21.8h, v3.8h\n"
+ "ldr x22, [x6, #0xa0]\n"
+ "smlal2 v17.4s, v9.8h, v3.8h\n"
+ "ldr d3, [x20, x4]\n"
+ "smlal v7.4s, v30.4h, v0.4h\n"
+ "ushll v4.8h, v4.8b, #0x0\n"
+ "smlal v27.4s, v11.4h, v0.4h\n"
+ "smlal v8.4s, v15.4h, v0.4h\n"
+ "ushll v3.8h, v3.8b, #0x0\n"
+ "ldr x21, [x6, #0xa8]\n"
+ "smlal2 v24.4s, v30.8h, v20.8h\n"
+ "smlal v13.4s, v28.4h, v6.4h\n"
+ "ldr x20, [x6, #0xb0]\n"
+ "ldr x12, [x6, #0xb8]\n"
+ "smlal2 v14.4s, v30.8h, v0.8h\n"
+ "ldr d30, [x7, #0x60]\n"
+ "smlal2 v22.4s, v11.8h, v0.8h\n"
+ "usubl v30.8h, v30.8b, v2.8b\n"
+ "smlal2 v17.4s, v15.8h, v0.8h\n"
+ "ldr d0, [x22, x4]\n"
+ "smlal v7.4s, v28.4h, v20.4h\n"
+ "ushll v0.8h, v0.8b, #0x0\n"
+ "smlal v27.4s, v15.4h, v20.4h\n"
+ "smlal v8.4s, v31.4h, v20.4h\n"
+ "ldr x11, [x6, #0xc0]\n"
+ "ldr x10, [x6, #0xc8]\n"
+ "smlal2 v24.4s, v28.8h, v6.8h\n"
+ "smlal v13.4s, v16.4h, v19.4h\n"
+ "ldr x9, [x6, #0xd0]\n"
+ "ldr x28, [x6, #0xd8]\n"
+ "smlal2 v14.4s, v28.8h, v20.8h\n"
+ "ldr d28, [x7, #0x68]\n"
+ "smlal2 v22.4s, v15.8h, v20.8h\n"
+ "usubl v28.8h, v28.8b, v2.8b\n"
+ "smlal2 v17.4s, v31.8h, v20.8h\n"
+ "ldr d20, [x21, x4]\n"
+ "smlal v7.4s, v16.4h, v6.4h\n"
+ "ushll v20.8h, v20.8b, #0x0\n"
+ "smlal v27.4s, v31.4h, v6.4h\n"
+ "smlal v8.4s, v29.4h, v6.4h\n"
+ "ldr x27, [x6, #0xe0]\n"
+ "ldr x26, [x6, #0xe8]\n"
+ "smlal2 v24.4s, v16.8h, v19.8h\n"
+ "smlal v13.4s, v21.4h, v10.4h\n"
+ "ldr x25, [x6, #0xf0]\n"
+ "ldr x24, [x6, #0xf8]\n"
+ "smlal2 v14.4s, v16.8h, v6.8h\n"
+ "ldr d16, [x7, #0x70]\n"
+ "smlal2 v22.4s, v31.8h, v6.8h\n"
+ "usubl v16.8h, v16.8b, v2.8b\n"
+ "smlal2 v17.4s, v29.8h, v6.8h\n"
+ "ldr d6, [x20, x4]\n"
+ "smlal v7.4s, v21.4h, v19.4h\n"
+ "ushll v6.8h, v6.8b, #0x0\n"
+ "smlal v27.4s, v29.4h, v19.4h\n"
+ "smlal v8.4s, v1.4h, v19.4h\n"
+ "ldr x23, [x6, #0x100]\n"
+ "ldr x22, [x6, #0x108]\n"
+ "smlal2 v24.4s, v21.8h, v10.8h\n"
+ "smlal v13.4s, v11.4h, v5.4h\n"
+ "ldr x21, [x6, #0x110]\n"
+ "ldr x20, [x6, #0x118]\n"
+ "smlal2 v14.4s, v21.8h, v19.8h\n"
+ "ldr d21, [x7, #0x78]\n"
+ "smlal2 v22.4s, v29.8h, v19.8h\n"
+ "usubl v21.8h, v21.8b, v2.8b\n"
+ "smlal2 v17.4s, v1.8h, v19.8h\n"
+ "ldr d19, [x12, x4]\n"
+ "smlal v7.4s, v9.4h, v10.4h\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "smlal v27.4s, v1.4h, v10.4h\n"
+ "smlal v8.4s, v18.4h, v10.4h\n"
+ "tst x2, #0x7\n"
+ "smlal2 v24.4s, v11.8h, v5.8h\n"
+ "ldr d11, [x7, #0x80]\n"
+ "smlal v13.4s, v15.4h, v23.4h\n"
+ "usubl v11.8h, v11.8b, v2.8b\n"
+ "smlal2 v14.4s, v9.8h, v10.8h\n"
+ "ldr d9, [x11, x4]\n"
+ "smlal2 v22.4s, v1.8h, v10.8h\n"
+ "ushll v9.8h, v9.8b, #0x0\n"
+ "smlal2 v17.4s, v18.8h, v10.8h\n"
+ "ldr d10, [x10, x4]\n"
+ "smlal v7.4s, v15.4h, v5.4h\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "smlal v27.4s, v4.4h, v5.4h\n"
+ "smlal v8.4s, v3.4h, v5.4h\n"
+ "smlal2 v24.4s, v15.8h, v23.8h\n"
+ "smlal v13.4s, v31.4h, v30.4h\n"
+ "smlal2 v14.4s, v15.8h, v5.8h\n"
+ "ldr d15, [x7, #0x88]\n"
+ "smlal2 v22.4s, v4.8h, v5.8h\n"
+ "usubl v15.8h, v15.8b, v2.8b\n"
+ "smlal2 v17.4s, v3.8h, v5.8h\n"
+ "ldr d5, [x9, x4]\n"
+ "smlal v7.4s, v31.4h, v23.4h\n"
+ "ushll v5.8h, v5.8b, #0x0\n"
+ "smlal v27.4s, v3.4h, v23.4h\n"
+ "smlal v8.4s, v0.4h, v23.4h\n"
+ "smlal2 v24.4s, v31.8h, v30.8h\n"
+ "smlal v13.4s, v29.4h, v28.4h\n"
+ "smlal2 v14.4s, v31.8h, v23.8h\n"
+ "ldr d31, [x7, #0x90]\n"
+ "smlal2 v22.4s, v3.8h, v23.8h\n"
+ "usubl v31.8h, v31.8b, v2.8b\n"
+ "smlal2 v17.4s, v0.8h, v23.8h\n"
+ "ldr d23, [x28, x4]\n"
+ "smlal v7.4s, v29.4h, v30.4h\n"
"ushll v23.8h, v23.8b, #0x0\n"
- "smlal2 v7.4s, v25.8h, v1.8h\n"
- "smlal v11.4s, v31.4h, v3.4h\n"
- "ldr x23, [x2, #0x100]\n"
- "add x5, x5, #0x20\n"
- "smlal2 v5.4s, v24.8h, v1.8h\n"
- "ldr d1, [x0, #0x58]\n"
- "smlal v20.4s, v31.4h, v2.4h\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "smlal v8.4s, v24.4h, v2.4h\n"
- "smlal v6.4s, v27.4h, v2.4h\n"
- "smlal2 v13.4s, v31.8h, v3.8h\n"
- "smlal2 v19.4s, v31.8h, v2.8h\n"
- "ldr d31, [x15, x3]\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "smlal2 v7.4s, v24.8h, v2.8h\n"
- "smlal v11.4s, v30.4h, v4.4h\n"
- "ldr x15, [x2, #0x108]\n"
- "smlal2 v5.4s, v27.8h, v2.8h\n"
- "ldr d2, [x0, #0x60]\n"
- "smlal v20.4s, v30.4h, v3.4h\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "smlal v8.4s, v27.4h, v3.4h\n"
- "smlal v6.4s, v23.4h, v3.4h\n"
- "smlal2 v13.4s, v30.8h, v4.8h\n"
- "smlal2 v19.4s, v30.8h, v3.8h\n"
- "ldr d30, [x21, x3]\n"
+ "smlal v27.4s, v0.4h, v30.4h\n"
+ "smlal v8.4s, v20.4h, v30.4h\n"
+ "smlal2 v24.4s, v29.8h, v28.8h\n"
+ "smlal v13.4s, v1.4h, v16.4h\n"
+ "smlal2 v14.4s, v29.8h, v30.8h\n"
+ "ldr d29, [x7, #0x98]\n"
+ "smlal2 v22.4s, v0.8h, v30.8h\n"
+ "usubl v29.8h, v29.8b, v2.8b\n"
+ "smlal2 v17.4s, v20.8h, v30.8h\n"
+ "ldr d30, [x27, x4]\n"
+ "smlal v7.4s, v1.4h, v28.4h\n"
"ushll v30.8h, v30.8b, #0x0\n"
- "smlal2 v7.4s, v27.8h, v3.8h\n"
- "smlal v11.4s, v22.4h, v0.4h\n"
- "ldr x21, [x2, #0x110]\n"
- "smlal2 v5.4s, v23.8h, v3.8h\n"
- "ldr d3, [x0, #0x68]\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "smlal v8.4s, v23.4h, v4.4h\n"
- "smlal v6.4s, v28.4h, v4.4h\n"
- "smlal2 v13.4s, v22.8h, v0.8h\n"
- "ldr d22, [x20, x3]\n"
- "smlal2 v19.4s, v26.8h, v4.8h\n"
- "ldr d26, [x14, x3]\n"
- "smlal2 v7.4s, v23.8h, v4.8h\n"
- "smlal v11.4s, v25.4h, v1.4h\n"
- "ushll v26.8h, v26.8b, #0x0\n"
- "ldr x20, [x2, #0x118]\n"
- "smlal2 v5.4s, v28.8h, v4.8h\n"
- "ldr d4, [x0, #0x70]\n"
- "smlal v20.4s, v25.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "smlal v8.4s, v31.4h, v0.4h\n"
- "smlal v6.4s, v30.4h, v0.4h\n"
- "ushll v22.8h, v22.8b, #0x0\n"
- "smlal2 v13.4s, v25.8h, v1.8h\n"
- "smlal2 v19.4s, v25.8h, v0.8h\n"
- "ldr d25, [x13, x3]\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal2 v7.4s, v31.8h, v0.8h\n"
- "smlal v11.4s, v24.4h, v2.4h\n"
- "smlal2 v5.4s, v30.8h, v0.8h\n"
- "ldr d0, [x0, #0x78]\n"
- "smlal v20.4s, v24.4h, v1.4h\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "smlal v8.4s, v30.4h, v1.4h\n"
- "smlal v6.4s, v26.4h, v1.4h\n"
- "smlal2 v13.4s, v24.8h, v2.8h\n"
- "smlal2 v19.4s, v24.8h, v1.8h\n"
- "ldr d24, [x12, x3]\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "smlal2 v7.4s, v30.8h, v1.8h\n"
- "smlal v11.4s, v27.4h, v3.4h\n"
- "smlal2 v5.4s, v26.8h, v1.8h\n"
- "ldr d1, [x0, #0x80]\n"
- "smlal v20.4s, v27.4h, v2.4h\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "smlal v8.4s, v26.4h, v2.4h\n"
- "smlal v6.4s, v25.4h, v2.4h\n"
- "smlal2 v13.4s, v27.8h, v3.8h\n"
- "smlal2 v19.4s, v27.8h, v2.8h\n"
- "ldr d27, [x11, x3]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal2 v7.4s, v26.8h, v2.8h\n"
- "smlal v11.4s, v23.4h, v4.4h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "ldr d2, [x0, #0x88]\n"
- "smlal v20.4s, v23.4h, v3.4h\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v6.4s, v24.4h, v3.4h\n"
- "smlal2 v13.4s, v23.8h, v4.8h\n"
- "smlal2 v19.4s, v23.8h, v3.8h\n"
- "ldr d23, [x10, x3]\n"
- "ushll v23.8h, v23.8b, #0x0\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal v11.4s, v31.4h, v0.4h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "ldr d3, [x0, #0x90]\n"
- "smlal v20.4s, v28.4h, v4.4h\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal v6.4s, v22.4h, v4.4h\n"
- "smlal2 v13.4s, v31.8h, v0.8h\n"
- "ldr d31, [x22, x3]\n"
- "smlal2 v19.4s, v28.8h, v4.8h\n"
- "ldr d28, [x27, x3]\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal v11.4s, v30.4h, v1.4h\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "smlal2 v5.4s, v22.8h, v4.8h\n"
- "ldr d4, [x0, #0x98]\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "smlal v8.4s, v27.4h, v0.4h\n"
- "smlal v6.4s, v23.4h, v0.4h\n"
+ "smlal v27.4s, v20.4h, v28.4h\n"
+ "smlal v8.4s, v6.4h, v28.4h\n"
+ "smlal2 v24.4s, v1.8h, v16.8h\n"
+ "smlal v13.4s, v4.4h, v21.4h\n"
+ "smlal2 v14.4s, v1.8h, v28.8h\n"
+ "ldr d1, [x7, #0xa0]\n"
+ "smlal2 v22.4s, v20.8h, v28.8h\n"
+ "usubl v1.8h, v1.8b, v2.8b\n"
+ "smlal2 v17.4s, v6.8h, v28.8h\n"
+ "ldr d28, [x26, x4]\n"
+ "smlal v7.4s, v18.4h, v16.4h\n"
"ushll v28.8h, v28.8b, #0x0\n"
- "smlal2 v13.4s, v30.8h, v1.8h\n"
- "smlal2 v19.4s, v30.8h, v0.8h\n"
- "ldr d30, [x28, x3]\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "smlal2 v7.4s, v27.8h, v0.8h\n"
- "smlal v11.4s, v26.4h, v2.4h\n"
- "smlal2 v5.4s, v23.8h, v0.8h\n"
- "ldr d0, [x0, #0xa0]\n"
- "smlal v20.4s, v26.4h, v1.4h\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "smlal v8.4s, v23.4h, v1.4h\n"
- "smlal v6.4s, v31.4h, v1.4h\n"
- "smlal2 v13.4s, v26.8h, v2.8h\n"
- "smlal2 v19.4s, v26.8h, v1.8h\n"
- "ldr d26, [x26, x3]\n"
- "ushll v26.8h, v26.8b, #0x0\n"
- "smlal2 v7.4s, v23.8h, v1.8h\n"
- "smlal v11.4s, v25.4h, v3.4h\n"
- "smlal2 v5.4s, v31.8h, v1.8h\n"
- "ldr d1, [x0, #0xa8]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "smlal v8.4s, v31.4h, v2.4h\n"
- "smlal v6.4s, v30.4h, v2.4h\n"
- "smlal2 v13.4s, v25.8h, v3.8h\n"
- "smlal2 v19.4s, v25.8h, v2.8h\n"
- "ldr d25, [x25, x3]\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal2 v7.4s, v31.8h, v2.8h\n"
- "smlal v11.4s, v24.4h, v4.4h\n"
- "smlal2 v5.4s, v30.8h, v2.8h\n"
- "ldr d2, [x0, #0xb0]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "smlal v8.4s, v30.4h, v3.4h\n"
- "smlal v6.4s, v28.4h, v3.4h\n"
- "smlal2 v13.4s, v24.8h, v4.8h\n"
- "smlal2 v19.4s, v24.8h, v3.8h\n"
- "ldr d24, [x24, x3]\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "smlal2 v7.4s, v30.8h, v3.8h\n"
- "smlal v11.4s, v27.4h, v0.4h\n"
- "smlal2 v5.4s, v28.8h, v3.8h\n"
- "ldr d3, [x0, #0xb8]\n"
- "smlal v20.4s, v22.4h, v4.4h\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "smlal v8.4s, v28.4h, v4.4h\n"
- "smlal v6.4s, v26.4h, v4.4h\n"
- "smlal2 v13.4s, v27.8h, v0.8h\n"
- "ldr d27, [x23, x3]\n"
- "smlal2 v7.4s, v28.8h, v4.8h\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal v11.4s, v23.4h, v1.4h\n"
- "smlal2 v19.4s, v22.8h, v4.8h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "ldr d4, [x0, #0xc0]\n"
- "smlal v20.4s, v23.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "smlal v8.4s, v25.4h, v0.4h\n"
- "smlal v6.4s, v24.4h, v0.4h\n"
- "smlal2 v13.4s, v23.8h, v1.8h\n"
- "smlal2 v7.4s, v25.8h, v0.8h\n"
- "ldr d25, [x15, x3]\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal v11.4s, v31.4h, v2.4h\n"
- "smlal2 v19.4s, v23.8h, v0.8h\n"
- "smlal2 v5.4s, v24.8h, v0.8h\n"
- "smlal v20.4s, v31.4h, v1.4h\n"
- "smlal v8.4s, v24.4h, v1.4h\n"
- "smlal v6.4s, v27.4h, v1.4h\n"
- "smlal2 v13.4s, v31.8h, v2.8h\n"
- "smlal2 v7.4s, v24.8h, v1.8h\n"
- "ldr d24, [x21, x3]\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "smlal v11.4s, v30.4h, v3.4h\n"
- "smlal2 v19.4s, v31.8h, v1.8h\n"
- "smlal2 v5.4s, v27.8h, v1.8h\n"
- "smlal v20.4s, v30.4h, v2.4h\n"
- "smlal v8.4s, v27.4h, v2.4h\n"
- "smlal v6.4s, v25.4h, v2.4h\n"
- "smlal2 v13.4s, v30.8h, v3.8h\n"
- "smlal2 v7.4s, v27.8h, v2.8h\n"
- "ldr d27, [x20, x3]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal v11.4s, v28.4h, v4.4h\n"
- "smlal2 v19.4s, v30.8h, v2.8h\n"
- "sqrdmulh v11.4s, v11.4s, v18.4s\n"
- "add x3, x3, #0x8\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "smlal v20.4s, v28.4h, v3.4h\n"
- "and v31.16b, v11.16b, v21.16b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v6.4s, v24.4h, v3.4h\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "smlal2 v13.4s, v28.8h, v4.8h\n"
- "smlal2 v19.4s, v28.8h, v3.8h\n"
- "sqrdmulh v13.4s, v13.4s, v16.4s\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "and v17.16b, v13.16b, v10.16b\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "sqrdmulh v20.4s, v20.4s, v18.4s\n"
- "smlal v6.4s, v27.4h, v4.4h\n"
- "smlal2 v19.4s, v26.8h, v4.8h\n"
- "sqrdmulh v8.4s, v8.4s, v18.4s\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal2 v5.4s, v27.8h, v4.8h\n"
- "sqrdmulh v6.4s, v6.4s, v18.4s\n"
- "sqadd v11.4s, v11.4s, v31.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v26.16b, v20.16b, v21.16b\n"
- "sqrdmulh v19.4s, v19.4s, v16.4s\n"
- "and v18.16b, v8.16b, v21.16b\n"
- "sqrdmulh v7.4s, v7.4s, v16.4s\n"
- "and v31.16b, v6.16b, v21.16b\n"
- "sqrdmulh v5.4s, v5.4s, v16.4s\n"
- "sqadd v13.4s, v13.4s, v17.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "and v27.16b, v19.16b, v10.16b\n"
+ "smlal v27.4s, v6.4h, v16.4h\n"
+ "smlal v8.4s, v19.4h, v16.4h\n"
+ "smlal2 v24.4s, v4.8h, v21.8h\n"
+ "ldr d4, [x7, #0xa8]\n"
+ "smlal v13.4s, v3.4h, v11.4h\n"
+ "usubl v4.8h, v4.8b, v2.8b\n"
+ "smlal2 v14.4s, v18.8h, v16.8h\n"
+ "ldr d18, [x25, x4]\n"
+ "smlal2 v22.4s, v6.8h, v16.8h\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "smlal2 v17.4s, v19.8h, v16.8h\n"
+ "ldr d16, [x24, x4]\n"
+ "smlal v7.4s, v3.4h, v21.4h\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "smlal v27.4s, v9.4h, v21.4h\n"
+ "smlal v8.4s, v10.4h, v21.4h\n"
+ "smlal2 v24.4s, v3.8h, v11.8h\n"
+ "smlal v13.4s, v0.4h, v15.4h\n"
+ "smlal2 v14.4s, v3.8h, v21.8h\n"
+ "ldr d3, [x7, #0xb0]\n"
+ "smlal2 v22.4s, v9.8h, v21.8h\n"
+ "usubl v3.8h, v3.8b, v2.8b\n"
+ "smlal2 v17.4s, v10.8h, v21.8h\n"
+ "ldr d21, [x23, x4]\n"
+ "smlal v7.4s, v0.4h, v11.4h\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "smlal v27.4s, v10.4h, v11.4h\n"
+ "smlal v8.4s, v5.4h, v11.4h\n"
+ "smlal2 v24.4s, v0.8h, v15.8h\n"
+ "smlal v13.4s, v20.4h, v31.4h\n"
+ "smlal2 v14.4s, v0.8h, v11.8h\n"
+ "ldr d0, [x7, #0xb8]\n"
+ "smlal2 v22.4s, v10.8h, v11.8h\n"
+ "usubl v0.8h, v0.8b, v2.8b\n"
+ "smlal2 v17.4s, v5.8h, v11.8h\n"
+ "ldr d11, [x22, x4]\n"
+ "smlal v7.4s, v20.4h, v15.4h\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "smlal v27.4s, v5.4h, v15.4h\n"
+ "smlal v8.4s, v23.4h, v15.4h\n"
+ "smlal2 v24.4s, v20.8h, v31.8h\n"
+ "smlal v13.4s, v6.4h, v29.4h\n"
+ "smlal2 v14.4s, v20.8h, v15.8h\n"
+ "ldr d20, [x7, #0xc0]\n"
+ "smlal2 v22.4s, v5.8h, v15.8h\n"
+ "usubl v20.8h, v20.8b, v2.8b\n"
+ "smlal2 v17.4s, v23.8h, v15.8h\n"
+ "ldr d15, [x21, x4]\n"
+ "smlal v7.4s, v6.4h, v31.4h\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "smlal v27.4s, v23.4h, v31.4h\n"
+ "smlal v8.4s, v30.4h, v31.4h\n"
+ "smlal2 v24.4s, v6.8h, v29.8h\n"
+ "smlal v13.4s, v9.4h, v1.4h\n"
+ "smlal2 v14.4s, v6.8h, v31.8h\n"
+ "ldr d6, [x20, x4]\n"
+ "smlal2 v22.4s, v23.8h, v31.8h\n"
+ "ushll v6.8h, v6.8b, #0x0\n"
+ "smlal2 v17.4s, v30.8h, v31.8h\n"
+ "ldr q31, [x8, #0x0]\n"
+ "smlal v7.4s, v19.4h, v29.4h\n"
+ "add x4, x4, #0x8\n"
+ "smlal v27.4s, v30.4h, v29.4h\n"
+ "smlal v8.4s, v28.4h, v29.4h\n"
+ "smlal2 v24.4s, v9.8h, v1.8h\n"
+ "ldr q9, [x17, #0x0]\n"
+ "smlal v13.4s, v10.4h, v4.4h\n"
+ "smlal2 v14.4s, v19.8h, v29.8h\n"
+ "ldr q19, [x8, #0x10]\n"
+ "smlal2 v22.4s, v30.8h, v29.8h\n"
+ "add x8, x8, #0x20\n"
+ "smlal2 v17.4s, v28.8h, v29.8h\n"
+ "ldr q29, [x17, #0x10]\n"
+ "smlal v7.4s, v10.4h, v1.4h\n"
+ "add x17, x17, #0x20\n"
+ "smlal v27.4s, v18.4h, v1.4h\n"
+ "smlal v8.4s, v16.4h, v1.4h\n"
+ "smlal2 v24.4s, v10.8h, v4.8h\n"
+ "smlal v13.4s, v5.4h, v3.4h\n"
+ "smlal2 v14.4s, v10.8h, v1.8h\n"
+ "smlal2 v22.4s, v18.8h, v1.8h\n"
+ "smlal2 v17.4s, v16.8h, v1.8h\n"
+ "smlal v7.4s, v5.4h, v4.4h\n"
+ "smlal v27.4s, v16.4h, v4.4h\n"
+ "smlal v8.4s, v21.4h, v4.4h\n"
+ "smlal2 v24.4s, v5.8h, v3.8h\n"
+ "smlal v13.4s, v23.4h, v0.4h\n"
+ "smlal2 v14.4s, v5.8h, v4.8h\n"
+ "smlal2 v22.4s, v16.8h, v4.8h\n"
+ "smlal2 v17.4s, v21.8h, v4.8h\n"
+ "smlal v7.4s, v23.4h, v3.4h\n"
+ "smlal v27.4s, v21.4h, v3.4h\n"
+ "smlal v8.4s, v11.4h, v3.4h\n"
+ "smlal2 v24.4s, v23.8h, v0.8h\n"
+ "smlal v13.4s, v30.4h, v20.4h\n"
+ "sqrdmulh v13.4s, v13.4s, v31.4s\n"
+ "smlal2 v14.4s, v23.8h, v3.8h\n"
+ "smlal2 v22.4s, v21.8h, v3.8h\n"
+ "and v21.16b, v13.16b, v9.16b\n"
+ "smlal2 v17.4s, v11.8h, v3.8h\n"
+ "smlal v7.4s, v30.4h, v0.4h\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "smlal v27.4s, v11.4h, v0.4h\n"
+ "smlal v8.4s, v15.4h, v0.4h\n"
+ "sqadd v13.4s, v13.4s, v21.4s\n"
+ "smlal2 v24.4s, v30.8h, v20.8h\n"
+ "smlal2 v14.4s, v30.8h, v0.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v19.4s\n"
+ "smlal2 v22.4s, v11.8h, v0.8h\n"
+ "smlal2 v17.4s, v15.8h, v0.8h\n"
+ "and v16.16b, v24.16b, v29.16b\n"
+ "smlal v7.4s, v28.4h, v20.4h\n"
+ "smlal v27.4s, v15.4h, v20.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v31.4s\n"
+ "smlal v8.4s, v6.4h, v20.4h\n"
+ "smlal2 v14.4s, v28.8h, v20.8h\n"
+ "sqrdmulh v27.4s, v27.4s, v31.4s\n"
+ "smlal2 v22.4s, v15.8h, v20.8h\n"
+ "smlal2 v17.4s, v6.8h, v20.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v31.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v23.16b, v7.16b, v9.16b\n"
+ "sqrdmulh v14.4s, v14.4s, v19.4s\n"
+ "and v20.16b, v27.16b, v9.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v19.4s\n"
+ "and v3.16b, v8.16b, v9.16b\n"
+ "sqrdmulh v17.4s, v17.4s, v19.4s\n"
+ "sqadd v24.4s, v24.4s, v16.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v18.16b, v14.16b, v29.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v19.16b, v22.16b, v29.16b\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "and v30.16b, v17.16b, v29.16b\n"
+ "sqadd v7.4s, v7.4s, v23.4s\n"
"sshr v18.4s, v18.4s, #0x1f\n"
- "and v25.16b, v7.16b, v10.16b\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "and v17.16b, v5.16b, v10.16b\n"
- "sqadd v20.4s, v20.4s, v26.4s\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v18.4s\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "sqadd v6.4s, v6.4s, v31.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "srshl v11.4s, v11.4s, v21.4s\n"
- "srshl v20.4s, v20.4s, v21.4s\n"
- "sqadd v19.4s, v19.4s, v27.4s\n"
- "srshl v8.4s, v8.4s, v21.4s\n"
- "sqadd v7.4s, v7.4s, v25.4s\n"
- "srshl v6.4s, v6.4s, v21.4s\n"
- "sqadd v5.4s, v5.4s, v17.4s\n"
- "srshl v13.4s, v13.4s, v10.4s\n"
- "sqxtn v11.4h, v11.4s\n"
- "srshl v19.4s, v19.4s, v10.4s\n"
- "sqxtn v20.4h, v20.4s\n"
- "srshl v7.4s, v7.4s, v10.4s\n"
+ "sqadd v27.4s, v27.4s, v20.4s\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v3.4s\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v9.4s\n"
+ "srshl v7.4s, v7.4s, v9.4s\n"
+ "sqadd v14.4s, v14.4s, v18.4s\n"
+ "srshl v27.4s, v27.4s, v9.4s\n"
+ "sqadd v22.4s, v22.4s, v19.4s\n"
+ "srshl v8.4s, v8.4s, v9.4s\n"
+ "sqadd v17.4s, v17.4s, v30.4s\n"
+ "srshl v24.4s, v24.4s, v29.4s\n"
+ "sqxtn v13.4h, v13.4s\n"
+ "srshl v14.4s, v14.4s, v29.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v22.4s, v22.4s, v29.4s\n"
+ "sqxtn v27.4h, v27.4s\n"
+ "srshl v17.4s, v17.4s, v29.4s\n"
"sqxtn v8.4h, v8.4s\n"
- "srshl v5.4s, v5.4s, v10.4s\n"
- "sqxtn v6.4h, v6.4s\n"
- "sqxtn2 v11.8h, v13.4s\n"
- "sqxtn2 v20.8h, v19.4s\n"
- "sqxtn2 v8.8h, v7.4s\n"
- "sqxtn2 v6.8h, v5.4s\n"
- "sqadd v11.8h, v11.8h, v15.8h\n"
- "sqadd v20.8h, v20.8h, v15.8h\n"
- "sqadd v8.8h, v8.8h, v15.8h\n"
- "sqadd v6.8h, v6.8h, v15.8h\n"
- "smax v11.8h, v11.8h, v14.8h\n"
- "smax v20.8h, v20.8h, v14.8h\n"
- "smax v8.8h, v8.8h, v14.8h\n"
- "smax v6.8h, v6.8h, v14.8h\n"
- "smin v11.8h, v11.8h, v12.8h\n"
- "smin v20.8h, v20.8h, v12.8h\n"
- "smin v8.8h, v8.8h, v12.8h\n"
- "smin v6.8h, v6.8h, v12.8h\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str d11, [x7, x1]\n"
+ "sqxtn2 v13.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v14.4s\n"
+ "sqxtn2 v27.8h, v22.4s\n"
+ "sqxtn2 v8.8h, v17.4s\n"
+ "sqadd v13.8h, v13.8h, v25.8h\n"
+ "sqadd v7.8h, v7.8h, v25.8h\n"
+ "sqadd v27.8h, v27.8h, v25.8h\n"
+ "sqadd v8.8h, v8.8h, v25.8h\n"
+ "smax v13.8h, v13.8h, v12.8h\n"
+ "smax v7.8h, v7.8h, v12.8h\n"
+ "smax v27.8h, v27.8h, v12.8h\n"
+ "smax v8.8h, v8.8h, v12.8h\n"
+ "smin v13.8h, v13.8h, v26.8h\n"
+ "smin v7.8h, v7.8h, v26.8h\n"
+ "smin v27.8h, v27.8h, v26.8h\n"
+ "smin v8.8h, v8.8h, v26.8h\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "str d13, [x16, x5]\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str d7, [x15, x5]\n"
"uzp1 v8.16b, v8.16b, v8.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "str d20, [x8, x1]\n"
- "str d8, [x17, x1]\n"
- "str d6, [x16, x1]\n"
- "add x1, x1, #0x8\n"
+ "str d27, [x14, x5]\n"
+ "str d8, [x13, x5]\n"
+ "add x5, x5, #0x8\n"
"beq 124f\n"
- "add x0, x0, #0xc8\n"
+ "add x7, x7, #0xc8\n"
"3:" // Oddments
- "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x4, #2, 5f\n"
- "ld1 { v11.4s }, [x13], #0x10\n"
- "tbz x4, #1, 4f\n"
- "ld1 { v13.d }[0], [x13], #0x8\n"
- "tbz x4, #0, 7f\n"
- "ld1 { v13.s }[2], [x13]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x2, #2, 5f\n"
+ "ld1 { v13.4s }, [x20], #0x10\n"
+ "tbz x2, #1, 4f\n"
+ "ld1 { v24.d }[0], [x20], #0x8\n"
+ "tbz x2, #0, 7f\n"
+ "ld1 { v24.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x4, #0, 7f\n"
- "ld1 { v13.s }[0], [x13]\n"
+ "tbz x2, #0, 7f\n"
+ "ld1 { v24.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x4, #1, 6f\n"
- "ld1 { v11.d }[0], [x13], #0x8\n"
- "tbz x4, #0, 7f\n"
- "ld1 { v11.s }[2], [x13]\n"
+ "tbz x2, #1, 6f\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
+ "tbz x2, #0, 7f\n"
+ "ld1 { v13.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 7f\n"
- "ld1 { v11.s }[0], [x13]\n"
+ "tbz x2, #0, 7f\n"
+ "ld1 { v13.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x0, #0x0]\n"
- "ldr d1, [x0, #0x8]\n"
- "mov v20.16b, v11.16b\n"
- "mov v19.16b, v13.16b\n"
- "ldr d2, [x0, #0x10]\n"
- "ldr d3, [x0, #0x18]\n"
- "mov v8.16b, v11.16b\n"
+ "ldr d21, [x7, #0x0]\n"
+ "ldr d15, [x7, #0x8]\n"
"mov v7.16b, v13.16b\n"
- "ldr d4, [x0, #0x20]\n"
- "ldp x10, x28, [x2, #0x0]\n"
- "mov v6.16b, v11.16b\n"
- "mov v5.16b, v13.16b\n"
- "ldp x27, x26, [x2, #0x10]\n"
- "ldp x25, x24, [x2, #0x20]\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "ldp x23, x22, [x2, #0x30]\n"
- "ldp x21, x20, [x2, #0x40]\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "add x10, x10, x3\n"
- "add x28, x28, x3\n"
- "add x27, x27, x3\n"
- "add x26, x26, x3\n"
- "add x25, x25, x3\n"
- "add x24, x24, x3\n"
- "add x23, x23, x3\n"
- "add x22, x22, x3\n"
- "add x21, x21, x3\n"
- "add x20, x20, x3\n"
- "tbz x4, #2, 9f\n"
- "ld1 { v31.s }[0], [x10], #0x4\n"
- "ld1 { v30.s }[0], [x28], #0x4\n"
- "ld1 { v29.s }[0], [x27], #0x4\n"
- "ld1 { v28.s }[0], [x26], #0x4\n"
- "ld1 { v27.s }[0], [x25], #0x4\n"
- "ld1 { v23.s }[0], [x24], #0x4\n"
- "ld1 { v25.s }[0], [x23], #0x4\n"
- "ld1 { v24.s }[0], [x22], #0x4\n"
- "ld1 { v26.s }[0], [x21], #0x4\n"
- "ld1 { v22.s }[0], [x20], #0x4\n"
- "tbz x4, #1, 8f\n"
- "ld1 { v31.h }[2], [x10], #0x2\n"
- "ld1 { v30.h }[2], [x28], #0x2\n"
- "ld1 { v29.h }[2], [x27], #0x2\n"
- "ld1 { v28.h }[2], [x26], #0x2\n"
- "ld1 { v27.h }[2], [x25], #0x2\n"
- "ld1 { v23.h }[2], [x24], #0x2\n"
- "ld1 { v25.h }[2], [x23], #0x2\n"
- "ld1 { v24.h }[2], [x22], #0x2\n"
- "ld1 { v26.h }[2], [x21], #0x2\n"
- "ld1 { v22.h }[2], [x20], #0x2\n"
- "tbz x4, #0, 11f\n"
- "ld1 { v31.b }[6], [x10]\n"
- "ld1 { v30.b }[6], [x28]\n"
- "ld1 { v29.b }[6], [x27]\n"
- "ld1 { v28.b }[6], [x26]\n"
- "ld1 { v27.b }[6], [x25]\n"
- "ld1 { v23.b }[6], [x24]\n"
- "ld1 { v25.b }[6], [x23]\n"
- "ld1 { v24.b }[6], [x22]\n"
- "ld1 { v26.b }[6], [x21]\n"
- "ld1 { v22.b }[6], [x20]\n"
+ "mov v14.16b, v24.16b\n"
+ "ldr d29, [x7, #0x10]\n"
+ "ldr d18, [x7, #0x18]\n"
+ "mov v27.16b, v13.16b\n"
+ "mov v22.16b, v24.16b\n"
+ "ldr d3, [x7, #0x20]\n"
+ "ldp x9, x28, [x6, #0x0]\n"
+ "mov v8.16b, v13.16b\n"
+ "mov v17.16b, v24.16b\n"
+ "ldp x27, x26, [x6, #0x10]\n"
+ "ldp x25, x24, [x6, #0x20]\n"
+ "usubl v21.8h, v21.8b, v2.8b\n"
+ "usubl v15.8h, v15.8b, v2.8b\n"
+ "ldp x23, x22, [x6, #0x30]\n"
+ "ldp x21, x20, [x6, #0x40]\n"
+ "usubl v29.8h, v29.8b, v2.8b\n"
+ "usubl v18.8h, v18.8b, v2.8b\n"
+ "usubl v3.8h, v3.8b, v2.8b\n"
+ "add x9, x9, x4\n"
+ "add x28, x28, x4\n"
+ "add x27, x27, x4\n"
+ "add x26, x26, x4\n"
+ "add x25, x25, x4\n"
+ "add x24, x24, x4\n"
+ "add x23, x23, x4\n"
+ "add x22, x22, x4\n"
+ "add x21, x21, x4\n"
+ "add x20, x20, x4\n"
+ "tbz x2, #2, 9f\n"
+ "ld1 { v10.s }[0], [x9], #0x4\n"
+ "ld1 { v16.s }[0], [x28], #0x4\n"
+ "ld1 { v23.s }[0], [x27], #0x4\n"
+ "ld1 { v30.s }[0], [x26], #0x4\n"
+ "ld1 { v4.s }[0], [x25], #0x4\n"
+ "ld1 { v28.s }[0], [x24], #0x4\n"
+ "ld1 { v31.s }[0], [x23], #0x4\n"
+ "ld1 { v1.s }[0], [x22], #0x4\n"
+ "ld1 { v9.s }[0], [x21], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 8f\n"
+ "ld1 { v10.h }[2], [x9], #0x2\n"
+ "ld1 { v16.h }[2], [x28], #0x2\n"
+ "ld1 { v23.h }[2], [x27], #0x2\n"
+ "ld1 { v30.h }[2], [x26], #0x2\n"
+ "ld1 { v4.h }[2], [x25], #0x2\n"
+ "ld1 { v28.h }[2], [x24], #0x2\n"
+ "ld1 { v31.h }[2], [x23], #0x2\n"
+ "ld1 { v1.h }[2], [x22], #0x2\n"
+ "ld1 { v9.h }[2], [x21], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 11f\n"
+ "ld1 { v10.b }[6], [x9]\n"
+ "ld1 { v16.b }[6], [x28]\n"
+ "ld1 { v23.b }[6], [x27]\n"
+ "ld1 { v30.b }[6], [x26]\n"
+ "ld1 { v4.b }[6], [x25]\n"
+ "ld1 { v28.b }[6], [x24]\n"
+ "ld1 { v31.b }[6], [x23]\n"
+ "ld1 { v1.b }[6], [x22]\n"
+ "ld1 { v9.b }[6], [x21]\n"
+ "ld1 { v11.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x4, #0, 11f\n"
- "ld1 { v31.b }[4], [x10]\n"
- "ld1 { v30.b }[4], [x28]\n"
- "ld1 { v29.b }[4], [x27]\n"
- "ld1 { v28.b }[4], [x26]\n"
- "ld1 { v27.b }[4], [x25]\n"
- "ld1 { v23.b }[4], [x24]\n"
- "ld1 { v25.b }[4], [x23]\n"
- "ld1 { v24.b }[4], [x22]\n"
- "ld1 { v26.b }[4], [x21]\n"
- "ld1 { v22.b }[4], [x20]\n"
+ "tbz x2, #0, 11f\n"
+ "ld1 { v10.b }[4], [x9]\n"
+ "ld1 { v16.b }[4], [x28]\n"
+ "ld1 { v23.b }[4], [x27]\n"
+ "ld1 { v30.b }[4], [x26]\n"
+ "ld1 { v4.b }[4], [x25]\n"
+ "ld1 { v28.b }[4], [x24]\n"
+ "ld1 { v31.b }[4], [x23]\n"
+ "ld1 { v1.b }[4], [x22]\n"
+ "ld1 { v9.b }[4], [x21]\n"
+ "ld1 { v11.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x4, #1, 10f\n"
- "ld1 { v31.h }[0], [x10], #0x2\n"
- "ld1 { v30.h }[0], [x28], #0x2\n"
- "ld1 { v29.h }[0], [x27], #0x2\n"
- "ld1 { v28.h }[0], [x26], #0x2\n"
- "ld1 { v27.h }[0], [x25], #0x2\n"
- "ld1 { v23.h }[0], [x24], #0x2\n"
- "ld1 { v25.h }[0], [x23], #0x2\n"
- "ld1 { v24.h }[0], [x22], #0x2\n"
- "ld1 { v26.h }[0], [x21], #0x2\n"
- "ld1 { v22.h }[0], [x20], #0x2\n"
- "tbz x4, #0, 11f\n"
- "ld1 { v31.b }[2], [x10]\n"
- "ld1 { v30.b }[2], [x28]\n"
- "ld1 { v29.b }[2], [x27]\n"
- "ld1 { v28.b }[2], [x26]\n"
- "ld1 { v27.b }[2], [x25]\n"
- "ld1 { v23.b }[2], [x24]\n"
- "ld1 { v25.b }[2], [x23]\n"
- "ld1 { v24.b }[2], [x22]\n"
- "ld1 { v26.b }[2], [x21]\n"
- "ld1 { v22.b }[2], [x20]\n"
+ "tbz x2, #1, 10f\n"
+ "ld1 { v10.h }[0], [x9], #0x2\n"
+ "ld1 { v16.h }[0], [x28], #0x2\n"
+ "ld1 { v23.h }[0], [x27], #0x2\n"
+ "ld1 { v30.h }[0], [x26], #0x2\n"
+ "ld1 { v4.h }[0], [x25], #0x2\n"
+ "ld1 { v28.h }[0], [x24], #0x2\n"
+ "ld1 { v31.h }[0], [x23], #0x2\n"
+ "ld1 { v1.h }[0], [x22], #0x2\n"
+ "ld1 { v9.h }[0], [x21], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 11f\n"
+ "ld1 { v10.b }[2], [x9]\n"
+ "ld1 { v16.b }[2], [x28]\n"
+ "ld1 { v23.b }[2], [x27]\n"
+ "ld1 { v30.b }[2], [x26]\n"
+ "ld1 { v4.b }[2], [x25]\n"
+ "ld1 { v28.b }[2], [x24]\n"
+ "ld1 { v31.b }[2], [x23]\n"
+ "ld1 { v1.b }[2], [x22]\n"
+ "ld1 { v9.b }[2], [x21]\n"
+ "ld1 { v11.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 11f\n"
- "ld1 { v31.b }[0], [x10]\n"
- "ld1 { v30.b }[0], [x28]\n"
- "ld1 { v29.b }[0], [x27]\n"
- "ld1 { v28.b }[0], [x26]\n"
- "ld1 { v27.b }[0], [x25]\n"
- "ld1 { v23.b }[0], [x24]\n"
- "ld1 { v25.b }[0], [x23]\n"
- "ld1 { v24.b }[0], [x22]\n"
- "ld1 { v26.b }[0], [x21]\n"
- "ld1 { v22.b }[0], [x20]\n"
+ "tbz x2, #0, 11f\n"
+ "ld1 { v10.b }[0], [x9]\n"
+ "ld1 { v16.b }[0], [x28]\n"
+ "ld1 { v23.b }[0], [x27]\n"
+ "ld1 { v30.b }[0], [x26]\n"
+ "ld1 { v4.b }[0], [x25]\n"
+ "ld1 { v28.b }[0], [x24]\n"
+ "ld1 { v31.b }[0], [x23]\n"
+ "ld1 { v1.b }[0], [x22]\n"
+ "ld1 { v9.b }[0], [x21]\n"
+ "ld1 { v11.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "ushll v31.8h, v31.8b, #0x0\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "smlal v13.4s, v10.4h, v21.4h\n"
+ "ldr x20, [x6, #0x50]\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "smlal2 v24.4s, v10.8h, v21.8h\n"
+ "smlal v7.4s, v16.4h, v21.4h\n"
+ "smlal2 v14.4s, v16.8h, v21.8h\n"
+ "smlal v27.4s, v23.4h, v21.4h\n"
"ushll v30.8h, v30.8b, #0x0\n"
- "smlal v11.4s, v31.4h, v0.4h\n"
- "ldr x20, [x2, #0x50]\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal2 v13.4s, v31.8h, v0.8h\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "smlal2 v19.4s, v30.8h, v0.8h\n"
- "smlal v8.4s, v29.4h, v0.4h\n"
+ "add x20, x20, x4\n"
+ "smlal2 v22.4s, v23.8h, v21.8h\n"
+ "ushll v4.8h, v4.8b, #0x0\n"
+ "smlal v8.4s, v30.4h, v21.4h\n"
+ "smlal2 v17.4s, v30.8h, v21.8h\n"
+ "smlal v13.4s, v16.4h, v15.4h\n"
"ushll v28.8h, v28.8b, #0x0\n"
- "add x20, x20, x3\n"
- "smlal2 v7.4s, v29.8h, v0.8h\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal v6.4s, v28.4h, v0.4h\n"
- "smlal2 v5.4s, v28.8h, v0.8h\n"
- "smlal v11.4s, v30.4h, v1.4h\n"
- "ushll v23.8h, v23.8b, #0x0\n"
- "smlal2 v13.4s, v30.8h, v1.8h\n"
- "smlal v20.4s, v27.4h, v1.4h\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal2 v19.4s, v27.8h, v1.8h\n"
- "smlal v8.4s, v28.4h, v1.4h\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "smlal2 v7.4s, v28.8h, v1.8h\n"
- "ushll v26.8h, v26.8b, #0x0\n"
- "smlal v6.4s, v23.4h, v1.4h\n"
- "ushll v22.8h, v22.8b, #0x0\n"
- "smlal2 v5.4s, v23.8h, v1.8h\n"
- "smlal v11.4s, v27.4h, v2.4h\n"
- "smlal2 v13.4s, v27.8h, v2.8h\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "smlal2 v19.4s, v25.8h, v2.8h\n"
- "smlal v8.4s, v23.4h, v2.4h\n"
- "smlal2 v7.4s, v23.8h, v2.8h\n"
- "tbz x4, #2, 13f\n"
- "ld1 { v31.s }[0], [x20], #0x4\n"
- "tbz x4, #1, 12f\n"
- "ld1 { v31.h }[2], [x20], #0x2\n"
- "tbz x4, #0, 15f\n"
- "ld1 { v31.b }[6], [x20]\n"
+ "smlal2 v24.4s, v16.8h, v15.8h\n"
+ "smlal v7.4s, v4.4h, v15.4h\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "smlal2 v14.4s, v4.8h, v15.8h\n"
+ "smlal v27.4s, v30.4h, v15.4h\n"
+ "ushll v1.8h, v1.8b, #0x0\n"
+ "smlal2 v22.4s, v30.8h, v15.8h\n"
+ "ushll v9.8h, v9.8b, #0x0\n"
+ "smlal v8.4s, v28.4h, v15.4h\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "smlal2 v17.4s, v28.8h, v15.8h\n"
+ "smlal v13.4s, v4.4h, v29.4h\n"
+ "smlal2 v24.4s, v4.8h, v29.8h\n"
+ "smlal v7.4s, v31.4h, v29.4h\n"
+ "smlal2 v14.4s, v31.8h, v29.8h\n"
+ "smlal v27.4s, v28.4h, v29.4h\n"
+ "smlal2 v22.4s, v28.8h, v29.8h\n"
+ "tbz x2, #2, 13f\n"
+ "ld1 { v5.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 12f\n"
+ "ld1 { v5.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 15f\n"
+ "ld1 { v5.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 15f\n"
- "ld1 { v31.b }[4], [x20]\n"
+ "tbz x2, #0, 15f\n"
+ "ld1 { v5.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x4, #1, 14f\n"
- "ld1 { v31.h }[0], [x20], #0x2\n"
- "tbz x4, #0, 15f\n"
- "ld1 { v31.b }[2], [x20]\n"
+ "tbz x2, #1, 14f\n"
+ "ld1 { v5.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 15f\n"
+ "ld1 { v5.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 15f\n"
- "ld1 { v31.b }[0], [x20]\n"
+ "tbz x2, #0, 15f\n"
+ "ld1 { v5.b }[0], [x20]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
- "ushll v31.8h, v31.8b, #0x0\n"
- "ldr x22, [x2, #0x58]\n"
- "smlal v6.4s, v31.4h, v2.4h\n"
- "smlal2 v5.4s, v31.8h, v2.8h\n"
- "smlal v11.4s, v25.4h, v3.4h\n"
- "smlal2 v13.4s, v25.8h, v3.8h\n"
- "add x22, x22, x3\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "smlal2 v19.4s, v24.8h, v3.8h\n"
- "smlal v8.4s, v31.4h, v3.4h\n"
- "smlal2 v7.4s, v31.8h, v3.8h\n"
- "tbz x4, #2, 17f\n"
- "ld1 { v30.s }[0], [x22], #0x4\n"
- "tbz x4, #1, 16f\n"
- "ld1 { v30.h }[2], [x22], #0x2\n"
- "tbz x4, #0, 19f\n"
- "ld1 { v30.b }[6], [x22]\n"
+ "ushll v5.8h, v5.8b, #0x0\n"
+ "ldr x20, [x6, #0x58]\n"
+ "smlal v8.4s, v5.4h, v29.4h\n"
+ "smlal2 v17.4s, v5.8h, v29.8h\n"
+ "smlal v13.4s, v31.4h, v18.4h\n"
+ "smlal2 v24.4s, v31.8h, v18.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v7.4s, v1.4h, v18.4h\n"
+ "smlal2 v14.4s, v1.8h, v18.8h\n"
+ "smlal v27.4s, v5.4h, v18.4h\n"
+ "smlal2 v22.4s, v5.8h, v18.8h\n"
+ "tbz x2, #2, 17f\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 16f\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 19f\n"
+ "ld1 { v10.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 19f\n"
- "ld1 { v30.b }[4], [x22]\n"
+ "tbz x2, #0, 19f\n"
+ "ld1 { v10.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
- "tbz x4, #1, 18f\n"
- "ld1 { v30.h }[0], [x22], #0x2\n"
- "tbz x4, #0, 19f\n"
- "ld1 { v30.b }[2], [x22]\n"
+ "tbz x2, #1, 18f\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 19f\n"
+ "ld1 { v10.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 19f\n"
- "ld1 { v30.b }[0], [x22]\n"
+ "tbz x2, #0, 19f\n"
+ "ld1 { v10.b }[0], [x20]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
- "ushll v30.8h, v30.8b, #0x0\n"
- "ldr x21, [x2, #0x60]\n"
- "smlal v6.4s, v30.4h, v3.4h\n"
- "smlal2 v5.4s, v30.8h, v3.8h\n"
- "smlal v11.4s, v24.4h, v4.4h\n"
- "smlal2 v13.4s, v24.8h, v4.8h\n"
- "add x21, x21, x3\n"
- "tbz x4, #2, 21f\n"
- "ld1 { v27.s }[0], [x21], #0x4\n"
- "tbz x4, #1, 20f\n"
- "ld1 { v27.h }[2], [x21], #0x2\n"
- "tbz x4, #0, 23f\n"
- "ld1 { v27.b }[6], [x21]\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ldr x20, [x6, #0x60]\n"
+ "smlal v8.4s, v10.4h, v18.4h\n"
+ "smlal2 v17.4s, v10.8h, v18.8h\n"
+ "smlal v13.4s, v1.4h, v3.4h\n"
+ "smlal2 v24.4s, v1.8h, v3.8h\n"
+ "add x20, x20, x4\n"
+ "tbz x2, #2, 21f\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 20f\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 23f\n"
+ "ld1 { v15.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
- "tbz x4, #0, 23f\n"
- "ld1 { v27.b }[4], [x21]\n"
+ "tbz x2, #0, 23f\n"
+ "ld1 { v15.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 5): Bit 2: Unset
- "tbz x4, #1, 22f\n"
- "ld1 { v27.h }[0], [x21], #0x2\n"
- "tbz x4, #0, 23f\n"
- "ld1 { v27.b }[2], [x21]\n"
+ "tbz x2, #1, 22f\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 23f\n"
+ "ld1 { v15.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 23f\n"
- "ld1 { v27.b }[0], [x21]\n"
+ "tbz x2, #0, 23f\n"
+ "ld1 { v15.b }[0], [x20]\n"
"23:" // Oddments: Load (0, 5): Bit 2: End
- "ldr d0, [x0, #0x28]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal v20.4s, v27.4h, v4.4h\n"
- "smlal2 v19.4s, v27.8h, v4.8h\n"
- "smlal v8.4s, v30.4h, v4.4h\n"
- "smlal2 v7.4s, v30.8h, v4.8h\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "ldr x20, [x2, #0x68]\n"
- "smlal v6.4s, v26.4h, v4.4h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "add x20, x20, x3\n"
- "smlal v11.4s, v29.4h, v0.4h\n"
- "smlal2 v13.4s, v29.8h, v0.8h\n"
- "smlal v20.4s, v28.4h, v0.4h\n"
- "smlal2 v19.4s, v28.8h, v0.8h\n"
- "smlal v8.4s, v22.4h, v0.4h\n"
- "smlal2 v7.4s, v22.8h, v0.8h\n"
- "tbz x4, #2, 25f\n"
- "ld1 { v25.s }[0], [x20], #0x4\n"
- "tbz x4, #1, 24f\n"
- "ld1 { v25.h }[2], [x20], #0x2\n"
- "tbz x4, #0, 27f\n"
- "ld1 { v25.b }[6], [x20]\n"
+ "ldr d6, [x7, #0x28]\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "smlal v7.4s, v15.4h, v3.4h\n"
+ "smlal2 v14.4s, v15.8h, v3.8h\n"
+ "smlal v27.4s, v10.4h, v3.4h\n"
+ "smlal2 v22.4s, v10.8h, v3.8h\n"
+ "usubl v6.8h, v6.8b, v2.8b\n"
+ "ldr x20, [x6, #0x68]\n"
+ "smlal v8.4s, v9.4h, v3.4h\n"
+ "smlal2 v17.4s, v9.8h, v3.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v23.4h, v6.4h\n"
+ "smlal2 v24.4s, v23.8h, v6.8h\n"
+ "smlal v7.4s, v30.4h, v6.4h\n"
+ "smlal2 v14.4s, v30.8h, v6.8h\n"
+ "smlal v27.4s, v11.4h, v6.4h\n"
+ "smlal2 v22.4s, v11.8h, v6.8h\n"
+ "tbz x2, #2, 25f\n"
+ "ld1 { v20.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 24f\n"
+ "ld1 { v20.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 27f\n"
+ "ld1 { v20.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
- "tbz x4, #0, 27f\n"
- "ld1 { v25.b }[4], [x20]\n"
+ "tbz x2, #0, 27f\n"
+ "ld1 { v20.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (2, 1): Bit 2: Unset
- "tbz x4, #1, 26f\n"
- "ld1 { v25.h }[0], [x20], #0x2\n"
- "tbz x4, #0, 27f\n"
- "ld1 { v25.b }[2], [x20]\n"
+ "tbz x2, #1, 26f\n"
+ "ld1 { v20.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 27f\n"
+ "ld1 { v20.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 27f\n"
- "ld1 { v25.b }[0], [x20]\n"
+ "tbz x2, #0, 27f\n"
+ "ld1 { v20.b }[0], [x20]\n"
"27:" // Oddments: Load (2, 1): Bit 2: End
- "ldr d1, [x0, #0x30]\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "ldr x25, [x2, #0x70]\n"
- "smlal v6.4s, v25.4h, v0.4h\n"
- "smlal2 v5.4s, v25.8h, v0.8h\n"
- "add x25, x25, x3\n"
- "smlal v11.4s, v28.4h, v1.4h\n"
- "smlal2 v13.4s, v28.8h, v1.8h\n"
- "smlal v20.4s, v23.4h, v1.4h\n"
- "smlal2 v19.4s, v23.8h, v1.8h\n"
- "smlal v8.4s, v25.4h, v1.4h\n"
- "smlal2 v7.4s, v25.8h, v1.8h\n"
- "tbz x4, #2, 29f\n"
- "ld1 { v24.s }[0], [x25], #0x4\n"
- "tbz x4, #1, 28f\n"
- "ld1 { v24.h }[2], [x25], #0x2\n"
- "tbz x4, #0, 31f\n"
- "ld1 { v24.b }[6], [x25]\n"
+ "ldr d4, [x7, #0x30]\n"
+ "ushll v20.8h, v20.8b, #0x0\n"
+ "usubl v4.8h, v4.8b, v2.8b\n"
+ "ldr x20, [x6, #0x70]\n"
+ "smlal v8.4s, v20.4h, v6.4h\n"
+ "smlal2 v17.4s, v20.8h, v6.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v30.4h, v4.4h\n"
+ "smlal2 v24.4s, v30.8h, v4.8h\n"
+ "smlal v7.4s, v28.4h, v4.4h\n"
+ "smlal2 v14.4s, v28.8h, v4.8h\n"
+ "smlal v27.4s, v20.4h, v4.4h\n"
+ "smlal2 v22.4s, v20.8h, v4.8h\n"
+ "tbz x2, #2, 29f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 28f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 31f\n"
+ "ld1 { v23.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
- "tbz x4, #0, 31f\n"
- "ld1 { v24.b }[4], [x25]\n"
+ "tbz x2, #0, 31f\n"
+ "ld1 { v23.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
- "tbz x4, #1, 30f\n"
- "ld1 { v24.h }[0], [x25], #0x2\n"
- "tbz x4, #0, 31f\n"
- "ld1 { v24.b }[2], [x25]\n"
+ "tbz x2, #1, 30f\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 31f\n"
+ "ld1 { v23.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 31f\n"
- "ld1 { v24.b }[0], [x25]\n"
+ "tbz x2, #0, 31f\n"
+ "ld1 { v23.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "ldr d2, [x0, #0x38]\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "ldr x26, [x2, #0x78]\n"
- "smlal v6.4s, v24.4h, v1.4h\n"
- "smlal2 v5.4s, v24.8h, v1.8h\n"
- "add x26, x26, x3\n"
- "smlal v11.4s, v23.4h, v2.4h\n"
- "smlal2 v13.4s, v23.8h, v2.8h\n"
- "smlal v20.4s, v31.4h, v2.4h\n"
- "smlal2 v19.4s, v31.8h, v2.8h\n"
- "smlal v8.4s, v24.4h, v2.4h\n"
- "smlal2 v7.4s, v24.8h, v2.8h\n"
- "tbz x4, #2, 33f\n"
- "ld1 { v27.s }[0], [x26], #0x4\n"
- "tbz x4, #1, 32f\n"
- "ld1 { v27.h }[2], [x26], #0x2\n"
- "tbz x4, #0, 35f\n"
- "ld1 { v27.b }[6], [x26]\n"
+ "ldr d30, [x7, #0x38]\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "usubl v30.8h, v30.8b, v2.8b\n"
+ "ldr x20, [x6, #0x78]\n"
+ "smlal v8.4s, v23.4h, v4.4h\n"
+ "smlal2 v17.4s, v23.8h, v4.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v28.4h, v30.4h\n"
+ "smlal2 v24.4s, v28.8h, v30.8h\n"
+ "smlal v7.4s, v5.4h, v30.4h\n"
+ "smlal2 v14.4s, v5.8h, v30.8h\n"
+ "smlal v27.4s, v23.4h, v30.4h\n"
+ "smlal2 v22.4s, v23.8h, v30.8h\n"
+ "tbz x2, #2, 33f\n"
+ "ld1 { v3.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 32f\n"
+ "ld1 { v3.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 35f\n"
+ "ld1 { v3.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 35f\n"
- "ld1 { v27.b }[4], [x26]\n"
+ "tbz x2, #0, 35f\n"
+ "ld1 { v3.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x4, #1, 34f\n"
- "ld1 { v27.h }[0], [x26], #0x2\n"
- "tbz x4, #0, 35f\n"
- "ld1 { v27.b }[2], [x26]\n"
+ "tbz x2, #1, 34f\n"
+ "ld1 { v3.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 35f\n"
+ "ld1 { v3.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 35f\n"
- "ld1 { v27.b }[0], [x26]\n"
+ "tbz x2, #0, 35f\n"
+ "ld1 { v3.b }[0], [x20]\n"
"35:" // Oddments: Load (2, 3): Bit 2: End
- "ldr d3, [x0, #0x40]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "ldr x23, [x2, #0x80]\n"
- "smlal v6.4s, v27.4h, v2.4h\n"
- "smlal2 v5.4s, v27.8h, v2.8h\n"
- "add x23, x23, x3\n"
- "smlal v11.4s, v31.4h, v3.4h\n"
- "smlal2 v13.4s, v31.8h, v3.8h\n"
- "smlal v20.4s, v30.4h, v3.4h\n"
- "smlal2 v19.4s, v30.8h, v3.8h\n"
- "smlal v8.4s, v27.4h, v3.4h\n"
- "smlal2 v7.4s, v27.8h, v3.8h\n"
- "tbz x4, #2, 37f\n"
- "ld1 { v23.s }[0], [x23], #0x4\n"
- "tbz x4, #1, 36f\n"
- "ld1 { v23.h }[2], [x23], #0x2\n"
- "tbz x4, #0, 39f\n"
- "ld1 { v23.b }[6], [x23]\n"
+ "ldr d16, [x7, #0x40]\n"
+ "ushll v3.8h, v3.8b, #0x0\n"
+ "usubl v16.8h, v16.8b, v2.8b\n"
+ "ldr x20, [x6, #0x80]\n"
+ "smlal v8.4s, v3.4h, v30.4h\n"
+ "smlal2 v17.4s, v3.8h, v30.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v5.4h, v16.4h\n"
+ "smlal2 v24.4s, v5.8h, v16.8h\n"
+ "smlal v7.4s, v10.4h, v16.4h\n"
+ "smlal2 v14.4s, v10.8h, v16.8h\n"
+ "smlal v27.4s, v3.4h, v16.4h\n"
+ "smlal2 v22.4s, v3.8h, v16.8h\n"
+ "tbz x2, #2, 37f\n"
+ "ld1 { v6.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 36f\n"
+ "ld1 { v6.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 39f\n"
+ "ld1 { v6.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 39f\n"
- "ld1 { v23.b }[4], [x23]\n"
+ "tbz x2, #0, 39f\n"
+ "ld1 { v6.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 4): Bit 2: Unset
- "tbz x4, #1, 38f\n"
- "ld1 { v23.h }[0], [x23], #0x2\n"
- "tbz x4, #0, 39f\n"
- "ld1 { v23.b }[2], [x23]\n"
+ "tbz x2, #1, 38f\n"
+ "ld1 { v6.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 39f\n"
+ "ld1 { v6.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 39f\n"
- "ld1 { v23.b }[0], [x23]\n"
+ "tbz x2, #0, 39f\n"
+ "ld1 { v6.b }[0], [x20]\n"
"39:" // Oddments: Load (2, 4): Bit 2: End
- "ldr d4, [x0, #0x48]\n"
- "ushll v23.8h, v23.8b, #0x0\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "ldr x24, [x2, #0x88]\n"
- "smlal v6.4s, v23.4h, v3.4h\n"
- "smlal2 v5.4s, v23.8h, v3.8h\n"
- "add x24, x24, x3\n"
- "smlal v11.4s, v30.4h, v4.4h\n"
- "smlal2 v13.4s, v30.8h, v4.8h\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal2 v19.4s, v26.8h, v4.8h\n"
- "smlal v8.4s, v23.4h, v4.4h\n"
- "smlal2 v7.4s, v23.8h, v4.8h\n"
- "tbz x4, #2, 41f\n"
- "ld1 { v28.s }[0], [x24], #0x4\n"
- "tbz x4, #1, 40f\n"
- "ld1 { v28.h }[2], [x24], #0x2\n"
- "tbz x4, #0, 43f\n"
- "ld1 { v28.b }[6], [x24]\n"
+ "ldr d1, [x7, #0x48]\n"
+ "ushll v6.8h, v6.8b, #0x0\n"
+ "usubl v1.8h, v1.8b, v2.8b\n"
+ "ldr x20, [x6, #0x88]\n"
+ "smlal v8.4s, v6.4h, v16.4h\n"
+ "smlal2 v17.4s, v6.8h, v16.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v10.4h, v1.4h\n"
+ "smlal2 v24.4s, v10.8h, v1.8h\n"
+ "smlal v7.4s, v9.4h, v1.4h\n"
+ "smlal2 v14.4s, v9.8h, v1.8h\n"
+ "smlal v27.4s, v6.4h, v1.4h\n"
+ "smlal2 v22.4s, v6.8h, v1.8h\n"
+ "tbz x2, #2, 41f\n"
+ "ld1 { v18.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 40f\n"
+ "ld1 { v18.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 43f\n"
+ "ld1 { v18.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
- "tbz x4, #0, 43f\n"
- "ld1 { v28.b }[4], [x24]\n"
+ "tbz x2, #0, 43f\n"
+ "ld1 { v18.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 5): Bit 2: Unset
- "tbz x4, #1, 42f\n"
- "ld1 { v28.h }[0], [x24], #0x2\n"
- "tbz x4, #0, 43f\n"
- "ld1 { v28.b }[2], [x24]\n"
+ "tbz x2, #1, 42f\n"
+ "ld1 { v18.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 43f\n"
+ "ld1 { v18.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 43f\n"
- "ld1 { v28.b }[0], [x24]\n"
+ "tbz x2, #0, 43f\n"
+ "ld1 { v18.b }[0], [x20]\n"
"43:" // Oddments: Load (2, 5): Bit 2: End
- "ldr d0, [x0, #0x50]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "ldr x15, [x2, #0x90]\n"
- "smlal v6.4s, v28.4h, v4.4h\n"
- "smlal2 v5.4s, v28.8h, v4.8h\n"
- "add x15, x15, x3\n"
- "smlal v11.4s, v22.4h, v0.4h\n"
- "smlal2 v13.4s, v22.8h, v0.8h\n"
- "smlal v20.4s, v25.4h, v0.4h\n"
- "smlal2 v19.4s, v25.8h, v0.8h\n"
- "tbz x4, #2, 45f\n"
- "ld1 { v31.s }[0], [x15], #0x4\n"
- "tbz x4, #1, 44f\n"
- "ld1 { v31.h }[2], [x15], #0x2\n"
- "tbz x4, #0, 47f\n"
- "ld1 { v31.b }[6], [x15]\n"
+ "ldr d28, [x7, #0x50]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "usubl v28.8h, v28.8b, v2.8b\n"
+ "ldr x20, [x6, #0x90]\n"
+ "smlal v8.4s, v18.4h, v1.4h\n"
+ "smlal2 v17.4s, v18.8h, v1.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v11.4h, v28.4h\n"
+ "smlal2 v24.4s, v11.8h, v28.8h\n"
+ "smlal v7.4s, v20.4h, v28.4h\n"
+ "smlal2 v14.4s, v20.8h, v28.8h\n"
+ "tbz x2, #2, 45f\n"
+ "ld1 { v30.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 44f\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 47f\n"
+ "ld1 { v30.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x4, #0, 47f\n"
- "ld1 { v31.b }[4], [x15]\n"
+ "tbz x2, #0, 47f\n"
+ "ld1 { v30.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x4, #1, 46f\n"
- "ld1 { v31.h }[0], [x15], #0x2\n"
- "tbz x4, #0, 47f\n"
- "ld1 { v31.b }[2], [x15]\n"
+ "tbz x2, #1, 46f\n"
+ "ld1 { v30.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 47f\n"
+ "ld1 { v30.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 47f\n"
- "ld1 { v31.b }[0], [x15]\n"
+ "tbz x2, #0, 47f\n"
+ "ld1 { v30.b }[0], [x20]\n"
"47:" // Oddments: Load (3, 0): Bit 2: End
- "ushll v31.8h, v31.8b, #0x0\n"
- "ldr x21, [x2, #0x98]\n"
- "smlal v8.4s, v31.4h, v0.4h\n"
- "smlal2 v7.4s, v31.8h, v0.8h\n"
- "add x21, x21, x3\n"
- "tbz x4, #2, 49f\n"
- "ld1 { v30.s }[0], [x21], #0x4\n"
- "tbz x4, #1, 48f\n"
- "ld1 { v30.h }[2], [x21], #0x2\n"
- "tbz x4, #0, 51f\n"
- "ld1 { v30.b }[6], [x21]\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "ldr x20, [x6, #0x98]\n"
+ "smlal v27.4s, v30.4h, v28.4h\n"
+ "smlal2 v22.4s, v30.8h, v28.8h\n"
+ "add x20, x20, x4\n"
+ "tbz x2, #2, 49f\n"
+ "ld1 { v19.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 48f\n"
+ "ld1 { v19.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 51f\n"
+ "ld1 { v19.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x4, #0, 51f\n"
- "ld1 { v30.b }[4], [x21]\n"
+ "tbz x2, #0, 51f\n"
+ "ld1 { v19.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x4, #1, 50f\n"
- "ld1 { v30.h }[0], [x21], #0x2\n"
- "tbz x4, #0, 51f\n"
- "ld1 { v30.b }[2], [x21]\n"
+ "tbz x2, #1, 50f\n"
+ "ld1 { v19.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 51f\n"
+ "ld1 { v19.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 51f\n"
- "ld1 { v30.b }[0], [x21]\n"
+ "tbz x2, #0, 51f\n"
+ "ld1 { v19.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "ldr d1, [x0, #0x58]\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "ldr x14, [x2, #0xa0]\n"
- "smlal v6.4s, v30.4h, v0.4h\n"
- "smlal2 v5.4s, v30.8h, v0.8h\n"
- "add x14, x14, x3\n"
- "smlal v11.4s, v25.4h, v1.4h\n"
- "smlal2 v13.4s, v25.8h, v1.8h\n"
- "smlal v20.4s, v24.4h, v1.4h\n"
- "smlal2 v19.4s, v24.8h, v1.8h\n"
- "smlal v8.4s, v30.4h, v1.4h\n"
- "smlal2 v7.4s, v30.8h, v1.8h\n"
- "tbz x4, #2, 53f\n"
- "ld1 { v26.s }[0], [x14], #0x4\n"
- "tbz x4, #1, 52f\n"
- "ld1 { v26.h }[2], [x14], #0x2\n"
- "tbz x4, #0, 55f\n"
- "ld1 { v26.b }[6], [x14]\n"
+ "ldr d0, [x7, #0x58]\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "usubl v0.8h, v0.8b, v2.8b\n"
+ "ldr x20, [x6, #0xa0]\n"
+ "smlal v8.4s, v19.4h, v28.4h\n"
+ "smlal2 v17.4s, v19.8h, v28.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v20.4h, v0.4h\n"
+ "smlal2 v24.4s, v20.8h, v0.8h\n"
+ "smlal v7.4s, v23.4h, v0.4h\n"
+ "smlal2 v14.4s, v23.8h, v0.8h\n"
+ "smlal v27.4s, v19.4h, v0.4h\n"
+ "smlal2 v22.4s, v19.8h, v0.8h\n"
+ "tbz x2, #2, 53f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 52f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 55f\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x4, #0, 55f\n"
- "ld1 { v26.b }[4], [x14]\n"
+ "tbz x2, #0, 55f\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x4, #1, 54f\n"
- "ld1 { v26.h }[0], [x14], #0x2\n"
- "tbz x4, #0, 55f\n"
- "ld1 { v26.b }[2], [x14]\n"
+ "tbz x2, #1, 54f\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 55f\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 55f\n"
- "ld1 { v26.b }[0], [x14]\n"
+ "tbz x2, #0, 55f\n"
+ "ld1 { v9.b }[0], [x20]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "ldr d2, [x0, #0x60]\n"
- "ushll v26.8h, v26.8b, #0x0\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "ldr x13, [x2, #0xa8]\n"
- "smlal v6.4s, v26.4h, v1.4h\n"
- "smlal2 v5.4s, v26.8h, v1.8h\n"
- "add x13, x13, x3\n"
- "smlal v11.4s, v24.4h, v2.4h\n"
- "smlal2 v13.4s, v24.8h, v2.8h\n"
- "smlal v20.4s, v27.4h, v2.4h\n"
- "smlal2 v19.4s, v27.8h, v2.8h\n"
- "smlal v8.4s, v26.4h, v2.4h\n"
- "smlal2 v7.4s, v26.8h, v2.8h\n"
- "tbz x4, #2, 57f\n"
- "ld1 { v25.s }[0], [x13], #0x4\n"
- "tbz x4, #1, 56f\n"
- "ld1 { v25.h }[2], [x13], #0x2\n"
- "tbz x4, #0, 59f\n"
- "ld1 { v25.b }[6], [x13]\n"
+ "ldr d10, [x7, #0x60]\n"
+ "ushll v9.8h, v9.8b, #0x0\n"
+ "usubl v10.8h, v10.8b, v2.8b\n"
+ "ldr x20, [x6, #0xa8]\n"
+ "smlal v8.4s, v9.4h, v0.4h\n"
+ "smlal2 v17.4s, v9.8h, v0.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v23.4h, v10.4h\n"
+ "smlal2 v24.4s, v23.8h, v10.8h\n"
+ "smlal v7.4s, v3.4h, v10.4h\n"
+ "smlal2 v14.4s, v3.8h, v10.8h\n"
+ "smlal v27.4s, v9.4h, v10.4h\n"
+ "smlal2 v22.4s, v9.8h, v10.8h\n"
+ "tbz x2, #2, 57f\n"
+ "ld1 { v20.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 56f\n"
+ "ld1 { v20.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 59f\n"
+ "ld1 { v20.b }[6], [x20]\n"
"b 59f\n"
"56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 59f\n"
- "ld1 { v25.b }[4], [x13]\n"
+ "tbz x2, #0, 59f\n"
+ "ld1 { v20.b }[4], [x20]\n"
"b 59f\n"
"57:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x4, #1, 58f\n"
- "ld1 { v25.h }[0], [x13], #0x2\n"
- "tbz x4, #0, 59f\n"
- "ld1 { v25.b }[2], [x13]\n"
+ "tbz x2, #1, 58f\n"
+ "ld1 { v20.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 59f\n"
+ "ld1 { v20.b }[2], [x20]\n"
"b 59f\n"
"58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 59f\n"
- "ld1 { v25.b }[0], [x13]\n"
+ "tbz x2, #0, 59f\n"
+ "ld1 { v20.b }[0], [x20]\n"
"59:" // Oddments: Load (3, 3): Bit 2: End
- "ldr d3, [x0, #0x68]\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "ldr x12, [x2, #0xb0]\n"
- "smlal v6.4s, v25.4h, v2.4h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "add x12, x12, x3\n"
- "smlal v11.4s, v27.4h, v3.4h\n"
- "smlal2 v13.4s, v27.8h, v3.8h\n"
- "smlal v20.4s, v23.4h, v3.4h\n"
- "smlal2 v19.4s, v23.8h, v3.8h\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "tbz x4, #2, 61f\n"
- "ld1 { v24.s }[0], [x12], #0x4\n"
- "tbz x4, #1, 60f\n"
- "ld1 { v24.h }[2], [x12], #0x2\n"
- "tbz x4, #0, 63f\n"
- "ld1 { v24.b }[6], [x12]\n"
+ "ldr d28, [x7, #0x68]\n"
+ "ushll v20.8h, v20.8b, #0x0\n"
+ "usubl v28.8h, v28.8b, v2.8b\n"
+ "ldr x20, [x6, #0xb0]\n"
+ "smlal v8.4s, v20.4h, v10.4h\n"
+ "smlal2 v17.4s, v20.8h, v10.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v3.4h, v28.4h\n"
+ "smlal2 v24.4s, v3.8h, v28.8h\n"
+ "smlal v7.4s, v6.4h, v28.4h\n"
+ "smlal2 v14.4s, v6.8h, v28.8h\n"
+ "smlal v27.4s, v20.4h, v28.4h\n"
+ "smlal2 v22.4s, v20.8h, v28.8h\n"
+ "tbz x2, #2, 61f\n"
+ "ld1 { v5.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 60f\n"
+ "ld1 { v5.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 63f\n"
+ "ld1 { v5.b }[6], [x20]\n"
"b 63f\n"
"60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 63f\n"
- "ld1 { v24.b }[4], [x12]\n"
+ "tbz x2, #0, 63f\n"
+ "ld1 { v5.b }[4], [x20]\n"
"b 63f\n"
"61:" // Oddments: Load (3, 4): Bit 2: Unset
- "tbz x4, #1, 62f\n"
- "ld1 { v24.h }[0], [x12], #0x2\n"
- "tbz x4, #0, 63f\n"
- "ld1 { v24.b }[2], [x12]\n"
+ "tbz x2, #1, 62f\n"
+ "ld1 { v5.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 63f\n"
+ "ld1 { v5.b }[2], [x20]\n"
"b 63f\n"
"62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 63f\n"
- "ld1 { v24.b }[0], [x12]\n"
+ "tbz x2, #0, 63f\n"
+ "ld1 { v5.b }[0], [x20]\n"
"63:" // Oddments: Load (3, 4): Bit 2: End
- "ldr d4, [x0, #0x70]\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "ldr x20, [x2, #0xb8]\n"
- "smlal v6.4s, v24.4h, v3.4h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "add x20, x20, x3\n"
- "smlal v11.4s, v23.4h, v4.4h\n"
- "smlal2 v13.4s, v23.8h, v4.8h\n"
- "smlal v20.4s, v28.4h, v4.4h\n"
- "smlal2 v19.4s, v28.8h, v4.8h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "tbz x4, #2, 65f\n"
- "ld1 { v22.s }[0], [x20], #0x4\n"
- "tbz x4, #1, 64f\n"
- "ld1 { v22.h }[2], [x20], #0x2\n"
- "tbz x4, #0, 67f\n"
- "ld1 { v22.b }[6], [x20]\n"
+ "ldr d23, [x7, #0x70]\n"
+ "ushll v5.8h, v5.8b, #0x0\n"
+ "usubl v23.8h, v23.8b, v2.8b\n"
+ "ldr x20, [x6, #0xb8]\n"
+ "smlal v8.4s, v5.4h, v28.4h\n"
+ "smlal2 v17.4s, v5.8h, v28.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v6.4h, v23.4h\n"
+ "smlal2 v24.4s, v6.8h, v23.8h\n"
+ "smlal v7.4s, v18.4h, v23.4h\n"
+ "smlal2 v14.4s, v18.8h, v23.8h\n"
+ "smlal v27.4s, v5.4h, v23.4h\n"
+ "smlal2 v22.4s, v5.8h, v23.8h\n"
+ "tbz x2, #2, 65f\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 64f\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 67f\n"
+ "ld1 { v29.b }[6], [x20]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
- "tbz x4, #0, 67f\n"
- "ld1 { v22.b }[4], [x20]\n"
+ "tbz x2, #0, 67f\n"
+ "ld1 { v29.b }[4], [x20]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 5): Bit 2: Unset
- "tbz x4, #1, 66f\n"
- "ld1 { v22.h }[0], [x20], #0x2\n"
- "tbz x4, #0, 67f\n"
- "ld1 { v22.b }[2], [x20]\n"
+ "tbz x2, #1, 66f\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 67f\n"
+ "ld1 { v29.b }[2], [x20]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 67f\n"
- "ld1 { v22.b }[0], [x20]\n"
+ "tbz x2, #0, 67f\n"
+ "ld1 { v29.b }[0], [x20]\n"
"67:" // Oddments: Load (3, 5): Bit 2: End
- "ldr d0, [x0, #0x78]\n"
- "ushll v22.8h, v22.8b, #0x0\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "ldr x11, [x2, #0xc0]\n"
- "smlal v6.4s, v22.4h, v4.4h\n"
- "smlal2 v5.4s, v22.8h, v4.8h\n"
- "add x11, x11, x3\n"
- "smlal v11.4s, v31.4h, v0.4h\n"
- "smlal2 v13.4s, v31.8h, v0.8h\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "smlal2 v19.4s, v30.8h, v0.8h\n"
- "tbz x4, #2, 69f\n"
- "ld1 { v27.s }[0], [x11], #0x4\n"
- "tbz x4, #1, 68f\n"
- "ld1 { v27.h }[2], [x11], #0x2\n"
- "tbz x4, #0, 71f\n"
- "ld1 { v27.b }[6], [x11]\n"
+ "ldr d4, [x7, #0x78]\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "usubl v4.8h, v4.8b, v2.8b\n"
+ "ldr x20, [x6, #0xc0]\n"
+ "smlal v8.4s, v29.4h, v23.4h\n"
+ "smlal2 v17.4s, v29.8h, v23.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v30.4h, v4.4h\n"
+ "smlal2 v24.4s, v30.8h, v4.8h\n"
+ "smlal v7.4s, v19.4h, v4.4h\n"
+ "smlal2 v14.4s, v19.8h, v4.8h\n"
+ "tbz x2, #2, 69f\n"
+ "ld1 { v18.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 68f\n"
+ "ld1 { v18.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 71f\n"
+ "ld1 { v18.b }[6], [x20]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
- "tbz x4, #0, 71f\n"
- "ld1 { v27.b }[4], [x11]\n"
+ "tbz x2, #0, 71f\n"
+ "ld1 { v18.b }[4], [x20]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 0): Bit 2: Unset
- "tbz x4, #1, 70f\n"
- "ld1 { v27.h }[0], [x11], #0x2\n"
- "tbz x4, #0, 71f\n"
- "ld1 { v27.b }[2], [x11]\n"
+ "tbz x2, #1, 70f\n"
+ "ld1 { v18.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 71f\n"
+ "ld1 { v18.b }[2], [x20]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 71f\n"
- "ld1 { v27.b }[0], [x11]\n"
+ "tbz x2, #0, 71f\n"
+ "ld1 { v18.b }[0], [x20]\n"
"71:" // Oddments: Load (4, 0): Bit 2: End
- "ushll v27.8h, v27.8b, #0x0\n"
- "ldr x10, [x2, #0xc8]\n"
- "smlal v8.4s, v27.4h, v0.4h\n"
- "smlal2 v7.4s, v27.8h, v0.8h\n"
- "add x10, x10, x3\n"
- "tbz x4, #2, 73f\n"
- "ld1 { v23.s }[0], [x10], #0x4\n"
- "tbz x4, #1, 72f\n"
- "ld1 { v23.h }[2], [x10], #0x2\n"
- "tbz x4, #0, 75f\n"
- "ld1 { v23.b }[6], [x10]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "ldr x20, [x6, #0xc8]\n"
+ "smlal v27.4s, v18.4h, v4.4h\n"
+ "smlal2 v22.4s, v18.8h, v4.8h\n"
+ "add x20, x20, x4\n"
+ "tbz x2, #2, 73f\n"
+ "ld1 { v1.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 72f\n"
+ "ld1 { v1.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 75f\n"
+ "ld1 { v1.b }[6], [x20]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
- "tbz x4, #0, 75f\n"
- "ld1 { v23.b }[4], [x10]\n"
+ "tbz x2, #0, 75f\n"
+ "ld1 { v1.b }[4], [x20]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 1): Bit 2: Unset
- "tbz x4, #1, 74f\n"
- "ld1 { v23.h }[0], [x10], #0x2\n"
- "tbz x4, #0, 75f\n"
- "ld1 { v23.b }[2], [x10]\n"
+ "tbz x2, #1, 74f\n"
+ "ld1 { v1.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 75f\n"
+ "ld1 { v1.b }[2], [x20]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 75f\n"
- "ld1 { v23.b }[0], [x10]\n"
+ "tbz x2, #0, 75f\n"
+ "ld1 { v1.b }[0], [x20]\n"
"75:" // Oddments: Load (4, 1): Bit 2: End
- "ldr d1, [x0, #0x80]\n"
- "ushll v23.8h, v23.8b, #0x0\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "ldr x22, [x2, #0xd0]\n"
- "smlal v6.4s, v23.4h, v0.4h\n"
- "smlal2 v5.4s, v23.8h, v0.8h\n"
- "add x22, x22, x3\n"
- "smlal v11.4s, v30.4h, v1.4h\n"
- "smlal2 v13.4s, v30.8h, v1.8h\n"
- "smlal v20.4s, v26.4h, v1.4h\n"
- "smlal2 v19.4s, v26.8h, v1.8h\n"
- "smlal v8.4s, v23.4h, v1.4h\n"
- "smlal2 v7.4s, v23.8h, v1.8h\n"
- "tbz x4, #2, 77f\n"
- "ld1 { v31.s }[0], [x22], #0x4\n"
- "tbz x4, #1, 76f\n"
- "ld1 { v31.h }[2], [x22], #0x2\n"
- "tbz x4, #0, 79f\n"
- "ld1 { v31.b }[6], [x22]\n"
+ "ldr d23, [x7, #0x80]\n"
+ "ushll v1.8h, v1.8b, #0x0\n"
+ "usubl v23.8h, v23.8b, v2.8b\n"
+ "ldr x20, [x6, #0xd0]\n"
+ "smlal v8.4s, v1.4h, v4.4h\n"
+ "smlal2 v17.4s, v1.8h, v4.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v19.4h, v23.4h\n"
+ "smlal2 v24.4s, v19.8h, v23.8h\n"
+ "smlal v7.4s, v9.4h, v23.4h\n"
+ "smlal2 v14.4s, v9.8h, v23.8h\n"
+ "smlal v27.4s, v1.4h, v23.4h\n"
+ "smlal2 v22.4s, v1.8h, v23.8h\n"
+ "tbz x2, #2, 77f\n"
+ "ld1 { v4.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 76f\n"
+ "ld1 { v4.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 79f\n"
+ "ld1 { v4.b }[6], [x20]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
- "tbz x4, #0, 79f\n"
- "ld1 { v31.b }[4], [x22]\n"
+ "tbz x2, #0, 79f\n"
+ "ld1 { v4.b }[4], [x20]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 2): Bit 2: Unset
- "tbz x4, #1, 78f\n"
- "ld1 { v31.h }[0], [x22], #0x2\n"
- "tbz x4, #0, 79f\n"
- "ld1 { v31.b }[2], [x22]\n"
+ "tbz x2, #1, 78f\n"
+ "ld1 { v4.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 79f\n"
+ "ld1 { v4.b }[2], [x20]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 79f\n"
- "ld1 { v31.b }[0], [x22]\n"
+ "tbz x2, #0, 79f\n"
+ "ld1 { v4.b }[0], [x20]\n"
"79:" // Oddments: Load (4, 2): Bit 2: End
- "ldr d2, [x0, #0x88]\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "ldr x28, [x2, #0xd8]\n"
- "smlal v6.4s, v31.4h, v1.4h\n"
- "smlal2 v5.4s, v31.8h, v1.8h\n"
- "add x28, x28, x3\n"
- "smlal v11.4s, v26.4h, v2.4h\n"
- "smlal2 v13.4s, v26.8h, v2.8h\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "smlal2 v19.4s, v25.8h, v2.8h\n"
- "smlal v8.4s, v31.4h, v2.4h\n"
- "smlal2 v7.4s, v31.8h, v2.8h\n"
- "tbz x4, #2, 81f\n"
- "ld1 { v30.s }[0], [x28], #0x4\n"
- "tbz x4, #1, 80f\n"
- "ld1 { v30.h }[2], [x28], #0x2\n"
- "tbz x4, #0, 83f\n"
- "ld1 { v30.b }[6], [x28]\n"
+ "ldr d30, [x7, #0x88]\n"
+ "ushll v4.8h, v4.8b, #0x0\n"
+ "usubl v30.8h, v30.8b, v2.8b\n"
+ "ldr x20, [x6, #0xd8]\n"
+ "smlal v8.4s, v4.4h, v23.4h\n"
+ "smlal2 v17.4s, v4.8h, v23.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v9.4h, v30.4h\n"
+ "smlal2 v24.4s, v9.8h, v30.8h\n"
+ "smlal v7.4s, v20.4h, v30.4h\n"
+ "smlal2 v14.4s, v20.8h, v30.8h\n"
+ "smlal v27.4s, v4.4h, v30.4h\n"
+ "smlal2 v22.4s, v4.8h, v30.8h\n"
+ "tbz x2, #2, 81f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 80f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 83f\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 83f\n"
"80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 83f\n"
- "ld1 { v30.b }[4], [x28]\n"
+ "tbz x2, #0, 83f\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 83f\n"
"81:" // Oddments: Load (4, 3): Bit 2: Unset
- "tbz x4, #1, 82f\n"
- "ld1 { v30.h }[0], [x28], #0x2\n"
- "tbz x4, #0, 83f\n"
- "ld1 { v30.b }[2], [x28]\n"
+ "tbz x2, #1, 82f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 83f\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 83f\n"
"82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 83f\n"
- "ld1 { v30.b }[0], [x28]\n"
+ "tbz x2, #0, 83f\n"
+ "ld1 { v21.b }[0], [x20]\n"
"83:" // Oddments: Load (4, 3): Bit 2: End
- "ldr d3, [x0, #0x90]\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "ldr x27, [x2, #0xe0]\n"
- "smlal v6.4s, v30.4h, v2.4h\n"
- "smlal2 v5.4s, v30.8h, v2.8h\n"
- "add x27, x27, x3\n"
- "smlal v11.4s, v25.4h, v3.4h\n"
- "smlal2 v13.4s, v25.8h, v3.8h\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "smlal2 v19.4s, v24.8h, v3.8h\n"
- "smlal v8.4s, v30.4h, v3.4h\n"
- "smlal2 v7.4s, v30.8h, v3.8h\n"
- "tbz x4, #2, 85f\n"
- "ld1 { v28.s }[0], [x27], #0x4\n"
- "tbz x4, #1, 84f\n"
- "ld1 { v28.h }[2], [x27], #0x2\n"
- "tbz x4, #0, 87f\n"
- "ld1 { v28.b }[6], [x27]\n"
+ "ldr d3, [x7, #0x90]\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "usubl v3.8h, v3.8b, v2.8b\n"
+ "ldr x20, [x6, #0xe0]\n"
+ "smlal v8.4s, v21.4h, v30.4h\n"
+ "smlal2 v17.4s, v21.8h, v30.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v20.4h, v3.4h\n"
+ "smlal2 v24.4s, v20.8h, v3.8h\n"
+ "smlal v7.4s, v5.4h, v3.4h\n"
+ "smlal2 v14.4s, v5.8h, v3.8h\n"
+ "smlal v27.4s, v21.4h, v3.4h\n"
+ "smlal2 v22.4s, v21.8h, v3.8h\n"
+ "tbz x2, #2, 85f\n"
+ "ld1 { v30.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 84f\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 87f\n"
+ "ld1 { v30.b }[6], [x20]\n"
"b 87f\n"
"84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 87f\n"
- "ld1 { v28.b }[4], [x27]\n"
+ "tbz x2, #0, 87f\n"
+ "ld1 { v30.b }[4], [x20]\n"
"b 87f\n"
"85:" // Oddments: Load (4, 4): Bit 2: Unset
- "tbz x4, #1, 86f\n"
- "ld1 { v28.h }[0], [x27], #0x2\n"
- "tbz x4, #0, 87f\n"
- "ld1 { v28.b }[2], [x27]\n"
+ "tbz x2, #1, 86f\n"
+ "ld1 { v30.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 87f\n"
+ "ld1 { v30.b }[2], [x20]\n"
"b 87f\n"
"86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 87f\n"
- "ld1 { v28.b }[0], [x27]\n"
+ "tbz x2, #0, 87f\n"
+ "ld1 { v30.b }[0], [x20]\n"
"87:" // Oddments: Load (4, 4): Bit 2: End
- "ldr d4, [x0, #0x98]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "ldr x26, [x2, #0xe8]\n"
- "smlal v6.4s, v28.4h, v3.4h\n"
- "smlal2 v5.4s, v28.8h, v3.8h\n"
- "add x26, x26, x3\n"
- "smlal v11.4s, v24.4h, v4.4h\n"
- "smlal2 v13.4s, v24.8h, v4.8h\n"
- "smlal v20.4s, v22.4h, v4.4h\n"
- "smlal2 v19.4s, v22.8h, v4.8h\n"
- "smlal v8.4s, v28.4h, v4.4h\n"
- "smlal2 v7.4s, v28.8h, v4.8h\n"
- "tbz x4, #2, 89f\n"
- "ld1 { v26.s }[0], [x26], #0x4\n"
- "tbz x4, #1, 88f\n"
- "ld1 { v26.h }[2], [x26], #0x2\n"
- "tbz x4, #0, 91f\n"
- "ld1 { v26.b }[6], [x26]\n"
+ "ldr d19, [x7, #0x98]\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "usubl v19.8h, v19.8b, v2.8b\n"
+ "ldr x20, [x6, #0xe8]\n"
+ "smlal v8.4s, v30.4h, v3.4h\n"
+ "smlal2 v17.4s, v30.8h, v3.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v5.4h, v19.4h\n"
+ "smlal2 v24.4s, v5.8h, v19.8h\n"
+ "smlal v7.4s, v29.4h, v19.4h\n"
+ "smlal2 v14.4s, v29.8h, v19.8h\n"
+ "smlal v27.4s, v30.4h, v19.4h\n"
+ "smlal2 v22.4s, v30.8h, v19.8h\n"
+ "tbz x2, #2, 89f\n"
+ "ld1 { v20.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 88f\n"
+ "ld1 { v20.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 91f\n"
+ "ld1 { v20.b }[6], [x20]\n"
"b 91f\n"
"88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
- "tbz x4, #0, 91f\n"
- "ld1 { v26.b }[4], [x26]\n"
+ "tbz x2, #0, 91f\n"
+ "ld1 { v20.b }[4], [x20]\n"
"b 91f\n"
"89:" // Oddments: Load (4, 5): Bit 2: Unset
- "tbz x4, #1, 90f\n"
- "ld1 { v26.h }[0], [x26], #0x2\n"
- "tbz x4, #0, 91f\n"
- "ld1 { v26.b }[2], [x26]\n"
+ "tbz x2, #1, 90f\n"
+ "ld1 { v20.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 91f\n"
+ "ld1 { v20.b }[2], [x20]\n"
"b 91f\n"
"90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 91f\n"
- "ld1 { v26.b }[0], [x26]\n"
+ "tbz x2, #0, 91f\n"
+ "ld1 { v20.b }[0], [x20]\n"
"91:" // Oddments: Load (4, 5): Bit 2: End
- "ldr d0, [x0, #0xa0]\n"
- "ushll v26.8h, v26.8b, #0x0\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "ldr x25, [x2, #0xf0]\n"
- "smlal v6.4s, v26.4h, v4.4h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "add x25, x25, x3\n"
- "smlal v11.4s, v27.4h, v0.4h\n"
- "smlal2 v13.4s, v27.8h, v0.8h\n"
- "smlal v20.4s, v23.4h, v0.4h\n"
- "smlal2 v19.4s, v23.8h, v0.8h\n"
- "tbz x4, #2, 93f\n"
- "ld1 { v25.s }[0], [x25], #0x4\n"
- "tbz x4, #1, 92f\n"
- "ld1 { v25.h }[2], [x25], #0x2\n"
- "tbz x4, #0, 95f\n"
- "ld1 { v25.b }[6], [x25]\n"
+ "ldr d23, [x7, #0xa0]\n"
+ "ushll v20.8h, v20.8b, #0x0\n"
+ "usubl v23.8h, v23.8b, v2.8b\n"
+ "ldr x20, [x6, #0xf0]\n"
+ "smlal v8.4s, v20.4h, v19.4h\n"
+ "smlal2 v17.4s, v20.8h, v19.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v18.4h, v23.4h\n"
+ "smlal2 v24.4s, v18.8h, v23.8h\n"
+ "smlal v7.4s, v1.4h, v23.4h\n"
+ "smlal2 v14.4s, v1.8h, v23.8h\n"
+ "tbz x2, #2, 93f\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 92f\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 95f\n"
+ "ld1 { v10.b }[6], [x20]\n"
"b 95f\n"
"92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
- "tbz x4, #0, 95f\n"
- "ld1 { v25.b }[4], [x25]\n"
+ "tbz x2, #0, 95f\n"
+ "ld1 { v10.b }[4], [x20]\n"
"b 95f\n"
"93:" // Oddments: Load (5, 0): Bit 2: Unset
- "tbz x4, #1, 94f\n"
- "ld1 { v25.h }[0], [x25], #0x2\n"
- "tbz x4, #0, 95f\n"
- "ld1 { v25.b }[2], [x25]\n"
+ "tbz x2, #1, 94f\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 95f\n"
+ "ld1 { v10.b }[2], [x20]\n"
"b 95f\n"
"94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 95f\n"
- "ld1 { v25.b }[0], [x25]\n"
+ "tbz x2, #0, 95f\n"
+ "ld1 { v10.b }[0], [x20]\n"
"95:" // Oddments: Load (5, 0): Bit 2: End
- "ushll v25.8h, v25.8b, #0x0\n"
- "ldr x24, [x2, #0xf8]\n"
- "smlal v8.4s, v25.4h, v0.4h\n"
- "smlal2 v7.4s, v25.8h, v0.8h\n"
- "add x24, x24, x3\n"
- "tbz x4, #2, 97f\n"
- "ld1 { v24.s }[0], [x24], #0x4\n"
- "tbz x4, #1, 96f\n"
- "ld1 { v24.h }[2], [x24], #0x2\n"
- "tbz x4, #0, 99f\n"
- "ld1 { v24.b }[6], [x24]\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ldr x20, [x6, #0xf8]\n"
+ "smlal v27.4s, v10.4h, v23.4h\n"
+ "smlal2 v22.4s, v10.8h, v23.8h\n"
+ "add x20, x20, x4\n"
+ "tbz x2, #2, 97f\n"
+ "ld1 { v18.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 96f\n"
+ "ld1 { v18.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 99f\n"
+ "ld1 { v18.b }[6], [x20]\n"
"b 99f\n"
"96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
- "tbz x4, #0, 99f\n"
- "ld1 { v24.b }[4], [x24]\n"
+ "tbz x2, #0, 99f\n"
+ "ld1 { v18.b }[4], [x20]\n"
"b 99f\n"
"97:" // Oddments: Load (5, 1): Bit 2: Unset
- "tbz x4, #1, 98f\n"
- "ld1 { v24.h }[0], [x24], #0x2\n"
- "tbz x4, #0, 99f\n"
- "ld1 { v24.b }[2], [x24]\n"
+ "tbz x2, #1, 98f\n"
+ "ld1 { v18.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 99f\n"
+ "ld1 { v18.b }[2], [x20]\n"
"b 99f\n"
"98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 99f\n"
- "ld1 { v24.b }[0], [x24]\n"
+ "tbz x2, #0, 99f\n"
+ "ld1 { v18.b }[0], [x20]\n"
"99:" // Oddments: Load (5, 1): Bit 2: End
- "ldr d1, [x0, #0xa8]\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "ldr x23, [x2, #0x100]\n"
- "smlal v6.4s, v24.4h, v0.4h\n"
- "smlal2 v5.4s, v24.8h, v0.8h\n"
- "add x23, x23, x3\n"
- "smlal v11.4s, v23.4h, v1.4h\n"
- "smlal2 v13.4s, v23.8h, v1.8h\n"
- "smlal v20.4s, v31.4h, v1.4h\n"
- "smlal2 v19.4s, v31.8h, v1.8h\n"
- "smlal v8.4s, v24.4h, v1.4h\n"
- "smlal2 v7.4s, v24.8h, v1.8h\n"
- "tbz x4, #2, 101f\n"
- "ld1 { v27.s }[0], [x23], #0x4\n"
- "tbz x4, #1, 100f\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
- "tbz x4, #0, 103f\n"
- "ld1 { v27.b }[6], [x23]\n"
+ "ldr d5, [x7, #0xa8]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "usubl v5.8h, v5.8b, v2.8b\n"
+ "ldr x20, [x6, #0x100]\n"
+ "smlal v8.4s, v18.4h, v23.4h\n"
+ "smlal2 v17.4s, v18.8h, v23.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v1.4h, v5.4h\n"
+ "smlal2 v24.4s, v1.8h, v5.8h\n"
+ "smlal v7.4s, v4.4h, v5.4h\n"
+ "smlal2 v14.4s, v4.8h, v5.8h\n"
+ "smlal v27.4s, v18.4h, v5.4h\n"
+ "smlal2 v22.4s, v18.8h, v5.8h\n"
+ "tbz x2, #2, 101f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 100f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 103f\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 103f\n"
"100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
- "tbz x4, #0, 103f\n"
- "ld1 { v27.b }[4], [x23]\n"
+ "tbz x2, #0, 103f\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 103f\n"
"101:" // Oddments: Load (5, 2): Bit 2: Unset
- "tbz x4, #1, 102f\n"
- "ld1 { v27.h }[0], [x23], #0x2\n"
- "tbz x4, #0, 103f\n"
- "ld1 { v27.b }[2], [x23]\n"
+ "tbz x2, #1, 102f\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 103f\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 103f\n"
"102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 103f\n"
- "ld1 { v27.b }[0], [x23]\n"
+ "tbz x2, #0, 103f\n"
+ "ld1 { v9.b }[0], [x20]\n"
"103:" // Oddments: Load (5, 2): Bit 2: End
- "ldr d2, [x0, #0xb0]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "ldr x15, [x2, #0x108]\n"
- "smlal v6.4s, v27.4h, v1.4h\n"
- "smlal2 v5.4s, v27.8h, v1.8h\n"
- "add x15, x15, x3\n"
- "smlal v11.4s, v31.4h, v2.4h\n"
- "smlal2 v13.4s, v31.8h, v2.8h\n"
- "smlal v20.4s, v30.4h, v2.4h\n"
- "smlal2 v19.4s, v30.8h, v2.8h\n"
- "smlal v8.4s, v27.4h, v2.4h\n"
- "smlal2 v7.4s, v27.8h, v2.8h\n"
- "tbz x4, #2, 105f\n"
- "ld1 { v25.s }[0], [x15], #0x4\n"
- "tbz x4, #1, 104f\n"
- "ld1 { v25.h }[2], [x15], #0x2\n"
- "tbz x4, #0, 107f\n"
- "ld1 { v25.b }[6], [x15]\n"
+ "ldr d18, [x7, #0xb0]\n"
+ "ushll v9.8h, v9.8b, #0x0\n"
+ "usubl v18.8h, v18.8b, v2.8b\n"
+ "ldr x20, [x6, #0x108]\n"
+ "smlal v8.4s, v9.4h, v5.4h\n"
+ "smlal2 v17.4s, v9.8h, v5.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v4.4h, v18.4h\n"
+ "smlal2 v24.4s, v4.8h, v18.8h\n"
+ "smlal v7.4s, v21.4h, v18.4h\n"
+ "smlal2 v14.4s, v21.8h, v18.8h\n"
+ "smlal v27.4s, v9.4h, v18.4h\n"
+ "smlal2 v22.4s, v9.8h, v18.8h\n"
+ "tbz x2, #2, 105f\n"
+ "ld1 { v5.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 104f\n"
+ "ld1 { v5.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 107f\n"
+ "ld1 { v5.b }[6], [x20]\n"
"b 107f\n"
"104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 107f\n"
- "ld1 { v25.b }[4], [x15]\n"
+ "tbz x2, #0, 107f\n"
+ "ld1 { v5.b }[4], [x20]\n"
"b 107f\n"
"105:" // Oddments: Load (5, 3): Bit 2: Unset
- "tbz x4, #1, 106f\n"
- "ld1 { v25.h }[0], [x15], #0x2\n"
- "tbz x4, #0, 107f\n"
- "ld1 { v25.b }[2], [x15]\n"
+ "tbz x2, #1, 106f\n"
+ "ld1 { v5.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 107f\n"
+ "ld1 { v5.b }[2], [x20]\n"
"b 107f\n"
"106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 107f\n"
- "ld1 { v25.b }[0], [x15]\n"
+ "tbz x2, #0, 107f\n"
+ "ld1 { v5.b }[0], [x20]\n"
"107:" // Oddments: Load (5, 3): Bit 2: End
- "ldr d3, [x0, #0xb8]\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "ldr x21, [x2, #0x110]\n"
- "smlal v6.4s, v25.4h, v2.4h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "add x21, x21, x3\n"
- "smlal v11.4s, v30.4h, v3.4h\n"
- "smlal2 v13.4s, v30.8h, v3.8h\n"
- "smlal v20.4s, v28.4h, v3.4h\n"
- "smlal2 v19.4s, v28.8h, v3.8h\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "tbz x4, #2, 109f\n"
- "ld1 { v24.s }[0], [x21], #0x4\n"
- "tbz x4, #1, 108f\n"
- "ld1 { v24.h }[2], [x21], #0x2\n"
- "tbz x4, #0, 111f\n"
- "ld1 { v24.b }[6], [x21]\n"
+ "ldr d11, [x7, #0xb8]\n"
+ "ushll v5.8h, v5.8b, #0x0\n"
+ "usubl v11.8h, v11.8b, v2.8b\n"
+ "ldr x20, [x6, #0x110]\n"
+ "smlal v8.4s, v5.4h, v18.4h\n"
+ "smlal2 v17.4s, v5.8h, v18.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v21.4h, v11.4h\n"
+ "smlal2 v24.4s, v21.8h, v11.8h\n"
+ "smlal v7.4s, v30.4h, v11.4h\n"
+ "smlal2 v14.4s, v30.8h, v11.8h\n"
+ "smlal v27.4s, v5.4h, v11.4h\n"
+ "smlal2 v22.4s, v5.8h, v11.8h\n"
+ "tbz x2, #2, 109f\n"
+ "ld1 { v18.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 108f\n"
+ "ld1 { v18.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 111f\n"
+ "ld1 { v18.b }[6], [x20]\n"
"b 111f\n"
"108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 111f\n"
- "ld1 { v24.b }[4], [x21]\n"
+ "tbz x2, #0, 111f\n"
+ "ld1 { v18.b }[4], [x20]\n"
"b 111f\n"
"109:" // Oddments: Load (5, 4): Bit 2: Unset
- "tbz x4, #1, 110f\n"
- "ld1 { v24.h }[0], [x21], #0x2\n"
- "tbz x4, #0, 111f\n"
- "ld1 { v24.b }[2], [x21]\n"
+ "tbz x2, #1, 110f\n"
+ "ld1 { v18.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 111f\n"
+ "ld1 { v18.b }[2], [x20]\n"
"b 111f\n"
"110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 111f\n"
- "ld1 { v24.b }[0], [x21]\n"
+ "tbz x2, #0, 111f\n"
+ "ld1 { v18.b }[0], [x20]\n"
"111:" // Oddments: Load (5, 4): Bit 2: End
- "ldr d4, [x0, #0xc0]\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "ldr x20, [x2, #0x118]\n"
- "smlal v6.4s, v24.4h, v3.4h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "add x20, x20, x3\n"
- "smlal v11.4s, v28.4h, v4.4h\n"
- "smlal2 v13.4s, v28.8h, v4.8h\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal2 v19.4s, v26.8h, v4.8h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "tbz x4, #2, 113f\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
- "tbz x4, #1, 112f\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
- "tbz x4, #0, 115f\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ldr d16, [x7, #0xc0]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "usubl v16.8h, v16.8b, v2.8b\n"
+ "ldr x20, [x6, #0x118]\n"
+ "smlal v8.4s, v18.4h, v11.4h\n"
+ "smlal2 v17.4s, v18.8h, v11.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v30.4h, v16.4h\n"
+ "smlal2 v24.4s, v30.8h, v16.8h\n"
+ "smlal v7.4s, v20.4h, v16.4h\n"
+ "smlal2 v14.4s, v20.8h, v16.8h\n"
+ "smlal v27.4s, v18.4h, v16.4h\n"
+ "smlal2 v22.4s, v18.8h, v16.8h\n"
+ "tbz x2, #2, 113f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 112f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 115f\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 115f\n"
"112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
- "tbz x4, #0, 115f\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "tbz x2, #0, 115f\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 115f\n"
"113:" // Oddments: Load (5, 5): Bit 2: Unset
- "tbz x4, #1, 114f\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
- "tbz x4, #0, 115f\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "tbz x2, #1, 114f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 115f\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 115f\n"
"114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 115f\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "tbz x2, #0, 115f\n"
+ "ld1 { v21.b }[0], [x20]\n"
"115:" // Oddments: Load (5, 5): Bit 2: End
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal v6.4s, v27.4h, v4.4h\n"
- "smlal2 v5.4s, v27.8h, v4.8h\n"
- "tbz x4, #2, 117f\n"
- "ld1 { v18.4s }, [x6], #0x10\n"
- "ld1 { v21.4s }, [x5], #0x10\n"
- "tbz x4, #1, 116f\n"
- "ld1 { v16.d }[0], [x6], #0x8\n"
- "ld1 { v10.d }[0], [x5], #0x8\n"
- "tbz x4, #0, 119f\n"
- "ld1 { v16.s }[2], [x6]\n"
- "ld1 { v10.s }[2], [x5]\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "smlal v8.4s, v21.4h, v16.4h\n"
+ "smlal2 v17.4s, v21.8h, v16.8h\n"
+ "tbz x2, #2, 117f\n"
+ "ld1 { v16.4s }, [x8], #0x10\n"
+ "ld1 { v21.4s }, [x17], #0x10\n"
+ "tbz x2, #1, 116f\n"
+ "ld1 { v18.d }[0], [x8], #0x8\n"
+ "ld1 { v0.d }[0], [x17], #0x8\n"
+ "tbz x2, #0, 119f\n"
+ "ld1 { v18.s }[2], [x8]\n"
+ "ld1 { v0.s }[2], [x17]\n"
"b 119f\n"
"116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x4, #0, 119f\n"
- "ld1 { v16.s }[0], [x6]\n"
- "ld1 { v10.s }[0], [x5]\n"
+ "tbz x2, #0, 119f\n"
+ "ld1 { v18.s }[0], [x8]\n"
+ "ld1 { v0.s }[0], [x17]\n"
"b 119f\n"
"117:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x4, #1, 118f\n"
- "ld1 { v18.d }[0], [x6], #0x8\n"
- "ld1 { v21.d }[0], [x5], #0x8\n"
- "tbz x4, #0, 119f\n"
- "ld1 { v18.s }[2], [x6]\n"
- "ld1 { v21.s }[2], [x5]\n"
+ "tbz x2, #1, 118f\n"
+ "ld1 { v16.d }[0], [x8], #0x8\n"
+ "ld1 { v21.d }[0], [x17], #0x8\n"
+ "tbz x2, #0, 119f\n"
+ "ld1 { v16.s }[2], [x8]\n"
+ "ld1 { v21.s }[2], [x17]\n"
"b 119f\n"
"118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 119f\n"
- "ld1 { v18.s }[0], [x6]\n"
- "ld1 { v21.s }[0], [x5]\n"
+ "tbz x2, #0, 119f\n"
+ "ld1 { v16.s }[0], [x8]\n"
+ "ld1 { v21.s }[0], [x17]\n"
"119:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v11.4s, v11.4s, v18.4s\n"
- "and v31.16b, v11.16b, v21.16b\n"
- "add x7, x7, x1\n"
- "add x8, x8, x1\n"
"sqrdmulh v13.4s, v13.4s, v16.4s\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "add x17, x17, x1\n"
- "add x16, x16, x1\n"
- "and v17.16b, v13.16b, v10.16b\n"
- "sqrdmulh v20.4s, v20.4s, v18.4s\n"
- "sqrdmulh v8.4s, v8.4s, v18.4s\n"
- "sqrdmulh v6.4s, v6.4s, v18.4s\n"
- "sqadd v11.4s, v11.4s, v31.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v26.16b, v20.16b, v21.16b\n"
- "sqrdmulh v19.4s, v19.4s, v16.4s\n"
- "and v18.16b, v8.16b, v21.16b\n"
+ "and v5.16b, v13.16b, v21.16b\n"
+ "add x16, x16, x5\n"
+ "add x15, x15, x5\n"
+ "sqrdmulh v24.4s, v24.4s, v18.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "add x14, x14, x5\n"
+ "add x13, x13, x5\n"
+ "and v2.16b, v24.16b, v0.16b\n"
"sqrdmulh v7.4s, v7.4s, v16.4s\n"
- "and v31.16b, v6.16b, v21.16b\n"
- "sqrdmulh v5.4s, v5.4s, v16.4s\n"
- "sqadd v13.4s, v13.4s, v17.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "and v27.16b, v19.16b, v10.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "and v25.16b, v7.16b, v10.16b\n"
+ "sqrdmulh v27.4s, v27.4s, v16.4s\n"
+ "sqrdmulh v8.4s, v8.4s, v16.4s\n"
+ "sqadd v13.4s, v13.4s, v5.4s\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "and v23.16b, v7.16b, v21.16b\n"
+ "sqrdmulh v14.4s, v14.4s, v18.4s\n"
+ "and v20.16b, v27.16b, v21.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v18.4s\n"
+ "and v31.16b, v8.16b, v21.16b\n"
+ "sqrdmulh v17.4s, v17.4s, v18.4s\n"
+ "sqadd v24.4s, v24.4s, v2.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v18.16b, v14.16b, v0.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v11.16b, v22.16b, v0.16b\n"
"sshr v31.4s, v31.4s, #0x1f\n"
- "and v17.16b, v5.16b, v10.16b\n"
- "sqadd v20.4s, v20.4s, v26.4s\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v18.4s\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "sqadd v6.4s, v6.4s, v31.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "srshl v11.4s, v11.4s, v21.4s\n"
- "srshl v20.4s, v20.4s, v21.4s\n"
- "sqadd v19.4s, v19.4s, v27.4s\n"
+ "and v10.16b, v17.16b, v0.16b\n"
+ "sqadd v7.4s, v7.4s, v23.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v27.4s, v27.4s, v20.4s\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v31.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v21.4s\n"
+ "srshl v7.4s, v7.4s, v21.4s\n"
+ "sqadd v14.4s, v14.4s, v18.4s\n"
+ "srshl v27.4s, v27.4s, v21.4s\n"
+ "sqadd v22.4s, v22.4s, v11.4s\n"
"srshl v8.4s, v8.4s, v21.4s\n"
- "sqadd v7.4s, v7.4s, v25.4s\n"
- "srshl v6.4s, v6.4s, v21.4s\n"
- "sqadd v5.4s, v5.4s, v17.4s\n"
- "srshl v13.4s, v13.4s, v10.4s\n"
- "sqxtn v11.4h, v11.4s\n"
- "srshl v19.4s, v19.4s, v10.4s\n"
- "sqxtn v20.4h, v20.4s\n"
- "srshl v7.4s, v7.4s, v10.4s\n"
+ "sqadd v17.4s, v17.4s, v10.4s\n"
+ "srshl v24.4s, v24.4s, v0.4s\n"
+ "sqxtn v13.4h, v13.4s\n"
+ "srshl v14.4s, v14.4s, v0.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v22.4s, v22.4s, v0.4s\n"
+ "sqxtn v27.4h, v27.4s\n"
+ "srshl v17.4s, v17.4s, v0.4s\n"
"sqxtn v8.4h, v8.4s\n"
- "srshl v5.4s, v5.4s, v10.4s\n"
- "sqxtn v6.4h, v6.4s\n"
- "sqxtn2 v11.8h, v13.4s\n"
- "sqxtn2 v20.8h, v19.4s\n"
- "sqxtn2 v8.8h, v7.4s\n"
- "sqxtn2 v6.8h, v5.4s\n"
- "sqadd v11.8h, v11.8h, v15.8h\n"
- "sqadd v20.8h, v20.8h, v15.8h\n"
- "sqadd v8.8h, v8.8h, v15.8h\n"
- "sqadd v6.8h, v6.8h, v15.8h\n"
- "smax v11.8h, v11.8h, v14.8h\n"
- "smax v20.8h, v20.8h, v14.8h\n"
- "smax v8.8h, v8.8h, v14.8h\n"
- "smax v6.8h, v6.8h, v14.8h\n"
- "smin v11.8h, v11.8h, v12.8h\n"
- "smin v20.8h, v20.8h, v12.8h\n"
- "smin v8.8h, v8.8h, v12.8h\n"
- "smin v6.8h, v6.8h, v12.8h\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "sqxtn2 v13.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v14.4s\n"
+ "sqxtn2 v27.8h, v22.4s\n"
+ "sqxtn2 v8.8h, v17.4s\n"
+ "sqadd v13.8h, v13.8h, v25.8h\n"
+ "sqadd v7.8h, v7.8h, v25.8h\n"
+ "sqadd v27.8h, v27.8h, v25.8h\n"
+ "sqadd v8.8h, v8.8h, v25.8h\n"
+ "smax v13.8h, v13.8h, v12.8h\n"
+ "smax v7.8h, v7.8h, v12.8h\n"
+ "smax v27.8h, v27.8h, v12.8h\n"
+ "smax v8.8h, v8.8h, v12.8h\n"
+ "smin v13.8h, v13.8h, v26.8h\n"
+ "smin v7.8h, v7.8h, v26.8h\n"
+ "smin v27.8h, v27.8h, v26.8h\n"
+ "smin v8.8h, v8.8h, v26.8h\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v8.16b, v8.16b, v8.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "tbz x4, #2, 121f\n"
- "st1 { v11.s }[0], [x7], #0x4\n"
- "st1 { v20.s }[0], [x8], #0x4\n"
- "st1 { v8.s }[0], [x17], #0x4\n"
- "st1 { v6.s }[0], [x16], #0x4\n"
- "tbz x4, #1, 120f\n"
- "st1 { v11.h }[2], [x7], #0x2\n"
- "st1 { v20.h }[2], [x8], #0x2\n"
- "st1 { v8.h }[2], [x17], #0x2\n"
- "st1 { v6.h }[2], [x16], #0x2\n"
- "tbz x4, #0, 123f\n"
- "st1 { v11.b }[6], [x7], #0x1\n"
- "st1 { v20.b }[6], [x8], #0x1\n"
- "st1 { v8.b }[6], [x17], #0x1\n"
- "st1 { v6.b }[6], [x16], #0x1\n"
+ "tbz x2, #2, 121f\n"
+ "st1 { v13.s }[0], [x16], #0x4\n"
+ "st1 { v7.s }[0], [x15], #0x4\n"
+ "st1 { v27.s }[0], [x14], #0x4\n"
+ "st1 { v8.s }[0], [x13], #0x4\n"
+ "tbz x2, #1, 120f\n"
+ "st1 { v13.h }[2], [x16], #0x2\n"
+ "st1 { v7.h }[2], [x15], #0x2\n"
+ "st1 { v27.h }[2], [x14], #0x2\n"
+ "st1 { v8.h }[2], [x13], #0x2\n"
+ "tbz x2, #0, 123f\n"
+ "st1 { v13.b }[6], [x16], #0x1\n"
+ "st1 { v7.b }[6], [x15], #0x1\n"
+ "st1 { v27.b }[6], [x14], #0x1\n"
+ "st1 { v8.b }[6], [x13], #0x1\n"
"b 123f\n"
"120:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x4, #0, 123f\n"
- "st1 { v11.b }[4], [x7], #0x1\n"
- "st1 { v20.b }[4], [x8], #0x1\n"
- "st1 { v8.b }[4], [x17], #0x1\n"
- "st1 { v6.b }[4], [x16], #0x1\n"
+ "tbz x2, #0, 123f\n"
+ "st1 { v13.b }[4], [x16], #0x1\n"
+ "st1 { v7.b }[4], [x15], #0x1\n"
+ "st1 { v27.b }[4], [x14], #0x1\n"
+ "st1 { v8.b }[4], [x13], #0x1\n"
"b 123f\n"
"121:" // Oddments: Bit 2: Unset
- "tbz x4, #1, 122f\n"
- "st1 { v11.h }[0], [x7], #0x2\n"
- "st1 { v20.h }[0], [x8], #0x2\n"
- "st1 { v8.h }[0], [x17], #0x2\n"
- "st1 { v6.h }[0], [x16], #0x2\n"
- "tbz x4, #0, 123f\n"
- "st1 { v11.b }[2], [x7], #0x1\n"
- "st1 { v20.b }[2], [x8], #0x1\n"
- "st1 { v8.b }[2], [x17], #0x1\n"
- "st1 { v6.b }[2], [x16], #0x1\n"
+ "tbz x2, #1, 122f\n"
+ "st1 { v13.h }[0], [x16], #0x2\n"
+ "st1 { v7.h }[0], [x15], #0x2\n"
+ "st1 { v27.h }[0], [x14], #0x2\n"
+ "st1 { v8.h }[0], [x13], #0x2\n"
+ "tbz x2, #0, 123f\n"
+ "st1 { v13.b }[2], [x16], #0x1\n"
+ "st1 { v7.b }[2], [x15], #0x1\n"
+ "st1 { v27.b }[2], [x14], #0x1\n"
+ "st1 { v8.b }[2], [x13], #0x1\n"
"b 123f\n"
"122:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 123f\n"
- "st1 { v11.b }[0], [x7], #0x1\n"
- "st1 { v20.b }[0], [x8], #0x1\n"
- "st1 { v8.b }[0], [x17], #0x1\n"
- "st1 { v6.b }[0], [x16], #0x1\n"
+ "tbz x2, #0, 123f\n"
+ "st1 { v13.b }[0], [x16], #0x1\n"
+ "st1 { v7.b }[0], [x15], #0x1\n"
+ "st1 { v27.b }[0], [x14], #0x1\n"
+ "st1 { v8.b }[0], [x13], #0x1\n"
"123:" // Oddments: Bit 2: End
"124:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index cf655cbe78..7b0b414517 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
@@ -34,16 +34,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
- const unsigned int,
- const uint8_t *const *const,
- const int8_t *const,
- const int32_t *const,
- const arm_gemm::Requantize32 &,
- const int32_t *const,
- const int32_t *const,
- uint8_t *const *const
-);
+void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
class a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index 4419048793..89253ba670 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -91,1072 +91,1072 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x6, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
- "lsr x7, x6, #0x3\n"
+ "lsr x8, x7, #0x3\n"
"add x20, x23, %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v24.16b }, [x20]\n"
+ "ld1r { v14.16b }, [x20]\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
"add x21, x23, %[offsetof_Requantize32_b_offset]\n"
"add x20, x23, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v15.16b }, [x21]\n"
- "ld1r { v14.8h }, [x20]\n"
+ "ld1r { v19.16b }, [x21]\n"
+ "ld1r { v13.8h }, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_minval]\n"
"add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v12.8h }, [x21]\n"
- "ld1r { v11.8h }, [x20]\n"
- "mov x8, #0x0\n"
+ "ld1r { v29.8h }, [x21]\n"
+ "ld1r { v12.8h }, [x20]\n"
"mov x17, #0x0\n"
- "add x16, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x15, [%x[params], %[offsetof_Params_weights]]\n"
- "ldr x14, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "ldr x13, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x12, x11, [x22, #0x0]\n"
- "ldp x10, x9, [x22, #0x10]\n"
- "cbz x7, 3f\n"
- "ldr d0, [x15, #0x0]\n"
- "ldr d1, [x15, #0x8]\n"
- "subs x7, x7, #0x1\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ldr d2, [x15, #0x10]\n"
- "ldr d3, [x15, #0x18]\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ldr d4, [x15, #0x20]\n"
- "ldr d5, [x15, #0x28]\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ldr d6, [x15, #0x30]\n"
- "ldr d7, [x15, #0x38]\n"
- "ssubl v5.8h, v5.8b, v15.8b\n"
- "ssubl v6.8h, v6.8b, v15.8b\n"
- "ldr d8, [x15, #0x40]\n"
- "ldr x28, [%x[params], %[offsetof_Params_bias]]\n"
- "ssubl v7.8h, v7.8b, v15.8b\n"
- "ssubl v8.8h, v8.8b, v15.8b\n"
- "ldr q13, [x28, #0x0]\n"
- "ldr q20, [x28, #0x10]\n"
- "add x28, x28, #0x20\n"
- "str x28, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x24, x23, [x16, #0x0]\n"
- "ldp x22, x21, [x16, #0x10]\n"
- "mov v9.16b, v13.16b\n"
- "mov v18.16b, v20.16b\n"
- "ldr d31, [x24, x8]\n"
- "ldr d30, [x23, x8]\n"
- "mov v16.16b, v13.16b\n"
- "mov v26.16b, v20.16b\n"
- "ldr d29, [x22, x8]\n"
- "ldr d28, [x21, x8]\n"
- "mov v25.16b, v13.16b\n"
- "mov v10.16b, v20.16b\n"
- "ldr x20, [x16, #0x20]\n"
- "ldr d27, [x20, x8]\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "usubl v30.8h, v30.8b, v24.8b\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "usubl v27.8h, v27.8b, v24.8b\n"
+ "mov x16, #0x0\n"
+ "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x11, x10, [x22, #0x0]\n"
+ "ldp x9, x28, [x22, #0x10]\n"
+ "cbz x8, 3f\n"
+ "ldr d23, [x14, #0x0]\n"
+ "ldr d16, [x14, #0x8]\n"
+ "subs x8, x8, #0x1\n"
+ "ssubl v23.8h, v23.8b, v19.8b\n"
+ "ldr d1, [x14, #0x10]\n"
+ "ldr d5, [x14, #0x18]\n"
+ "ssubl v16.8h, v16.8b, v19.8b\n"
+ "ssubl v1.8h, v1.8b, v19.8b\n"
+ "ldr d26, [x14, #0x20]\n"
+ "ldr d18, [x14, #0x28]\n"
+ "ssubl v5.8h, v5.8b, v19.8b\n"
+ "ssubl v26.8h, v26.8b, v19.8b\n"
+ "ldr d31, [x14, #0x30]\n"
+ "ldr d25, [x14, #0x38]\n"
+ "ssubl v18.8h, v18.8b, v19.8b\n"
+ "ssubl v31.8h, v31.8b, v19.8b\n"
+ "ldr d20, [x14, #0x40]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ssubl v25.8h, v25.8b, v19.8b\n"
+ "ssubl v20.8h, v20.8b, v19.8b\n"
+ "ldr q9, [x20, #0x0]\n"
+ "ldr q24, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x23, x22, [x15, #0x0]\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "mov v7.16b, v9.16b\n"
+ "mov v0.16b, v24.16b\n"
+ "ldr d22, [x23, x17]\n"
+ "ldr d4, [x22, x17]\n"
+ "mov v2.16b, v9.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "ldr d8, [x21, x17]\n"
+ "ldr d27, [x20, x17]\n"
+ "mov v10.16b, v9.16b\n"
+ "mov v6.16b, v24.16b\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ldr d15, [x20, x17]\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
"beq 2f\n"
"1:" // Loop
- "ldr q17, [x14, #0x0]\n"
- "ldr q22, [x13, #0x0]\n"
- "smlal v13.4s, v31.4h, v4.4h\n"
- "smlal2 v20.4s, v31.8h, v4.8h\n"
- "ldr q23, [x14, #0x10]\n"
- "smlal v9.4s, v31.4h, v3.4h\n"
- "smlal2 v18.4s, v31.8h, v3.8h\n"
- "ldr x21, [x16, #0x28]\n"
- "smlal v13.4s, v30.4h, v0.4h\n"
- "smlal2 v20.4s, v30.8h, v0.8h\n"
- "ldr q19, [x13, #0x10]\n"
- "ldr x28, [x16, #0x38]\n"
- "smlal v9.4s, v29.4h, v2.4h\n"
- "smlal2 v18.4s, v29.8h, v2.8h\n"
- "ldr x20, [x16, #0x30]\n"
- "ldr d29, [x20, x8]\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "smlal2 v26.4s, v31.8h, v1.8h\n"
- "ldr x27, [x16, #0x40]\n"
- "ldr x26, [x16, #0x48]\n"
- "smlal v25.4s, v31.4h, v0.4h\n"
- "smlal2 v10.4s, v31.8h, v0.8h\n"
- "ldr d31, [x21, x8]\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "ldr x25, [x16, #0x50]\n"
- "smlal v9.4s, v28.4h, v4.4h\n"
- "smlal2 v18.4s, v28.8h, v4.8h\n"
- "ldr x24, [x16, #0x58]\n"
- "ldr x23, [x16, #0x60]\n"
- "smlal v16.4s, v28.4h, v2.4h\n"
- "smlal2 v26.4s, v28.8h, v2.8h\n"
- "ldr x22, [x16, #0x68]\n"
- "ldr x21, [x16, #0x70]\n"
- "smlal v25.4s, v28.4h, v1.4h\n"
- "smlal2 v10.4s, v28.8h, v1.8h\n"
- "ldr d28, [x28, x8]\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v13.4s, v27.4h, v7.4h\n"
- "smlal2 v20.4s, v27.8h, v7.8h\n"
- "ldr x20, [x16, #0x78]\n"
- "ldr x28, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal v9.4s, v27.4h, v6.4h\n"
- "smlal2 v18.4s, v27.8h, v6.8h\n"
- "add x15, x15, #0x48\n"
- "subs x7, x7, #0x1\n"
- "smlal v16.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v6.8h\n"
- "ldr d31, [x27, x8]\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v25.4s, v27.4h, v3.4h\n"
- "smlal2 v10.4s, v27.8h, v3.8h\n"
- "add x14, x14, #0x20\n"
+ "ldr q3, [x13, #0x0]\n"
+ "ldr q17, [x12, #0x0]\n"
+ "smlal v9.4s, v22.4h, v26.4h\n"
+ "smlal2 v24.4s, v22.8h, v26.8h\n"
+ "ldr q21, [x13, #0x10]\n"
+ "ldr q28, [x12, #0x10]\n"
+ "smlal v9.4s, v4.4h, v23.4h\n"
+ "smlal v7.4s, v22.4h, v5.4h\n"
+ "ldr x20, [x15, #0x28]\n"
+ "ldr d11, [x20, x17]\n"
+ "smlal v2.4s, v22.4h, v16.4h\n"
+ "smlal v10.4s, v22.4h, v23.4h\n"
+ "smlal2 v24.4s, v4.8h, v23.8h\n"
+ "ldr x20, [x15, #0x38]\n"
+ "ldr d4, [x20, x17]\n"
+ "smlal v9.4s, v27.4h, v18.4h\n"
+ "smlal2 v0.4s, v22.8h, v5.8h\n"
+ "smlal2 v30.4s, v22.8h, v16.8h\n"
+ "ldr x20, [x15, #0x30]\n"
+ "usubl v11.8h, v11.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v23.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "smlal v2.4s, v27.4h, v1.4h\n"
+ "smlal v10.4s, v27.4h, v16.4h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "ldr x27, [x15, #0x48]\n"
+ "smlal2 v24.4s, v27.8h, v18.8h\n"
+ "smlal v9.4s, v15.4h, v25.4h\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "ldr x26, [x15, #0x50]\n"
+ "smlal2 v0.4s, v8.8h, v1.8h\n"
+ "ldr d8, [x20, x17]\n"
+ "smlal2 v30.4s, v27.8h, v1.8h\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v16.8h\n"
+ "smlal v7.4s, v27.4h, v26.4h\n"
+ "ldr x25, [x15, #0x58]\n"
+ "ldr x24, [x15, #0x60]\n"
+ "smlal v2.4s, v11.4h, v31.4h\n"
+ "smlal v10.4s, v15.4h, v5.4h\n"
+ "ldr x23, [x15, #0x68]\n"
+ "ldr x22, [x15, #0x70]\n"
+ "smlal2 v24.4s, v15.8h, v25.8h\n"
+ "smlal v9.4s, v4.4h, v16.4h\n"
+ "ldr x21, [x15, #0x78]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v0.4s, v27.8h, v26.8h\n"
+ "ldr d27, [x27, x17]\n"
+ "smlal2 v30.4s, v11.8h, v31.8h\n"
+ "ldr d11, [x26, x17]\n"
+ "smlal2 v6.4s, v15.8h, v5.8h\n"
+ "smlal v7.4s, v15.4h, v31.4h\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "add x14, x14, #0x48\n"
+ "smlal v2.4s, v15.4h, v26.4h\n"
+ "smlal v10.4s, v22.4h, v20.4h\n"
+ "usubl v11.8h, v11.8b, v14.8b\n"
+ "subs x8, x8, #0x1\n"
+ "smlal2 v24.4s, v4.8h, v16.8h\n"
+ "smlal v9.4s, v8.4h, v1.4h\n"
"add x13, x13, #0x20\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "smlal v9.4s, v28.4h, v0.4h\n"
- "smlal2 v18.4s, v28.8h, v0.8h\n"
- "ldr d30, [x26, x8]\n"
- "usubl v30.8h, v30.8b, v24.8b\n"
- "smlal v16.4s, v27.4h, v4.4h\n"
- "smlal v25.4s, v29.4h, v8.4h\n"
- "smlal2 v26.4s, v27.8h, v4.8h\n"
- "ldr d28, [x24, x8]\n"
- "smlal2 v10.4s, v29.8h, v8.8h\n"
- "ldr d29, [x25, x8]\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v20.4s, v31.8h, v2.8h\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "smlal v9.4s, v31.4h, v1.4h\n"
- "smlal2 v18.4s, v31.8h, v1.8h\n"
- "ldr d31, [x23, x8]\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v16.4s, v30.4h, v5.4h\n"
- "smlal v25.4s, v30.4h, v4.4h\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v20.4s, v30.8h, v8.8h\n"
- "smlal v9.4s, v30.4h, v7.4h\n"
- "smlal2 v18.4s, v30.8h, v7.8h\n"
- "smlal2 v26.4s, v30.8h, v5.8h\n"
- "smlal2 v10.4s, v30.8h, v4.8h\n"
- "ldr d30, [x22, x8]\n"
- "usubl v30.8h, v30.8b, v24.8b\n"
- "smlal v16.4s, v29.4h, v0.4h\n"
- "smlal v25.4s, v28.4h, v2.4h\n"
- "smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v20.4s, v29.8h, v3.8h\n"
- "smlal2 v26.4s, v29.8h, v0.8h\n"
- "ldr d29, [x21, x8]\n"
- "smlal2 v10.4s, v28.8h, v2.8h\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal v25.4s, v30.4h, v5.4h\n"
- "smlal v9.4s, v28.4h, v5.4h\n"
- "smlal2 v18.4s, v28.8h, v5.8h\n"
- "ldr d28, [x20, x8]\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v13.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v3.8h\n"
- "sqrdmulh v13.4s, v13.4s, v17.4s\n"
- "add x8, x8, #0x8\n"
- "smlal2 v10.4s, v30.8h, v5.8h\n"
- "smlal v16.4s, v29.4h, v7.4h\n"
- "and v21.16b, v13.16b, v22.16b\n"
- "smlal v25.4s, v29.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "smlal2 v26.4s, v29.8h, v7.8h\n"
- "smlal2 v10.4s, v29.8h, v6.8h\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "smlal v9.4s, v30.4h, v8.4h\n"
- "smlal v16.4s, v28.4h, v8.4h\n"
- "and v29.16b, v20.16b, v19.16b\n"
- "smlal v25.4s, v28.4h, v7.4h\n"
- "smlal2 v18.4s, v30.8h, v8.8h\n"
- "sqrdmulh v9.4s, v9.4s, v17.4s\n"
- "smlal2 v26.4s, v28.8h, v8.8h\n"
- "smlal2 v10.4s, v28.8h, v7.8h\n"
- "sqrdmulh v16.4s, v16.4s, v17.4s\n"
- "sqrdmulh v25.4s, v25.4s, v17.4s\n"
- "sqadd v13.4s, v13.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v0.16b, v9.16b, v22.16b\n"
- "sqrdmulh v18.4s, v18.4s, v23.4s\n"
- "and v27.16b, v16.16b, v22.16b\n"
- "sqrdmulh v26.4s, v26.4s, v23.4s\n"
- "and v21.16b, v25.16b, v22.16b\n"
- "sqrdmulh v10.4s, v10.4s, v23.4s\n"
- "sqadd v20.4s, v20.4s, v29.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v17.16b, v18.16b, v19.16b\n"
+ "add x12, x12, #0x20\n"
+ "smlal2 v0.4s, v15.8h, v31.8h\n"
+ "smlal2 v30.4s, v15.8h, v26.8h\n"
+ "ldr d15, [x25, x17]\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v20.8h\n"
+ "ldr d22, [x24, x17]\n"
+ "smlal v7.4s, v4.4h, v23.4h\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "smlal v2.4s, v27.4h, v18.4h\n"
+ "smlal v10.4s, v27.4h, v26.4h\n"
+ "smlal2 v24.4s, v8.8h, v1.8h\n"
+ "smlal v9.4s, v27.4h, v20.4h\n"
+ "smlal2 v0.4s, v4.8h, v23.8h\n"
+ "ldr d4, [x23, x17]\n"
+ "smlal2 v30.4s, v27.8h, v18.8h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v26.8h\n"
+ "ldr d26, [x22, x17]\n"
+ "smlal v7.4s, v8.4h, v16.4h\n"
+ "usubl v26.8h, v26.8b, v14.8b\n"
+ "smlal v2.4s, v11.4h, v23.4h\n"
+ "smlal v10.4s, v15.4h, v1.4h\n"
+ "smlal2 v24.4s, v27.8h, v20.8h\n"
+ "smlal v9.4s, v11.4h, v5.4h\n"
+ "smlal2 v0.4s, v8.8h, v16.8h\n"
+ "ldr d8, [x21, x17]\n"
+ "smlal2 v30.4s, v11.8h, v23.8h\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "smlal2 v6.4s, v15.8h, v1.8h\n"
+ "smlal v7.4s, v27.4h, v25.4h\n"
+ "add x17, x17, #0x8\n"
+ "smlal v2.4s, v22.4h, v5.4h\n"
+ "smlal v10.4s, v4.4h, v18.4h\n"
+ "smlal2 v24.4s, v11.8h, v5.8h\n"
+ "smlal v9.4s, v22.4h, v31.4h\n"
+ "sqrdmulh v9.4s, v9.4s, v3.4s\n"
+ "smlal2 v0.4s, v27.8h, v25.8h\n"
+ "smlal2 v30.4s, v22.8h, v5.8h\n"
+ "and v27.16b, v9.16b, v17.16b\n"
+ "smlal2 v6.4s, v4.8h, v18.8h\n"
+ "smlal v7.4s, v15.4h, v18.4h\n"
"sshr v27.4s, v27.4s, #0x1f\n"
- "and v7.16b, v26.16b, v19.16b\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "and v29.16b, v10.16b, v19.16b\n"
- "sqadd v9.4s, v9.4s, v0.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v27.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v25.4s, v25.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v22.4s\n"
- "srshl v9.4s, v9.4s, v22.4s\n"
- "sqadd v18.4s, v18.4s, v17.4s\n"
- "srshl v16.4s, v16.4s, v22.4s\n"
- "sqadd v26.4s, v26.4s, v7.4s\n"
- "srshl v25.4s, v25.4s, v22.4s\n"
- "sqadd v10.4s, v10.4s, v29.4s\n"
- "srshl v20.4s, v20.4s, v19.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v18.4s, v18.4s, v19.4s\n"
+ "smlal v2.4s, v26.4h, v25.4h\n"
+ "smlal v10.4s, v26.4h, v31.4h\n"
+ "sqadd v9.4s, v9.4s, v27.4s\n"
+ "smlal2 v24.4s, v22.8h, v31.8h\n"
+ "smlal2 v0.4s, v15.8h, v18.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ "smlal2 v30.4s, v26.8h, v25.8h\n"
+ "smlal2 v6.4s, v26.8h, v31.8h\n"
+ "and v31.16b, v24.16b, v28.16b\n"
+ "smlal v7.4s, v4.4h, v20.4h\n"
+ "smlal v2.4s, v8.4h, v20.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v3.4s\n"
+ "smlal v10.4s, v8.4h, v25.4h\n"
+ "smlal2 v0.4s, v4.8h, v20.8h\n"
+ "sqrdmulh v2.4s, v2.4s, v3.4s\n"
+ "smlal2 v30.4s, v8.8h, v20.8h\n"
+ "smlal2 v6.4s, v8.8h, v25.8h\n"
+ "sqrdmulh v10.4s, v10.4s, v3.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v22.16b, v7.16b, v17.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v21.4s\n"
+ "and v3.16b, v2.16b, v17.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "and v11.16b, v10.16b, v17.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v20.16b, v0.16b, v28.16b\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "and v31.16b, v30.16b, v28.16b\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "and v18.16b, v6.16b, v28.16b\n"
+ "sqadd v7.4s, v7.4s, v22.4s\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sqadd v2.4s, v2.4s, v3.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v11.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "srshl v9.4s, v9.4s, v17.4s\n"
+ "srshl v7.4s, v7.4s, v17.4s\n"
+ "sqadd v0.4s, v0.4s, v20.4s\n"
+ "srshl v2.4s, v2.4s, v17.4s\n"
+ "sqadd v30.4s, v30.4s, v31.4s\n"
+ "srshl v10.4s, v10.4s, v17.4s\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
"sqxtn v9.4h, v9.4s\n"
- "srshl v26.4s, v26.4s, v19.4s\n"
- "sqxtn v16.4h, v16.4s\n"
- "srshl v10.4s, v10.4s, v19.4s\n"
- "sqxtn v25.4h, v25.4s\n"
- "sqxtn2 v13.8h, v20.4s\n"
- "sqxtn2 v9.8h, v18.4s\n"
- "sqxtn2 v16.8h, v26.4s\n"
- "sqxtn2 v25.8h, v10.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v9.8h, v9.8h, v14.8h\n"
- "sqadd v16.8h, v16.8h, v14.8h\n"
- "sqadd v25.8h, v25.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v9.8h, v9.8h, v12.8h\n"
- "smax v16.8h, v16.8h, v12.8h\n"
- "smax v25.8h, v25.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v9.8h, v9.8h, v11.8h\n"
- "smin v16.8h, v16.8h, v11.8h\n"
- "smin v25.8h, v25.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
- "str d13, [x12, x17]\n"
+ "srshl v0.4s, v0.4s, v28.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v30.4s, v30.4s, v28.4s\n"
+ "sqxtn v2.4h, v2.4s\n"
+ "srshl v6.4s, v6.4s, v28.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "sqxtn2 v9.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v0.4s\n"
+ "sqxtn2 v2.8h, v30.4s\n"
+ "sqxtn2 v10.8h, v6.4s\n"
+ "sqadd v9.8h, v9.8h, v13.8h\n"
+ "sqadd v7.8h, v7.8h, v13.8h\n"
+ "sqadd v2.8h, v2.8h, v13.8h\n"
+ "sqadd v10.8h, v10.8h, v13.8h\n"
+ "smax v9.8h, v9.8h, v29.8h\n"
+ "smax v7.8h, v7.8h, v29.8h\n"
+ "smax v2.8h, v2.8h, v29.8h\n"
+ "smax v10.8h, v10.8h, v29.8h\n"
+ "smin v9.8h, v9.8h, v12.8h\n"
+ "smin v7.8h, v7.8h, v12.8h\n"
+ "smin v2.8h, v2.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str d9, [x11, x17]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str d16, [x10, x17]\n"
- "str d25, [x9, x17]\n"
- "ldr q13, [x28, #0x0]\n"
- "ldr q20, [x28, #0x10]\n"
- "add x28, x28, #0x20\n"
- "ldr d0, [x15, #0x0]\n"
- "ldr d1, [x15, #0x8]\n"
- "add x17, x17, #0x8\n"
- "str x28, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d2, [x15, #0x10]\n"
- "ldr d3, [x15, #0x18]\n"
- "mov v9.16b, v13.16b\n"
- "mov v18.16b, v20.16b\n"
- "ldr d4, [x15, #0x20]\n"
- "ldr d5, [x15, #0x28]\n"
- "mov v16.16b, v13.16b\n"
- "mov v26.16b, v20.16b\n"
- "ldr d6, [x15, #0x30]\n"
- "ldr d7, [x15, #0x38]\n"
- "mov v25.16b, v13.16b\n"
- "mov v10.16b, v20.16b\n"
- "ldr d8, [x15, #0x40]\n"
- "ldp x24, x23, [x16, #0x0]\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldp x22, x21, [x16, #0x10]\n"
- "ldr d31, [x24, x8]\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr d30, [x23, x8]\n"
- "ldr d29, [x22, x8]\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ssubl v5.8h, v5.8b, v15.8b\n"
- "ldr d28, [x21, x8]\n"
- "ldr x20, [x16, #0x20]\n"
- "ssubl v6.8h, v6.8b, v15.8b\n"
- "ssubl v7.8h, v7.8b, v15.8b\n"
- "ldr d27, [x20, x8]\n"
- "ssubl v8.8h, v8.8b, v15.8b\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "usubl v30.8h, v30.8b, v24.8b\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "usubl v27.8h, v27.8b, v24.8b\n"
+ "str d9, [x11, x16]\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "str d7, [x10, x16]\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "str d2, [x9, x16]\n"
+ "str d10, [x28, x16]\n"
+ "ldr q9, [x20, #0x0]\n"
+ "ldr q24, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d23, [x14, #0x0]\n"
+ "ldr d16, [x14, #0x8]\n"
+ "add x16, x16, #0x8\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d1, [x14, #0x10]\n"
+ "ldr d5, [x14, #0x18]\n"
+ "mov v7.16b, v9.16b\n"
+ "mov v0.16b, v24.16b\n"
+ "ldr d26, [x14, #0x20]\n"
+ "ldr d18, [x14, #0x28]\n"
+ "mov v2.16b, v9.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "ldr d31, [x14, #0x30]\n"
+ "ldr d25, [x14, #0x38]\n"
+ "mov v10.16b, v9.16b\n"
+ "mov v6.16b, v24.16b\n"
+ "ldr d20, [x14, #0x40]\n"
+ "ldp x23, x22, [x15, #0x0]\n"
+ "ssubl v23.8h, v23.8b, v19.8b\n"
+ "ssubl v16.8h, v16.8b, v19.8b\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr d22, [x23, x17]\n"
+ "ssubl v1.8h, v1.8b, v19.8b\n"
+ "ssubl v5.8h, v5.8b, v19.8b\n"
+ "ldr d4, [x22, x17]\n"
+ "ldr d8, [x21, x17]\n"
+ "ssubl v26.8h, v26.8b, v19.8b\n"
+ "ssubl v18.8h, v18.8b, v19.8b\n"
+ "ldr d27, [x20, x17]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ssubl v31.8h, v31.8b, v19.8b\n"
+ "ssubl v25.8h, v25.8b, v19.8b\n"
+ "ldr d15, [x20, x17]\n"
+ "ssubl v20.8h, v20.8b, v19.8b\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
"bgt 1b\n"
"2:" // Tail
- "ldr q17, [x14, #0x0]\n"
- "ldr q22, [x13, #0x0]\n"
- "smlal v13.4s, v31.4h, v4.4h\n"
- "smlal2 v20.4s, v31.8h, v4.8h\n"
- "ldr q23, [x14, #0x10]\n"
- "smlal v9.4s, v31.4h, v3.4h\n"
- "smlal2 v18.4s, v31.8h, v3.8h\n"
- "ldr x21, [x16, #0x28]\n"
- "smlal v13.4s, v30.4h, v0.4h\n"
- "smlal2 v20.4s, v30.8h, v0.8h\n"
- "ldr q19, [x13, #0x10]\n"
- "ldr x28, [x16, #0x38]\n"
- "smlal v9.4s, v29.4h, v2.4h\n"
- "smlal2 v18.4s, v29.8h, v2.8h\n"
- "ldr x20, [x16, #0x30]\n"
- "ldr d29, [x20, x8]\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "smlal2 v26.4s, v31.8h, v1.8h\n"
- "ldr x27, [x16, #0x40]\n"
- "ldr x26, [x16, #0x48]\n"
- "smlal v25.4s, v31.4h, v0.4h\n"
- "smlal2 v10.4s, v31.8h, v0.8h\n"
- "ldr d31, [x21, x8]\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "ldr x25, [x16, #0x50]\n"
- "smlal v9.4s, v28.4h, v4.4h\n"
- "smlal2 v18.4s, v28.8h, v4.8h\n"
- "ldr x24, [x16, #0x58]\n"
- "ldr x23, [x16, #0x60]\n"
- "smlal v16.4s, v28.4h, v2.4h\n"
- "smlal2 v26.4s, v28.8h, v2.8h\n"
- "ldr x22, [x16, #0x68]\n"
- "ldr x21, [x16, #0x70]\n"
- "smlal v25.4s, v28.4h, v1.4h\n"
- "smlal2 v10.4s, v28.8h, v1.8h\n"
- "ldr d28, [x28, x8]\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v13.4s, v27.4h, v7.4h\n"
- "smlal2 v20.4s, v27.8h, v7.8h\n"
- "ldr x20, [x16, #0x78]\n"
- "tst x6, #0x7\n"
- "smlal v9.4s, v27.4h, v6.4h\n"
- "smlal2 v18.4s, v27.8h, v6.8h\n"
- "add x14, x14, #0x20\n"
+ "ldr q28, [x13, #0x0]\n"
+ "ldr q17, [x12, #0x0]\n"
+ "smlal v9.4s, v22.4h, v26.4h\n"
+ "smlal2 v24.4s, v22.8h, v26.8h\n"
+ "ldr q21, [x13, #0x10]\n"
+ "ldr q3, [x12, #0x10]\n"
+ "smlal v9.4s, v4.4h, v23.4h\n"
+ "smlal v7.4s, v22.4h, v5.4h\n"
+ "ldr x20, [x15, #0x28]\n"
+ "ldr d11, [x20, x17]\n"
+ "smlal v2.4s, v22.4h, v16.4h\n"
+ "smlal v10.4s, v22.4h, v23.4h\n"
+ "smlal2 v24.4s, v4.8h, v23.8h\n"
+ "ldr x20, [x15, #0x38]\n"
+ "ldr d4, [x20, x17]\n"
+ "smlal v9.4s, v27.4h, v18.4h\n"
+ "smlal2 v0.4s, v22.8h, v5.8h\n"
+ "smlal2 v30.4s, v22.8h, v16.8h\n"
+ "ldr x20, [x15, #0x30]\n"
+ "usubl v11.8h, v11.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v23.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "smlal v2.4s, v27.4h, v1.4h\n"
+ "smlal v10.4s, v27.4h, v16.4h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "ldr x26, [x15, #0x48]\n"
+ "smlal2 v24.4s, v27.8h, v18.8h\n"
+ "smlal v9.4s, v15.4h, v25.4h\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "ldr x25, [x15, #0x50]\n"
+ "smlal2 v0.4s, v8.8h, v1.8h\n"
+ "ldr d8, [x20, x17]\n"
+ "smlal2 v30.4s, v27.8h, v1.8h\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v16.8h\n"
+ "smlal v7.4s, v27.4h, v26.4h\n"
+ "ldr x24, [x15, #0x58]\n"
+ "ldr x23, [x15, #0x60]\n"
+ "smlal v2.4s, v11.4h, v31.4h\n"
+ "smlal v10.4s, v15.4h, v5.4h\n"
+ "ldr x22, [x15, #0x68]\n"
+ "ldr x21, [x15, #0x70]\n"
+ "smlal2 v24.4s, v15.8h, v25.8h\n"
+ "smlal v9.4s, v4.4h, v16.4h\n"
+ "ldr x20, [x15, #0x78]\n"
+ "tst x7, #0x7\n"
+ "smlal2 v0.4s, v27.8h, v26.8h\n"
+ "ldr d27, [x26, x17]\n"
+ "smlal2 v30.4s, v11.8h, v31.8h\n"
+ "ldr d11, [x25, x17]\n"
+ "smlal2 v6.4s, v15.8h, v5.8h\n"
+ "smlal v7.4s, v15.4h, v31.4h\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
"add x13, x13, #0x20\n"
- "smlal v16.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v6.8h\n"
- "ldr d31, [x27, x8]\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v25.4s, v27.4h, v3.4h\n"
- "smlal2 v10.4s, v27.8h, v3.8h\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "smlal v9.4s, v28.4h, v0.4h\n"
- "smlal2 v18.4s, v28.8h, v0.8h\n"
- "ldr d30, [x26, x8]\n"
- "usubl v30.8h, v30.8b, v24.8b\n"
- "smlal v16.4s, v27.4h, v4.4h\n"
- "smlal v25.4s, v29.4h, v8.4h\n"
- "smlal2 v26.4s, v27.8h, v4.8h\n"
- "ldr d28, [x24, x8]\n"
- "smlal2 v10.4s, v29.8h, v8.8h\n"
- "ldr d29, [x25, x8]\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v20.4s, v31.8h, v2.8h\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "smlal v9.4s, v31.4h, v1.4h\n"
- "smlal2 v18.4s, v31.8h, v1.8h\n"
- "ldr d31, [x23, x8]\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v16.4s, v30.4h, v5.4h\n"
- "smlal v25.4s, v30.4h, v4.4h\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v20.4s, v30.8h, v8.8h\n"
- "smlal v9.4s, v30.4h, v7.4h\n"
- "smlal2 v18.4s, v30.8h, v7.8h\n"
- "smlal2 v26.4s, v30.8h, v5.8h\n"
- "smlal2 v10.4s, v30.8h, v4.8h\n"
- "ldr d30, [x22, x8]\n"
- "usubl v30.8h, v30.8b, v24.8b\n"
- "smlal v16.4s, v29.4h, v0.4h\n"
- "smlal v25.4s, v28.4h, v2.4h\n"
- "smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v20.4s, v29.8h, v3.8h\n"
- "smlal2 v26.4s, v29.8h, v0.8h\n"
- "ldr d29, [x21, x8]\n"
- "smlal2 v10.4s, v28.8h, v2.8h\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal v25.4s, v30.4h, v5.4h\n"
- "smlal v9.4s, v28.4h, v5.4h\n"
- "smlal2 v18.4s, v28.8h, v5.8h\n"
- "ldr d28, [x20, x8]\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v13.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v3.8h\n"
- "sqrdmulh v13.4s, v13.4s, v17.4s\n"
- "add x8, x8, #0x8\n"
- "smlal2 v10.4s, v30.8h, v5.8h\n"
- "smlal v16.4s, v29.4h, v7.4h\n"
- "and v21.16b, v13.16b, v22.16b\n"
- "smlal v25.4s, v29.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "smlal2 v26.4s, v29.8h, v7.8h\n"
- "smlal2 v10.4s, v29.8h, v6.8h\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "smlal v9.4s, v30.4h, v8.4h\n"
- "smlal v16.4s, v28.4h, v8.4h\n"
- "and v29.16b, v20.16b, v19.16b\n"
- "smlal v25.4s, v28.4h, v7.4h\n"
- "smlal2 v18.4s, v30.8h, v8.8h\n"
- "sqrdmulh v9.4s, v9.4s, v17.4s\n"
- "smlal2 v26.4s, v28.8h, v8.8h\n"
- "smlal2 v10.4s, v28.8h, v7.8h\n"
- "sqrdmulh v16.4s, v16.4s, v17.4s\n"
- "sqrdmulh v25.4s, v25.4s, v17.4s\n"
- "sqadd v13.4s, v13.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v0.16b, v9.16b, v22.16b\n"
- "sqrdmulh v18.4s, v18.4s, v23.4s\n"
- "and v27.16b, v16.16b, v22.16b\n"
- "sqrdmulh v26.4s, v26.4s, v23.4s\n"
- "and v21.16b, v25.16b, v22.16b\n"
- "sqrdmulh v10.4s, v10.4s, v23.4s\n"
- "sqadd v20.4s, v20.4s, v29.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v17.16b, v18.16b, v19.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "and v7.16b, v26.16b, v19.16b\n"
+ "smlal v2.4s, v15.4h, v26.4h\n"
+ "smlal v10.4s, v22.4h, v20.4h\n"
+ "usubl v11.8h, v11.8b, v14.8b\n"
+ "add x12, x12, #0x20\n"
+ "smlal2 v24.4s, v4.8h, v16.8h\n"
+ "smlal v9.4s, v8.4h, v1.4h\n"
+ "smlal2 v0.4s, v15.8h, v31.8h\n"
+ "smlal2 v30.4s, v15.8h, v26.8h\n"
+ "ldr d15, [x24, x17]\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v20.8h\n"
+ "ldr d22, [x23, x17]\n"
+ "smlal v7.4s, v4.4h, v23.4h\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "smlal v2.4s, v27.4h, v18.4h\n"
+ "smlal v10.4s, v27.4h, v26.4h\n"
+ "smlal2 v24.4s, v8.8h, v1.8h\n"
+ "smlal v9.4s, v27.4h, v20.4h\n"
+ "smlal2 v0.4s, v4.8h, v23.8h\n"
+ "ldr d4, [x22, x17]\n"
+ "smlal2 v30.4s, v27.8h, v18.8h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v26.8h\n"
+ "ldr d26, [x21, x17]\n"
+ "smlal v7.4s, v8.4h, v16.4h\n"
+ "usubl v26.8h, v26.8b, v14.8b\n"
+ "smlal v2.4s, v11.4h, v23.4h\n"
+ "smlal v10.4s, v15.4h, v1.4h\n"
+ "smlal2 v24.4s, v27.8h, v20.8h\n"
+ "smlal v9.4s, v11.4h, v5.4h\n"
+ "smlal2 v0.4s, v8.8h, v16.8h\n"
+ "ldr d16, [x20, x17]\n"
+ "smlal2 v30.4s, v11.8h, v23.8h\n"
+ "usubl v16.8h, v16.8b, v14.8b\n"
+ "smlal2 v6.4s, v15.8h, v1.8h\n"
+ "smlal v7.4s, v27.4h, v25.4h\n"
+ "add x17, x17, #0x8\n"
+ "smlal v2.4s, v22.4h, v5.4h\n"
+ "smlal v10.4s, v4.4h, v18.4h\n"
+ "smlal2 v24.4s, v11.8h, v5.8h\n"
+ "smlal v9.4s, v22.4h, v31.4h\n"
+ "sqrdmulh v9.4s, v9.4s, v28.4s\n"
+ "smlal2 v0.4s, v27.8h, v25.8h\n"
+ "smlal2 v30.4s, v22.8h, v5.8h\n"
+ "and v1.16b, v9.16b, v17.16b\n"
+ "smlal2 v6.4s, v4.8h, v18.8h\n"
+ "smlal v7.4s, v15.4h, v18.4h\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "smlal v2.4s, v26.4h, v25.4h\n"
+ "smlal v10.4s, v26.4h, v31.4h\n"
+ "sqadd v9.4s, v9.4s, v1.4s\n"
+ "smlal2 v24.4s, v22.8h, v31.8h\n"
+ "smlal2 v0.4s, v15.8h, v18.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ "smlal2 v30.4s, v26.8h, v25.8h\n"
+ "smlal2 v6.4s, v26.8h, v31.8h\n"
+ "and v31.16b, v24.16b, v3.16b\n"
+ "smlal v7.4s, v4.4h, v20.4h\n"
+ "smlal v2.4s, v16.4h, v20.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v28.4s\n"
+ "smlal v10.4s, v16.4h, v25.4h\n"
+ "smlal2 v0.4s, v4.8h, v20.8h\n"
+ "sqrdmulh v2.4s, v2.4s, v28.4s\n"
+ "smlal2 v30.4s, v16.8h, v20.8h\n"
+ "smlal2 v6.4s, v16.8h, v25.8h\n"
+ "sqrdmulh v10.4s, v10.4s, v28.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v22.16b, v7.16b, v17.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v21.4s\n"
+ "and v15.16b, v2.16b, v17.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "and v11.16b, v10.16b, v17.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v18.16b, v0.16b, v3.16b\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "and v23.16b, v30.16b, v3.16b\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "and v21.16b, v6.16b, v3.16b\n"
+ "sqadd v7.4s, v7.4s, v22.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v2.4s, v2.4s, v15.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v11.4s\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "and v29.16b, v10.16b, v19.16b\n"
- "sqadd v9.4s, v9.4s, v0.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v27.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v25.4s, v25.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v22.4s\n"
- "srshl v9.4s, v9.4s, v22.4s\n"
- "sqadd v18.4s, v18.4s, v17.4s\n"
- "srshl v16.4s, v16.4s, v22.4s\n"
- "sqadd v26.4s, v26.4s, v7.4s\n"
- "srshl v25.4s, v25.4s, v22.4s\n"
- "sqadd v10.4s, v10.4s, v29.4s\n"
- "srshl v20.4s, v20.4s, v19.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v18.4s, v18.4s, v19.4s\n"
+ "srshl v9.4s, v9.4s, v17.4s\n"
+ "srshl v7.4s, v7.4s, v17.4s\n"
+ "sqadd v0.4s, v0.4s, v18.4s\n"
+ "srshl v2.4s, v2.4s, v17.4s\n"
+ "sqadd v30.4s, v30.4s, v23.4s\n"
+ "srshl v10.4s, v10.4s, v17.4s\n"
+ "sqadd v6.4s, v6.4s, v21.4s\n"
+ "srshl v24.4s, v24.4s, v3.4s\n"
"sqxtn v9.4h, v9.4s\n"
- "srshl v26.4s, v26.4s, v19.4s\n"
- "sqxtn v16.4h, v16.4s\n"
- "srshl v10.4s, v10.4s, v19.4s\n"
- "sqxtn v25.4h, v25.4s\n"
- "sqxtn2 v13.8h, v20.4s\n"
- "sqxtn2 v9.8h, v18.4s\n"
- "sqxtn2 v16.8h, v26.4s\n"
- "sqxtn2 v25.8h, v10.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v9.8h, v9.8h, v14.8h\n"
- "sqadd v16.8h, v16.8h, v14.8h\n"
- "sqadd v25.8h, v25.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v9.8h, v9.8h, v12.8h\n"
- "smax v16.8h, v16.8h, v12.8h\n"
- "smax v25.8h, v25.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v9.8h, v9.8h, v11.8h\n"
- "smin v16.8h, v16.8h, v11.8h\n"
- "smin v25.8h, v25.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
- "str d13, [x12, x17]\n"
+ "srshl v0.4s, v0.4s, v3.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v30.4s, v30.4s, v3.4s\n"
+ "sqxtn v2.4h, v2.4s\n"
+ "srshl v6.4s, v6.4s, v3.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "sqxtn2 v9.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v0.4s\n"
+ "sqxtn2 v2.8h, v30.4s\n"
+ "sqxtn2 v10.8h, v6.4s\n"
+ "sqadd v9.8h, v9.8h, v13.8h\n"
+ "sqadd v7.8h, v7.8h, v13.8h\n"
+ "sqadd v2.8h, v2.8h, v13.8h\n"
+ "sqadd v10.8h, v10.8h, v13.8h\n"
+ "smax v9.8h, v9.8h, v29.8h\n"
+ "smax v7.8h, v7.8h, v29.8h\n"
+ "smax v2.8h, v2.8h, v29.8h\n"
+ "smax v10.8h, v10.8h, v29.8h\n"
+ "smin v9.8h, v9.8h, v12.8h\n"
+ "smin v7.8h, v7.8h, v12.8h\n"
+ "smin v2.8h, v2.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str d9, [x11, x17]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str d16, [x10, x17]\n"
- "str d25, [x9, x17]\n"
- "add x17, x17, #0x8\n"
+ "str d9, [x11, x16]\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "str d7, [x10, x16]\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "str d2, [x9, x16]\n"
+ "str d10, [x28, x16]\n"
+ "add x16, x16, #0x8\n"
"beq 64f\n"
- "add x15, x15, #0x48\n"
+ "add x14, x14, #0x48\n"
"3:" // Oddments
- "ldr x28, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x6, #2, 5f\n"
- "ld1 { v13.4s }, [x28], #0x10\n"
- "tbz x6, #1, 4f\n"
- "ld1 { v20.d }[0], [x28], #0x8\n"
- "tbz x6, #0, 7f\n"
- "ld1 { v20.s }[2], [x28]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x7, #2, 5f\n"
+ "ld1 { v9.4s }, [x20], #0x10\n"
+ "tbz x7, #1, 4f\n"
+ "ld1 { v24.d }[0], [x20], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v24.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x6, #0, 7f\n"
- "ld1 { v20.s }[0], [x28]\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v24.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x6, #1, 6f\n"
- "ld1 { v13.d }[0], [x28], #0x8\n"
- "tbz x6, #0, 7f\n"
- "ld1 { v13.s }[2], [x28]\n"
+ "tbz x7, #1, 6f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v9.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 7f\n"
- "ld1 { v13.s }[0], [x28]\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v9.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x15, #0x0]\n"
- "ldr d1, [x15, #0x8]\n"
- "mov v9.16b, v13.16b\n"
- "mov v18.16b, v20.16b\n"
- "ldr d2, [x15, #0x10]\n"
- "ldr d3, [x15, #0x18]\n"
- "mov v16.16b, v13.16b\n"
- "mov v26.16b, v20.16b\n"
- "ldr d4, [x15, #0x20]\n"
- "ldr d5, [x15, #0x28]\n"
- "mov v25.16b, v13.16b\n"
- "mov v10.16b, v20.16b\n"
- "ldr d6, [x15, #0x30]\n"
- "ldr d7, [x15, #0x38]\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldr d8, [x15, #0x40]\n"
- "ldp x24, x23, [x16, #0x0]\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldp x22, x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x20]\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ssubl v5.8h, v5.8b, v15.8b\n"
- "ssubl v6.8h, v6.8b, v15.8b\n"
- "ssubl v7.8h, v7.8b, v15.8b\n"
- "ssubl v8.8h, v8.8b, v15.8b\n"
- "add x24, x24, x8\n"
- "add x23, x23, x8\n"
- "add x22, x22, x8\n"
- "add x21, x21, x8\n"
- "add x20, x20, x8\n"
- "tbz x6, #2, 9f\n"
- "ld1 { v31.s }[0], [x24], #0x4\n"
- "ld1 { v30.s }[0], [x23], #0x4\n"
- "ld1 { v29.s }[0], [x22], #0x4\n"
- "ld1 { v28.s }[0], [x21], #0x4\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
- "tbz x6, #1, 8f\n"
- "ld1 { v31.h }[2], [x24], #0x2\n"
- "ld1 { v30.h }[2], [x23], #0x2\n"
- "ld1 { v29.h }[2], [x22], #0x2\n"
- "ld1 { v28.h }[2], [x21], #0x2\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
- "tbz x6, #0, 11f\n"
- "ld1 { v31.b }[6], [x24]\n"
- "ld1 { v30.b }[6], [x23]\n"
- "ld1 { v29.b }[6], [x22]\n"
- "ld1 { v28.b }[6], [x21]\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ldr d23, [x14, #0x0]\n"
+ "ldr d16, [x14, #0x8]\n"
+ "mov v7.16b, v9.16b\n"
+ "mov v0.16b, v24.16b\n"
+ "ldr d1, [x14, #0x10]\n"
+ "ldr d5, [x14, #0x18]\n"
+ "mov v2.16b, v9.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "ldr d26, [x14, #0x20]\n"
+ "ldr d18, [x14, #0x28]\n"
+ "mov v10.16b, v9.16b\n"
+ "mov v6.16b, v24.16b\n"
+ "ldr d31, [x14, #0x30]\n"
+ "ldr d25, [x14, #0x38]\n"
+ "ssubl v23.8h, v23.8b, v19.8b\n"
+ "ssubl v16.8h, v16.8b, v19.8b\n"
+ "ldr d20, [x14, #0x40]\n"
+ "ldp x24, x23, [x15, #0x0]\n"
+ "ssubl v1.8h, v1.8b, v19.8b\n"
+ "ssubl v5.8h, v5.8b, v19.8b\n"
+ "ldp x22, x21, [x15, #0x10]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ssubl v26.8h, v26.8b, v19.8b\n"
+ "ssubl v18.8h, v18.8b, v19.8b\n"
+ "ssubl v31.8h, v31.8b, v19.8b\n"
+ "ssubl v25.8h, v25.8b, v19.8b\n"
+ "ssubl v20.8h, v20.8b, v19.8b\n"
+ "add x24, x24, x17\n"
+ "add x23, x23, x17\n"
+ "add x22, x22, x17\n"
+ "add x21, x21, x17\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 9f\n"
+ "ld1 { v22.s }[0], [x24], #0x4\n"
+ "ld1 { v4.s }[0], [x23], #0x4\n"
+ "ld1 { v8.s }[0], [x22], #0x4\n"
+ "ld1 { v27.s }[0], [x21], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 8f\n"
+ "ld1 { v22.h }[2], [x24], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v8.h }[2], [x22], #0x2\n"
+ "ld1 { v27.h }[2], [x21], #0x2\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[6], [x24]\n"
+ "ld1 { v4.b }[6], [x23]\n"
+ "ld1 { v8.b }[6], [x22]\n"
+ "ld1 { v27.b }[6], [x21]\n"
+ "ld1 { v15.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x6, #0, 11f\n"
- "ld1 { v31.b }[4], [x24]\n"
- "ld1 { v30.b }[4], [x23]\n"
- "ld1 { v29.b }[4], [x22]\n"
- "ld1 { v28.b }[4], [x21]\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[4], [x24]\n"
+ "ld1 { v4.b }[4], [x23]\n"
+ "ld1 { v8.b }[4], [x22]\n"
+ "ld1 { v27.b }[4], [x21]\n"
+ "ld1 { v15.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x6, #1, 10f\n"
- "ld1 { v31.h }[0], [x24], #0x2\n"
- "ld1 { v30.h }[0], [x23], #0x2\n"
- "ld1 { v29.h }[0], [x22], #0x2\n"
- "ld1 { v28.h }[0], [x21], #0x2\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
- "tbz x6, #0, 11f\n"
- "ld1 { v31.b }[2], [x24]\n"
- "ld1 { v30.b }[2], [x23]\n"
- "ld1 { v29.b }[2], [x22]\n"
- "ld1 { v28.b }[2], [x21]\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "tbz x7, #1, 10f\n"
+ "ld1 { v22.h }[0], [x24], #0x2\n"
+ "ld1 { v4.h }[0], [x23], #0x2\n"
+ "ld1 { v8.h }[0], [x22], #0x2\n"
+ "ld1 { v27.h }[0], [x21], #0x2\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[2], [x24]\n"
+ "ld1 { v4.b }[2], [x23]\n"
+ "ld1 { v8.b }[2], [x22]\n"
+ "ld1 { v27.b }[2], [x21]\n"
+ "ld1 { v15.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 11f\n"
- "ld1 { v31.b }[0], [x24]\n"
- "ld1 { v30.b }[0], [x23]\n"
- "ld1 { v29.b }[0], [x22]\n"
- "ld1 { v28.b }[0], [x21]\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[0], [x24]\n"
+ "ld1 { v4.b }[0], [x23]\n"
+ "ld1 { v8.b }[0], [x22]\n"
+ "ld1 { v27.b }[0], [x21]\n"
+ "ld1 { v15.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v13.4s, v31.4h, v4.4h\n"
- "smlal2 v20.4s, v31.8h, v4.8h\n"
- "ldr x21, [x16, #0x28]\n"
- "smlal v9.4s, v31.4h, v3.4h\n"
- "smlal2 v18.4s, v31.8h, v3.8h\n"
- "usubl v30.8h, v30.8b, v24.8b\n"
- "add x21, x21, x8\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "smlal2 v26.4s, v31.8h, v1.8h\n"
- "smlal v25.4s, v31.4h, v0.4h\n"
- "smlal2 v10.4s, v31.8h, v0.8h\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v13.4s, v30.4h, v0.4h\n"
- "smlal2 v20.4s, v30.8h, v0.8h\n"
- "usubl v27.8h, v27.8b, v24.8b\n"
- "smlal v9.4s, v29.4h, v2.4h\n"
- "smlal2 v18.4s, v29.8h, v2.8h\n"
- "smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "smlal v9.4s, v28.4h, v4.4h\n"
- "smlal2 v18.4s, v28.8h, v4.8h\n"
- "smlal v16.4s, v28.4h, v2.4h\n"
- "smlal2 v26.4s, v28.8h, v2.8h\n"
- "smlal v25.4s, v28.4h, v1.4h\n"
- "smlal2 v10.4s, v28.8h, v1.8h\n"
- "tbz x6, #2, 13f\n"
- "ld1 { v31.s }[0], [x21], #0x4\n"
- "tbz x6, #1, 12f\n"
- "ld1 { v31.h }[2], [x21], #0x2\n"
- "tbz x6, #0, 15f\n"
- "ld1 { v31.b }[6], [x21]\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "smlal v9.4s, v22.4h, v26.4h\n"
+ "smlal2 v24.4s, v22.8h, v26.8h\n"
+ "ldr x20, [x15, #0x28]\n"
+ "smlal v7.4s, v22.4h, v5.4h\n"
+ "smlal2 v0.4s, v22.8h, v5.8h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "smlal v2.4s, v22.4h, v16.4h\n"
+ "smlal2 v30.4s, v22.8h, v16.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v10.4s, v22.4h, v23.4h\n"
+ "smlal2 v6.4s, v22.8h, v23.8h\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "smlal v9.4s, v4.4h, v23.4h\n"
+ "smlal2 v24.4s, v4.8h, v23.8h\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "smlal2 v0.4s, v8.8h, v1.8h\n"
+ "smlal v9.4s, v27.4h, v18.4h\n"
+ "smlal2 v24.4s, v27.8h, v18.8h\n"
+ "smlal v7.4s, v27.4h, v26.4h\n"
+ "smlal2 v0.4s, v27.8h, v26.8h\n"
+ "smlal v2.4s, v27.4h, v1.4h\n"
+ "smlal2 v30.4s, v27.8h, v1.8h\n"
+ "smlal v10.4s, v27.4h, v16.4h\n"
+ "smlal2 v6.4s, v27.8h, v16.8h\n"
+ "tbz x7, #2, 13f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 12f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x6, #0, 15f\n"
- "ld1 { v31.b }[4], [x21]\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x6, #1, 14f\n"
- "ld1 { v31.h }[0], [x21], #0x2\n"
- "tbz x6, #0, 15f\n"
- "ld1 { v31.b }[2], [x21]\n"
+ "tbz x7, #1, 14f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 15f\n"
- "ld1 { v31.b }[0], [x21]\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[0], [x20]\n"
"15:" // Oddments: Load (3, 0): Bit 2: End
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v16.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v6.8h\n"
- "ldr x20, [x16, #0x30]\n"
- "smlal v13.4s, v27.4h, v7.4h\n"
- "smlal2 v20.4s, v27.8h, v7.8h\n"
- "add x20, x20, x8\n"
- "smlal v9.4s, v27.4h, v6.4h\n"
- "smlal2 v18.4s, v27.8h, v6.8h\n"
- "smlal v16.4s, v27.4h, v4.4h\n"
- "smlal2 v26.4s, v27.8h, v4.8h\n"
- "smlal v25.4s, v27.4h, v3.4h\n"
- "smlal2 v10.4s, v27.8h, v3.8h\n"
- "tbz x6, #2, 17f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
- "tbz x6, #1, 16f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
- "tbz x6, #0, 19f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "usubl v21.8h, v21.8b, v14.8b\n"
+ "smlal v2.4s, v21.4h, v31.4h\n"
+ "smlal2 v30.4s, v21.8h, v31.8h\n"
+ "ldr x20, [x15, #0x30]\n"
+ "smlal v9.4s, v15.4h, v25.4h\n"
+ "smlal2 v24.4s, v15.8h, v25.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v7.4s, v15.4h, v31.4h\n"
+ "smlal2 v0.4s, v15.8h, v31.8h\n"
+ "smlal v2.4s, v15.4h, v26.4h\n"
+ "smlal2 v30.4s, v15.8h, v26.8h\n"
+ "smlal v10.4s, v15.4h, v5.4h\n"
+ "smlal2 v6.4s, v15.8h, v5.8h\n"
+ "tbz x7, #2, 17f\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 16f\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x6, #0, 19f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x6, #1, 18f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
- "tbz x6, #0, 19f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "tbz x7, #1, 18f\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 19f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[0], [x20]\n"
"19:" // Oddments: Load (3, 3): Bit 2: End
- "usubl v29.8h, v29.8b, v24.8b\n"
- "ldr x28, [x16, #0x38]\n"
- "smlal v25.4s, v29.4h, v8.4h\n"
- "smlal2 v10.4s, v29.8h, v8.8h\n"
- "add x28, x28, x8\n"
- "tbz x6, #2, 21f\n"
- "ld1 { v28.s }[0], [x28], #0x4\n"
- "tbz x6, #1, 20f\n"
- "ld1 { v28.h }[2], [x28], #0x2\n"
- "tbz x6, #0, 23f\n"
- "ld1 { v28.b }[6], [x28]\n"
+ "usubl v28.8h, v28.8b, v14.8b\n"
+ "ldr x20, [x15, #0x38]\n"
+ "smlal v10.4s, v28.4h, v20.4h\n"
+ "smlal2 v6.4s, v28.8h, v20.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 21f\n"
+ "ld1 { v22.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 20f\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
- "tbz x6, #0, 23f\n"
- "ld1 { v28.b }[4], [x28]\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 1): Bit 2: Unset
- "tbz x6, #1, 22f\n"
- "ld1 { v28.h }[0], [x28], #0x2\n"
- "tbz x6, #0, 23f\n"
- "ld1 { v28.b }[2], [x28]\n"
+ "tbz x7, #1, 22f\n"
+ "ld1 { v22.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 23f\n"
- "ld1 { v28.b }[0], [x28]\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[0], [x20]\n"
"23:" // Oddments: Load (0, 1): Bit 2: End
- "usubl v28.8h, v28.8b, v24.8b\n"
- "ldr x27, [x16, #0x40]\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "smlal v9.4s, v28.4h, v0.4h\n"
- "smlal2 v18.4s, v28.8h, v0.8h\n"
- "add x27, x27, x8\n"
- "tbz x6, #2, 25f\n"
- "ld1 { v31.s }[0], [x27], #0x4\n"
- "tbz x6, #1, 24f\n"
- "ld1 { v31.h }[2], [x27], #0x2\n"
- "tbz x6, #0, 27f\n"
- "ld1 { v31.b }[6], [x27]\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "ldr x20, [x15, #0x40]\n"
+ "smlal v9.4s, v22.4h, v16.4h\n"
+ "smlal2 v24.4s, v22.8h, v16.8h\n"
+ "smlal v7.4s, v22.4h, v23.4h\n"
+ "smlal2 v0.4s, v22.8h, v23.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 25f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 24f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
- "tbz x6, #0, 27f\n"
- "ld1 { v31.b }[4], [x27]\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (0, 2): Bit 2: Unset
- "tbz x6, #1, 26f\n"
- "ld1 { v31.h }[0], [x27], #0x2\n"
- "tbz x6, #0, 27f\n"
- "ld1 { v31.b }[2], [x27]\n"
+ "tbz x7, #1, 26f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 27f\n"
- "ld1 { v31.b }[0], [x27]\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[0], [x20]\n"
"27:" // Oddments: Load (0, 2): Bit 2: End
- "usubl v31.8h, v31.8b, v24.8b\n"
- "ldr x26, [x16, #0x48]\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v20.4s, v31.8h, v2.8h\n"
- "smlal v9.4s, v31.4h, v1.4h\n"
- "smlal2 v18.4s, v31.8h, v1.8h\n"
- "add x26, x26, x8\n"
- "tbz x6, #2, 29f\n"
- "ld1 { v30.s }[0], [x26], #0x4\n"
- "tbz x6, #1, 28f\n"
- "ld1 { v30.h }[2], [x26], #0x2\n"
- "tbz x6, #0, 31f\n"
- "ld1 { v30.b }[6], [x26]\n"
+ "usubl v21.8h, v21.8b, v14.8b\n"
+ "ldr x20, [x15, #0x48]\n"
+ "smlal v9.4s, v21.4h, v1.4h\n"
+ "smlal2 v24.4s, v21.8h, v1.8h\n"
+ "smlal v7.4s, v21.4h, v16.4h\n"
+ "smlal2 v0.4s, v21.8h, v16.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 29f\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 28f\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
- "tbz x6, #0, 31f\n"
- "ld1 { v30.b }[4], [x26]\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
- "tbz x6, #1, 30f\n"
- "ld1 { v30.h }[0], [x26], #0x2\n"
- "tbz x6, #0, 31f\n"
- "ld1 { v30.b }[2], [x26]\n"
+ "tbz x7, #1, 30f\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 31f\n"
- "ld1 { v30.b }[0], [x26]\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "usubl v30.8h, v30.8b, v24.8b\n"
- "ldr x25, [x16, #0x50]\n"
- "smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v20.4s, v30.8h, v8.8h\n"
- "smlal v9.4s, v30.4h, v7.4h\n"
- "smlal2 v18.4s, v30.8h, v7.8h\n"
- "add x25, x25, x8\n"
- "smlal v16.4s, v30.4h, v5.4h\n"
- "smlal2 v26.4s, v30.8h, v5.8h\n"
- "smlal v25.4s, v30.4h, v4.4h\n"
- "smlal2 v10.4s, v30.8h, v4.8h\n"
- "tbz x6, #2, 33f\n"
- "ld1 { v29.s }[0], [x25], #0x4\n"
- "tbz x6, #1, 32f\n"
- "ld1 { v29.h }[2], [x25], #0x2\n"
- "tbz x6, #0, 35f\n"
- "ld1 { v29.b }[6], [x25]\n"
+ "usubl v28.8h, v28.8b, v14.8b\n"
+ "ldr x20, [x15, #0x50]\n"
+ "smlal v9.4s, v28.4h, v20.4h\n"
+ "smlal2 v24.4s, v28.8h, v20.8h\n"
+ "smlal v7.4s, v28.4h, v25.4h\n"
+ "smlal2 v0.4s, v28.8h, v25.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v2.4s, v28.4h, v18.4h\n"
+ "smlal2 v30.4s, v28.8h, v18.8h\n"
+ "smlal v10.4s, v28.4h, v26.4h\n"
+ "smlal2 v6.4s, v28.8h, v26.8h\n"
+ "tbz x7, #2, 33f\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 32f\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
- "tbz x6, #0, 35f\n"
- "ld1 { v29.b }[4], [x25]\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (1, 0): Bit 2: Unset
- "tbz x6, #1, 34f\n"
- "ld1 { v29.h }[0], [x25], #0x2\n"
- "tbz x6, #0, 35f\n"
- "ld1 { v29.b }[2], [x25]\n"
+ "tbz x7, #1, 34f\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 35f\n"
- "ld1 { v29.b }[0], [x25]\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[0], [x20]\n"
"35:" // Oddments: Load (1, 0): Bit 2: End
- "usubl v29.8h, v29.8b, v24.8b\n"
- "ldr x24, [x16, #0x58]\n"
- "smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v20.4s, v29.8h, v3.8h\n"
- "smlal v16.4s, v29.4h, v0.4h\n"
- "smlal2 v26.4s, v29.8h, v0.8h\n"
- "add x24, x24, x8\n"
- "tbz x6, #2, 37f\n"
- "ld1 { v28.s }[0], [x24], #0x4\n"
- "tbz x6, #1, 36f\n"
- "ld1 { v28.h }[2], [x24], #0x2\n"
- "tbz x6, #0, 39f\n"
- "ld1 { v28.b }[6], [x24]\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "ldr x20, [x15, #0x58]\n"
+ "smlal v9.4s, v8.4h, v5.4h\n"
+ "smlal2 v24.4s, v8.8h, v5.8h\n"
+ "smlal v2.4s, v8.4h, v23.4h\n"
+ "smlal2 v30.4s, v8.8h, v23.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 37f\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 36f\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x6, #0, 39f\n"
- "ld1 { v28.b }[4], [x24]\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x6, #1, 38f\n"
- "ld1 { v28.h }[0], [x24], #0x2\n"
- "tbz x6, #0, 39f\n"
- "ld1 { v28.b }[2], [x24]\n"
+ "tbz x7, #1, 38f\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 39f\n"
- "ld1 { v28.b }[0], [x24]\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[0], [x20]\n"
"39:" // Oddments: Load (1, 3): Bit 2: End
- "usubl v28.8h, v28.8b, v24.8b\n"
- "ldr x23, [x16, #0x60]\n"
- "smlal v9.4s, v28.4h, v5.4h\n"
- "smlal2 v18.4s, v28.8h, v5.8h\n"
- "smlal v25.4s, v28.4h, v2.4h\n"
- "smlal2 v10.4s, v28.8h, v2.8h\n"
- "add x23, x23, x8\n"
- "tbz x6, #2, 41f\n"
- "ld1 { v31.s }[0], [x23], #0x4\n"
- "tbz x6, #1, 40f\n"
- "ld1 { v31.h }[2], [x23], #0x2\n"
- "tbz x6, #0, 43f\n"
- "ld1 { v31.b }[6], [x23]\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "ldr x20, [x15, #0x60]\n"
+ "smlal v7.4s, v8.4h, v18.4h\n"
+ "smlal2 v0.4s, v8.8h, v18.8h\n"
+ "smlal v10.4s, v8.4h, v1.4h\n"
+ "smlal2 v6.4s, v8.8h, v1.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 41f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 40f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
- "tbz x6, #0, 43f\n"
- "ld1 { v31.b }[4], [x23]\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 0): Bit 2: Unset
- "tbz x6, #1, 42f\n"
- "ld1 { v31.h }[0], [x23], #0x2\n"
- "tbz x6, #0, 43f\n"
- "ld1 { v31.b }[2], [x23]\n"
+ "tbz x7, #1, 42f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 43f\n"
- "ld1 { v31.b }[0], [x23]\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"43:" // Oddments: Load (2, 0): Bit 2: End
- "usubl v31.8h, v31.8b, v24.8b\n"
- "ldr x22, [x16, #0x68]\n"
- "smlal v13.4s, v31.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal2 v26.4s, v31.8h, v3.8h\n"
- "add x22, x22, x8\n"
- "tbz x6, #2, 45f\n"
- "ld1 { v30.s }[0], [x22], #0x4\n"
- "tbz x6, #1, 44f\n"
- "ld1 { v30.h }[2], [x22], #0x2\n"
- "tbz x6, #0, 47f\n"
- "ld1 { v30.b }[6], [x22]\n"
+ "usubl v17.8h, v17.8b, v14.8b\n"
+ "ldr x20, [x15, #0x68]\n"
+ "smlal v9.4s, v17.4h, v31.4h\n"
+ "smlal2 v24.4s, v17.8h, v31.8h\n"
+ "smlal v2.4s, v17.4h, v5.4h\n"
+ "smlal2 v30.4s, v17.8h, v5.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 45f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 44f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x6, #0, 47f\n"
- "ld1 { v30.b }[4], [x22]\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x6, #1, 46f\n"
- "ld1 { v30.h }[0], [x22], #0x2\n"
- "tbz x6, #0, 47f\n"
- "ld1 { v30.b }[2], [x22]\n"
+ "tbz x7, #1, 46f\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 47f\n"
- "ld1 { v30.b }[0], [x22]\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[0], [x20]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "usubl v30.8h, v30.8b, v24.8b\n"
- "ldr x21, [x16, #0x70]\n"
- "smlal v9.4s, v30.4h, v8.4h\n"
- "smlal2 v18.4s, v30.8h, v8.8h\n"
- "smlal v25.4s, v30.4h, v5.4h\n"
- "smlal2 v10.4s, v30.8h, v5.8h\n"
- "add x21, x21, x8\n"
- "tbz x6, #2, 49f\n"
- "ld1 { v29.s }[0], [x21], #0x4\n"
- "tbz x6, #1, 48f\n"
- "ld1 { v29.h }[2], [x21], #0x2\n"
- "tbz x6, #0, 51f\n"
- "ld1 { v29.b }[6], [x21]\n"
+ "usubl v23.8h, v23.8b, v14.8b\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal v7.4s, v23.4h, v20.4h\n"
+ "smlal2 v0.4s, v23.8h, v20.8h\n"
+ "smlal v10.4s, v23.4h, v18.4h\n"
+ "smlal2 v6.4s, v23.8h, v18.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 49f\n"
+ "ld1 { v5.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 48f\n"
+ "ld1 { v5.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x6, #0, 51f\n"
- "ld1 { v29.b }[4], [x21]\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x6, #1, 50f\n"
- "ld1 { v29.h }[0], [x21], #0x2\n"
- "tbz x6, #0, 51f\n"
- "ld1 { v29.b }[2], [x21]\n"
+ "tbz x7, #1, 50f\n"
+ "ld1 { v5.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 51f\n"
- "ld1 { v29.b }[0], [x21]\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "usubl v29.8h, v29.8b, v24.8b\n"
- "ldr x20, [x16, #0x78]\n"
- "smlal v16.4s, v29.4h, v7.4h\n"
- "smlal2 v26.4s, v29.8h, v7.8h\n"
- "smlal v25.4s, v29.4h, v6.4h\n"
- "smlal2 v10.4s, v29.8h, v6.8h\n"
- "add x20, x20, x8\n"
- "tbz x6, #2, 53f\n"
- "ld1 { v28.s }[0], [x20], #0x4\n"
- "tbz x6, #1, 52f\n"
- "ld1 { v28.h }[2], [x20], #0x2\n"
- "tbz x6, #0, 55f\n"
- "ld1 { v28.b }[6], [x20]\n"
+ "usubl v5.8h, v5.8b, v14.8b\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v2.4s, v5.4h, v25.4h\n"
+ "smlal2 v30.4s, v5.8h, v25.8h\n"
+ "smlal v10.4s, v5.4h, v31.4h\n"
+ "smlal2 v6.4s, v5.8h, v31.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 53f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 52f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x6, #0, 55f\n"
- "ld1 { v28.b }[4], [x20]\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x6, #1, 54f\n"
- "ld1 { v28.h }[0], [x20], #0x2\n"
- "tbz x6, #0, 55f\n"
- "ld1 { v28.b }[2], [x20]\n"
+ "tbz x7, #1, 54f\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 55f\n"
- "ld1 { v28.b }[0], [x20]\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[0], [x20]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v16.4s, v28.4h, v8.4h\n"
- "smlal2 v26.4s, v28.8h, v8.8h\n"
- "smlal v25.4s, v28.4h, v7.4h\n"
- "smlal2 v10.4s, v28.8h, v7.8h\n"
- "tbz x6, #2, 57f\n"
- "ld1 { v17.4s }, [x14], #0x10\n"
- "ld1 { v22.4s }, [x13], #0x10\n"
- "tbz x6, #1, 56f\n"
- "ld1 { v23.d }[0], [x14], #0x8\n"
- "ld1 { v19.d }[0], [x13], #0x8\n"
- "tbz x6, #0, 59f\n"
- "ld1 { v23.s }[2], [x14]\n"
- "ld1 { v19.s }[2], [x13]\n"
+ "usubl v23.8h, v23.8b, v14.8b\n"
+ "smlal v2.4s, v23.4h, v20.4h\n"
+ "smlal2 v30.4s, v23.8h, v20.8h\n"
+ "smlal v10.4s, v23.4h, v25.4h\n"
+ "smlal2 v6.4s, v23.8h, v25.8h\n"
+ "tbz x7, #2, 57f\n"
+ "ld1 { v15.4s }, [x13], #0x10\n"
+ "ld1 { v19.4s }, [x12], #0x10\n"
+ "tbz x7, #1, 56f\n"
+ "ld1 { v18.d }[0], [x13], #0x8\n"
+ "ld1 { v22.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v18.s }[2], [x13]\n"
+ "ld1 { v22.s }[2], [x12]\n"
"b 59f\n"
"56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x6, #0, 59f\n"
- "ld1 { v23.s }[0], [x14]\n"
- "ld1 { v19.s }[0], [x13]\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v18.s }[0], [x13]\n"
+ "ld1 { v22.s }[0], [x12]\n"
"b 59f\n"
"57:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x6, #1, 58f\n"
- "ld1 { v17.d }[0], [x14], #0x8\n"
- "ld1 { v22.d }[0], [x13], #0x8\n"
- "tbz x6, #0, 59f\n"
- "ld1 { v17.s }[2], [x14]\n"
- "ld1 { v22.s }[2], [x13]\n"
+ "tbz x7, #1, 58f\n"
+ "ld1 { v15.d }[0], [x13], #0x8\n"
+ "ld1 { v19.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v15.s }[2], [x13]\n"
+ "ld1 { v19.s }[2], [x12]\n"
"b 59f\n"
"58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 59f\n"
- "ld1 { v17.s }[0], [x14]\n"
- "ld1 { v22.s }[0], [x13]\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v15.s }[0], [x13]\n"
+ "ld1 { v19.s }[0], [x12]\n"
"59:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v13.4s, v13.4s, v17.4s\n"
- "and v21.16b, v13.16b, v22.16b\n"
- "add x12, x12, x17\n"
- "add x11, x11, x17\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "add x10, x10, x17\n"
- "add x9, x9, x17\n"
- "and v29.16b, v20.16b, v19.16b\n"
- "sqrdmulh v9.4s, v9.4s, v17.4s\n"
- "sqrdmulh v16.4s, v16.4s, v17.4s\n"
- "sqrdmulh v25.4s, v25.4s, v17.4s\n"
- "sqadd v13.4s, v13.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v0.16b, v9.16b, v22.16b\n"
- "sqrdmulh v18.4s, v18.4s, v23.4s\n"
- "and v27.16b, v16.16b, v22.16b\n"
- "sqrdmulh v26.4s, v26.4s, v23.4s\n"
- "and v21.16b, v25.16b, v22.16b\n"
- "sqrdmulh v10.4s, v10.4s, v23.4s\n"
- "sqadd v20.4s, v20.4s, v29.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v17.16b, v18.16b, v19.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "and v7.16b, v26.16b, v19.16b\n"
+ "sqrdmulh v9.4s, v9.4s, v15.4s\n"
+ "and v17.16b, v9.16b, v19.16b\n"
+ "add x11, x11, x16\n"
+ "add x10, x10, x16\n"
+ "sqrdmulh v24.4s, v24.4s, v18.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "add x9, x9, x16\n"
+ "add x28, x28, x16\n"
+ "and v20.16b, v24.16b, v22.16b\n"
+ "sqrdmulh v7.4s, v7.4s, v15.4s\n"
+ "sqrdmulh v2.4s, v2.4s, v15.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v15.4s\n"
+ "sqadd v9.4s, v9.4s, v17.4s\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v21.16b, v7.16b, v19.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+ "and v15.16b, v2.16b, v19.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v18.4s\n"
+ "and v23.16b, v10.16b, v19.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v18.4s\n"
+ "sqadd v24.4s, v24.4s, v20.4s\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "and v29.16b, v10.16b, v19.16b\n"
- "sqadd v9.4s, v9.4s, v0.4s\n"
+ "and v18.16b, v0.16b, v22.16b\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "and v17.16b, v30.16b, v22.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v28.16b, v6.16b, v22.16b\n"
+ "sqadd v7.4s, v7.4s, v21.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v2.4s, v2.4s, v15.4s\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v27.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v25.4s, v25.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v22.4s\n"
- "srshl v9.4s, v9.4s, v22.4s\n"
- "sqadd v18.4s, v18.4s, v17.4s\n"
- "srshl v16.4s, v16.4s, v22.4s\n"
- "sqadd v26.4s, v26.4s, v7.4s\n"
- "srshl v25.4s, v25.4s, v22.4s\n"
- "sqadd v10.4s, v10.4s, v29.4s\n"
- "srshl v20.4s, v20.4s, v19.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v18.4s, v18.4s, v19.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v26.4s, v26.4s, v19.4s\n"
- "sqxtn v16.4h, v16.4s\n"
+ "sqadd v10.4s, v10.4s, v23.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "srshl v9.4s, v9.4s, v19.4s\n"
+ "srshl v7.4s, v7.4s, v19.4s\n"
+ "sqadd v0.4s, v0.4s, v18.4s\n"
+ "srshl v2.4s, v2.4s, v19.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
"srshl v10.4s, v10.4s, v19.4s\n"
- "sqxtn v25.4h, v25.4s\n"
- "sqxtn2 v13.8h, v20.4s\n"
- "sqxtn2 v9.8h, v18.4s\n"
- "sqxtn2 v16.8h, v26.4s\n"
- "sqxtn2 v25.8h, v10.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v9.8h, v9.8h, v14.8h\n"
- "sqadd v16.8h, v16.8h, v14.8h\n"
- "sqadd v25.8h, v25.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v9.8h, v9.8h, v12.8h\n"
- "smax v16.8h, v16.8h, v12.8h\n"
- "smax v25.8h, v25.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v9.8h, v9.8h, v11.8h\n"
- "smin v16.8h, v16.8h, v11.8h\n"
- "smin v25.8h, v25.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "sqadd v6.4s, v6.4s, v28.4s\n"
+ "srshl v24.4s, v24.4s, v22.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "srshl v0.4s, v0.4s, v22.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v30.4s, v30.4s, v22.4s\n"
+ "sqxtn v2.4h, v2.4s\n"
+ "srshl v6.4s, v6.4s, v22.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "sqxtn2 v9.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v0.4s\n"
+ "sqxtn2 v2.8h, v30.4s\n"
+ "sqxtn2 v10.8h, v6.4s\n"
+ "sqadd v9.8h, v9.8h, v13.8h\n"
+ "sqadd v7.8h, v7.8h, v13.8h\n"
+ "sqadd v2.8h, v2.8h, v13.8h\n"
+ "sqadd v10.8h, v10.8h, v13.8h\n"
+ "smax v9.8h, v9.8h, v29.8h\n"
+ "smax v7.8h, v7.8h, v29.8h\n"
+ "smax v2.8h, v2.8h, v29.8h\n"
+ "smax v10.8h, v10.8h, v29.8h\n"
+ "smin v9.8h, v9.8h, v12.8h\n"
+ "smin v7.8h, v7.8h, v12.8h\n"
+ "smin v2.8h, v2.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "tbz x6, #2, 61f\n"
- "st1 { v13.s }[0], [x12], #0x4\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "tbz x7, #2, 61f\n"
"st1 { v9.s }[0], [x11], #0x4\n"
- "st1 { v16.s }[0], [x10], #0x4\n"
- "st1 { v25.s }[0], [x9], #0x4\n"
- "tbz x6, #1, 60f\n"
- "st1 { v13.h }[2], [x12], #0x2\n"
+ "st1 { v7.s }[0], [x10], #0x4\n"
+ "st1 { v2.s }[0], [x9], #0x4\n"
+ "st1 { v10.s }[0], [x28], #0x4\n"
+ "tbz x7, #1, 60f\n"
"st1 { v9.h }[2], [x11], #0x2\n"
- "st1 { v16.h }[2], [x10], #0x2\n"
- "st1 { v25.h }[2], [x9], #0x2\n"
- "tbz x6, #0, 63f\n"
- "st1 { v13.b }[6], [x12], #0x1\n"
+ "st1 { v7.h }[2], [x10], #0x2\n"
+ "st1 { v2.h }[2], [x9], #0x2\n"
+ "st1 { v10.h }[2], [x28], #0x2\n"
+ "tbz x7, #0, 63f\n"
"st1 { v9.b }[6], [x11], #0x1\n"
- "st1 { v16.b }[6], [x10], #0x1\n"
- "st1 { v25.b }[6], [x9], #0x1\n"
+ "st1 { v7.b }[6], [x10], #0x1\n"
+ "st1 { v2.b }[6], [x9], #0x1\n"
+ "st1 { v10.b }[6], [x28], #0x1\n"
"b 63f\n"
"60:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x6, #0, 63f\n"
- "st1 { v13.b }[4], [x12], #0x1\n"
+ "tbz x7, #0, 63f\n"
"st1 { v9.b }[4], [x11], #0x1\n"
- "st1 { v16.b }[4], [x10], #0x1\n"
- "st1 { v25.b }[4], [x9], #0x1\n"
+ "st1 { v7.b }[4], [x10], #0x1\n"
+ "st1 { v2.b }[4], [x9], #0x1\n"
+ "st1 { v10.b }[4], [x28], #0x1\n"
"b 63f\n"
"61:" // Oddments: Bit 2: Unset
- "tbz x6, #1, 62f\n"
- "st1 { v13.h }[0], [x12], #0x2\n"
+ "tbz x7, #1, 62f\n"
"st1 { v9.h }[0], [x11], #0x2\n"
- "st1 { v16.h }[0], [x10], #0x2\n"
- "st1 { v25.h }[0], [x9], #0x2\n"
- "tbz x6, #0, 63f\n"
- "st1 { v13.b }[2], [x12], #0x1\n"
+ "st1 { v7.h }[0], [x10], #0x2\n"
+ "st1 { v2.h }[0], [x9], #0x2\n"
+ "st1 { v10.h }[0], [x28], #0x2\n"
+ "tbz x7, #0, 63f\n"
"st1 { v9.b }[2], [x11], #0x1\n"
- "st1 { v16.b }[2], [x10], #0x1\n"
- "st1 { v25.b }[2], [x9], #0x1\n"
+ "st1 { v7.b }[2], [x10], #0x1\n"
+ "st1 { v2.b }[2], [x9], #0x1\n"
+ "st1 { v10.b }[2], [x28], #0x1\n"
"b 63f\n"
"62:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 63f\n"
- "st1 { v13.b }[0], [x12], #0x1\n"
+ "tbz x7, #0, 63f\n"
"st1 { v9.b }[0], [x11], #0x1\n"
- "st1 { v16.b }[0], [x10], #0x1\n"
- "st1 { v25.b }[0], [x9], #0x1\n"
+ "st1 { v7.b }[0], [x10], #0x1\n"
+ "st1 { v2.b }[0], [x9], #0x1\n"
+ "st1 { v10.b }[0], [x28], #0x1\n"
"63:" // Oddments: Bit 2: End
"64:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index 9e80fbfc07..5d6fbac4bd 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
@@ -34,15 +34,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
- const unsigned int,
- const uint8_t *const *const,
- const int8_t *const,
- const int32_t *const,
- const arm_gemm::Requantize32 &,
- const int32_t *const,
- const int32_t *const,
- uint8_t *const *const);
+void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
class a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index 5124b2c8f3..2cc802f9e6 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -104,16 +104,16 @@ void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
"lsr x8, x7, #0x3\n"
"add x20, x23, %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v12.16b }, [x20]\n"
+ "ld1r { v6.16b }, [x20]\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
"add x21, x23, %[offsetof_Requantize32_b_offset]\n"
"add x20, x23, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v13.16b }, [x21]\n"
- "ld1r { v11.8h }, [x20]\n"
+ "ld1r { v15.16b }, [x21]\n"
+ "ld1r { v13.8h }, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_minval]\n"
"add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v16.8h }, [x21]\n"
- "ld1r { v14.8h }, [x20]\n"
+ "ld1r { v17.8h }, [x21]\n"
+ "ld1r { v24.8h }, [x20]\n"
"mov x17, #0x0\n"
"mov x16, #0x0\n"
"add x15, %x[params], %[offsetof_Params_inptrs]\n"
@@ -123,563 +123,563 @@ void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ldp x11, x10, [x22, #0x0]\n"
"ldp x9, x28, [x22, #0x10]\n"
"cbz x8, 3f\n"
- "ldr d0, [x14, #0x0]\n"
- "ldr d1, [x14, #0x8]\n"
+ "ldr d11, [x14, #0x0]\n"
+ "ldr d22, [x14, #0x8]\n"
"subs x8, x8, #0x1\n"
- "ssubl v0.8h, v0.8b, v13.8b\n"
- "ldr d2, [x14, #0x10]\n"
- "ldr d3, [x14, #0x18]\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
- "ldr d4, [x14, #0x20]\n"
- "ldr d5, [x14, #0x28]\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
- "ssubl v4.8h, v4.8b, v13.8b\n"
- "ldr d6, [x14, #0x30]\n"
+ "ssubl v11.8h, v11.8b, v15.8b\n"
+ "ldr d14, [x14, #0x10]\n"
+ "ldr d28, [x14, #0x18]\n"
+ "ssubl v22.8h, v22.8b, v15.8b\n"
+ "ssubl v14.8h, v14.8b, v15.8b\n"
+ "ldr d18, [x14, #0x20]\n"
+ "ldr d9, [x14, #0x28]\n"
+ "ssubl v28.8h, v28.8b, v15.8b\n"
+ "ssubl v18.8h, v18.8b, v15.8b\n"
+ "ldr d26, [x14, #0x30]\n"
"ldr d7, [x14, #0x38]\n"
- "ssubl v5.8h, v5.8b, v13.8b\n"
- "ssubl v6.8h, v6.8b, v13.8b\n"
- "ldr d8, [x14, #0x40]\n"
- "ldr x24, [%x[params], %[offsetof_Params_bias]]\n"
- "ssubl v7.8h, v7.8b, v13.8b\n"
- "ssubl v8.8h, v8.8b, v13.8b\n"
- "ldr q15, [x24, #0x0]\n"
- "ldr q17, [x24, #0x10]\n"
- "add x24, x24, #0x20\n"
- "str x24, [%x[params], %[offsetof_Params_bias]]\n"
+ "ssubl v9.8h, v9.8b, v15.8b\n"
+ "ssubl v26.8h, v26.8b, v15.8b\n"
+ "ldr d4, [x14, #0x40]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q3, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
"ldp x27, x26, [x15, #0x0]\n"
"ldp x25, x24, [x15, #0x10]\n"
- "mov v10.16b, v15.16b\n"
- "mov v20.16b, v17.16b\n"
+ "mov v21.16b, v5.16b\n"
+ "mov v8.16b, v3.16b\n"
"ldp x23, x22, [x15, #0x20]\n"
"ldp x21, x20, [x15, #0x30]\n"
- "mov v9.16b, v15.16b\n"
- "mov v23.16b, v17.16b\n"
- "ldr d31, [x27, x17]\n"
- "ldr d30, [x26, x17]\n"
- "mov v21.16b, v15.16b\n"
- "mov v22.16b, v17.16b\n"
- "ldr d29, [x25, x17]\n"
- "ldr d28, [x24, x17]\n"
- "usubl v31.8h, v31.8b, v12.8b\n"
- "usubl v30.8h, v30.8b, v12.8b\n"
- "ldr d27, [x23, x17]\n"
- "ldr d26, [x22, x17]\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "usubl v28.8h, v28.8b, v12.8b\n"
- "ldr d25, [x21, x17]\n"
- "ldr d24, [x20, x17]\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
+ "mov v20.16b, v5.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "ldr d25, [x27, x17]\n"
+ "ldr d27, [x26, x17]\n"
+ "mov v19.16b, v5.16b\n"
+ "mov v31.16b, v3.16b\n"
+ "ldr d1, [x25, x17]\n"
+ "ldr d2, [x24, x17]\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "ldr d12, [x23, x17]\n"
+ "ldr d16, [x22, x17]\n"
+ "usubl v1.8h, v1.8b, v6.8b\n"
+ "usubl v2.8h, v2.8b, v6.8b\n"
+ "ldr d23, [x21, x17]\n"
+ "ldr d10, [x20, x17]\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "usubl v10.8h, v10.8b, v6.8b\n"
"beq 2f\n"
"1:" // Loop
- "smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v17.4s, v31.8h, v8.8h\n"
- "ldr x24, [x15, #0x40]\n"
- "ldr x22, [x15, #0x48]\n"
- "smlal v10.4s, v31.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "ldr x21, [x15, #0x50]\n"
- "ldr x20, [x15, #0x58]\n"
- "smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v17.4s, v30.8h, v0.8h\n"
- "ldr q19, [x13, #0x0]\n"
- "ldr x23, [x15, #0x78]\n"
- "smlal v10.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "ldr d28, [x22, x17]\n"
- "usubl v28.8h, v28.8b, v12.8b\n"
- "smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v17.4s, v29.8h, v1.8h\n"
- "ldr d29, [x24, x17]\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
+ "ldr q30, [x13, #0x0]\n"
+ "ldr q29, [x12, #0x0]\n"
+ "smlal v5.4s, v25.4h, v4.4h\n"
+ "smlal2 v3.4s, v25.8h, v4.8h\n"
+ "ldr x21, [x15, #0x58]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v5.4s, v27.4h, v11.4h\n"
+ "smlal v21.4s, v25.4h, v26.4h\n"
+ "ldr x25, [x15, #0x60]\n"
+ "ldr x24, [x15, #0x80]\n"
+ "smlal v20.4s, v25.4h, v14.4h\n"
+ "smlal v19.4s, v25.4h, v11.4h\n"
+ "smlal2 v3.4s, v27.8h, v11.8h\n"
"ldr d27, [x21, x17]\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v17.4s, v26.8h, v3.8h\n"
- "ldr d26, [x20, x17]\n"
- "ldr x20, [x15, #0x60]\n"
- "smlal v10.4s, v24.4h, v0.4h\n"
- "smlal2 v20.4s, v24.8h, v0.8h\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "ldr x21, [x15, #0x80]\n"
- "smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v17.4s, v25.8h, v4.8h\n"
- "ldr d25, [x20, x17]\n"
- "ldr x20, [x15, #0x68]\n"
- "smlal v10.4s, v29.4h, v4.4h\n"
- "smlal2 v20.4s, v29.8h, v4.8h\n"
- "ldr d29, [x20, x17]\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v17.4s, v24.8h, v2.8h\n"
- "ldr q18, [x12, #0x0]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v22.4h\n"
+ "smlal2 v8.4s, v25.8h, v26.8h\n"
+ "smlal2 v0.4s, v25.8h, v14.8h\n"
+ "ldr x23, [x15, #0x68]\n"
"ldr x22, [x15, #0x88]\n"
- "smlal v10.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "ldr d28, [x21, x17]\n"
- "ldr x21, [x15, #0x70]\n"
- "smlal v9.4s, v31.4h, v2.4h\n"
- "smlal2 v23.4s, v31.8h, v2.8h\n"
- "usubl v28.8h, v28.8b, v12.8b\n"
+ "smlal2 v31.4s, v25.8h, v11.8h\n"
+ "ldr d25, [x20, x17]\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "smlal v21.4s, v2.4h, v22.4h\n"
+ "smlal v20.4s, v27.4h, v28.4h\n"
+ "smlal v19.4s, v25.4h, v18.4h\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal2 v3.4s, v1.8h, v22.8h\n"
+ "ldr d1, [x25, x17]\n"
+ "usubl v1.8h, v1.8b, v6.8b\n"
+ "smlal v5.4s, v16.4h, v28.4h\n"
+ "smlal2 v8.4s, v2.8h, v22.8h\n"
+ "ldr d2, [x24, x17]\n"
+ "usubl v2.8h, v2.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v28.8h\n"
+ "ldr d27, [x23, x17]\n"
+ "smlal2 v31.4s, v25.8h, v18.8h\n"
+ "ldr d25, [x22, x17]\n"
+ "smlal v21.4s, v12.4h, v14.4h\n"
"ldr x25, [x15, #0x98]\n"
- "smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v17.4s, v27.8h, v5.8h\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "ldr x24, [x15, #0x90]\n"
- "smlal v10.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "smlal v20.4s, v1.4h, v11.4h\n"
+ "smlal v19.4s, v2.4h, v22.4h\n"
+ "ldr x24, [x15, #0x50]\n"
+ "smlal2 v3.4s, v16.8h, v28.8h\n"
+ "ldr d16, [x21, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v23.4h, v18.4h\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "smlal2 v8.4s, v12.8h, v14.8h\n"
+ "ldr d12, [x20, x17]\n"
+ "ldr x23, [x15, #0x48]\n"
+ "smlal2 v0.4s, v1.8h, v11.8h\n"
+ "smlal2 v31.4s, v2.8h, v22.8h\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v21.4s, v10.4h, v11.4h\n"
+ "smlal v20.4s, v27.4h, v18.4h\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "smlal v19.4s, v25.4h, v9.4h\n"
+ "smlal2 v3.4s, v23.8h, v18.8h\n"
+ "ldr d23, [x25, x17]\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "smlal v5.4s, v10.4h, v14.4h\n"
+ "smlal2 v8.4s, v10.8h, v11.8h\n"
+ "ldr d11, [x24, x17]\n"
+ "usubl v11.8h, v11.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v18.8h\n"
"ldr d27, [x23, x17]\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "smlal v21.4s, v31.4h, v0.4h\n"
- "smlal v9.4s, v26.4h, v3.4h\n"
- "ldr x23, [x15, #0xa8]\n"
- "ldr x20, [x15, #0xa0]\n"
- "smlal2 v23.4s, v26.8h, v3.8h\n"
- "ldr d26, [x22, x17]\n"
- "smlal2 v22.4s, v31.8h, v0.8h\n"
- "ldr d24, [x21, x17]\n"
- "smlal v21.4s, v27.4h, v4.4h\n"
- "smlal v9.4s, v25.4h, v0.4h\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "ldr x22, [x15, #0xb0]\n"
- "smlal2 v23.4s, v25.8h, v0.8h\n"
- "ldr q30, [x13, #0x10]\n"
- "smlal2 v22.4s, v27.8h, v4.8h\n"
- "ldr d27, [x20, x17]\n"
- "smlal v21.4s, v28.4h, v1.4h\n"
- "smlal v15.4s, v25.4h, v6.4h\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
- "ldr x21, [x15, #0xb8]\n"
- "smlal2 v17.4s, v25.8h, v6.8h\n"
- "ldr d25, [x24, x17]\n"
- "smlal v9.4s, v29.4h, v4.4h\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal2 v23.4s, v29.8h, v4.8h\n"
- "ldr d29, [x25, x17]\n"
- "ldr q31, [x12, #0x10]\n"
- "smlal2 v22.4s, v28.8h, v1.8h\n"
- "smlal v21.4s, v26.4h, v5.4h\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v15.4s, v24.4h, v7.4h\n"
- "ldr x20, [x15, #0xc0]\n"
- "smlal2 v17.4s, v24.8h, v7.8h\n"
- "smlal v9.4s, v24.4h, v1.4h\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "ldr x24, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal2 v23.4s, v24.8h, v1.8h\n"
- "ldr d24, [x23, x17]\n"
- "smlal2 v22.4s, v26.8h, v5.8h\n"
- "ldr d26, [x22, x17]\n"
- "smlal v21.4s, v29.4h, v2.4h\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
- "smlal2 v22.4s, v29.8h, v2.8h\n"
+ "smlal2 v31.4s, v25.8h, v9.8h\n"
+ "ldr d25, [x21, x17]\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "smlal v21.4s, v16.4h, v18.4h\n"
+ "smlal v20.4s, v12.4h, v22.4h\n"
+ "smlal v19.4s, v23.4h, v14.4h\n"
+ "smlal2 v3.4s, v10.8h, v14.8h\n"
+ "ldr d10, [x20, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "usubl v10.8h, v10.8b, v6.8b\n"
+ "smlal v5.4s, v11.4h, v9.4h\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "smlal2 v8.4s, v16.8h, v18.8h\n"
+ "ldr d18, [x22, x17]\n"
+ "ldr d16, [x21, x17]\n"
+ "smlal2 v0.4s, v12.8h, v22.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal2 v31.4s, v23.8h, v14.8h\n"
+ "ldr q14, [x13, #0x10]\n"
+ "smlal v21.4s, v27.4h, v9.4h\n"
+ "smlal v20.4s, v25.4h, v26.4h\n"
+ "smlal v19.4s, v10.4h, v28.4h\n"
+ "usubl v18.8h, v18.8b, v6.8b\n"
+ "ldr x21, [x15, #0xc0]\n"
+ "smlal2 v3.4s, v11.8h, v9.8h\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v26.4h\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v8.4s, v27.8h, v9.8h\n"
+ "ldr d27, [x21, x17]\n"
+ "smlal2 v0.4s, v25.8h, v26.8h\n"
+ "ldr q25, [x12, #0x10]\n"
+ "smlal2 v31.4s, v10.8h, v28.8h\n"
+ "smlal v21.4s, v11.4h, v28.4h\n"
+ "usubl v22.8h, v22.8b, v6.8b\n"
"add x14, x14, #0x48\n"
- "smlal v9.4s, v25.4h, v6.4h\n"
- "smlal v21.4s, v24.4h, v3.4h\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v20.4s, v18.4h, v7.4h\n"
+ "smlal v19.4s, v16.4h, v7.4h\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "add x17, x17, #0x8\n"
+ "smlal2 v3.4s, v1.8h, v26.8h\n"
+ "smlal v5.4s, v12.4h, v7.4h\n"
+ "sqrdmulh v5.4s, v5.4s, v30.4s\n"
"subs x8, x8, #0x1\n"
- "smlal v10.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
- "sqrdmulh v15.4s, v15.4s, v19.4s\n"
+ "smlal2 v8.4s, v11.8h, v28.8h\n"
+ "smlal2 v0.4s, v18.8h, v7.8h\n"
+ "and v28.16b, v5.16b, v29.16b\n"
"add x13, x13, #0x20\n"
- "smlal2 v23.4s, v25.8h, v6.8h\n"
- "ldr d25, [x21, x17]\n"
- "smlal2 v22.4s, v24.8h, v3.8h\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v9.4s, v27.4h, v7.4h\n"
- "smlal v21.4s, v26.4h, v7.4h\n"
- "and v0.16b, v15.16b, v18.16b\n"
+ "smlal2 v31.4s, v16.8h, v7.8h\n"
+ "smlal v21.4s, v2.4h, v7.4h\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
"add x12, x12, #0x20\n"
- "smlal v10.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "ldr d29, [x20, x17]\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal2 v23.4s, v27.8h, v7.8h\n"
- "smlal2 v22.4s, v26.8h, v7.8h\n"
- "sqrdmulh v17.4s, v17.4s, v30.4s\n"
- "add x17, x17, #0x8\n"
- "smlal v9.4s, v24.4h, v5.4h\n"
- "smlal v21.4s, v25.4h, v6.4h\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "smlal2 v23.4s, v24.8h, v5.8h\n"
- "smlal2 v22.4s, v25.8h, v6.8h\n"
- "and v7.16b, v17.16b, v31.16b\n"
- "smlal v9.4s, v25.4h, v8.4h\n"
- "smlal v21.4s, v29.4h, v8.4h\n"
- "sqrdmulh v10.4s, v10.4s, v19.4s\n"
- "smlal2 v23.4s, v25.8h, v8.8h\n"
- "smlal2 v22.4s, v29.8h, v8.8h\n"
- "sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "sqrdmulh v21.4s, v21.4s, v19.4s\n"
- "sqadd v15.4s, v15.4s, v0.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "and v19.16b, v10.16b, v18.16b\n"
+ "smlal v20.4s, v10.4h, v9.4h\n"
+ "smlal v19.4s, v22.4h, v26.4h\n"
+ "sqadd v5.4s, v5.4s, v28.4s\n"
+ "smlal2 v3.4s, v12.8h, v7.8h\n"
+ "smlal2 v8.4s, v2.8h, v7.8h\n"
+ "sqrdmulh v3.4s, v3.4s, v14.4s\n"
+ "smlal2 v0.4s, v10.8h, v9.8h\n"
+ "smlal2 v31.4s, v22.8h, v26.8h\n"
+ "and v16.16b, v3.16b, v25.16b\n"
+ "smlal v21.4s, v23.4h, v4.4h\n"
+ "smlal v20.4s, v22.4h, v4.4h\n"
+ "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+ "smlal v19.4s, v27.4h, v4.4h\n"
+ "smlal2 v8.4s, v23.8h, v4.8h\n"
"sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "and v27.16b, v9.16b, v18.16b\n"
- "sqrdmulh v23.4s, v23.4s, v30.4s\n"
- "and v0.16b, v21.16b, v18.16b\n"
- "sqrdmulh v22.4s, v22.4s, v30.4s\n"
- "sqadd v17.4s, v17.4s, v7.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "and v5.16b, v20.16b, v31.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "and v4.16b, v23.16b, v31.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v7.16b, v22.16b, v31.16b\n"
- "sqadd v10.4s, v10.4s, v19.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v27.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v18.4s\n"
- "srshl v10.4s, v10.4s, v18.4s\n"
- "sqadd v20.4s, v20.4s, v5.4s\n"
- "srshl v9.4s, v9.4s, v18.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "srshl v21.4s, v21.4s, v18.4s\n"
- "sqadd v22.4s, v22.4s, v7.4s\n"
- "srshl v17.4s, v17.4s, v31.4s\n"
- "sqxtn v15.4h, v15.4s\n"
- "srshl v20.4s, v20.4s, v31.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "srshl v23.4s, v23.4s, v31.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v22.4s, v22.4s, v31.4s\n"
+ "smlal2 v0.4s, v22.8h, v4.8h\n"
+ "smlal2 v31.4s, v27.8h, v4.8h\n"
+ "sqrdmulh v19.4s, v19.4s, v30.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v12.16b, v21.16b, v29.16b\n"
+ "sqrdmulh v8.4s, v8.4s, v14.4s\n"
+ "and v23.16b, v20.16b, v29.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v14.4s\n"
+ "and v9.16b, v19.16b, v29.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v14.4s\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "and v18.16b, v8.16b, v25.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v22.16b, v0.16b, v25.16b\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "and v16.16b, v31.16b, v25.16b\n"
+ "sqadd v21.4s, v21.4s, v12.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v23.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v9.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v29.4s\n"
+ "srshl v21.4s, v21.4s, v29.4s\n"
+ "sqadd v8.4s, v8.4s, v18.4s\n"
+ "srshl v20.4s, v20.4s, v29.4s\n"
+ "sqadd v0.4s, v0.4s, v22.4s\n"
+ "srshl v19.4s, v19.4s, v29.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v3.4s, v3.4s, v25.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "srshl v8.4s, v8.4s, v25.4s\n"
"sqxtn v21.4h, v21.4s\n"
- "sqxtn2 v15.8h, v17.4s\n"
- "sqxtn2 v10.8h, v20.4s\n"
- "sqxtn2 v9.8h, v23.4s\n"
- "sqxtn2 v21.8h, v22.4s\n"
- "sqadd v15.8h, v15.8h, v11.8h\n"
- "sqadd v10.8h, v10.8h, v11.8h\n"
- "sqadd v9.8h, v9.8h, v11.8h\n"
- "sqadd v21.8h, v21.8h, v11.8h\n"
- "smax v15.8h, v15.8h, v16.8h\n"
- "smax v10.8h, v10.8h, v16.8h\n"
- "smax v9.8h, v9.8h, v16.8h\n"
- "smax v21.8h, v21.8h, v16.8h\n"
- "smin v15.8h, v15.8h, v14.8h\n"
- "smin v10.8h, v10.8h, v14.8h\n"
- "smin v9.8h, v9.8h, v14.8h\n"
- "smin v21.8h, v21.8h, v14.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
- "str d15, [x11, x16]\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
- "str d10, [x10, x16]\n"
+ "srshl v0.4s, v0.4s, v25.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v31.4s, v31.4s, v25.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "sqxtn2 v5.8h, v3.4s\n"
+ "sqxtn2 v21.8h, v8.4s\n"
+ "sqxtn2 v20.8h, v0.4s\n"
+ "sqxtn2 v19.8h, v31.4s\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "sqadd v21.8h, v21.8h, v13.8h\n"
+ "sqadd v20.8h, v20.8h, v13.8h\n"
+ "sqadd v19.8h, v19.8h, v13.8h\n"
+ "smax v5.8h, v5.8h, v17.8h\n"
+ "smax v21.8h, v21.8h, v17.8h\n"
+ "smax v20.8h, v20.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smin v5.8h, v5.8h, v24.8h\n"
+ "smin v21.8h, v21.8h, v24.8h\n"
+ "smin v20.8h, v20.8h, v24.8h\n"
+ "smin v19.8h, v19.8h, v24.8h\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "str d5, [x11, x16]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
- "str d9, [x9, x16]\n"
- "str d21, [x28, x16]\n"
- "ldr q15, [x24, #0x0]\n"
- "ldr q17, [x24, #0x10]\n"
- "add x24, x24, #0x20\n"
- "ldr d0, [x14, #0x0]\n"
- "ldr d1, [x14, #0x8]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d21, [x10, x16]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str d20, [x9, x16]\n"
+ "str d19, [x28, x16]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q3, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d11, [x14, #0x0]\n"
+ "ldr d22, [x14, #0x8]\n"
"add x16, x16, #0x8\n"
- "str x24, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d2, [x14, #0x10]\n"
- "ldr d3, [x14, #0x18]\n"
- "mov v10.16b, v15.16b\n"
- "mov v20.16b, v17.16b\n"
- "ldr d4, [x14, #0x20]\n"
- "ldr d5, [x14, #0x28]\n"
- "mov v9.16b, v15.16b\n"
- "mov v23.16b, v17.16b\n"
- "ldr d6, [x14, #0x30]\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d14, [x14, #0x10]\n"
+ "ldr d28, [x14, #0x18]\n"
+ "mov v21.16b, v5.16b\n"
+ "mov v8.16b, v3.16b\n"
+ "ldr d18, [x14, #0x20]\n"
+ "ldr d9, [x14, #0x28]\n"
+ "mov v20.16b, v5.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "ldr d26, [x14, #0x30]\n"
"ldr d7, [x14, #0x38]\n"
- "mov v21.16b, v15.16b\n"
- "mov v22.16b, v17.16b\n"
- "ldr d8, [x14, #0x40]\n"
+ "mov v19.16b, v5.16b\n"
+ "mov v31.16b, v3.16b\n"
+ "ldr d4, [x14, #0x40]\n"
"ldp x27, x26, [x15, #0x0]\n"
- "ssubl v0.8h, v0.8b, v13.8b\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
+ "ssubl v11.8h, v11.8b, v15.8b\n"
+ "ssubl v22.8h, v22.8b, v15.8b\n"
"ldp x25, x24, [x15, #0x10]\n"
"ldp x23, x22, [x15, #0x20]\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
+ "ssubl v14.8h, v14.8b, v15.8b\n"
+ "ssubl v28.8h, v28.8b, v15.8b\n"
"ldp x21, x20, [x15, #0x30]\n"
- "ldr d31, [x27, x17]\n"
- "ssubl v4.8h, v4.8b, v13.8b\n"
- "ssubl v5.8h, v5.8b, v13.8b\n"
- "ldr d30, [x26, x17]\n"
- "ldr d29, [x25, x17]\n"
- "ssubl v6.8h, v6.8b, v13.8b\n"
- "ssubl v7.8h, v7.8b, v13.8b\n"
- "ldr d28, [x24, x17]\n"
- "ldr d27, [x23, x17]\n"
- "ssubl v8.8h, v8.8b, v13.8b\n"
- "usubl v31.8h, v31.8b, v12.8b\n"
- "ldr d26, [x22, x17]\n"
- "ldr d25, [x21, x17]\n"
- "usubl v30.8h, v30.8b, v12.8b\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "ldr d24, [x20, x17]\n"
- "usubl v28.8h, v28.8b, v12.8b\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
+ "ldr d25, [x27, x17]\n"
+ "ssubl v18.8h, v18.8b, v15.8b\n"
+ "ssubl v9.8h, v9.8b, v15.8b\n"
+ "ldr d27, [x26, x17]\n"
+ "ldr d1, [x25, x17]\n"
+ "ssubl v26.8h, v26.8b, v15.8b\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "ldr d2, [x24, x17]\n"
+ "ldr d12, [x23, x17]\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "ldr d16, [x22, x17]\n"
+ "ldr d23, [x21, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "usubl v1.8h, v1.8b, v6.8b\n"
+ "ldr d10, [x20, x17]\n"
+ "usubl v2.8h, v2.8b, v6.8b\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "usubl v10.8h, v10.8b, v6.8b\n"
"bgt 1b\n"
"2:" // Tail
- "smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v17.4s, v31.8h, v8.8h\n"
- "ldr x24, [x15, #0x40]\n"
- "ldr x22, [x15, #0x48]\n"
- "smlal v10.4s, v31.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "ldr x21, [x15, #0x50]\n"
- "ldr x20, [x15, #0x58]\n"
- "smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v17.4s, v30.8h, v0.8h\n"
- "ldr q19, [x13, #0x0]\n"
- "ldr x23, [x15, #0x78]\n"
- "smlal v10.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "ldr d28, [x22, x17]\n"
- "usubl v28.8h, v28.8b, v12.8b\n"
- "smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v17.4s, v29.8h, v1.8h\n"
- "ldr d29, [x24, x17]\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
+ "ldr q29, [x13, #0x0]\n"
+ "ldr q30, [x12, #0x0]\n"
+ "smlal v5.4s, v25.4h, v4.4h\n"
+ "smlal2 v3.4s, v25.8h, v4.8h\n"
+ "ldr x21, [x15, #0x58]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v5.4s, v27.4h, v11.4h\n"
+ "smlal v21.4s, v25.4h, v26.4h\n"
+ "ldr x25, [x15, #0x60]\n"
+ "ldr x24, [x15, #0x80]\n"
+ "smlal v20.4s, v25.4h, v14.4h\n"
+ "smlal v19.4s, v25.4h, v11.4h\n"
+ "smlal2 v3.4s, v27.8h, v11.8h\n"
"ldr d27, [x21, x17]\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v17.4s, v26.8h, v3.8h\n"
- "ldr d26, [x20, x17]\n"
- "ldr x20, [x15, #0x60]\n"
- "smlal v10.4s, v24.4h, v0.4h\n"
- "smlal2 v20.4s, v24.8h, v0.8h\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "ldr x21, [x15, #0x80]\n"
- "smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v17.4s, v25.8h, v4.8h\n"
- "ldr d25, [x20, x17]\n"
- "ldr x20, [x15, #0x68]\n"
- "smlal v10.4s, v29.4h, v4.4h\n"
- "smlal2 v20.4s, v29.8h, v4.8h\n"
- "ldr d29, [x20, x17]\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v17.4s, v24.8h, v2.8h\n"
- "ldr q18, [x12, #0x0]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v22.4h\n"
+ "smlal2 v8.4s, v25.8h, v26.8h\n"
+ "smlal2 v0.4s, v25.8h, v14.8h\n"
+ "ldr x23, [x15, #0x68]\n"
"ldr x22, [x15, #0x88]\n"
- "smlal v10.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "ldr d28, [x21, x17]\n"
- "ldr x21, [x15, #0x70]\n"
- "smlal v9.4s, v31.4h, v2.4h\n"
- "smlal2 v23.4s, v31.8h, v2.8h\n"
- "usubl v28.8h, v28.8b, v12.8b\n"
+ "smlal2 v31.4s, v25.8h, v11.8h\n"
+ "ldr d25, [x20, x17]\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "smlal v21.4s, v2.4h, v22.4h\n"
+ "smlal v20.4s, v27.4h, v28.4h\n"
+ "smlal v19.4s, v25.4h, v18.4h\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal2 v3.4s, v1.8h, v22.8h\n"
+ "ldr d1, [x25, x17]\n"
+ "usubl v1.8h, v1.8b, v6.8b\n"
+ "smlal v5.4s, v16.4h, v28.4h\n"
+ "smlal2 v8.4s, v2.8h, v22.8h\n"
+ "ldr d2, [x24, x17]\n"
+ "usubl v2.8h, v2.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v28.8h\n"
+ "ldr d27, [x23, x17]\n"
+ "smlal2 v31.4s, v25.8h, v18.8h\n"
+ "ldr d25, [x22, x17]\n"
+ "smlal v21.4s, v12.4h, v14.4h\n"
"ldr x25, [x15, #0x98]\n"
- "smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v17.4s, v27.8h, v5.8h\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "ldr x24, [x15, #0x90]\n"
- "smlal v10.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "smlal v20.4s, v1.4h, v11.4h\n"
+ "smlal v19.4s, v2.4h, v22.4h\n"
+ "ldr x24, [x15, #0x50]\n"
+ "smlal2 v3.4s, v16.8h, v28.8h\n"
+ "ldr d16, [x21, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v23.4h, v18.4h\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "smlal2 v8.4s, v12.8h, v14.8h\n"
+ "ldr d12, [x20, x17]\n"
+ "ldr x23, [x15, #0x48]\n"
+ "smlal2 v0.4s, v1.8h, v11.8h\n"
+ "smlal2 v31.4s, v2.8h, v22.8h\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v21.4s, v10.4h, v11.4h\n"
+ "smlal v20.4s, v27.4h, v18.4h\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "smlal v19.4s, v25.4h, v9.4h\n"
+ "smlal2 v3.4s, v23.8h, v18.8h\n"
+ "ldr d23, [x25, x17]\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "smlal v5.4s, v10.4h, v14.4h\n"
+ "smlal2 v8.4s, v10.8h, v11.8h\n"
+ "ldr d11, [x24, x17]\n"
+ "usubl v11.8h, v11.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v18.8h\n"
"ldr d27, [x23, x17]\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "smlal v21.4s, v31.4h, v0.4h\n"
- "smlal v9.4s, v26.4h, v3.4h\n"
- "ldr x23, [x15, #0xa8]\n"
- "ldr x20, [x15, #0xa0]\n"
- "smlal2 v23.4s, v26.8h, v3.8h\n"
- "ldr d26, [x22, x17]\n"
- "smlal2 v22.4s, v31.8h, v0.8h\n"
- "ldr d24, [x21, x17]\n"
- "smlal v21.4s, v27.4h, v4.4h\n"
- "smlal v9.4s, v25.4h, v0.4h\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "ldr x22, [x15, #0xb0]\n"
- "smlal2 v23.4s, v25.8h, v0.8h\n"
- "ldr q30, [x13, #0x10]\n"
- "smlal2 v22.4s, v27.8h, v4.8h\n"
- "ldr d27, [x20, x17]\n"
- "smlal v21.4s, v28.4h, v1.4h\n"
- "smlal v15.4s, v25.4h, v6.4h\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
- "ldr x21, [x15, #0xb8]\n"
- "smlal2 v17.4s, v25.8h, v6.8h\n"
- "ldr d25, [x24, x17]\n"
- "smlal v9.4s, v29.4h, v4.4h\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal2 v23.4s, v29.8h, v4.8h\n"
- "ldr d29, [x25, x17]\n"
- "ldr q31, [x12, #0x10]\n"
- "smlal2 v22.4s, v28.8h, v1.8h\n"
- "smlal v21.4s, v26.4h, v5.4h\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v15.4s, v24.4h, v7.4h\n"
+ "smlal2 v31.4s, v25.8h, v9.8h\n"
+ "ldr d25, [x21, x17]\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "smlal v21.4s, v16.4h, v18.4h\n"
+ "smlal v20.4s, v12.4h, v22.4h\n"
+ "smlal v19.4s, v23.4h, v14.4h\n"
+ "smlal2 v3.4s, v10.8h, v14.8h\n"
+ "ldr d10, [x20, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "usubl v10.8h, v10.8b, v6.8b\n"
+ "smlal v5.4s, v11.4h, v9.4h\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "smlal2 v8.4s, v16.8h, v18.8h\n"
+ "ldr d16, [x22, x17]\n"
+ "ldr d18, [x21, x17]\n"
+ "smlal2 v0.4s, v12.8h, v22.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal2 v31.4s, v23.8h, v14.8h\n"
+ "ldr q14, [x13, #0x10]\n"
+ "smlal v21.4s, v27.4h, v9.4h\n"
+ "smlal v20.4s, v25.4h, v26.4h\n"
+ "smlal v19.4s, v10.4h, v28.4h\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0xc0]\n"
- "smlal2 v17.4s, v24.8h, v7.8h\n"
- "smlal v9.4s, v24.4h, v1.4h\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
+ "smlal2 v3.4s, v11.8h, v9.8h\n"
+ "usubl v18.8h, v18.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v26.4h\n"
"tst x7, #0x7\n"
- "smlal2 v23.4s, v24.8h, v1.8h\n"
- "ldr d24, [x23, x17]\n"
- "smlal2 v22.4s, v26.8h, v5.8h\n"
- "ldr d26, [x22, x17]\n"
- "smlal v21.4s, v29.4h, v2.4h\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
- "smlal2 v22.4s, v29.8h, v2.8h\n"
+ "smlal2 v8.4s, v27.8h, v9.8h\n"
+ "ldr d27, [x20, x17]\n"
+ "smlal2 v0.4s, v25.8h, v26.8h\n"
+ "ldr q25, [x12, #0x10]\n"
+ "smlal2 v31.4s, v10.8h, v28.8h\n"
+ "smlal v21.4s, v11.4h, v28.4h\n"
+ "usubl v22.8h, v22.8b, v6.8b\n"
+ "add x17, x17, #0x8\n"
+ "smlal v20.4s, v16.4h, v7.4h\n"
+ "smlal v19.4s, v18.4h, v7.4h\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
"add x13, x13, #0x20\n"
- "smlal v9.4s, v25.4h, v6.4h\n"
- "smlal v21.4s, v24.4h, v3.4h\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal2 v3.4s, v1.8h, v26.8h\n"
+ "smlal v5.4s, v12.4h, v7.4h\n"
+ "sqrdmulh v5.4s, v5.4s, v29.4s\n"
"add x12, x12, #0x20\n"
- "smlal v10.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
- "sqrdmulh v15.4s, v15.4s, v19.4s\n"
- "smlal2 v23.4s, v25.8h, v6.8h\n"
- "ldr d25, [x21, x17]\n"
- "smlal2 v22.4s, v24.8h, v3.8h\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v9.4s, v27.4h, v7.4h\n"
- "smlal v21.4s, v26.4h, v7.4h\n"
- "and v0.16b, v15.16b, v18.16b\n"
- "smlal v10.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "ldr d29, [x20, x17]\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal2 v23.4s, v27.8h, v7.8h\n"
- "smlal2 v22.4s, v26.8h, v7.8h\n"
- "sqrdmulh v17.4s, v17.4s, v30.4s\n"
- "add x17, x17, #0x8\n"
- "smlal v9.4s, v24.4h, v5.4h\n"
- "smlal v21.4s, v25.4h, v6.4h\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "smlal2 v23.4s, v24.8h, v5.8h\n"
- "smlal2 v22.4s, v25.8h, v6.8h\n"
- "and v7.16b, v17.16b, v31.16b\n"
- "smlal v9.4s, v25.4h, v8.4h\n"
- "smlal v21.4s, v29.4h, v8.4h\n"
- "sqrdmulh v10.4s, v10.4s, v19.4s\n"
- "smlal2 v23.4s, v25.8h, v8.8h\n"
- "smlal2 v22.4s, v29.8h, v8.8h\n"
- "sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "sqrdmulh v21.4s, v21.4s, v19.4s\n"
- "sqadd v15.4s, v15.4s, v0.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "and v19.16b, v10.16b, v18.16b\n"
- "sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "and v27.16b, v9.16b, v18.16b\n"
- "sqrdmulh v23.4s, v23.4s, v30.4s\n"
- "and v0.16b, v21.16b, v18.16b\n"
- "sqrdmulh v22.4s, v22.4s, v30.4s\n"
- "sqadd v17.4s, v17.4s, v7.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "and v5.16b, v20.16b, v31.16b\n"
+ "smlal2 v8.4s, v11.8h, v28.8h\n"
+ "smlal2 v0.4s, v16.8h, v7.8h\n"
+ "and v16.16b, v5.16b, v30.16b\n"
+ "smlal2 v31.4s, v18.8h, v7.8h\n"
+ "smlal v21.4s, v2.4h, v7.4h\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smlal v20.4s, v10.4h, v9.4h\n"
+ "smlal v19.4s, v22.4h, v26.4h\n"
+ "sqadd v5.4s, v5.4s, v16.4s\n"
+ "smlal2 v3.4s, v12.8h, v7.8h\n"
+ "smlal2 v8.4s, v2.8h, v7.8h\n"
+ "sqrdmulh v3.4s, v3.4s, v14.4s\n"
+ "smlal2 v0.4s, v10.8h, v9.8h\n"
+ "smlal2 v31.4s, v22.8h, v26.8h\n"
+ "and v16.16b, v3.16b, v25.16b\n"
+ "smlal v21.4s, v23.4h, v4.4h\n"
+ "smlal v20.4s, v22.4h, v4.4h\n"
+ "sqrdmulh v21.4s, v21.4s, v29.4s\n"
+ "smlal v19.4s, v27.4h, v4.4h\n"
+ "smlal2 v8.4s, v23.8h, v4.8h\n"
+ "sqrdmulh v20.4s, v20.4s, v29.4s\n"
+ "smlal2 v0.4s, v22.8h, v4.8h\n"
+ "smlal2 v31.4s, v27.8h, v4.8h\n"
+ "sqrdmulh v19.4s, v19.4s, v29.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v23.16b, v21.16b, v30.16b\n"
+ "sqrdmulh v8.4s, v8.4s, v14.4s\n"
+ "and v27.16b, v20.16b, v30.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v14.4s\n"
+ "and v22.16b, v19.16b, v30.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v14.4s\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v14.16b, v8.16b, v25.16b\n"
"sshr v27.4s, v27.4s, #0x1f\n"
- "and v4.16b, v23.16b, v31.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v7.16b, v22.16b, v31.16b\n"
- "sqadd v10.4s, v10.4s, v19.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v27.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v18.4s\n"
- "srshl v10.4s, v10.4s, v18.4s\n"
- "sqadd v20.4s, v20.4s, v5.4s\n"
- "srshl v9.4s, v9.4s, v18.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "srshl v21.4s, v21.4s, v18.4s\n"
- "sqadd v22.4s, v22.4s, v7.4s\n"
- "srshl v17.4s, v17.4s, v31.4s\n"
- "sqxtn v15.4h, v15.4s\n"
- "srshl v20.4s, v20.4s, v31.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "srshl v23.4s, v23.4s, v31.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v22.4s, v22.4s, v31.4s\n"
+ "and v18.16b, v0.16b, v25.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v16.16b, v31.16b, v25.16b\n"
+ "sqadd v21.4s, v21.4s, v23.4s\n"
+ "sshr v14.4s, v14.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v27.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v22.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v30.4s\n"
+ "srshl v21.4s, v21.4s, v30.4s\n"
+ "sqadd v8.4s, v8.4s, v14.4s\n"
+ "srshl v20.4s, v20.4s, v30.4s\n"
+ "sqadd v0.4s, v0.4s, v18.4s\n"
+ "srshl v19.4s, v19.4s, v30.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v3.4s, v3.4s, v25.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "srshl v8.4s, v8.4s, v25.4s\n"
"sqxtn v21.4h, v21.4s\n"
- "sqxtn2 v15.8h, v17.4s\n"
- "sqxtn2 v10.8h, v20.4s\n"
- "sqxtn2 v9.8h, v23.4s\n"
- "sqxtn2 v21.8h, v22.4s\n"
- "sqadd v15.8h, v15.8h, v11.8h\n"
- "sqadd v10.8h, v10.8h, v11.8h\n"
- "sqadd v9.8h, v9.8h, v11.8h\n"
- "sqadd v21.8h, v21.8h, v11.8h\n"
- "smax v15.8h, v15.8h, v16.8h\n"
- "smax v10.8h, v10.8h, v16.8h\n"
- "smax v9.8h, v9.8h, v16.8h\n"
- "smax v21.8h, v21.8h, v16.8h\n"
- "smin v15.8h, v15.8h, v14.8h\n"
- "smin v10.8h, v10.8h, v14.8h\n"
- "smin v9.8h, v9.8h, v14.8h\n"
- "smin v21.8h, v21.8h, v14.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
- "str d15, [x11, x16]\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
- "str d10, [x10, x16]\n"
+ "srshl v0.4s, v0.4s, v25.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v31.4s, v31.4s, v25.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "sqxtn2 v5.8h, v3.4s\n"
+ "sqxtn2 v21.8h, v8.4s\n"
+ "sqxtn2 v20.8h, v0.4s\n"
+ "sqxtn2 v19.8h, v31.4s\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "sqadd v21.8h, v21.8h, v13.8h\n"
+ "sqadd v20.8h, v20.8h, v13.8h\n"
+ "sqadd v19.8h, v19.8h, v13.8h\n"
+ "smax v5.8h, v5.8h, v17.8h\n"
+ "smax v21.8h, v21.8h, v17.8h\n"
+ "smax v20.8h, v20.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smin v5.8h, v5.8h, v24.8h\n"
+ "smin v21.8h, v21.8h, v24.8h\n"
+ "smin v20.8h, v20.8h, v24.8h\n"
+ "smin v19.8h, v19.8h, v24.8h\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "str d5, [x11, x16]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
- "str d9, [x9, x16]\n"
- "str d21, [x28, x16]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d21, [x10, x16]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str d20, [x9, x16]\n"
+ "str d19, [x28, x16]\n"
"add x16, x16, #0x8\n"
"beq 88f\n"
"add x14, x14, #0x48\n"
"3:" // Oddments
- "ldr x24, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
"tbz x7, #2, 5f\n"
- "ld1 { v15.4s }, [x24], #0x10\n"
+ "ld1 { v5.4s }, [x20], #0x10\n"
"tbz x7, #1, 4f\n"
- "ld1 { v17.d }[0], [x24], #0x8\n"
+ "ld1 { v3.d }[0], [x20], #0x8\n"
"tbz x7, #0, 7f\n"
- "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v3.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
"tbz x7, #0, 7f\n"
- "ld1 { v17.s }[0], [x24]\n"
+ "ld1 { v3.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
"tbz x7, #1, 6f\n"
- "ld1 { v15.d }[0], [x24], #0x8\n"
+ "ld1 { v5.d }[0], [x20], #0x8\n"
"tbz x7, #0, 7f\n"
- "ld1 { v15.s }[2], [x24]\n"
+ "ld1 { v5.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 7f\n"
- "ld1 { v15.s }[0], [x24]\n"
+ "ld1 { v5.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x14, #0x0]\n"
- "ldr d1, [x14, #0x8]\n"
- "mov v10.16b, v15.16b\n"
- "mov v20.16b, v17.16b\n"
- "ldr d2, [x14, #0x10]\n"
- "ldr d3, [x14, #0x18]\n"
- "mov v9.16b, v15.16b\n"
- "mov v23.16b, v17.16b\n"
- "ldr d4, [x14, #0x20]\n"
- "ldr d5, [x14, #0x28]\n"
- "mov v21.16b, v15.16b\n"
- "mov v22.16b, v17.16b\n"
- "ldr d6, [x14, #0x30]\n"
+ "ldr d11, [x14, #0x0]\n"
+ "ldr d22, [x14, #0x8]\n"
+ "mov v21.16b, v5.16b\n"
+ "mov v8.16b, v3.16b\n"
+ "ldr d14, [x14, #0x10]\n"
+ "ldr d28, [x14, #0x18]\n"
+ "mov v20.16b, v5.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "ldr d18, [x14, #0x20]\n"
+ "ldr d9, [x14, #0x28]\n"
+ "mov v19.16b, v5.16b\n"
+ "mov v31.16b, v3.16b\n"
+ "ldr d26, [x14, #0x30]\n"
"ldr d7, [x14, #0x38]\n"
- "ssubl v0.8h, v0.8b, v13.8b\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
- "ldr d8, [x14, #0x40]\n"
+ "ssubl v11.8h, v11.8b, v15.8b\n"
+ "ssubl v22.8h, v22.8b, v15.8b\n"
+ "ldr d4, [x14, #0x40]\n"
"ldp x27, x26, [x15, #0x0]\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
+ "ssubl v14.8h, v14.8b, v15.8b\n"
+ "ssubl v28.8h, v28.8b, v15.8b\n"
"ldp x25, x24, [x15, #0x10]\n"
"ldp x23, x22, [x15, #0x20]\n"
- "ssubl v4.8h, v4.8b, v13.8b\n"
- "ssubl v5.8h, v5.8b, v13.8b\n"
+ "ssubl v18.8h, v18.8b, v15.8b\n"
+ "ssubl v9.8h, v9.8b, v15.8b\n"
"ldp x21, x20, [x15, #0x30]\n"
- "ssubl v6.8h, v6.8b, v13.8b\n"
- "ssubl v7.8h, v7.8b, v13.8b\n"
- "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ssubl v26.8h, v26.8b, v15.8b\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
"add x27, x27, x17\n"
"add x26, x26, x17\n"
"add x25, x25, x17\n"
@@ -689,700 +689,700 @@ void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"add x21, x21, x17\n"
"add x20, x20, x17\n"
"tbz x7, #2, 9f\n"
- "ld1 { v31.s }[0], [x27], #0x4\n"
- "ld1 { v30.s }[0], [x26], #0x4\n"
- "ld1 { v29.s }[0], [x25], #0x4\n"
- "ld1 { v28.s }[0], [x24], #0x4\n"
- "ld1 { v27.s }[0], [x23], #0x4\n"
- "ld1 { v26.s }[0], [x22], #0x4\n"
- "ld1 { v25.s }[0], [x21], #0x4\n"
- "ld1 { v24.s }[0], [x20], #0x4\n"
+ "ld1 { v25.s }[0], [x27], #0x4\n"
+ "ld1 { v27.s }[0], [x26], #0x4\n"
+ "ld1 { v1.s }[0], [x25], #0x4\n"
+ "ld1 { v2.s }[0], [x24], #0x4\n"
+ "ld1 { v12.s }[0], [x23], #0x4\n"
+ "ld1 { v16.s }[0], [x22], #0x4\n"
+ "ld1 { v23.s }[0], [x21], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz x7, #1, 8f\n"
- "ld1 { v31.h }[2], [x27], #0x2\n"
- "ld1 { v30.h }[2], [x26], #0x2\n"
- "ld1 { v29.h }[2], [x25], #0x2\n"
- "ld1 { v28.h }[2], [x24], #0x2\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
- "ld1 { v26.h }[2], [x22], #0x2\n"
- "ld1 { v25.h }[2], [x21], #0x2\n"
- "ld1 { v24.h }[2], [x20], #0x2\n"
+ "ld1 { v25.h }[2], [x27], #0x2\n"
+ "ld1 { v27.h }[2], [x26], #0x2\n"
+ "ld1 { v1.h }[2], [x25], #0x2\n"
+ "ld1 { v2.h }[2], [x24], #0x2\n"
+ "ld1 { v12.h }[2], [x23], #0x2\n"
+ "ld1 { v16.h }[2], [x22], #0x2\n"
+ "ld1 { v23.h }[2], [x21], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[6], [x27]\n"
- "ld1 { v30.b }[6], [x26]\n"
- "ld1 { v29.b }[6], [x25]\n"
- "ld1 { v28.b }[6], [x24]\n"
- "ld1 { v27.b }[6], [x23]\n"
- "ld1 { v26.b }[6], [x22]\n"
- "ld1 { v25.b }[6], [x21]\n"
- "ld1 { v24.b }[6], [x20]\n"
+ "ld1 { v25.b }[6], [x27]\n"
+ "ld1 { v27.b }[6], [x26]\n"
+ "ld1 { v1.b }[6], [x25]\n"
+ "ld1 { v2.b }[6], [x24]\n"
+ "ld1 { v12.b }[6], [x23]\n"
+ "ld1 { v16.b }[6], [x22]\n"
+ "ld1 { v23.b }[6], [x21]\n"
+ "ld1 { v10.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[4], [x27]\n"
- "ld1 { v30.b }[4], [x26]\n"
- "ld1 { v29.b }[4], [x25]\n"
- "ld1 { v28.b }[4], [x24]\n"
- "ld1 { v27.b }[4], [x23]\n"
- "ld1 { v26.b }[4], [x22]\n"
- "ld1 { v25.b }[4], [x21]\n"
- "ld1 { v24.b }[4], [x20]\n"
+ "ld1 { v25.b }[4], [x27]\n"
+ "ld1 { v27.b }[4], [x26]\n"
+ "ld1 { v1.b }[4], [x25]\n"
+ "ld1 { v2.b }[4], [x24]\n"
+ "ld1 { v12.b }[4], [x23]\n"
+ "ld1 { v16.b }[4], [x22]\n"
+ "ld1 { v23.b }[4], [x21]\n"
+ "ld1 { v10.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
"tbz x7, #1, 10f\n"
- "ld1 { v31.h }[0], [x27], #0x2\n"
- "ld1 { v30.h }[0], [x26], #0x2\n"
- "ld1 { v29.h }[0], [x25], #0x2\n"
- "ld1 { v28.h }[0], [x24], #0x2\n"
- "ld1 { v27.h }[0], [x23], #0x2\n"
- "ld1 { v26.h }[0], [x22], #0x2\n"
- "ld1 { v25.h }[0], [x21], #0x2\n"
- "ld1 { v24.h }[0], [x20], #0x2\n"
+ "ld1 { v25.h }[0], [x27], #0x2\n"
+ "ld1 { v27.h }[0], [x26], #0x2\n"
+ "ld1 { v1.h }[0], [x25], #0x2\n"
+ "ld1 { v2.h }[0], [x24], #0x2\n"
+ "ld1 { v12.h }[0], [x23], #0x2\n"
+ "ld1 { v16.h }[0], [x22], #0x2\n"
+ "ld1 { v23.h }[0], [x21], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[2], [x27]\n"
- "ld1 { v30.b }[2], [x26]\n"
- "ld1 { v29.b }[2], [x25]\n"
- "ld1 { v28.b }[2], [x24]\n"
- "ld1 { v27.b }[2], [x23]\n"
- "ld1 { v26.b }[2], [x22]\n"
- "ld1 { v25.b }[2], [x21]\n"
- "ld1 { v24.b }[2], [x20]\n"
+ "ld1 { v25.b }[2], [x27]\n"
+ "ld1 { v27.b }[2], [x26]\n"
+ "ld1 { v1.b }[2], [x25]\n"
+ "ld1 { v2.b }[2], [x24]\n"
+ "ld1 { v12.b }[2], [x23]\n"
+ "ld1 { v16.b }[2], [x22]\n"
+ "ld1 { v23.b }[2], [x21]\n"
+ "ld1 { v10.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[0], [x27]\n"
- "ld1 { v30.b }[0], [x26]\n"
- "ld1 { v29.b }[0], [x25]\n"
- "ld1 { v28.b }[0], [x24]\n"
- "ld1 { v27.b }[0], [x23]\n"
- "ld1 { v26.b }[0], [x22]\n"
- "ld1 { v25.b }[0], [x21]\n"
- "ld1 { v24.b }[0], [x20]\n"
+ "ld1 { v25.b }[0], [x27]\n"
+ "ld1 { v27.b }[0], [x26]\n"
+ "ld1 { v1.b }[0], [x25]\n"
+ "ld1 { v2.b }[0], [x24]\n"
+ "ld1 { v12.b }[0], [x23]\n"
+ "ld1 { v16.b }[0], [x22]\n"
+ "ld1 { v23.b }[0], [x21]\n"
+ "ld1 { v10.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "usubl v31.8h, v31.8b, v12.8b\n"
- "smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v17.4s, v31.8h, v8.8h\n"
- "ldr x24, [x15, #0x40]\n"
- "usubl v30.8h, v30.8b, v12.8b\n"
- "smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v17.4s, v30.8h, v0.8h\n"
- "add x24, x24, x17\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v10.4s, v31.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v17.4s, v29.8h, v1.8h\n"
- "usubl v28.8h, v28.8b, v12.8b\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "smlal v10.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v17.4s, v26.8h, v3.8h\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
- "smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v17.4s, v25.8h, v4.8h\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
- "smlal v9.4s, v31.4h, v2.4h\n"
- "smlal2 v23.4s, v31.8h, v2.8h\n"
- "smlal v21.4s, v31.4h, v0.4h\n"
- "smlal2 v22.4s, v31.8h, v0.8h\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v17.4s, v24.8h, v2.8h\n"
- "smlal v10.4s, v24.4h, v0.4h\n"
- "smlal2 v20.4s, v24.8h, v0.8h\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "smlal v5.4s, v25.4h, v4.4h\n"
+ "smlal2 v3.4s, v25.8h, v4.8h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v27.4h, v11.4h\n"
+ "smlal2 v3.4s, v27.8h, v11.8h\n"
+ "usubl v1.8h, v1.8b, v6.8b\n"
+ "smlal v21.4s, v25.4h, v26.4h\n"
+ "smlal2 v8.4s, v25.8h, v26.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v5.4s, v1.4h, v22.4h\n"
+ "smlal2 v3.4s, v1.8h, v22.8h\n"
+ "usubl v2.8h, v2.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "smlal v21.4s, v2.4h, v22.4h\n"
+ "smlal2 v8.4s, v2.8h, v22.8h\n"
+ "smlal v5.4s, v16.4h, v28.4h\n"
+ "smlal2 v3.4s, v16.8h, v28.8h\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "smlal v21.4s, v12.4h, v14.4h\n"
+ "smlal2 v8.4s, v12.8h, v14.8h\n"
+ "smlal v5.4s, v23.4h, v18.4h\n"
+ "smlal2 v3.4s, v23.8h, v18.8h\n"
+ "usubl v10.8h, v10.8b, v6.8b\n"
+ "smlal v20.4s, v25.4h, v14.4h\n"
+ "smlal2 v0.4s, v25.8h, v14.8h\n"
+ "smlal v19.4s, v25.4h, v11.4h\n"
+ "smlal2 v31.4s, v25.8h, v11.8h\n"
+ "smlal v5.4s, v10.4h, v14.4h\n"
+ "smlal2 v3.4s, v10.8h, v14.8h\n"
+ "smlal v21.4s, v10.4h, v11.4h\n"
+ "smlal2 v8.4s, v10.8h, v11.8h\n"
"tbz x7, #2, 13f\n"
- "ld1 { v29.s }[0], [x24], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
"tbz x7, #1, 12f\n"
- "ld1 { v29.h }[2], [x24], #0x2\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[6], [x24]\n"
+ "ld1 { v15.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[4], [x24]\n"
+ "ld1 { v15.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
"tbz x7, #1, 14f\n"
- "ld1 { v29.h }[0], [x24], #0x2\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[2], [x24]\n"
+ "ld1 { v15.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[0], [x24]\n"
+ "ld1 { v15.b }[0], [x20]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
- "usubl v29.8h, v29.8b, v12.8b\n"
- "ldr x22, [x15, #0x48]\n"
- "smlal v10.4s, v29.4h, v4.4h\n"
- "smlal2 v20.4s, v29.8h, v4.8h\n"
- "add x22, x22, x17\n"
+ "usubl v15.8h, v15.8b, v6.8b\n"
+ "ldr x20, [x15, #0x48]\n"
+ "smlal v21.4s, v15.4h, v18.4h\n"
+ "smlal2 v8.4s, v15.8h, v18.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 17f\n"
- "ld1 { v28.s }[0], [x22], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 16f\n"
- "ld1 { v28.h }[2], [x22], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[6], [x22]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[4], [x22]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
"tbz x7, #1, 18f\n"
- "ld1 { v28.h }[0], [x22], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[2], [x22]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[0], [x22]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
- "usubl v28.8h, v28.8b, v12.8b\n"
- "ldr x21, [x15, #0x50]\n"
- "smlal v10.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "add x21, x21, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x50]\n"
+ "smlal v21.4s, v16.4h, v9.4h\n"
+ "smlal2 v8.4s, v16.8h, v9.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 21f\n"
- "ld1 { v27.s }[0], [x21], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 20f\n"
- "ld1 { v27.h }[2], [x21], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[6], [x21]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[4], [x21]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (1, 2): Bit 2: Unset
"tbz x7, #1, 22f\n"
- "ld1 { v27.h }[0], [x21], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[0], [x21]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"23:" // Oddments: Load (1, 2): Bit 2: End
- "usubl v27.8h, v27.8b, v12.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0x58]\n"
- "smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v17.4s, v27.8h, v5.8h\n"
- "smlal v10.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "smlal v5.4s, v16.4h, v9.4h\n"
+ "smlal2 v3.4s, v16.8h, v9.8h\n"
+ "smlal v21.4s, v16.4h, v28.4h\n"
+ "smlal2 v8.4s, v16.8h, v28.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 25f\n"
- "ld1 { v26.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 24f\n"
- "ld1 { v26.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (3, 0): Bit 2: Unset
"tbz x7, #1, 26f\n"
- "ld1 { v26.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"27:" // Oddments: Load (3, 0): Bit 2: End
- "usubl v26.8h, v26.8b, v12.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0x60]\n"
- "smlal v9.4s, v26.4h, v3.4h\n"
- "smlal2 v23.4s, v26.8h, v3.8h\n"
+ "smlal v20.4s, v16.4h, v28.4h\n"
+ "smlal2 v0.4s, v16.8h, v28.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 29f\n"
- "ld1 { v25.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 28f\n"
- "ld1 { v25.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 0): Bit 2: Unset
"tbz x7, #1, 30f\n"
- "ld1 { v25.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 0): Bit 2: End
- "usubl v25.8h, v25.8b, v12.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0x68]\n"
- "smlal v15.4s, v25.4h, v6.4h\n"
- "smlal2 v17.4s, v25.8h, v6.8h\n"
- "smlal v9.4s, v25.4h, v0.4h\n"
- "smlal2 v23.4s, v25.8h, v0.8h\n"
+ "smlal v5.4s, v16.4h, v26.4h\n"
+ "smlal2 v3.4s, v16.8h, v26.8h\n"
+ "smlal v20.4s, v16.4h, v11.4h\n"
+ "smlal2 v0.4s, v16.8h, v11.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 33f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 32f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (3, 1): Bit 2: Unset
"tbz x7, #1, 34f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"35:" // Oddments: Load (3, 1): Bit 2: End
- "usubl v29.8h, v29.8b, v12.8b\n"
- "ldr x21, [x15, #0x70]\n"
- "smlal v9.4s, v29.4h, v4.4h\n"
- "smlal2 v23.4s, v29.8h, v4.8h\n"
- "add x21, x21, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal v20.4s, v16.4h, v18.4h\n"
+ "smlal2 v0.4s, v16.8h, v18.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 37f\n"
- "ld1 { v24.s }[0], [x21], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 36f\n"
- "ld1 { v24.h }[2], [x21], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[6], [x21]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[4], [x21]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 1): Bit 2: Unset
"tbz x7, #1, 38f\n"
- "ld1 { v24.h }[0], [x21], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[0], [x21]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"39:" // Oddments: Load (2, 1): Bit 2: End
- "usubl v24.8h, v24.8b, v12.8b\n"
- "ldr x23, [x15, #0x78]\n"
- "smlal v15.4s, v24.4h, v7.4h\n"
- "smlal2 v17.4s, v24.8h, v7.8h\n"
- "smlal v9.4s, v24.4h, v1.4h\n"
- "smlal2 v23.4s, v24.8h, v1.8h\n"
- "add x23, x23, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v5.4s, v16.4h, v7.4h\n"
+ "smlal2 v3.4s, v16.8h, v7.8h\n"
+ "smlal v20.4s, v16.4h, v22.4h\n"
+ "smlal2 v0.4s, v16.8h, v22.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 41f\n"
- "ld1 { v27.s }[0], [x23], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 40f\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[6], [x23]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[4], [x23]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (3, 3): Bit 2: Unset
"tbz x7, #1, 42f\n"
- "ld1 { v27.h }[0], [x23], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[2], [x23]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[0], [x23]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"43:" // Oddments: Load (3, 3): Bit 2: End
- "usubl v27.8h, v27.8b, v12.8b\n"
- "ldr x21, [x15, #0x80]\n"
- "smlal v21.4s, v27.4h, v4.4h\n"
- "smlal2 v22.4s, v27.8h, v4.8h\n"
- "add x21, x21, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x80]\n"
+ "smlal v19.4s, v16.4h, v18.4h\n"
+ "smlal2 v31.4s, v16.8h, v18.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 45f\n"
- "ld1 { v28.s }[0], [x21], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 44f\n"
- "ld1 { v28.h }[2], [x21], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[6], [x21]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[4], [x21]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
"tbz x7, #1, 46f\n"
- "ld1 { v28.h }[0], [x21], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[0], [x21]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "usubl v28.8h, v28.8b, v12.8b\n"
- "ldr x22, [x15, #0x88]\n"
- "smlal v10.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
- "smlal v21.4s, v28.4h, v1.4h\n"
- "smlal2 v22.4s, v28.8h, v1.8h\n"
- "add x22, x22, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x88]\n"
+ "smlal v21.4s, v16.4h, v7.4h\n"
+ "smlal2 v8.4s, v16.8h, v7.8h\n"
+ "smlal v19.4s, v16.4h, v22.4h\n"
+ "smlal2 v31.4s, v16.8h, v22.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 49f\n"
- "ld1 { v26.s }[0], [x22], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 48f\n"
- "ld1 { v26.h }[2], [x22], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[6], [x22]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[4], [x22]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 4): Bit 2: Unset
"tbz x7, #1, 50f\n"
- "ld1 { v26.h }[0], [x22], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[2], [x22]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[0], [x22]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 4): Bit 2: End
- "usubl v26.8h, v26.8b, v12.8b\n"
- "ldr x24, [x15, #0x90]\n"
- "smlal v21.4s, v26.4h, v5.4h\n"
- "smlal2 v22.4s, v26.8h, v5.8h\n"
- "add x24, x24, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x90]\n"
+ "smlal v19.4s, v16.4h, v9.4h\n"
+ "smlal2 v31.4s, v16.8h, v9.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 53f\n"
- "ld1 { v25.s }[0], [x24], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 52f\n"
- "ld1 { v25.h }[2], [x24], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[6], [x24]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[4], [x24]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (4, 0): Bit 2: Unset
"tbz x7, #1, 54f\n"
- "ld1 { v25.h }[0], [x24], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[2], [x24]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[0], [x24]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"55:" // Oddments: Load (4, 0): Bit 2: End
- "usubl v25.8h, v25.8b, v12.8b\n"
- "ldr x25, [x15, #0x98]\n"
- "smlal v9.4s, v25.4h, v6.4h\n"
- "smlal2 v23.4s, v25.8h, v6.8h\n"
- "add x25, x25, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x98]\n"
+ "smlal v20.4s, v16.4h, v26.4h\n"
+ "smlal2 v0.4s, v16.8h, v26.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 57f\n"
- "ld1 { v29.s }[0], [x25], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 56f\n"
- "ld1 { v29.h }[2], [x25], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[6], [x25]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 59f\n"
"56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[4], [x25]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 59f\n"
"57:" // Oddments: Load (2, 4): Bit 2: Unset
"tbz x7, #1, 58f\n"
- "ld1 { v29.h }[0], [x25], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[2], [x25]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 59f\n"
"58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[0], [x25]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"59:" // Oddments: Load (2, 4): Bit 2: End
- "usubl v29.8h, v29.8b, v12.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0xa0]\n"
- "smlal v10.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "smlal v21.4s, v29.4h, v2.4h\n"
- "smlal2 v22.4s, v29.8h, v2.8h\n"
+ "smlal v21.4s, v16.4h, v4.4h\n"
+ "smlal2 v8.4s, v16.8h, v4.8h\n"
+ "smlal v19.4s, v16.4h, v14.4h\n"
+ "smlal2 v31.4s, v16.8h, v14.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 61f\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 60f\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 63f\n"
"60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 63f\n"
"61:" // Oddments: Load (4, 1): Bit 2: Unset
"tbz x7, #1, 62f\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 63f\n"
"62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"63:" // Oddments: Load (4, 1): Bit 2: End
- "usubl v27.8h, v27.8b, v12.8b\n"
- "ldr x23, [x15, #0xa8]\n"
- "smlal v9.4s, v27.4h, v7.4h\n"
- "smlal2 v23.4s, v27.8h, v7.8h\n"
- "add x23, x23, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v20.4s, v16.4h, v7.4h\n"
+ "smlal2 v0.4s, v16.8h, v7.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 65f\n"
- "ld1 { v24.s }[0], [x23], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 64f\n"
- "ld1 { v24.h }[2], [x23], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[6], [x23]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[4], [x23]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 2): Bit 2: Unset
"tbz x7, #1, 66f\n"
- "ld1 { v24.h }[0], [x23], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[2], [x23]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[0], [x23]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"67:" // Oddments: Load (3, 2): Bit 2: End
- "usubl v24.8h, v24.8b, v12.8b\n"
- "ldr x22, [x15, #0xb0]\n"
- "smlal v9.4s, v24.4h, v5.4h\n"
- "smlal2 v23.4s, v24.8h, v5.8h\n"
- "smlal v21.4s, v24.4h, v3.4h\n"
- "smlal2 v22.4s, v24.8h, v3.8h\n"
- "add x22, x22, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xb0]\n"
+ "smlal v20.4s, v16.4h, v9.4h\n"
+ "smlal2 v0.4s, v16.8h, v9.8h\n"
+ "smlal v19.4s, v16.4h, v28.4h\n"
+ "smlal2 v31.4s, v16.8h, v28.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 69f\n"
- "ld1 { v26.s }[0], [x22], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 68f\n"
- "ld1 { v26.h }[2], [x22], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[6], [x22]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[4], [x22]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 3): Bit 2: Unset
"tbz x7, #1, 70f\n"
- "ld1 { v26.h }[0], [x22], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[2], [x22]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[0], [x22]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"71:" // Oddments: Load (4, 3): Bit 2: End
- "usubl v26.8h, v26.8b, v12.8b\n"
- "ldr x21, [x15, #0xb8]\n"
- "smlal v21.4s, v26.4h, v7.4h\n"
- "smlal2 v22.4s, v26.8h, v7.8h\n"
- "add x21, x21, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "smlal v19.4s, v16.4h, v7.4h\n"
+ "smlal2 v31.4s, v16.8h, v7.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 73f\n"
- "ld1 { v25.s }[0], [x21], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 72f\n"
- "ld1 { v25.h }[2], [x21], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[6], [x21]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[4], [x21]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 2): Bit 2: Unset
"tbz x7, #1, 74f\n"
- "ld1 { v25.h }[0], [x21], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[0], [x21]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"75:" // Oddments: Load (4, 2): Bit 2: End
- "usubl v25.8h, v25.8b, v12.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0xc0]\n"
- "smlal v9.4s, v25.4h, v8.4h\n"
- "smlal2 v23.4s, v25.8h, v8.8h\n"
- "smlal v21.4s, v25.4h, v6.4h\n"
- "smlal2 v22.4s, v25.8h, v6.8h\n"
+ "smlal v20.4s, v16.4h, v4.4h\n"
+ "smlal2 v0.4s, v16.8h, v4.8h\n"
+ "smlal v19.4s, v16.4h, v26.4h\n"
+ "smlal2 v31.4s, v16.8h, v26.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 77f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 76f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 4): Bit 2: Unset
"tbz x7, #1, 78f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"79:" // Oddments: Load (4, 4): Bit 2: End
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v21.4s, v29.4h, v8.4h\n"
- "smlal2 v22.4s, v29.8h, v8.8h\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "smlal v19.4s, v16.4h, v4.4h\n"
+ "smlal2 v31.4s, v16.8h, v4.8h\n"
"tbz x7, #2, 81f\n"
- "ld1 { v19.4s }, [x13], #0x10\n"
- "ld1 { v18.4s }, [x12], #0x10\n"
+ "ld1 { v14.4s }, [x13], #0x10\n"
+ "ld1 { v25.4s }, [x12], #0x10\n"
"tbz x7, #1, 80f\n"
- "ld1 { v30.d }[0], [x13], #0x8\n"
- "ld1 { v31.d }[0], [x12], #0x8\n"
+ "ld1 { v18.d }[0], [x13], #0x8\n"
+ "ld1 { v12.d }[0], [x12], #0x8\n"
"tbz x7, #0, 83f\n"
- "ld1 { v30.s }[2], [x13]\n"
- "ld1 { v31.s }[2], [x12]\n"
+ "ld1 { v18.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x12]\n"
"b 83f\n"
"80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
"tbz x7, #0, 83f\n"
- "ld1 { v30.s }[0], [x13]\n"
- "ld1 { v31.s }[0], [x12]\n"
+ "ld1 { v18.s }[0], [x13]\n"
+ "ld1 { v12.s }[0], [x12]\n"
"b 83f\n"
"81:" // Oddments: Load requant params: Bit 2: Unset
"tbz x7, #1, 82f\n"
- "ld1 { v19.d }[0], [x13], #0x8\n"
- "ld1 { v18.d }[0], [x12], #0x8\n"
+ "ld1 { v14.d }[0], [x13], #0x8\n"
+ "ld1 { v25.d }[0], [x12], #0x8\n"
"tbz x7, #0, 83f\n"
- "ld1 { v19.s }[2], [x13]\n"
- "ld1 { v18.s }[2], [x12]\n"
+ "ld1 { v14.s }[2], [x13]\n"
+ "ld1 { v25.s }[2], [x12]\n"
"b 83f\n"
"82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 83f\n"
- "ld1 { v19.s }[0], [x13]\n"
- "ld1 { v18.s }[0], [x12]\n"
+ "ld1 { v14.s }[0], [x13]\n"
+ "ld1 { v25.s }[0], [x12]\n"
"83:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v15.4s, v15.4s, v19.4s\n"
- "and v0.16b, v15.16b, v18.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v14.4s\n"
+ "and v28.16b, v5.16b, v25.16b\n"
"add x11, x11, x16\n"
"add x10, x10, x16\n"
- "sqrdmulh v17.4s, v17.4s, v30.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v3.4s, v3.4s, v18.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
"add x9, x9, x16\n"
"add x28, x28, x16\n"
- "and v7.16b, v17.16b, v31.16b\n"
- "sqrdmulh v10.4s, v10.4s, v19.4s\n"
- "sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "sqrdmulh v21.4s, v21.4s, v19.4s\n"
- "sqadd v15.4s, v15.4s, v0.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "and v19.16b, v10.16b, v18.16b\n"
- "sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "and v27.16b, v9.16b, v18.16b\n"
- "sqrdmulh v23.4s, v23.4s, v30.4s\n"
- "and v0.16b, v21.16b, v18.16b\n"
- "sqrdmulh v22.4s, v22.4s, v30.4s\n"
- "sqadd v17.4s, v17.4s, v7.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "and v5.16b, v20.16b, v31.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "and v4.16b, v23.16b, v31.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v7.16b, v22.16b, v31.16b\n"
- "sqadd v10.4s, v10.4s, v19.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v27.4s\n"
+ "and v16.16b, v3.16b, v12.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v14.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v14.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v14.4s\n"
+ "sqadd v5.4s, v5.4s, v28.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v14.16b, v21.16b, v25.16b\n"
+ "sqrdmulh v8.4s, v8.4s, v18.4s\n"
+ "and v6.16b, v20.16b, v25.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+ "and v4.16b, v19.16b, v25.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v18.4s\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "sshr v14.4s, v14.4s, #0x1f\n"
+ "and v18.16b, v8.16b, v12.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "and v7.16b, v0.16b, v12.16b\n"
"sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
+ "and v16.16b, v31.16b, v12.16b\n"
+ "sqadd v21.4s, v21.4s, v14.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v6.4s\n"
"sshr v7.4s, v7.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v18.4s\n"
- "srshl v10.4s, v10.4s, v18.4s\n"
- "sqadd v20.4s, v20.4s, v5.4s\n"
- "srshl v9.4s, v9.4s, v18.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "srshl v21.4s, v21.4s, v18.4s\n"
- "sqadd v22.4s, v22.4s, v7.4s\n"
- "srshl v17.4s, v17.4s, v31.4s\n"
- "sqxtn v15.4h, v15.4s\n"
- "srshl v20.4s, v20.4s, v31.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "srshl v23.4s, v23.4s, v31.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v22.4s, v22.4s, v31.4s\n"
+ "sqadd v19.4s, v19.4s, v4.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v25.4s\n"
+ "srshl v21.4s, v21.4s, v25.4s\n"
+ "sqadd v8.4s, v8.4s, v18.4s\n"
+ "srshl v20.4s, v20.4s, v25.4s\n"
+ "sqadd v0.4s, v0.4s, v7.4s\n"
+ "srshl v19.4s, v19.4s, v25.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v3.4s, v3.4s, v12.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "srshl v8.4s, v8.4s, v12.4s\n"
"sqxtn v21.4h, v21.4s\n"
- "sqxtn2 v15.8h, v17.4s\n"
- "sqxtn2 v10.8h, v20.4s\n"
- "sqxtn2 v9.8h, v23.4s\n"
- "sqxtn2 v21.8h, v22.4s\n"
- "sqadd v15.8h, v15.8h, v11.8h\n"
- "sqadd v10.8h, v10.8h, v11.8h\n"
- "sqadd v9.8h, v9.8h, v11.8h\n"
- "sqadd v21.8h, v21.8h, v11.8h\n"
- "smax v15.8h, v15.8h, v16.8h\n"
- "smax v10.8h, v10.8h, v16.8h\n"
- "smax v9.8h, v9.8h, v16.8h\n"
- "smax v21.8h, v21.8h, v16.8h\n"
- "smin v15.8h, v15.8h, v14.8h\n"
- "smin v10.8h, v10.8h, v14.8h\n"
- "smin v9.8h, v9.8h, v14.8h\n"
- "smin v21.8h, v21.8h, v14.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "srshl v0.4s, v0.4s, v12.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v31.4s, v31.4s, v12.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "sqxtn2 v5.8h, v3.4s\n"
+ "sqxtn2 v21.8h, v8.4s\n"
+ "sqxtn2 v20.8h, v0.4s\n"
+ "sqxtn2 v19.8h, v31.4s\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "sqadd v21.8h, v21.8h, v13.8h\n"
+ "sqadd v20.8h, v20.8h, v13.8h\n"
+ "sqadd v19.8h, v19.8h, v13.8h\n"
+ "smax v5.8h, v5.8h, v17.8h\n"
+ "smax v21.8h, v21.8h, v17.8h\n"
+ "smax v20.8h, v20.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smin v5.8h, v5.8h, v24.8h\n"
+ "smin v21.8h, v21.8h, v24.8h\n"
+ "smin v20.8h, v20.8h, v24.8h\n"
+ "smin v19.8h, v19.8h, v24.8h\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
"tbz x7, #2, 85f\n"
- "st1 { v15.s }[0], [x11], #0x4\n"
- "st1 { v10.s }[0], [x10], #0x4\n"
- "st1 { v9.s }[0], [x9], #0x4\n"
- "st1 { v21.s }[0], [x28], #0x4\n"
+ "st1 { v5.s }[0], [x11], #0x4\n"
+ "st1 { v21.s }[0], [x10], #0x4\n"
+ "st1 { v20.s }[0], [x9], #0x4\n"
+ "st1 { v19.s }[0], [x28], #0x4\n"
"tbz x7, #1, 84f\n"
- "st1 { v15.h }[2], [x11], #0x2\n"
- "st1 { v10.h }[2], [x10], #0x2\n"
- "st1 { v9.h }[2], [x9], #0x2\n"
- "st1 { v21.h }[2], [x28], #0x2\n"
+ "st1 { v5.h }[2], [x11], #0x2\n"
+ "st1 { v21.h }[2], [x10], #0x2\n"
+ "st1 { v20.h }[2], [x9], #0x2\n"
+ "st1 { v19.h }[2], [x28], #0x2\n"
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[6], [x11], #0x1\n"
- "st1 { v10.b }[6], [x10], #0x1\n"
- "st1 { v9.b }[6], [x9], #0x1\n"
- "st1 { v21.b }[6], [x28], #0x1\n"
+ "st1 { v5.b }[6], [x11], #0x1\n"
+ "st1 { v21.b }[6], [x10], #0x1\n"
+ "st1 { v20.b }[6], [x9], #0x1\n"
+ "st1 { v19.b }[6], [x28], #0x1\n"
"b 87f\n"
"84:" // Oddments: Bit 2: Bit 1: Unset
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[4], [x11], #0x1\n"
- "st1 { v10.b }[4], [x10], #0x1\n"
- "st1 { v9.b }[4], [x9], #0x1\n"
- "st1 { v21.b }[4], [x28], #0x1\n"
+ "st1 { v5.b }[4], [x11], #0x1\n"
+ "st1 { v21.b }[4], [x10], #0x1\n"
+ "st1 { v20.b }[4], [x9], #0x1\n"
+ "st1 { v19.b }[4], [x28], #0x1\n"
"b 87f\n"
"85:" // Oddments: Bit 2: Unset
"tbz x7, #1, 86f\n"
- "st1 { v15.h }[0], [x11], #0x2\n"
- "st1 { v10.h }[0], [x10], #0x2\n"
- "st1 { v9.h }[0], [x9], #0x2\n"
- "st1 { v21.h }[0], [x28], #0x2\n"
+ "st1 { v5.h }[0], [x11], #0x2\n"
+ "st1 { v21.h }[0], [x10], #0x2\n"
+ "st1 { v20.h }[0], [x9], #0x2\n"
+ "st1 { v19.h }[0], [x28], #0x2\n"
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[2], [x11], #0x1\n"
- "st1 { v10.b }[2], [x10], #0x1\n"
- "st1 { v9.b }[2], [x9], #0x1\n"
- "st1 { v21.b }[2], [x28], #0x1\n"
+ "st1 { v5.b }[2], [x11], #0x1\n"
+ "st1 { v21.b }[2], [x10], #0x1\n"
+ "st1 { v20.b }[2], [x9], #0x1\n"
+ "st1 { v19.b }[2], [x28], #0x1\n"
"b 87f\n"
"86:" // Oddments: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[0], [x11], #0x1\n"
- "st1 { v10.b }[0], [x10], #0x1\n"
- "st1 { v9.b }[0], [x9], #0x1\n"
- "st1 { v21.b }[0], [x28], #0x1\n"
+ "st1 { v5.b }[0], [x11], #0x1\n"
+ "st1 { v21.b }[0], [x10], #0x1\n"
+ "st1 { v20.b }[0], [x9], #0x1\n"
+ "st1 { v19.b }[0], [x28], #0x1\n"
"87:" // Oddments: Bit 2: End
"88:" // End
:
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index 19767e2823..32117ad1e6 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
@@ -34,15 +34,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
- const unsigned int,
- const uint8_t *const *const,
- const int8_t *const,
- const int32_t *const,
- const arm_gemm::Requantize32 &,
- const int32_t *const,
- const int32_t *const,
- uint8_t *const *const);
+void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
class a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index 1ce037b68c..df955206e2 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -112,1188 +112,1188 @@ void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
__asm__ __volatile__(
"ldr x1, [%x[params], %[offsetof_Params_n_channels]]\n"
- "ldr x13, [%x[params], %[offsetof_Params_requant]]\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
"lsr x2, x1, #0x3\n"
- "add x3, x13, %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v9.16b }, [x3]\n"
- "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x11, x13, %[offsetof_Requantize32_b_offset]\n"
- "add x5, x13, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v15.16b }, [x11]\n"
- "ld1r { v14.8h }, [x5]\n"
- "add x3, x13, %[offsetof_Requantize32_minval]\n"
- "add x15, x13, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v12.8h }, [x3]\n"
- "ld1r { v11.8h }, [x15]\n"
- "mov x0, #0x0\n"
- "mov x10, #0x0\n"
- "add x4, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x3, [%x[params], %[offsetof_Params_weights]]\n"
- "ldr x5, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v18.16b }, [x20]\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v13.16b }, [x21]\n"
+ "ld1r { v26.8h }, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_minval]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v11.8h }, [x21]\n"
+ "ld1r { v0.8h }, [x20]\n"
+ "mov x3, #0x0\n"
+ "mov x4, #0x0\n"
+ "add x5, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_requant_muls]]\n"
"ldr x8, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x17, x6, [x24, #0x0]\n"
- "ldp x7, x16, [x24, #0x10]\n"
+ "ldp x17, x16, [x22, #0x0]\n"
+ "ldp x15, x14, [x22, #0x10]\n"
"cbz x2, 3f\n"
- "ldr d0, [x3, #0x0]\n"
- "ldr d1, [x3, #0x8]\n"
+ "ldr d6, [x6, #0x0]\n"
+ "ldr d14, [x6, #0x8]\n"
"subs x2, x2, #0x1\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ldr d2, [x3, #0x10]\n"
- "ldr d3, [x3, #0x18]\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ldr d4, [x3, #0x20]\n"
- "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ldr q13, [x13, #0x0]\n"
- "ldr q19, [x13, #0x10]\n"
- "add x13, x13, #0x20\n"
- "str x13, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x9, x28, [x4, #0x0]\n"
- "ldp x27, x26, [x4, #0x10]\n"
- "mov v20.16b, v13.16b\n"
- "mov v10.16b, v19.16b\n"
- "ldp x25, x24, [x4, #0x20]\n"
- "ldp x23, x22, [x4, #0x30]\n"
- "mov v8.16b, v13.16b\n"
- "mov v7.16b, v19.16b\n"
- "ldp x21, x20, [x4, #0x40]\n"
- "ldr d31, [x9, x0]\n"
- "mov v17.16b, v13.16b\n"
- "mov v21.16b, v19.16b\n"
- "ldr d30, [x28, x0]\n"
- "ldr d29, [x27, x0]\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "ldr d28, [x26, x0]\n"
- "ldr d27, [x25, x0]\n"
- "usubl v29.8h, v29.8b, v9.8b\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "ldr d23, [x24, x0]\n"
- "ldr d25, [x23, x0]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "ldr d24, [x22, x0]\n"
- "ldr d26, [x21, x0]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "ldr d22, [x20, x0]\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "usubl v22.8h, v22.8b, v9.8b\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ldr d10, [x6, #0x10]\n"
+ "ldr d21, [x6, #0x18]\n"
+ "ssubl v14.8h, v14.8b, v13.8b\n"
+ "ssubl v10.8h, v10.8b, v13.8b\n"
+ "ldr d12, [x6, #0x20]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ssubl v12.8h, v12.8b, v13.8b\n"
+ "ldr q7, [x20, #0x0]\n"
+ "ldr q15, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "mov v20.16b, v7.16b\n"
+ "mov v5.16b, v15.16b\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "mov v24.16b, v7.16b\n"
+ "mov v22.16b, v15.16b\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "ldr d31, [x9, x3]\n"
+ "mov v23.16b, v7.16b\n"
+ "mov v19.16b, v15.16b\n"
+ "ldr d17, [x28, x3]\n"
+ "ldr d30, [x27, x3]\n"
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "ldr d16, [x26, x3]\n"
+ "ldr d3, [x25, x3]\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "ldr d4, [x24, x3]\n"
+ "ldr d25, [x23, x3]\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "ldr d9, [x22, x3]\n"
+ "ldr d29, [x21, x3]\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "ldr d28, [x20, x3]\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
"beq 2f\n"
"1:" // Loop
- "ldr q18, [x5, #0x0]\n"
- "ldr q6, [x8, #0x0]\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "ldr q5, [x5, #0x10]\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "ldr x20, [x4, #0x50]\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "smlal v8.4s, v29.4h, v0.4h\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "ldr x22, [x4, #0x58]\n"
- "ldr x21, [x4, #0x60]\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr d31, [x20, x0]\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v7.4s, v29.8h, v0.8h\n"
- "smlal v13.4s, v27.4h, v2.4h\n"
- "ldr x20, [x4, #0x68]\n"
- "ldr x26, [x4, #0x70]\n"
- "smlal2 v21.4s, v28.8h, v0.8h\n"
- "ldr d30, [x22, x0]\n"
- "smlal v20.4s, v27.4h, v1.4h\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "smlal v8.4s, v28.4h, v1.4h\n"
- "smlal v17.4s, v23.4h, v1.4h\n"
- "ldr x25, [x4, #0x78]\n"
- "ldr x23, [x4, #0x80]\n"
- "smlal2 v19.4s, v27.8h, v2.8h\n"
- "smlal2 v10.4s, v27.8h, v1.8h\n"
- "ldr d0, [x3, #0x28]\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal2 v7.4s, v28.8h, v1.8h\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "ldr x24, [x4, #0x88]\n"
- "ldr x15, [x4, #0x90]\n"
- "smlal2 v21.4s, v23.8h, v1.8h\n"
- "ldr d27, [x21, x0]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v8.4s, v23.4h, v2.4h\n"
- "smlal v17.4s, v31.4h, v2.4h\n"
- "ldr x21, [x4, #0x98]\n"
- "ldr x14, [x4, #0xa0]\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "ldr d1, [x3, #0x30]\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal2 v7.4s, v23.8h, v2.8h\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "ldr x13, [x4, #0xa8]\n"
- "ldr x12, [x4, #0xb0]\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "ldr d25, [x20, x0]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal v8.4s, v31.4h, v3.4h\n"
- "smlal v17.4s, v30.4h, v3.4h\n"
- "ldr x20, [x4, #0xb8]\n"
- "ldr x11, [x4, #0xc0]\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "ldr d2, [x3, #0x38]\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal2 v7.4s, v31.8h, v3.8h\n"
- "smlal v13.4s, v29.4h, v0.4h\n"
- "ldr x22, [x4, #0xc8]\n"
- "ldr x9, [x4, #0xd0]\n"
- "smlal2 v21.4s, v30.8h, v3.8h\n"
- "ldr d24, [x26, x0]\n"
- "smlal v20.4s, v27.4h, v4.4h\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal v8.4s, v30.4h, v4.4h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "ldr x28, [x4, #0xd8]\n"
- "ldr x27, [x4, #0xe0]\n"
- "smlal2 v19.4s, v29.8h, v0.8h\n"
- "ldr d3, [x3, #0x40]\n"
- "smlal2 v10.4s, v27.8h, v4.8h\n"
- "ldr d27, [x25, x0]\n"
- "smlal2 v7.4s, v30.8h, v4.8h\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr x26, [x4, #0xe8]\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0x48]\n"
- "smlal v20.4s, v28.4h, v0.4h\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v8.4s, v22.4h, v0.4h\n"
- "smlal v17.4s, v25.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ldr d2, [x6, #0x28]\n"
+ "ldr d27, [x6, #0x30]\n"
+ "smlal v7.4s, v31.4h, v6.4h\n"
+ "smlal2 v15.4s, v31.8h, v6.8h\n"
+ "ldr d1, [x6, #0x38]\n"
+ "ldr d31, [x6, #0x40]\n"
+ "smlal v7.4s, v17.4h, v14.4h\n"
+ "smlal v20.4s, v17.4h, v6.4h\n"
+ "ldr d8, [x6, #0x48]\n"
+ "ldr x22, [x5, #0x50]\n"
+ "smlal v24.4s, v30.4h, v6.4h\n"
+ "smlal v23.4s, v16.4h, v6.4h\n"
+ "smlal2 v15.4s, v17.8h, v14.8h\n"
+ "smlal v7.4s, v3.4h, v10.4h\n"
+ "ldr x20, [x5, #0x58]\n"
+ "ldr x21, [x5, #0x60]\n"
+ "smlal2 v5.4s, v17.8h, v6.8h\n"
+ "ldr d17, [x22, x3]\n"
+ "smlal2 v22.4s, v30.8h, v6.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v6.8h\n"
+ "ldr d6, [x20, x3]\n"
+ "smlal v20.4s, v3.4h, v14.4h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal v24.4s, v16.4h, v14.4h\n"
+ "smlal v23.4s, v4.4h, v14.4h\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal2 v15.4s, v3.8h, v10.8h\n"
+ "smlal v7.4s, v25.4h, v21.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "ldr x22, [x5, #0x70]\n"
+ "smlal2 v5.4s, v3.8h, v14.8h\n"
+ "ldr d3, [x21, x3]\n"
+ "smlal2 v22.4s, v16.8h, v14.8h\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v14.8h\n"
+ "ldr d14, [x20, x3]\n"
+ "smlal v20.4s, v25.4h, v10.4h\n"
+ "usubl v14.8h, v14.8b, v18.8b\n"
+ "smlal v24.4s, v4.4h, v10.4h\n"
+ "smlal v23.4s, v17.4h, v10.4h\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "ldr x20, [x5, #0x78]\n"
+ "smlal2 v15.4s, v25.8h, v21.8h\n"
+ "smlal v7.4s, v9.4h, v12.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x21, [x5, #0x80]\n"
+ "smlal2 v5.4s, v25.8h, v10.8h\n"
+ "ldr d25, [x22, x3]\n"
+ "smlal2 v22.4s, v4.8h, v10.8h\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v10.8h\n"
+ "ldr d10, [x20, x3]\n"
+ "smlal v20.4s, v9.4h, v21.4h\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "smlal v24.4s, v17.4h, v21.4h\n"
+ "smlal v23.4s, v6.4h, v21.4h\n"
+ "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x24, [x5, #0x88]\n"
+ "smlal2 v15.4s, v9.8h, v12.8h\n"
+ "smlal v7.4s, v30.4h, v2.4h\n"
+ "ldr x20, [x5, #0x90]\n"
+ "ldr x23, [x5, #0x98]\n"
+ "smlal2 v5.4s, v9.8h, v21.8h\n"
+ "ldr d9, [x21, x3]\n"
+ "smlal2 v22.4s, v17.8h, v21.8h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v21.8h\n"
+ "ldr d21, [x6, #0x50]\n"
+ "smlal v20.4s, v3.4h, v12.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v12.4h\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "ldr x22, [x5, #0xa0]\n"
+ "ldr x21, [x5, #0xa8]\n"
+ "smlal2 v15.4s, v30.8h, v2.8h\n"
+ "ldr d30, [x24, x3]\n"
+ "smlal v7.4s, v16.4h, v27.4h\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "smlal2 v5.4s, v3.8h, v12.8h\n"
+ "ldr d3, [x6, #0x58]\n"
+ "smlal2 v22.4s, v6.8h, v12.8h\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "ldr d12, [x20, x3]\n"
+ "smlal v20.4s, v16.4h, v2.4h\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "smlal v24.4s, v28.4h, v2.4h\n"
+ "smlal v23.4s, v14.4h, v2.4h\n"
+ "ldr x20, [x5, #0xb0]\n"
+ "ldr x13, [x5, #0xb8]\n"
+ "smlal2 v15.4s, v16.8h, v27.8h\n"
+ "smlal v7.4s, v4.4h, v1.4h\n"
+ "ldr x12, [x5, #0xc0]\n"
+ "ldr x11, [x5, #0xc8]\n"
+ "smlal2 v5.4s, v16.8h, v2.8h\n"
+ "ldr d16, [x23, x3]\n"
+ "smlal2 v22.4s, v28.8h, v2.8h\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v2.8h\n"
+ "ldr d2, [x6, #0x60]\n"
+ "smlal v20.4s, v4.4h, v27.4h\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v27.4h\n"
+ "smlal v23.4s, v25.4h, v27.4h\n"
+ "ldr x10, [x5, #0xd0]\n"
+ "ldr x9, [x5, #0xd8]\n"
+ "smlal2 v15.4s, v4.8h, v1.8h\n"
+ "smlal v7.4s, v17.4h, v31.4h\n"
+ "ldr x28, [x5, #0xe0]\n"
+ "ldr x27, [x5, #0xe8]\n"
+ "smlal2 v5.4s, v4.8h, v27.8h\n"
+ "ldr d4, [x22, x3]\n"
+ "smlal2 v22.4s, v14.8h, v27.8h\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v19.4s, v25.8h, v27.8h\n"
+ "ldr d27, [x6, #0x68]\n"
+ "smlal v20.4s, v17.4h, v1.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v1.4h\n"
+ "smlal v23.4s, v10.4h, v1.4h\n"
+ "ldr x26, [x5, #0xf0]\n"
+ "ldr x25, [x5, #0xf8]\n"
+ "smlal2 v15.4s, v17.8h, v31.8h\n"
+ "smlal v7.4s, v6.4h, v8.4h\n"
+ "ldr x24, [x5, #0x100]\n"
+ "ldr x23, [x5, #0x108]\n"
+ "smlal2 v5.4s, v17.8h, v1.8h\n"
+ "ldr d17, [x21, x3]\n"
+ "smlal2 v22.4s, v25.8h, v1.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v1.8h\n"
+ "ldr d1, [x6, #0x70]\n"
+ "smlal v20.4s, v6.4h, v31.4h\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v31.4h\n"
+ "smlal v23.4s, v9.4h, v31.4h\n"
+ "ldr x22, [x5, #0x110]\n"
+ "ldr x21, [x5, #0x118]\n"
+ "smlal2 v15.4s, v6.8h, v8.8h\n"
+ "smlal v7.4s, v28.4h, v21.4h\n"
"subs x2, x2, #0x1\n"
+ "smlal2 v5.4s, v6.8h, v31.8h\n"
+ "ldr d6, [x20, x3]\n"
+ "smlal2 v22.4s, v10.8h, v31.8h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v31.8h\n"
+ "ldr d31, [x6, #0x78]\n"
+ "smlal v20.4s, v29.4h, v8.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v9.4h, v8.4h\n"
+ "smlal v23.4s, v30.4h, v8.4h\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v15.4s, v28.8h, v21.8h\n"
+ "ldr d28, [x13, x3]\n"
+ "smlal v7.4s, v14.4h, v3.4h\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
+ "smlal2 v5.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x6, #0x80]\n"
+ "smlal2 v22.4s, v9.8h, v8.8h\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "smlal2 v19.4s, v30.8h, v8.8h\n"
+ "ldr d8, [x12, x3]\n"
+ "smlal v20.4s, v14.4h, v21.4h\n"
+ "usubl v8.8h, v8.8b, v18.8b\n"
+ "smlal v24.4s, v12.4h, v21.4h\n"
+ "smlal v23.4s, v16.4h, v21.4h\n"
+ "smlal2 v15.4s, v14.8h, v3.8h\n"
+ "smlal v7.4s, v25.4h, v2.4h\n"
+ "smlal2 v5.4s, v14.8h, v21.8h\n"
+ "ldr d14, [x11, x3]\n"
+ "smlal2 v22.4s, v12.8h, v21.8h\n"
+ "usubl v14.8h, v14.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v21.8h\n"
+ "ldr d21, [x6, #0x88]\n"
+ "smlal v20.4s, v25.4h, v3.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v16.4h, v3.4h\n"
+ "smlal v23.4s, v4.4h, v3.4h\n"
+ "smlal2 v15.4s, v25.8h, v2.8h\n"
+ "smlal v7.4s, v10.4h, v27.4h\n"
+ "smlal2 v5.4s, v25.8h, v3.8h\n"
+ "ldr d25, [x10, x3]\n"
+ "smlal2 v22.4s, v16.8h, v3.8h\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v3.8h\n"
+ "ldr d3, [x6, #0x90]\n"
+ "smlal v20.4s, v10.4h, v2.4h\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v4.4h, v2.4h\n"
+ "smlal v23.4s, v17.4h, v2.4h\n"
+ "smlal2 v15.4s, v10.8h, v27.8h\n"
+ "smlal v7.4s, v9.4h, v1.4h\n"
+ "smlal2 v5.4s, v10.8h, v2.8h\n"
+ "ldr d10, [x9, x3]\n"
+ "smlal2 v22.4s, v4.8h, v2.8h\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v2.8h\n"
+ "ldr d2, [x6, #0x98]\n"
+ "smlal v20.4s, v9.4h, v27.4h\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v17.4h, v27.4h\n"
+ "smlal v23.4s, v6.4h, v27.4h\n"
+ "smlal2 v15.4s, v9.8h, v1.8h\n"
+ "smlal v7.4s, v12.4h, v31.4h\n"
+ "smlal2 v5.4s, v9.8h, v27.8h\n"
+ "ldr d9, [x28, x3]\n"
+ "smlal2 v22.4s, v17.8h, v27.8h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v27.8h\n"
+ "ldr d27, [x6, #0xa0]\n"
+ "smlal v20.4s, v30.4h, v1.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v1.4h\n"
+ "smlal v23.4s, v28.4h, v1.4h\n"
+ "smlal2 v15.4s, v12.8h, v31.8h\n"
+ "ldr d12, [x27, x3]\n"
+ "smlal v7.4s, v16.4h, v29.4h\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "smlal2 v5.4s, v30.8h, v1.8h\n"
+ "ldr d30, [x6, #0xa8]\n"
+ "smlal2 v22.4s, v6.8h, v1.8h\n"
+ "ssubl v30.8h, v30.8b, v13.8b\n"
"smlal2 v19.4s, v28.8h, v1.8h\n"
- "smlal2 v10.4s, v28.8h, v0.8h\n"
- "ldr d28, [x24, x0]\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "smlal2 v7.4s, v22.8h, v0.8h\n"
- "smlal v13.4s, v23.4h, v2.4h\n"
- "ldr x25, [x4, #0xf0]\n"
- "add x5, x5, #0x20\n"
- "smlal2 v21.4s, v25.8h, v0.8h\n"
- "ldr d0, [x3, #0x50]\n"
- "smlal v20.4s, v23.4h, v1.4h\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v1.4h\n"
- "smlal v17.4s, v24.4h, v1.4h\n"
- "smlal2 v19.4s, v23.8h, v2.8h\n"
- "smlal2 v10.4s, v23.8h, v1.8h\n"
- "ldr d23, [x23, x0]\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v7.4s, v25.8h, v1.8h\n"
- "smlal v13.4s, v31.4h, v3.4h\n"
- "ldr x24, [x4, #0xf8]\n"
- "smlal2 v21.4s, v24.8h, v1.8h\n"
- "ldr d1, [x3, #0x58]\n"
- "smlal v20.4s, v31.4h, v2.4h\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v24.4h, v2.4h\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal2 v19.4s, v31.8h, v3.8h\n"
- "smlal2 v10.4s, v31.8h, v2.8h\n"
- "ldr d31, [x15, x0]\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v7.4s, v24.8h, v2.8h\n"
- "smlal v13.4s, v30.4h, v4.4h\n"
- "ldr x23, [x4, #0x100]\n"
- "smlal2 v21.4s, v27.8h, v2.8h\n"
- "ldr d2, [x3, #0x60]\n"
- "smlal v20.4s, v30.4h, v3.4h\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v27.4h, v3.4h\n"
- "smlal v17.4s, v23.4h, v3.4h\n"
- "smlal2 v19.4s, v30.8h, v4.8h\n"
- "smlal2 v10.4s, v30.8h, v3.8h\n"
- "ldr d30, [x21, x0]\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "smlal2 v7.4s, v27.8h, v3.8h\n"
- "smlal v13.4s, v22.4h, v0.4h\n"
- "ldr x15, [x4, #0x108]\n"
- "smlal2 v21.4s, v23.8h, v3.8h\n"
- "ldr d3, [x3, #0x68]\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v23.4h, v4.4h\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "smlal2 v19.4s, v22.8h, v0.8h\n"
- "ldr d22, [x20, x0]\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "ldr d26, [x14, x0]\n"
- "smlal2 v7.4s, v23.8h, v4.8h\n"
- "smlal v13.4s, v25.4h, v1.4h\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "ldr x21, [x4, #0x110]\n"
- "smlal2 v21.4s, v28.8h, v4.8h\n"
- "ldr d4, [x3, #0x70]\n"
- "smlal v20.4s, v25.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v31.4h, v0.4h\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "usubl v22.8h, v22.8b, v9.8b\n"
- "ldr x20, [x4, #0x118]\n"
- "smlal2 v19.4s, v25.8h, v1.8h\n"
- "smlal2 v10.4s, v25.8h, v0.8h\n"
- "ldr d25, [x13, x0]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v0.8h\n"
- "smlal v13.4s, v24.4h, v2.4h\n"
- "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal2 v21.4s, v30.8h, v0.8h\n"
- "ldr d0, [x3, #0x78]\n"
- "smlal v20.4s, v24.4h, v1.4h\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v30.4h, v1.4h\n"
- "smlal v17.4s, v26.4h, v1.4h\n"
- "smlal2 v19.4s, v24.8h, v2.8h\n"
- "smlal2 v10.4s, v24.8h, v1.8h\n"
- "ldr d24, [x12, x0]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v7.4s, v30.8h, v1.8h\n"
- "smlal v13.4s, v27.4h, v3.4h\n"
- "smlal2 v21.4s, v26.8h, v1.8h\n"
- "ldr d1, [x3, #0x80]\n"
- "smlal v20.4s, v27.4h, v2.4h\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v26.4h, v2.4h\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v19.4s, v27.8h, v3.8h\n"
- "smlal2 v10.4s, v27.8h, v2.8h\n"
- "ldr d27, [x11, x0]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal2 v7.4s, v26.8h, v2.8h\n"
- "smlal v13.4s, v23.4h, v4.4h\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "ldr d2, [x3, #0x88]\n"
- "smlal v20.4s, v23.4h, v3.4h\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal2 v19.4s, v23.8h, v4.8h\n"
- "smlal2 v10.4s, v23.8h, v3.8h\n"
- "ldr d23, [x22, x0]\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "ldr d3, [x3, #0x90]\n"
- "smlal v20.4s, v28.4h, v4.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal v17.4s, v22.4h, v4.4h\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "ldr d31, [x9, x0]\n"
- "smlal2 v10.4s, v28.8h, v4.8h\n"
- "ldr d28, [x27, x0]\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v21.4s, v22.8h, v4.8h\n"
- "ldr d4, [x3, #0x98]\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v27.4h, v0.4h\n"
- "smlal v17.4s, v23.4h, v0.4h\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr d30, [x28, x0]\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "smlal2 v7.4s, v27.8h, v0.8h\n"
- "smlal v13.4s, v26.4h, v2.4h\n"
- "smlal2 v21.4s, v23.8h, v0.8h\n"
- "ldr d0, [x3, #0xa0]\n"
- "smlal v20.4s, v26.4h, v1.4h\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v23.4h, v1.4h\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal2 v19.4s, v26.8h, v2.8h\n"
- "smlal2 v10.4s, v26.8h, v1.8h\n"
- "ldr d26, [x26, x0]\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "smlal2 v7.4s, v23.8h, v1.8h\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "smlal2 v21.4s, v31.8h, v1.8h\n"
- "ldr d1, [x3, #0xa8]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v31.4h, v2.4h\n"
- "smlal v17.4s, v30.4h, v2.4h\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "ldr d25, [x25, x0]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v2.8h\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "smlal2 v21.4s, v30.8h, v2.8h\n"
- "ldr d2, [x3, #0xb0]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v30.4h, v3.4h\n"
- "smlal v17.4s, v28.4h, v3.4h\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "ldr d24, [x24, x0]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v7.4s, v30.8h, v3.8h\n"
- "smlal v13.4s, v27.4h, v0.4h\n"
- "smlal2 v21.4s, v28.8h, v3.8h\n"
- "ldr d3, [x3, #0xb8]\n"
- "smlal v20.4s, v22.4h, v4.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v28.4h, v4.4h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal2 v19.4s, v27.8h, v0.8h\n"
- "ldr d27, [x23, x0]\n"
- "smlal2 v7.4s, v28.8h, v4.8h\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v13.4s, v23.4h, v1.4h\n"
- "smlal2 v10.4s, v22.8h, v4.8h\n"
- "ldr q22, [x8, #0x10]\n"
+ "ldr d1, [x26, x3]\n"
+ "smlal v20.4s, v16.4h, v31.4h\n"
+ "usubl v1.8h, v1.8b, v18.8b\n"
+ "smlal v24.4s, v8.4h, v31.4h\n"
+ "smlal v23.4s, v14.4h, v31.4h\n"
+ "smlal2 v15.4s, v16.8h, v29.8h\n"
+ "smlal v7.4s, v4.4h, v21.4h\n"
+ "smlal2 v5.4s, v16.8h, v31.8h\n"
+ "ldr d16, [x25, x3]\n"
+ "smlal2 v22.4s, v8.8h, v31.8h\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v31.8h\n"
+ "ldr d31, [x6, #0xb0]\n"
+ "smlal v20.4s, v4.4h, v29.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v29.4h\n"
+ "smlal v23.4s, v25.4h, v29.4h\n"
+ "smlal2 v15.4s, v4.8h, v21.8h\n"
+ "smlal v7.4s, v17.4h, v3.4h\n"
+ "smlal2 v5.4s, v4.8h, v29.8h\n"
+ "ldr d4, [x24, x3]\n"
+ "smlal2 v22.4s, v14.8h, v29.8h\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v19.4s, v25.8h, v29.8h\n"
+ "ldr d29, [x6, #0xb8]\n"
+ "smlal v20.4s, v17.4h, v21.4h\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v21.4h\n"
+ "smlal v23.4s, v10.4h, v21.4h\n"
+ "smlal2 v15.4s, v17.8h, v3.8h\n"
+ "smlal v7.4s, v6.4h, v2.4h\n"
+ "smlal2 v5.4s, v17.8h, v21.8h\n"
+ "ldr d17, [x23, x3]\n"
+ "smlal2 v22.4s, v25.8h, v21.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v21.8h\n"
+ "ldr d21, [x6, #0xc0]\n"
+ "smlal v20.4s, v6.4h, v3.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v3.4h\n"
+ "smlal v23.4s, v9.4h, v3.4h\n"
+ "add x6, x6, #0xc8\n"
+ "smlal2 v15.4s, v6.8h, v2.8h\n"
+ "smlal v7.4s, v8.4h, v27.4h\n"
+ "smlal2 v5.4s, v6.8h, v3.8h\n"
+ "ldr d6, [x22, x3]\n"
+ "smlal2 v22.4s, v10.8h, v3.8h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v3.8h\n"
+ "ldr d3, [x21, x3]\n"
+ "smlal v20.4s, v28.4h, v2.4h\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "smlal v24.4s, v9.4h, v2.4h\n"
+ "smlal v23.4s, v12.4h, v2.4h\n"
+ "add x3, x3, #0x8\n"
+ "smlal2 v15.4s, v8.8h, v27.8h\n"
+ "ldr q8, [x7, #0x0]\n"
+ "smlal v7.4s, v14.4h, v30.4h\n"
+ "smlal2 v5.4s, v28.8h, v2.8h\n"
+ "ldr q28, [x8, #0x0]\n"
+ "smlal2 v22.4s, v9.8h, v2.8h\n"
+ "smlal2 v19.4s, v12.8h, v2.8h\n"
+ "ldr q2, [x7, #0x10]\n"
+ "smlal v20.4s, v14.4h, v27.4h\n"
+ "add x7, x7, #0x20\n"
+ "smlal v24.4s, v1.4h, v27.4h\n"
+ "smlal v23.4s, v16.4h, v27.4h\n"
+ "smlal2 v15.4s, v14.8h, v30.8h\n"
+ "smlal v7.4s, v25.4h, v31.4h\n"
+ "smlal2 v5.4s, v14.8h, v27.8h\n"
+ "ldr q14, [x8, #0x10]\n"
+ "smlal2 v22.4s, v1.8h, v27.8h\n"
"add x8, x8, #0x20\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0xc0]\n"
- "smlal v20.4s, v23.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v0.4h\n"
- "smlal v17.4s, v24.4h, v0.4h\n"
- "add x3, x3, #0xc8\n"
- "smlal2 v19.4s, v23.8h, v1.8h\n"
- "smlal2 v7.4s, v25.8h, v0.8h\n"
- "ldr d25, [x15, x0]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v10.4s, v23.8h, v0.8h\n"
- "smlal2 v21.4s, v24.8h, v0.8h\n"
- "smlal v20.4s, v31.4h, v1.4h\n"
- "smlal v8.4s, v24.4h, v1.4h\n"
- "smlal v17.4s, v27.4h, v1.4h\n"
- "smlal2 v19.4s, v31.8h, v2.8h\n"
- "smlal2 v7.4s, v24.8h, v1.8h\n"
- "ldr d24, [x21, x0]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal v13.4s, v30.4h, v3.4h\n"
- "smlal2 v10.4s, v31.8h, v1.8h\n"
- "smlal2 v21.4s, v27.8h, v1.8h\n"
- "smlal v20.4s, v30.4h, v2.4h\n"
- "smlal v8.4s, v27.4h, v2.4h\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v19.4s, v30.8h, v3.8h\n"
- "smlal2 v7.4s, v27.8h, v2.8h\n"
- "ldr d27, [x20, x0]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v13.4s, v28.4h, v4.4h\n"
- "smlal2 v10.4s, v30.8h, v2.8h\n"
- "sqrdmulh v13.4s, v13.4s, v18.4s\n"
- "add x0, x0, #0x8\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "smlal v20.4s, v28.4h, v3.4h\n"
- "and v30.16b, v13.16b, v6.16b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "smlal2 v19.4s, v28.8h, v4.8h\n"
- "smlal2 v10.4s, v28.8h, v3.8h\n"
- "sqrdmulh v19.4s, v19.4s, v5.4s\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "and v16.16b, v19.16b, v22.16b\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "sqrdmulh v20.4s, v20.4s, v18.4s\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "sqrdmulh v8.4s, v8.4s, v18.4s\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal2 v21.4s, v27.8h, v4.8h\n"
- "sqrdmulh v17.4s, v17.4s, v18.4s\n"
- "sqadd v13.4s, v13.4s, v30.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "and v0.16b, v20.16b, v6.16b\n"
- "sqrdmulh v10.4s, v10.4s, v5.4s\n"
- "and v18.16b, v8.16b, v6.16b\n"
- "sqrdmulh v7.4s, v7.4s, v5.4s\n"
- "and v30.16b, v17.16b, v6.16b\n"
- "sqrdmulh v21.4s, v21.4s, v5.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v26.16b, v10.16b, v22.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "and v23.16b, v7.16b, v22.16b\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "and v16.16b, v21.16b, v22.16b\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v18.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v30.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v6.4s\n"
- "srshl v20.4s, v20.4s, v6.4s\n"
- "sqadd v10.4s, v10.4s, v26.4s\n"
- "srshl v8.4s, v8.4s, v6.4s\n"
- "sqadd v7.4s, v7.4s, v23.4s\n"
- "srshl v17.4s, v17.4s, v6.4s\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "srshl v19.4s, v19.4s, v22.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v10.4s, v10.4s, v22.4s\n"
+ "smlal2 v19.4s, v16.8h, v27.8h\n"
+ "smlal v20.4s, v25.4h, v30.4h\n"
+ "smlal v24.4s, v16.4h, v30.4h\n"
+ "smlal v23.4s, v4.4h, v30.4h\n"
+ "smlal2 v15.4s, v25.8h, v31.8h\n"
+ "smlal v7.4s, v10.4h, v29.4h\n"
+ "smlal2 v5.4s, v25.8h, v30.8h\n"
+ "smlal2 v22.4s, v16.8h, v30.8h\n"
+ "smlal2 v19.4s, v4.8h, v30.8h\n"
+ "smlal v20.4s, v10.4h, v31.4h\n"
+ "smlal v24.4s, v4.4h, v31.4h\n"
+ "smlal v23.4s, v17.4h, v31.4h\n"
+ "smlal2 v15.4s, v10.8h, v29.8h\n"
+ "smlal v7.4s, v9.4h, v21.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v8.4s\n"
+ "smlal2 v5.4s, v10.8h, v31.8h\n"
+ "smlal2 v22.4s, v4.8h, v31.8h\n"
+ "and v27.16b, v7.16b, v28.16b\n"
+ "smlal2 v19.4s, v17.8h, v31.8h\n"
+ "smlal v20.4s, v9.4h, v29.4h\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "smlal v24.4s, v17.4h, v29.4h\n"
+ "smlal v23.4s, v6.4h, v29.4h\n"
+ "sqadd v7.4s, v7.4s, v27.4s\n"
+ "smlal2 v15.4s, v9.8h, v21.8h\n"
+ "smlal2 v5.4s, v9.8h, v29.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v2.4s\n"
+ "smlal2 v22.4s, v17.8h, v29.8h\n"
+ "smlal2 v19.4s, v6.8h, v29.8h\n"
+ "and v9.16b, v15.16b, v14.16b\n"
+ "smlal v20.4s, v12.4h, v21.4h\n"
+ "smlal v24.4s, v6.4h, v21.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+ "smlal v23.4s, v3.4h, v21.4h\n"
+ "smlal2 v5.4s, v12.8h, v21.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "smlal2 v22.4s, v6.8h, v21.8h\n"
+ "smlal2 v19.4s, v3.8h, v21.8h\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "and v25.16b, v20.16b, v28.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v2.4s\n"
+ "and v10.16b, v24.16b, v28.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v2.4s\n"
+ "and v21.16b, v23.16b, v28.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v2.4s\n"
+ "sqadd v15.4s, v15.4s, v9.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "and v9.16b, v5.16b, v14.16b\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "and v12.16b, v22.16b, v14.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v17.16b, v19.16b, v14.16b\n"
+ "sqadd v20.4s, v20.4s, v25.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v10.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v21.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v7.4s, v7.4s, v28.4s\n"
+ "srshl v20.4s, v20.4s, v28.4s\n"
+ "sqadd v5.4s, v5.4s, v9.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
+ "sqadd v22.4s, v22.4s, v12.4s\n"
+ "srshl v23.4s, v23.4s, v28.4s\n"
+ "sqadd v19.4s, v19.4s, v17.4s\n"
+ "srshl v15.4s, v15.4s, v14.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v5.4s, v5.4s, v14.4s\n"
"sqxtn v20.4h, v20.4s\n"
- "srshl v7.4s, v7.4s, v22.4s\n"
- "sqxtn v8.4h, v8.4s\n"
- "srshl v21.4s, v21.4s, v22.4s\n"
- "sqxtn v17.4h, v17.4s\n"
- "sqxtn2 v13.8h, v19.4s\n"
- "sqxtn2 v20.8h, v10.4s\n"
- "sqxtn2 v8.8h, v7.4s\n"
- "sqxtn2 v17.8h, v21.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v20.8h, v20.8h, v14.8h\n"
- "sqadd v8.8h, v8.8h, v14.8h\n"
- "sqadd v17.8h, v17.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v20.8h, v20.8h, v12.8h\n"
- "smax v8.8h, v8.8h, v12.8h\n"
- "smax v17.8h, v17.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v20.8h, v20.8h, v11.8h\n"
- "smin v8.8h, v8.8h, v11.8h\n"
- "smin v17.8h, v17.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "srshl v22.4s, v22.4s, v14.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "srshl v19.4s, v19.4s, v14.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v7.8h, v15.4s\n"
+ "sqxtn2 v20.8h, v5.4s\n"
+ "sqxtn2 v24.8h, v22.4s\n"
+ "sqxtn2 v23.8h, v19.4s\n"
+ "sqadd v7.8h, v7.8h, v26.8h\n"
+ "sqadd v20.8h, v20.8h, v26.8h\n"
+ "sqadd v24.8h, v24.8h, v26.8h\n"
+ "sqadd v23.8h, v23.8h, v26.8h\n"
+ "smax v7.8h, v7.8h, v11.8h\n"
+ "smax v20.8h, v20.8h, v11.8h\n"
+ "smax v24.8h, v24.8h, v11.8h\n"
+ "smax v23.8h, v23.8h, v11.8h\n"
+ "smin v7.8h, v7.8h, v0.8h\n"
+ "smin v20.8h, v20.8h, v0.8h\n"
+ "smin v24.8h, v24.8h, v0.8h\n"
+ "smin v23.8h, v23.8h, v0.8h\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "str d7, [x17, x4]\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str d13, [x17, x10]\n"
- "uzp1 v8.16b, v8.16b, v8.16b\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
- "str d20, [x6, x10]\n"
- "str d8, [x7, x10]\n"
- "str d17, [x16, x10]\n"
- "ldr q13, [x13, #0x0]\n"
- "ldr q19, [x13, #0x10]\n"
- "add x13, x13, #0x20\n"
- "ldr d0, [x3, #0x0]\n"
- "ldr d1, [x3, #0x8]\n"
- "add x10, x10, #0x8\n"
- "str x13, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d2, [x3, #0x10]\n"
- "ldr d3, [x3, #0x18]\n"
- "mov v20.16b, v13.16b\n"
- "mov v10.16b, v19.16b\n"
- "ldr d4, [x3, #0x20]\n"
- "ldp x9, x28, [x4, #0x0]\n"
- "mov v8.16b, v13.16b\n"
- "mov v7.16b, v19.16b\n"
- "ldp x27, x26, [x4, #0x10]\n"
- "ldp x25, x24, [x4, #0x20]\n"
- "mov v17.16b, v13.16b\n"
- "mov v21.16b, v19.16b\n"
- "ldp x23, x22, [x4, #0x30]\n"
- "ldp x21, x20, [x4, #0x40]\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldr d31, [x9, x0]\n"
- "ldr d30, [x28, x0]\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr d29, [x27, x0]\n"
- "ldr d28, [x26, x0]\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "ldr d27, [x25, x0]\n"
- "ldr d23, [x24, x0]\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "usubl v29.8h, v29.8b, v9.8b\n"
- "ldr d25, [x23, x0]\n"
- "ldr d24, [x22, x0]\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "ldr d26, [x21, x0]\n"
- "ldr d22, [x20, x0]\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "usubl v22.8h, v22.8b, v9.8b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str d20, [x16, x4]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d24, [x15, x4]\n"
+ "str d23, [x14, x4]\n"
+ "ldr q7, [x20, #0x0]\n"
+ "ldr q15, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d6, [x6, #0x0]\n"
+ "ldr d14, [x6, #0x8]\n"
+ "add x4, x4, #0x8\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d10, [x6, #0x10]\n"
+ "ldr d21, [x6, #0x18]\n"
+ "mov v20.16b, v7.16b\n"
+ "mov v5.16b, v15.16b\n"
+ "ldr d12, [x6, #0x20]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "mov v24.16b, v7.16b\n"
+ "mov v22.16b, v15.16b\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "mov v23.16b, v7.16b\n"
+ "mov v19.16b, v15.16b\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ssubl v14.8h, v14.8b, v13.8b\n"
+ "ldr d31, [x9, x3]\n"
+ "ldr d17, [x28, x3]\n"
+ "ssubl v10.8h, v10.8b, v13.8b\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ldr d30, [x27, x3]\n"
+ "ldr d16, [x26, x3]\n"
+ "ssubl v12.8h, v12.8b, v13.8b\n"
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "ldr d3, [x25, x3]\n"
+ "ldr d4, [x24, x3]\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "ldr d25, [x23, x3]\n"
+ "ldr d9, [x22, x3]\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "ldr d29, [x21, x3]\n"
+ "ldr d28, [x20, x3]\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
"bgt 1b\n"
"2:" // Tail
- "ldr q18, [x5, #0x0]\n"
- "ldr q6, [x8, #0x0]\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "ldr q5, [x5, #0x10]\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "ldr x20, [x4, #0x50]\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "smlal v8.4s, v29.4h, v0.4h\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "ldr x22, [x4, #0x58]\n"
- "ldr x21, [x4, #0x60]\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr d31, [x20, x0]\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v7.4s, v29.8h, v0.8h\n"
- "smlal v13.4s, v27.4h, v2.4h\n"
- "ldr x20, [x4, #0x68]\n"
- "ldr x26, [x4, #0x70]\n"
- "smlal2 v21.4s, v28.8h, v0.8h\n"
- "ldr d30, [x22, x0]\n"
- "smlal v20.4s, v27.4h, v1.4h\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "smlal v8.4s, v28.4h, v1.4h\n"
- "smlal v17.4s, v23.4h, v1.4h\n"
- "ldr x25, [x4, #0x78]\n"
- "ldr x23, [x4, #0x80]\n"
- "smlal2 v19.4s, v27.8h, v2.8h\n"
- "smlal2 v10.4s, v27.8h, v1.8h\n"
- "ldr d0, [x3, #0x28]\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal2 v7.4s, v28.8h, v1.8h\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "ldr x24, [x4, #0x88]\n"
- "ldr x15, [x4, #0x90]\n"
- "smlal2 v21.4s, v23.8h, v1.8h\n"
- "ldr d27, [x21, x0]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v8.4s, v23.4h, v2.4h\n"
- "smlal v17.4s, v31.4h, v2.4h\n"
- "ldr x21, [x4, #0x98]\n"
- "ldr x14, [x4, #0xa0]\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "ldr d1, [x3, #0x30]\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal2 v7.4s, v23.8h, v2.8h\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "ldr x13, [x4, #0xa8]\n"
- "ldr x12, [x4, #0xb0]\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "ldr d25, [x20, x0]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal v8.4s, v31.4h, v3.4h\n"
- "smlal v17.4s, v30.4h, v3.4h\n"
- "ldr x20, [x4, #0xb8]\n"
- "ldr x11, [x4, #0xc0]\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "ldr d2, [x3, #0x38]\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal2 v7.4s, v31.8h, v3.8h\n"
- "smlal v13.4s, v29.4h, v0.4h\n"
- "ldr x22, [x4, #0xc8]\n"
- "ldr x9, [x4, #0xd0]\n"
- "smlal2 v21.4s, v30.8h, v3.8h\n"
- "ldr d24, [x26, x0]\n"
- "smlal v20.4s, v27.4h, v4.4h\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal v8.4s, v30.4h, v4.4h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "ldr x28, [x4, #0xd8]\n"
- "ldr x27, [x4, #0xe0]\n"
- "smlal2 v19.4s, v29.8h, v0.8h\n"
- "ldr d3, [x3, #0x40]\n"
- "smlal2 v10.4s, v27.8h, v4.8h\n"
- "ldr d27, [x25, x0]\n"
- "smlal2 v7.4s, v30.8h, v4.8h\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr x26, [x4, #0xe8]\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0x48]\n"
- "smlal v20.4s, v28.4h, v0.4h\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v8.4s, v22.4h, v0.4h\n"
- "smlal v17.4s, v25.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ldr x25, [x4, #0xf0]\n"
- "smlal2 v19.4s, v28.8h, v1.8h\n"
- "smlal2 v10.4s, v28.8h, v0.8h\n"
- "ldr d28, [x24, x0]\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "smlal2 v7.4s, v22.8h, v0.8h\n"
- "smlal v13.4s, v23.4h, v2.4h\n"
- "ldr x24, [x4, #0xf8]\n"
- "tst x1, #0x7\n"
- "smlal2 v21.4s, v25.8h, v0.8h\n"
- "ldr d0, [x3, #0x50]\n"
- "smlal v20.4s, v23.4h, v1.4h\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v1.4h\n"
- "smlal v17.4s, v24.4h, v1.4h\n"
- "add x5, x5, #0x20\n"
- "smlal2 v19.4s, v23.8h, v2.8h\n"
- "smlal2 v10.4s, v23.8h, v1.8h\n"
- "ldr d23, [x23, x0]\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v7.4s, v25.8h, v1.8h\n"
- "smlal v13.4s, v31.4h, v3.4h\n"
- "ldr x23, [x4, #0x100]\n"
- "smlal2 v21.4s, v24.8h, v1.8h\n"
- "ldr d1, [x3, #0x58]\n"
- "smlal v20.4s, v31.4h, v2.4h\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v24.4h, v2.4h\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal2 v19.4s, v31.8h, v3.8h\n"
- "smlal2 v10.4s, v31.8h, v2.8h\n"
- "ldr d31, [x15, x0]\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v7.4s, v24.8h, v2.8h\n"
- "smlal v13.4s, v30.4h, v4.4h\n"
- "ldr x15, [x4, #0x108]\n"
- "smlal2 v21.4s, v27.8h, v2.8h\n"
- "ldr d2, [x3, #0x60]\n"
- "smlal v20.4s, v30.4h, v3.4h\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v27.4h, v3.4h\n"
- "smlal v17.4s, v23.4h, v3.4h\n"
- "smlal2 v19.4s, v30.8h, v4.8h\n"
- "smlal2 v10.4s, v30.8h, v3.8h\n"
- "ldr d30, [x21, x0]\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "smlal2 v7.4s, v27.8h, v3.8h\n"
- "smlal v13.4s, v22.4h, v0.4h\n"
- "ldr x21, [x4, #0x110]\n"
- "smlal2 v21.4s, v23.8h, v3.8h\n"
- "ldr d3, [x3, #0x68]\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v23.4h, v4.4h\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "smlal2 v19.4s, v22.8h, v0.8h\n"
- "ldr d22, [x20, x0]\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "ldr d26, [x14, x0]\n"
- "smlal2 v7.4s, v23.8h, v4.8h\n"
- "smlal v13.4s, v25.4h, v1.4h\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "ldr x20, [x4, #0x118]\n"
- "smlal2 v21.4s, v28.8h, v4.8h\n"
- "ldr d4, [x3, #0x70]\n"
- "smlal v20.4s, v25.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v31.4h, v0.4h\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "usubl v22.8h, v22.8b, v9.8b\n"
+ "ldr d27, [x6, #0x28]\n"
+ "ldr d1, [x6, #0x30]\n"
+ "smlal v7.4s, v31.4h, v6.4h\n"
+ "smlal2 v15.4s, v31.8h, v6.8h\n"
+ "ldr d2, [x6, #0x38]\n"
+ "ldr d31, [x6, #0x40]\n"
+ "smlal v7.4s, v17.4h, v14.4h\n"
+ "smlal v20.4s, v17.4h, v6.4h\n"
+ "ldr d8, [x6, #0x48]\n"
+ "ldr x22, [x5, #0x50]\n"
+ "smlal v24.4s, v30.4h, v6.4h\n"
+ "smlal v23.4s, v16.4h, v6.4h\n"
+ "smlal2 v15.4s, v17.8h, v14.8h\n"
+ "smlal v7.4s, v3.4h, v10.4h\n"
+ "ldr x20, [x5, #0x58]\n"
+ "ldr x21, [x5, #0x60]\n"
+ "smlal2 v5.4s, v17.8h, v6.8h\n"
+ "ldr d17, [x22, x3]\n"
+ "smlal2 v22.4s, v30.8h, v6.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v6.8h\n"
+ "ldr d6, [x20, x3]\n"
+ "smlal v20.4s, v3.4h, v14.4h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal v24.4s, v16.4h, v14.4h\n"
+ "smlal v23.4s, v4.4h, v14.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal2 v15.4s, v3.8h, v10.8h\n"
+ "smlal v7.4s, v25.4h, v21.4h\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "ldr x22, [x5, #0x70]\n"
+ "smlal2 v5.4s, v3.8h, v14.8h\n"
+ "ldr d3, [x21, x3]\n"
+ "smlal2 v22.4s, v16.8h, v14.8h\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v14.8h\n"
+ "ldr d14, [x20, x3]\n"
+ "smlal v20.4s, v25.4h, v10.4h\n"
+ "usubl v14.8h, v14.8b, v18.8b\n"
+ "smlal v24.4s, v4.4h, v10.4h\n"
+ "smlal v23.4s, v17.4h, v10.4h\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "ldr x21, [x5, #0x78]\n"
+ "smlal2 v15.4s, v25.8h, v21.8h\n"
+ "smlal v7.4s, v9.4h, v12.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x20, [x5, #0x80]\n"
+ "smlal2 v5.4s, v25.8h, v10.8h\n"
+ "ldr d25, [x22, x3]\n"
+ "smlal2 v22.4s, v4.8h, v10.8h\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v10.8h\n"
+ "ldr d10, [x21, x3]\n"
+ "smlal v20.4s, v9.4h, v21.4h\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "smlal v24.4s, v17.4h, v21.4h\n"
+ "smlal v23.4s, v6.4h, v21.4h\n"
+ "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x24, [x5, #0x88]\n"
+ "smlal2 v15.4s, v9.8h, v12.8h\n"
+ "smlal v7.4s, v30.4h, v27.4h\n"
+ "ldr x23, [x5, #0x90]\n"
+ "ldr x22, [x5, #0x98]\n"
+ "smlal2 v5.4s, v9.8h, v21.8h\n"
+ "ldr d9, [x20, x3]\n"
+ "smlal2 v22.4s, v17.8h, v21.8h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v21.8h\n"
+ "ldr d21, [x6, #0x50]\n"
+ "smlal v20.4s, v3.4h, v12.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v12.4h\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "ldr x21, [x5, #0xa0]\n"
+ "ldr x20, [x5, #0xa8]\n"
+ "smlal2 v15.4s, v30.8h, v27.8h\n"
+ "ldr d30, [x24, x3]\n"
+ "smlal v7.4s, v16.4h, v1.4h\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "smlal2 v5.4s, v3.8h, v12.8h\n"
+ "ldr d3, [x6, #0x58]\n"
+ "smlal2 v22.4s, v6.8h, v12.8h\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "ldr d12, [x23, x3]\n"
+ "smlal v20.4s, v16.4h, v27.4h\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "smlal v24.4s, v28.4h, v27.4h\n"
+ "smlal v23.4s, v14.4h, v27.4h\n"
+ "ldr x13, [x5, #0xb0]\n"
+ "ldr x12, [x5, #0xb8]\n"
+ "smlal2 v15.4s, v16.8h, v1.8h\n"
+ "smlal v7.4s, v4.4h, v2.4h\n"
+ "ldr x11, [x5, #0xc0]\n"
+ "ldr x10, [x5, #0xc8]\n"
+ "smlal2 v5.4s, v16.8h, v27.8h\n"
+ "ldr d16, [x22, x3]\n"
+ "smlal2 v22.4s, v28.8h, v27.8h\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v27.8h\n"
+ "ldr d27, [x6, #0x60]\n"
+ "smlal v20.4s, v4.4h, v1.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v1.4h\n"
+ "smlal v23.4s, v25.4h, v1.4h\n"
+ "ldr x9, [x5, #0xd0]\n"
+ "ldr x28, [x5, #0xd8]\n"
+ "smlal2 v15.4s, v4.8h, v2.8h\n"
+ "smlal v7.4s, v17.4h, v31.4h\n"
+ "ldr x27, [x5, #0xe0]\n"
+ "ldr x26, [x5, #0xe8]\n"
+ "smlal2 v5.4s, v4.8h, v1.8h\n"
+ "ldr d4, [x21, x3]\n"
+ "smlal2 v22.4s, v14.8h, v1.8h\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
"smlal2 v19.4s, v25.8h, v1.8h\n"
- "smlal2 v10.4s, v25.8h, v0.8h\n"
- "ldr d25, [x13, x0]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v0.8h\n"
- "smlal v13.4s, v24.4h, v2.4h\n"
- "smlal2 v21.4s, v30.8h, v0.8h\n"
- "ldr d0, [x3, #0x78]\n"
- "smlal v20.4s, v24.4h, v1.4h\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v30.4h, v1.4h\n"
- "smlal v17.4s, v26.4h, v1.4h\n"
- "smlal2 v19.4s, v24.8h, v2.8h\n"
- "smlal2 v10.4s, v24.8h, v1.8h\n"
- "ldr d24, [x12, x0]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v7.4s, v30.8h, v1.8h\n"
- "smlal v13.4s, v27.4h, v3.4h\n"
- "smlal2 v21.4s, v26.8h, v1.8h\n"
- "ldr d1, [x3, #0x80]\n"
- "smlal v20.4s, v27.4h, v2.4h\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v26.4h, v2.4h\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v19.4s, v27.8h, v3.8h\n"
- "smlal2 v10.4s, v27.8h, v2.8h\n"
- "ldr d27, [x11, x0]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal2 v7.4s, v26.8h, v2.8h\n"
- "smlal v13.4s, v23.4h, v4.4h\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "ldr d2, [x3, #0x88]\n"
- "smlal v20.4s, v23.4h, v3.4h\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal2 v19.4s, v23.8h, v4.8h\n"
- "smlal2 v10.4s, v23.8h, v3.8h\n"
- "ldr d23, [x22, x0]\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "ldr d3, [x3, #0x90]\n"
- "smlal v20.4s, v28.4h, v4.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal v17.4s, v22.4h, v4.4h\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "ldr d31, [x9, x0]\n"
- "smlal2 v10.4s, v28.8h, v4.8h\n"
- "ldr d28, [x27, x0]\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v21.4s, v22.8h, v4.8h\n"
- "ldr d4, [x3, #0x98]\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v27.4h, v0.4h\n"
- "smlal v17.4s, v23.4h, v0.4h\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr d30, [x28, x0]\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "smlal2 v7.4s, v27.8h, v0.8h\n"
- "smlal v13.4s, v26.4h, v2.4h\n"
- "smlal2 v21.4s, v23.8h, v0.8h\n"
- "ldr d0, [x3, #0xa0]\n"
- "smlal v20.4s, v26.4h, v1.4h\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v23.4h, v1.4h\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal2 v19.4s, v26.8h, v2.8h\n"
- "smlal2 v10.4s, v26.8h, v1.8h\n"
- "ldr d26, [x26, x0]\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "smlal2 v7.4s, v23.8h, v1.8h\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "smlal2 v21.4s, v31.8h, v1.8h\n"
- "ldr d1, [x3, #0xa8]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v31.4h, v2.4h\n"
- "smlal v17.4s, v30.4h, v2.4h\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "ldr d25, [x25, x0]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v2.8h\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "smlal2 v21.4s, v30.8h, v2.8h\n"
- "ldr d2, [x3, #0xb0]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v30.4h, v3.4h\n"
- "smlal v17.4s, v28.4h, v3.4h\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "ldr d24, [x24, x0]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v7.4s, v30.8h, v3.8h\n"
- "smlal v13.4s, v27.4h, v0.4h\n"
- "smlal2 v21.4s, v28.8h, v3.8h\n"
- "ldr d3, [x3, #0xb8]\n"
- "smlal v20.4s, v22.4h, v4.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v28.4h, v4.4h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal2 v19.4s, v27.8h, v0.8h\n"
- "ldr d27, [x23, x0]\n"
- "smlal2 v7.4s, v28.8h, v4.8h\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v13.4s, v23.4h, v1.4h\n"
- "smlal2 v10.4s, v22.8h, v4.8h\n"
- "ldr q22, [x8, #0x10]\n"
- "add x8, x8, #0x20\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0xc0]\n"
- "smlal v20.4s, v23.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v0.4h\n"
- "smlal v17.4s, v24.4h, v0.4h\n"
- "smlal2 v19.4s, v23.8h, v1.8h\n"
- "smlal2 v7.4s, v25.8h, v0.8h\n"
- "ldr d25, [x15, x0]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v10.4s, v23.8h, v0.8h\n"
- "smlal2 v21.4s, v24.8h, v0.8h\n"
- "smlal v20.4s, v31.4h, v1.4h\n"
- "smlal v8.4s, v24.4h, v1.4h\n"
- "smlal v17.4s, v27.4h, v1.4h\n"
- "smlal2 v19.4s, v31.8h, v2.8h\n"
- "smlal2 v7.4s, v24.8h, v1.8h\n"
- "ldr d24, [x21, x0]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal v13.4s, v30.4h, v3.4h\n"
- "smlal2 v10.4s, v31.8h, v1.8h\n"
- "smlal2 v21.4s, v27.8h, v1.8h\n"
+ "ldr d1, [x6, #0x68]\n"
+ "smlal v20.4s, v17.4h, v2.4h\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v2.4h\n"
+ "smlal v23.4s, v10.4h, v2.4h\n"
+ "ldr x25, [x5, #0xf0]\n"
+ "ldr x24, [x5, #0xf8]\n"
+ "smlal2 v15.4s, v17.8h, v31.8h\n"
+ "smlal v7.4s, v6.4h, v8.4h\n"
+ "ldr x23, [x5, #0x100]\n"
+ "ldr x22, [x5, #0x108]\n"
+ "smlal2 v5.4s, v17.8h, v2.8h\n"
+ "ldr d17, [x20, x3]\n"
+ "smlal2 v22.4s, v25.8h, v2.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v2.8h\n"
+ "ldr d2, [x6, #0x70]\n"
+ "smlal v20.4s, v6.4h, v31.4h\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v31.4h\n"
+ "smlal v23.4s, v9.4h, v31.4h\n"
+ "ldr x21, [x5, #0x110]\n"
+ "ldr x20, [x5, #0x118]\n"
+ "smlal2 v15.4s, v6.8h, v8.8h\n"
+ "smlal v7.4s, v28.4h, v21.4h\n"
+ "tst x1, #0x7\n"
+ "smlal2 v5.4s, v6.8h, v31.8h\n"
+ "ldr d6, [x13, x3]\n"
+ "smlal2 v22.4s, v10.8h, v31.8h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v31.8h\n"
+ "ldr d31, [x6, #0x78]\n"
+ "smlal v20.4s, v29.4h, v8.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v9.4h, v8.4h\n"
+ "smlal v23.4s, v30.4h, v8.4h\n"
+ "smlal2 v15.4s, v28.8h, v21.8h\n"
+ "ldr d28, [x12, x3]\n"
+ "smlal v7.4s, v14.4h, v3.4h\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
+ "smlal2 v5.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x6, #0x80]\n"
+ "smlal2 v22.4s, v9.8h, v8.8h\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "smlal2 v19.4s, v30.8h, v8.8h\n"
+ "ldr d8, [x11, x3]\n"
+ "smlal v20.4s, v14.4h, v21.4h\n"
+ "usubl v8.8h, v8.8b, v18.8b\n"
+ "smlal v24.4s, v12.4h, v21.4h\n"
+ "smlal v23.4s, v16.4h, v21.4h\n"
+ "smlal2 v15.4s, v14.8h, v3.8h\n"
+ "smlal v7.4s, v25.4h, v27.4h\n"
+ "smlal2 v5.4s, v14.8h, v21.8h\n"
+ "ldr d14, [x10, x3]\n"
+ "smlal2 v22.4s, v12.8h, v21.8h\n"
+ "usubl v14.8h, v14.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v21.8h\n"
+ "ldr d21, [x6, #0x88]\n"
+ "smlal v20.4s, v25.4h, v3.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v16.4h, v3.4h\n"
+ "smlal v23.4s, v4.4h, v3.4h\n"
+ "smlal2 v15.4s, v25.8h, v27.8h\n"
+ "smlal v7.4s, v10.4h, v1.4h\n"
+ "smlal2 v5.4s, v25.8h, v3.8h\n"
+ "ldr d25, [x9, x3]\n"
+ "smlal2 v22.4s, v16.8h, v3.8h\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v3.8h\n"
+ "ldr d3, [x6, #0x90]\n"
+ "smlal v20.4s, v10.4h, v27.4h\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v4.4h, v27.4h\n"
+ "smlal v23.4s, v17.4h, v27.4h\n"
+ "smlal2 v15.4s, v10.8h, v1.8h\n"
+ "smlal v7.4s, v9.4h, v2.4h\n"
+ "smlal2 v5.4s, v10.8h, v27.8h\n"
+ "ldr d10, [x28, x3]\n"
+ "smlal2 v22.4s, v4.8h, v27.8h\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v27.8h\n"
+ "ldr d27, [x6, #0x98]\n"
+ "smlal v20.4s, v9.4h, v1.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v17.4h, v1.4h\n"
+ "smlal v23.4s, v6.4h, v1.4h\n"
+ "smlal2 v15.4s, v9.8h, v2.8h\n"
+ "smlal v7.4s, v12.4h, v31.4h\n"
+ "smlal2 v5.4s, v9.8h, v1.8h\n"
+ "ldr d9, [x27, x3]\n"
+ "smlal2 v22.4s, v17.8h, v1.8h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v1.8h\n"
+ "ldr d1, [x6, #0xa0]\n"
"smlal v20.4s, v30.4h, v2.4h\n"
- "smlal v8.4s, v27.4h, v2.4h\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v19.4s, v30.8h, v3.8h\n"
- "smlal2 v7.4s, v27.8h, v2.8h\n"
- "ldr d27, [x20, x0]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v13.4s, v28.4h, v4.4h\n"
- "smlal2 v10.4s, v30.8h, v2.8h\n"
- "sqrdmulh v13.4s, v13.4s, v18.4s\n"
- "add x0, x0, #0x8\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "smlal v20.4s, v28.4h, v3.4h\n"
- "and v30.16b, v13.16b, v6.16b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "smlal2 v19.4s, v28.8h, v4.8h\n"
- "smlal2 v10.4s, v28.8h, v3.8h\n"
- "sqrdmulh v19.4s, v19.4s, v5.4s\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "and v16.16b, v19.16b, v22.16b\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "sqrdmulh v20.4s, v20.4s, v18.4s\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "sqrdmulh v8.4s, v8.4s, v18.4s\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal2 v21.4s, v27.8h, v4.8h\n"
- "sqrdmulh v17.4s, v17.4s, v18.4s\n"
- "sqadd v13.4s, v13.4s, v30.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "and v0.16b, v20.16b, v6.16b\n"
- "sqrdmulh v10.4s, v10.4s, v5.4s\n"
- "and v18.16b, v8.16b, v6.16b\n"
- "sqrdmulh v7.4s, v7.4s, v5.4s\n"
- "and v30.16b, v17.16b, v6.16b\n"
- "sqrdmulh v21.4s, v21.4s, v5.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v26.16b, v10.16b, v22.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "and v23.16b, v7.16b, v22.16b\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v2.4h\n"
+ "smlal v23.4s, v28.4h, v2.4h\n"
+ "smlal2 v15.4s, v12.8h, v31.8h\n"
+ "ldr d12, [x26, x3]\n"
+ "smlal v7.4s, v16.4h, v29.4h\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "smlal2 v5.4s, v30.8h, v2.8h\n"
+ "ldr d30, [x6, #0xa8]\n"
+ "smlal2 v22.4s, v6.8h, v2.8h\n"
+ "ssubl v30.8h, v30.8b, v13.8b\n"
+ "smlal2 v19.4s, v28.8h, v2.8h\n"
+ "ldr d2, [x25, x3]\n"
+ "smlal v20.4s, v16.4h, v31.4h\n"
+ "usubl v2.8h, v2.8b, v18.8b\n"
+ "smlal v24.4s, v8.4h, v31.4h\n"
+ "smlal v23.4s, v14.4h, v31.4h\n"
+ "smlal2 v15.4s, v16.8h, v29.8h\n"
+ "smlal v7.4s, v4.4h, v21.4h\n"
+ "smlal2 v5.4s, v16.8h, v31.8h\n"
+ "ldr d16, [x24, x3]\n"
+ "smlal2 v22.4s, v8.8h, v31.8h\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v31.8h\n"
+ "ldr d31, [x6, #0xb0]\n"
+ "smlal v20.4s, v4.4h, v29.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v29.4h\n"
+ "smlal v23.4s, v25.4h, v29.4h\n"
+ "smlal2 v15.4s, v4.8h, v21.8h\n"
+ "smlal v7.4s, v17.4h, v3.4h\n"
+ "smlal2 v5.4s, v4.8h, v29.8h\n"
+ "ldr d4, [x23, x3]\n"
+ "smlal2 v22.4s, v14.8h, v29.8h\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v19.4s, v25.8h, v29.8h\n"
+ "ldr d29, [x6, #0xb8]\n"
+ "smlal v20.4s, v17.4h, v21.4h\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v21.4h\n"
+ "smlal v23.4s, v10.4h, v21.4h\n"
+ "smlal2 v15.4s, v17.8h, v3.8h\n"
+ "smlal v7.4s, v6.4h, v27.4h\n"
+ "smlal2 v5.4s, v17.8h, v21.8h\n"
+ "ldr d17, [x22, x3]\n"
+ "smlal2 v22.4s, v25.8h, v21.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v21.8h\n"
+ "ldr d21, [x6, #0xc0]\n"
+ "smlal v20.4s, v6.4h, v3.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v3.4h\n"
+ "smlal v23.4s, v9.4h, v3.4h\n"
+ "smlal2 v15.4s, v6.8h, v27.8h\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "smlal2 v5.4s, v6.8h, v3.8h\n"
+ "ldr d6, [x21, x3]\n"
+ "smlal2 v22.4s, v10.8h, v3.8h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v3.8h\n"
+ "ldr d3, [x20, x3]\n"
+ "smlal v20.4s, v28.4h, v27.4h\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "smlal v24.4s, v9.4h, v27.4h\n"
+ "smlal v23.4s, v12.4h, v27.4h\n"
+ "add x3, x3, #0x8\n"
+ "smlal2 v15.4s, v8.8h, v1.8h\n"
+ "ldr q8, [x7, #0x0]\n"
+ "smlal v7.4s, v14.4h, v30.4h\n"
+ "smlal2 v5.4s, v28.8h, v27.8h\n"
+ "ldr q28, [x8, #0x0]\n"
+ "smlal2 v22.4s, v9.8h, v27.8h\n"
+ "smlal2 v19.4s, v12.8h, v27.8h\n"
+ "ldr q27, [x7, #0x10]\n"
+ "smlal v20.4s, v14.4h, v1.4h\n"
+ "add x7, x7, #0x20\n"
+ "smlal v24.4s, v2.4h, v1.4h\n"
+ "smlal v23.4s, v16.4h, v1.4h\n"
+ "smlal2 v15.4s, v14.8h, v30.8h\n"
+ "smlal v7.4s, v25.4h, v31.4h\n"
+ "smlal2 v5.4s, v14.8h, v1.8h\n"
+ "ldr q14, [x8, #0x10]\n"
+ "smlal2 v22.4s, v2.8h, v1.8h\n"
+ "add x8, x8, #0x20\n"
+ "smlal2 v19.4s, v16.8h, v1.8h\n"
+ "smlal v20.4s, v25.4h, v30.4h\n"
+ "smlal v24.4s, v16.4h, v30.4h\n"
+ "smlal v23.4s, v4.4h, v30.4h\n"
+ "smlal2 v15.4s, v25.8h, v31.8h\n"
+ "smlal v7.4s, v10.4h, v29.4h\n"
+ "smlal2 v5.4s, v25.8h, v30.8h\n"
+ "smlal2 v22.4s, v16.8h, v30.8h\n"
+ "smlal2 v19.4s, v4.8h, v30.8h\n"
+ "smlal v20.4s, v10.4h, v31.4h\n"
+ "smlal v24.4s, v4.4h, v31.4h\n"
+ "smlal v23.4s, v17.4h, v31.4h\n"
+ "smlal2 v15.4s, v10.8h, v29.8h\n"
+ "smlal v7.4s, v9.4h, v21.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v8.4s\n"
+ "smlal2 v5.4s, v10.8h, v31.8h\n"
+ "smlal2 v22.4s, v4.8h, v31.8h\n"
+ "and v4.16b, v7.16b, v28.16b\n"
+ "smlal2 v19.4s, v17.8h, v31.8h\n"
+ "smlal v20.4s, v9.4h, v29.4h\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "smlal v24.4s, v17.4h, v29.4h\n"
+ "smlal v23.4s, v6.4h, v29.4h\n"
+ "sqadd v7.4s, v7.4s, v4.4s\n"
+ "smlal2 v15.4s, v9.8h, v21.8h\n"
+ "smlal2 v5.4s, v9.8h, v29.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v27.4s\n"
+ "smlal2 v22.4s, v17.8h, v29.8h\n"
+ "smlal2 v19.4s, v6.8h, v29.8h\n"
+ "and v30.16b, v15.16b, v14.16b\n"
+ "smlal v20.4s, v12.4h, v21.4h\n"
+ "smlal v24.4s, v6.4h, v21.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+ "smlal v23.4s, v3.4h, v21.4h\n"
+ "smlal2 v5.4s, v12.8h, v21.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "smlal2 v22.4s, v6.8h, v21.8h\n"
+ "smlal2 v19.4s, v3.8h, v21.8h\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
"sshr v30.4s, v30.4s, #0x1f\n"
- "and v16.16b, v21.16b, v22.16b\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v18.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v30.4s\n"
+ "and v3.16b, v20.16b, v28.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v27.4s\n"
+ "and v25.16b, v24.16b, v28.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+ "and v16.16b, v23.16b, v28.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v27.4s\n"
+ "sqadd v15.4s, v15.4s, v30.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "and v4.16b, v5.16b, v14.16b\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "and v10.16b, v22.16b, v14.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v6.4s\n"
- "srshl v20.4s, v20.4s, v6.4s\n"
- "sqadd v10.4s, v10.4s, v26.4s\n"
- "srshl v8.4s, v8.4s, v6.4s\n"
- "sqadd v7.4s, v7.4s, v23.4s\n"
- "srshl v17.4s, v17.4s, v6.4s\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "srshl v19.4s, v19.4s, v22.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v10.4s, v10.4s, v22.4s\n"
+ "and v12.16b, v19.16b, v14.16b\n"
+ "sqadd v20.4s, v20.4s, v3.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v25.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v16.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "srshl v7.4s, v7.4s, v28.4s\n"
+ "srshl v20.4s, v20.4s, v28.4s\n"
+ "sqadd v5.4s, v5.4s, v4.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
+ "sqadd v22.4s, v22.4s, v10.4s\n"
+ "srshl v23.4s, v23.4s, v28.4s\n"
+ "sqadd v19.4s, v19.4s, v12.4s\n"
+ "srshl v15.4s, v15.4s, v14.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v5.4s, v5.4s, v14.4s\n"
"sqxtn v20.4h, v20.4s\n"
- "srshl v7.4s, v7.4s, v22.4s\n"
- "sqxtn v8.4h, v8.4s\n"
- "srshl v21.4s, v21.4s, v22.4s\n"
- "sqxtn v17.4h, v17.4s\n"
- "sqxtn2 v13.8h, v19.4s\n"
- "sqxtn2 v20.8h, v10.4s\n"
- "sqxtn2 v8.8h, v7.4s\n"
- "sqxtn2 v17.8h, v21.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v20.8h, v20.8h, v14.8h\n"
- "sqadd v8.8h, v8.8h, v14.8h\n"
- "sqadd v17.8h, v17.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v20.8h, v20.8h, v12.8h\n"
- "smax v8.8h, v8.8h, v12.8h\n"
- "smax v17.8h, v17.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v20.8h, v20.8h, v11.8h\n"
- "smin v8.8h, v8.8h, v11.8h\n"
- "smin v17.8h, v17.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "srshl v22.4s, v22.4s, v14.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "srshl v19.4s, v19.4s, v14.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v7.8h, v15.4s\n"
+ "sqxtn2 v20.8h, v5.4s\n"
+ "sqxtn2 v24.8h, v22.4s\n"
+ "sqxtn2 v23.8h, v19.4s\n"
+ "sqadd v7.8h, v7.8h, v26.8h\n"
+ "sqadd v20.8h, v20.8h, v26.8h\n"
+ "sqadd v24.8h, v24.8h, v26.8h\n"
+ "sqadd v23.8h, v23.8h, v26.8h\n"
+ "smax v7.8h, v7.8h, v11.8h\n"
+ "smax v20.8h, v20.8h, v11.8h\n"
+ "smax v24.8h, v24.8h, v11.8h\n"
+ "smax v23.8h, v23.8h, v11.8h\n"
+ "smin v7.8h, v7.8h, v0.8h\n"
+ "smin v20.8h, v20.8h, v0.8h\n"
+ "smin v24.8h, v24.8h, v0.8h\n"
+ "smin v23.8h, v23.8h, v0.8h\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "str d7, [x17, x4]\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str d13, [x17, x10]\n"
- "uzp1 v8.16b, v8.16b, v8.16b\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
- "str d20, [x6, x10]\n"
- "str d8, [x7, x10]\n"
- "str d17, [x16, x10]\n"
- "add x10, x10, #0x8\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str d20, [x16, x4]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d24, [x15, x4]\n"
+ "str d23, [x14, x4]\n"
+ "add x4, x4, #0x8\n"
"beq 124f\n"
- "add x3, x3, #0xc8\n"
+ "add x6, x6, #0xc8\n"
"3:" // Oddments
- "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
"tbz x1, #2, 5f\n"
- "ld1 { v13.4s }, [x13], #0x10\n"
+ "ld1 { v7.4s }, [x20], #0x10\n"
"tbz x1, #1, 4f\n"
- "ld1 { v19.d }[0], [x13], #0x8\n"
+ "ld1 { v15.d }[0], [x20], #0x8\n"
"tbz x1, #0, 7f\n"
- "ld1 { v19.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
"tbz x1, #0, 7f\n"
- "ld1 { v19.s }[0], [x13]\n"
+ "ld1 { v15.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
"tbz x1, #1, 6f\n"
- "ld1 { v13.d }[0], [x13], #0x8\n"
+ "ld1 { v7.d }[0], [x20], #0x8\n"
"tbz x1, #0, 7f\n"
- "ld1 { v13.s }[2], [x13]\n"
+ "ld1 { v7.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 7f\n"
- "ld1 { v13.s }[0], [x13]\n"
+ "ld1 { v7.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x3, #0x0]\n"
- "ldr d1, [x3, #0x8]\n"
- "mov v20.16b, v13.16b\n"
- "mov v10.16b, v19.16b\n"
- "ldr d2, [x3, #0x10]\n"
- "ldr d3, [x3, #0x18]\n"
- "mov v8.16b, v13.16b\n"
- "mov v7.16b, v19.16b\n"
- "ldr d4, [x3, #0x20]\n"
- "ldp x9, x28, [x4, #0x0]\n"
- "mov v17.16b, v13.16b\n"
- "mov v21.16b, v19.16b\n"
- "ldp x27, x26, [x4, #0x10]\n"
- "ldp x25, x24, [x4, #0x20]\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldp x23, x22, [x4, #0x30]\n"
- "ldp x21, x20, [x4, #0x40]\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "add x9, x9, x0\n"
- "add x28, x28, x0\n"
- "add x27, x27, x0\n"
- "add x26, x26, x0\n"
- "add x25, x25, x0\n"
- "add x24, x24, x0\n"
- "add x23, x23, x0\n"
- "add x22, x22, x0\n"
- "add x21, x21, x0\n"
- "add x20, x20, x0\n"
+ "ldr d6, [x6, #0x0]\n"
+ "ldr d14, [x6, #0x8]\n"
+ "mov v20.16b, v7.16b\n"
+ "mov v5.16b, v15.16b\n"
+ "ldr d10, [x6, #0x10]\n"
+ "ldr d21, [x6, #0x18]\n"
+ "mov v24.16b, v7.16b\n"
+ "mov v22.16b, v15.16b\n"
+ "ldr d12, [x6, #0x20]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "mov v23.16b, v7.16b\n"
+ "mov v19.16b, v15.16b\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ssubl v14.8h, v14.8b, v13.8b\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "ssubl v10.8h, v10.8b, v13.8b\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ssubl v12.8h, v12.8b, v13.8b\n"
+ "add x9, x9, x3\n"
+ "add x28, x28, x3\n"
+ "add x27, x27, x3\n"
+ "add x26, x26, x3\n"
+ "add x25, x25, x3\n"
+ "add x24, x24, x3\n"
+ "add x23, x23, x3\n"
+ "add x22, x22, x3\n"
+ "add x21, x21, x3\n"
+ "add x20, x20, x3\n"
"tbz x1, #2, 9f\n"
"ld1 { v31.s }[0], [x9], #0x4\n"
- "ld1 { v30.s }[0], [x28], #0x4\n"
- "ld1 { v29.s }[0], [x27], #0x4\n"
- "ld1 { v28.s }[0], [x26], #0x4\n"
- "ld1 { v27.s }[0], [x25], #0x4\n"
- "ld1 { v23.s }[0], [x24], #0x4\n"
+ "ld1 { v17.s }[0], [x28], #0x4\n"
+ "ld1 { v30.s }[0], [x27], #0x4\n"
+ "ld1 { v16.s }[0], [x26], #0x4\n"
+ "ld1 { v3.s }[0], [x25], #0x4\n"
+ "ld1 { v4.s }[0], [x24], #0x4\n"
"ld1 { v25.s }[0], [x23], #0x4\n"
- "ld1 { v24.s }[0], [x22], #0x4\n"
- "ld1 { v26.s }[0], [x21], #0x4\n"
- "ld1 { v22.s }[0], [x20], #0x4\n"
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v29.s }[0], [x21], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
"tbz x1, #1, 8f\n"
"ld1 { v31.h }[2], [x9], #0x2\n"
- "ld1 { v30.h }[2], [x28], #0x2\n"
- "ld1 { v29.h }[2], [x27], #0x2\n"
- "ld1 { v28.h }[2], [x26], #0x2\n"
- "ld1 { v27.h }[2], [x25], #0x2\n"
- "ld1 { v23.h }[2], [x24], #0x2\n"
+ "ld1 { v17.h }[2], [x28], #0x2\n"
+ "ld1 { v30.h }[2], [x27], #0x2\n"
+ "ld1 { v16.h }[2], [x26], #0x2\n"
+ "ld1 { v3.h }[2], [x25], #0x2\n"
+ "ld1 { v4.h }[2], [x24], #0x2\n"
"ld1 { v25.h }[2], [x23], #0x2\n"
- "ld1 { v24.h }[2], [x22], #0x2\n"
- "ld1 { v26.h }[2], [x21], #0x2\n"
- "ld1 { v22.h }[2], [x20], #0x2\n"
+ "ld1 { v9.h }[2], [x22], #0x2\n"
+ "ld1 { v29.h }[2], [x21], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
"tbz x1, #0, 11f\n"
"ld1 { v31.b }[6], [x9]\n"
- "ld1 { v30.b }[6], [x28]\n"
- "ld1 { v29.b }[6], [x27]\n"
- "ld1 { v28.b }[6], [x26]\n"
- "ld1 { v27.b }[6], [x25]\n"
- "ld1 { v23.b }[6], [x24]\n"
+ "ld1 { v17.b }[6], [x28]\n"
+ "ld1 { v30.b }[6], [x27]\n"
+ "ld1 { v16.b }[6], [x26]\n"
+ "ld1 { v3.b }[6], [x25]\n"
+ "ld1 { v4.b }[6], [x24]\n"
"ld1 { v25.b }[6], [x23]\n"
- "ld1 { v24.b }[6], [x22]\n"
- "ld1 { v26.b }[6], [x21]\n"
- "ld1 { v22.b }[6], [x20]\n"
+ "ld1 { v9.b }[6], [x22]\n"
+ "ld1 { v29.b }[6], [x21]\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
"tbz x1, #0, 11f\n"
"ld1 { v31.b }[4], [x9]\n"
- "ld1 { v30.b }[4], [x28]\n"
- "ld1 { v29.b }[4], [x27]\n"
- "ld1 { v28.b }[4], [x26]\n"
- "ld1 { v27.b }[4], [x25]\n"
- "ld1 { v23.b }[4], [x24]\n"
+ "ld1 { v17.b }[4], [x28]\n"
+ "ld1 { v30.b }[4], [x27]\n"
+ "ld1 { v16.b }[4], [x26]\n"
+ "ld1 { v3.b }[4], [x25]\n"
+ "ld1 { v4.b }[4], [x24]\n"
"ld1 { v25.b }[4], [x23]\n"
- "ld1 { v24.b }[4], [x22]\n"
- "ld1 { v26.b }[4], [x21]\n"
- "ld1 { v22.b }[4], [x20]\n"
+ "ld1 { v9.b }[4], [x22]\n"
+ "ld1 { v29.b }[4], [x21]\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
"tbz x1, #1, 10f\n"
"ld1 { v31.h }[0], [x9], #0x2\n"
- "ld1 { v30.h }[0], [x28], #0x2\n"
- "ld1 { v29.h }[0], [x27], #0x2\n"
- "ld1 { v28.h }[0], [x26], #0x2\n"
- "ld1 { v27.h }[0], [x25], #0x2\n"
- "ld1 { v23.h }[0], [x24], #0x2\n"
+ "ld1 { v17.h }[0], [x28], #0x2\n"
+ "ld1 { v30.h }[0], [x27], #0x2\n"
+ "ld1 { v16.h }[0], [x26], #0x2\n"
+ "ld1 { v3.h }[0], [x25], #0x2\n"
+ "ld1 { v4.h }[0], [x24], #0x2\n"
"ld1 { v25.h }[0], [x23], #0x2\n"
- "ld1 { v24.h }[0], [x22], #0x2\n"
- "ld1 { v26.h }[0], [x21], #0x2\n"
- "ld1 { v22.h }[0], [x20], #0x2\n"
+ "ld1 { v9.h }[0], [x22], #0x2\n"
+ "ld1 { v29.h }[0], [x21], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
"tbz x1, #0, 11f\n"
"ld1 { v31.b }[2], [x9]\n"
- "ld1 { v30.b }[2], [x28]\n"
- "ld1 { v29.b }[2], [x27]\n"
- "ld1 { v28.b }[2], [x26]\n"
- "ld1 { v27.b }[2], [x25]\n"
- "ld1 { v23.b }[2], [x24]\n"
+ "ld1 { v17.b }[2], [x28]\n"
+ "ld1 { v30.b }[2], [x27]\n"
+ "ld1 { v16.b }[2], [x26]\n"
+ "ld1 { v3.b }[2], [x25]\n"
+ "ld1 { v4.b }[2], [x24]\n"
"ld1 { v25.b }[2], [x23]\n"
- "ld1 { v24.b }[2], [x22]\n"
- "ld1 { v26.b }[2], [x21]\n"
- "ld1 { v22.b }[2], [x20]\n"
+ "ld1 { v9.b }[2], [x22]\n"
+ "ld1 { v29.b }[2], [x21]\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 11f\n"
"ld1 { v31.b }[0], [x9]\n"
- "ld1 { v30.b }[0], [x28]\n"
- "ld1 { v29.b }[0], [x27]\n"
- "ld1 { v28.b }[0], [x26]\n"
- "ld1 { v27.b }[0], [x25]\n"
- "ld1 { v23.b }[0], [x24]\n"
+ "ld1 { v17.b }[0], [x28]\n"
+ "ld1 { v30.b }[0], [x27]\n"
+ "ld1 { v16.b }[0], [x26]\n"
+ "ld1 { v3.b }[0], [x25]\n"
+ "ld1 { v4.b }[0], [x24]\n"
"ld1 { v25.b }[0], [x23]\n"
- "ld1 { v24.b }[0], [x22]\n"
- "ld1 { v26.b }[0], [x21]\n"
- "ld1 { v22.b }[0], [x20]\n"
+ "ld1 { v9.b }[0], [x22]\n"
+ "ld1 { v29.b }[0], [x21]\n"
+ "ld1 { v28.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "usubl v31.8h, v31.8b, v9.8b\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "ldr x20, [x4, #0x50]\n"
- "usubl v29.8h, v29.8b, v9.8b\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "smlal v8.4s, v29.4h, v0.4h\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "add x20, x20, x0\n"
- "smlal2 v7.4s, v29.8h, v0.8h\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "smlal2 v21.4s, v28.8h, v0.8h\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal v20.4s, v27.4h, v1.4h\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v10.4s, v27.8h, v1.8h\n"
- "smlal v8.4s, v28.4h, v1.4h\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v7.4s, v28.8h, v1.8h\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "smlal v17.4s, v23.4h, v1.4h\n"
- "usubl v22.8h, v22.8b, v9.8b\n"
- "smlal2 v21.4s, v23.8h, v1.8h\n"
- "smlal v13.4s, v27.4h, v2.4h\n"
- "smlal2 v19.4s, v27.8h, v2.8h\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "smlal v8.4s, v23.4h, v2.4h\n"
- "smlal2 v7.4s, v23.8h, v2.8h\n"
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal v7.4s, v31.4h, v6.4h\n"
+ "ldr x20, [x5, #0x50]\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "smlal2 v15.4s, v31.8h, v6.8h\n"
+ "smlal v20.4s, v17.4h, v6.4h\n"
+ "smlal2 v5.4s, v17.8h, v6.8h\n"
+ "smlal v24.4s, v30.4h, v6.4h\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "add x20, x20, x3\n"
+ "smlal2 v22.4s, v30.8h, v6.8h\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "smlal v23.4s, v16.4h, v6.4h\n"
+ "smlal2 v19.4s, v16.8h, v6.8h\n"
+ "smlal v7.4s, v17.4h, v14.4h\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v15.4s, v17.8h, v14.8h\n"
+ "smlal v20.4s, v3.4h, v14.4h\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v5.4s, v3.8h, v14.8h\n"
+ "smlal v24.4s, v16.4h, v14.4h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v22.4s, v16.8h, v14.8h\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "smlal v23.4s, v4.4h, v14.4h\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v14.8h\n"
+ "smlal v7.4s, v3.4h, v10.4h\n"
+ "smlal2 v15.4s, v3.8h, v10.8h\n"
+ "smlal v20.4s, v25.4h, v10.4h\n"
+ "smlal2 v5.4s, v25.8h, v10.8h\n"
+ "smlal v24.4s, v4.4h, v10.4h\n"
+ "smlal2 v22.4s, v4.8h, v10.8h\n"
"tbz x1, #2, 13f\n"
- "ld1 { v31.s }[0], [x20], #0x4\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
"tbz x1, #1, 12f\n"
- "ld1 { v31.h }[2], [x20], #0x2\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
"tbz x1, #0, 15f\n"
- "ld1 { v31.b }[6], [x20]\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 15f\n"
- "ld1 { v31.b }[4], [x20]\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
"tbz x1, #1, 14f\n"
- "ld1 { v31.h }[0], [x20], #0x2\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
"tbz x1, #0, 15f\n"
- "ld1 { v31.b }[2], [x20]\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 15f\n"
- "ld1 { v31.b }[0], [x20]\n"
+ "ld1 { v27.b }[0], [x20]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
- "usubl v31.8h, v31.8b, v9.8b\n"
- "ldr x22, [x4, #0x58]\n"
- "smlal v17.4s, v31.4h, v2.4h\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "add x22, x22, x0\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "smlal v8.4s, v31.4h, v3.4h\n"
- "smlal2 v7.4s, v31.8h, v3.8h\n"
+ "usubl v27.8h, v27.8b, v18.8b\n"
+ "ldr x20, [x5, #0x58]\n"
+ "smlal v23.4s, v27.4h, v10.4h\n"
+ "smlal2 v19.4s, v27.8h, v10.8h\n"
+ "smlal v7.4s, v25.4h, v21.4h\n"
+ "smlal2 v15.4s, v25.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v20.4s, v9.4h, v21.4h\n"
+ "smlal2 v5.4s, v9.8h, v21.8h\n"
+ "smlal v24.4s, v27.4h, v21.4h\n"
+ "smlal2 v22.4s, v27.8h, v21.8h\n"
"tbz x1, #2, 17f\n"
- "ld1 { v30.s }[0], [x22], #0x4\n"
+ "ld1 { v6.s }[0], [x20], #0x4\n"
"tbz x1, #1, 16f\n"
- "ld1 { v30.h }[2], [x22], #0x2\n"
+ "ld1 { v6.h }[2], [x20], #0x2\n"
"tbz x1, #0, 19f\n"
- "ld1 { v30.b }[6], [x22]\n"
+ "ld1 { v6.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 19f\n"
- "ld1 { v30.b }[4], [x22]\n"
+ "ld1 { v6.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
"tbz x1, #1, 18f\n"
- "ld1 { v30.h }[0], [x22], #0x2\n"
+ "ld1 { v6.h }[0], [x20], #0x2\n"
"tbz x1, #0, 19f\n"
- "ld1 { v30.b }[2], [x22]\n"
+ "ld1 { v6.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 19f\n"
- "ld1 { v30.b }[0], [x22]\n"
+ "ld1 { v6.b }[0], [x20]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
- "usubl v30.8h, v30.8b, v9.8b\n"
- "ldr x21, [x4, #0x60]\n"
- "smlal v17.4s, v30.4h, v3.4h\n"
- "smlal2 v21.4s, v30.8h, v3.8h\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "add x21, x21, x0\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "ldr x20, [x5, #0x60]\n"
+ "smlal v23.4s, v6.4h, v21.4h\n"
+ "smlal2 v19.4s, v6.8h, v21.8h\n"
+ "smlal v7.4s, v9.4h, v12.4h\n"
+ "smlal2 v15.4s, v9.8h, v12.8h\n"
+ "add x20, x20, x3\n"
"tbz x1, #2, 21f\n"
- "ld1 { v27.s }[0], [x21], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz x1, #1, 20f\n"
- "ld1 { v27.h }[2], [x21], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"tbz x1, #0, 23f\n"
- "ld1 { v27.b }[6], [x21]\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 23f\n"
- "ld1 { v27.b }[4], [x21]\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 5): Bit 2: Unset
"tbz x1, #1, 22f\n"
- "ld1 { v27.h }[0], [x21], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"tbz x1, #0, 23f\n"
- "ld1 { v27.b }[2], [x21]\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 23f\n"
- "ld1 { v27.b }[0], [x21]\n"
+ "ld1 { v9.b }[0], [x20]\n"
"23:" // Oddments: Load (0, 5): Bit 2: End
- "ldr d0, [x3, #0x28]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v20.4s, v27.4h, v4.4h\n"
- "smlal2 v10.4s, v27.8h, v4.8h\n"
- "smlal v8.4s, v30.4h, v4.4h\n"
- "smlal2 v7.4s, v30.8h, v4.8h\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ldr x20, [x4, #0x68]\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "add x20, x20, x0\n"
- "smlal v13.4s, v29.4h, v0.4h\n"
- "smlal2 v19.4s, v29.8h, v0.8h\n"
- "smlal v20.4s, v28.4h, v0.4h\n"
- "smlal2 v10.4s, v28.8h, v0.8h\n"
- "smlal v8.4s, v22.4h, v0.4h\n"
- "smlal2 v7.4s, v22.8h, v0.8h\n"
+ "ldr d14, [x6, #0x28]\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal v20.4s, v9.4h, v12.4h\n"
+ "smlal2 v5.4s, v9.8h, v12.8h\n"
+ "smlal v24.4s, v6.4h, v12.4h\n"
+ "smlal2 v22.4s, v6.8h, v12.8h\n"
+ "ssubl v14.8h, v14.8b, v13.8b\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v30.4h, v14.4h\n"
+ "smlal2 v15.4s, v30.8h, v14.8h\n"
+ "smlal v20.4s, v16.4h, v14.4h\n"
+ "smlal2 v5.4s, v16.8h, v14.8h\n"
+ "smlal v24.4s, v28.4h, v14.4h\n"
+ "smlal2 v22.4s, v28.8h, v14.8h\n"
"tbz x1, #2, 25f\n"
"ld1 { v25.s }[0], [x20], #0x4\n"
"tbz x1, #1, 24f\n"
@@ -1315,869 +1315,869 @@ void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
"tbz x1, #0, 27f\n"
"ld1 { v25.b }[0], [x20]\n"
"27:" // Oddments: Load (2, 1): Bit 2: End
- "ldr d1, [x3, #0x30]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldr x26, [x4, #0x70]\n"
- "smlal v17.4s, v25.4h, v0.4h\n"
- "smlal2 v21.4s, v25.8h, v0.8h\n"
- "add x26, x26, x0\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v19.4s, v28.8h, v1.8h\n"
- "smlal v20.4s, v23.4h, v1.4h\n"
- "smlal2 v10.4s, v23.8h, v1.8h\n"
- "smlal v8.4s, v25.4h, v1.4h\n"
- "smlal2 v7.4s, v25.8h, v1.8h\n"
+ "ldr d21, [x6, #0x30]\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ldr x20, [x5, #0x70]\n"
+ "smlal v23.4s, v25.4h, v14.4h\n"
+ "smlal2 v19.4s, v25.8h, v14.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v16.4h, v21.4h\n"
+ "smlal2 v15.4s, v16.8h, v21.8h\n"
+ "smlal v20.4s, v4.4h, v21.4h\n"
+ "smlal2 v5.4s, v4.8h, v21.8h\n"
+ "smlal v24.4s, v25.4h, v21.4h\n"
+ "smlal2 v22.4s, v25.8h, v21.8h\n"
"tbz x1, #2, 29f\n"
- "ld1 { v24.s }[0], [x26], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz x1, #1, 28f\n"
- "ld1 { v24.h }[2], [x26], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"tbz x1, #0, 31f\n"
- "ld1 { v24.b }[6], [x26]\n"
+ "ld1 { v10.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 31f\n"
- "ld1 { v24.b }[4], [x26]\n"
+ "ld1 { v10.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
"tbz x1, #1, 30f\n"
- "ld1 { v24.h }[0], [x26], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"tbz x1, #0, 31f\n"
- "ld1 { v24.b }[2], [x26]\n"
+ "ld1 { v10.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 31f\n"
- "ld1 { v24.b }[0], [x26]\n"
+ "ld1 { v10.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "ldr d2, [x3, #0x38]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ldr x25, [x4, #0x78]\n"
- "smlal v17.4s, v24.4h, v1.4h\n"
- "smlal2 v21.4s, v24.8h, v1.8h\n"
- "add x25, x25, x0\n"
- "smlal v13.4s, v23.4h, v2.4h\n"
- "smlal2 v19.4s, v23.8h, v2.8h\n"
- "smlal v20.4s, v31.4h, v2.4h\n"
- "smlal2 v10.4s, v31.8h, v2.8h\n"
- "smlal v8.4s, v24.4h, v2.4h\n"
- "smlal2 v7.4s, v24.8h, v2.8h\n"
+ "ldr d9, [x6, #0x38]\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "ssubl v9.8h, v9.8b, v13.8b\n"
+ "ldr x20, [x5, #0x78]\n"
+ "smlal v23.4s, v10.4h, v21.4h\n"
+ "smlal2 v19.4s, v10.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v4.4h, v9.4h\n"
+ "smlal2 v15.4s, v4.8h, v9.8h\n"
+ "smlal v20.4s, v27.4h, v9.4h\n"
+ "smlal2 v5.4s, v27.8h, v9.8h\n"
+ "smlal v24.4s, v10.4h, v9.4h\n"
+ "smlal2 v22.4s, v10.8h, v9.8h\n"
"tbz x1, #2, 33f\n"
- "ld1 { v27.s }[0], [x25], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz x1, #1, 32f\n"
- "ld1 { v27.h }[2], [x25], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"tbz x1, #0, 35f\n"
- "ld1 { v27.b }[6], [x25]\n"
+ "ld1 { v12.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 35f\n"
- "ld1 { v27.b }[4], [x25]\n"
+ "ld1 { v12.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (2, 3): Bit 2: Unset
"tbz x1, #1, 34f\n"
- "ld1 { v27.h }[0], [x25], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"tbz x1, #0, 35f\n"
- "ld1 { v27.b }[2], [x25]\n"
+ "ld1 { v12.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 35f\n"
- "ld1 { v27.b }[0], [x25]\n"
+ "ld1 { v12.b }[0], [x20]\n"
"35:" // Oddments: Load (2, 3): Bit 2: End
- "ldr d3, [x3, #0x40]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr x23, [x4, #0x80]\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal2 v21.4s, v27.8h, v2.8h\n"
- "add x23, x23, x0\n"
- "smlal v13.4s, v31.4h, v3.4h\n"
- "smlal2 v19.4s, v31.8h, v3.8h\n"
- "smlal v20.4s, v30.4h, v3.4h\n"
- "smlal2 v10.4s, v30.8h, v3.8h\n"
- "smlal v8.4s, v27.4h, v3.4h\n"
- "smlal2 v7.4s, v27.8h, v3.8h\n"
+ "ldr d31, [x6, #0x40]\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x20, [x5, #0x80]\n"
+ "smlal v23.4s, v12.4h, v9.4h\n"
+ "smlal2 v19.4s, v12.8h, v9.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v27.4h, v31.4h\n"
+ "smlal2 v15.4s, v27.8h, v31.8h\n"
+ "smlal v20.4s, v6.4h, v31.4h\n"
+ "smlal2 v5.4s, v6.8h, v31.8h\n"
+ "smlal v24.4s, v12.4h, v31.4h\n"
+ "smlal2 v22.4s, v12.8h, v31.8h\n"
"tbz x1, #2, 37f\n"
- "ld1 { v23.s }[0], [x23], #0x4\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
"tbz x1, #1, 36f\n"
- "ld1 { v23.h }[2], [x23], #0x2\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
"tbz x1, #0, 39f\n"
- "ld1 { v23.b }[6], [x23]\n"
+ "ld1 { v8.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 39f\n"
- "ld1 { v23.b }[4], [x23]\n"
+ "ld1 { v8.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 4): Bit 2: Unset
"tbz x1, #1, 38f\n"
- "ld1 { v23.h }[0], [x23], #0x2\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
"tbz x1, #0, 39f\n"
- "ld1 { v23.b }[2], [x23]\n"
+ "ld1 { v8.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 39f\n"
- "ld1 { v23.b }[0], [x23]\n"
+ "ld1 { v8.b }[0], [x20]\n"
"39:" // Oddments: Load (2, 4): Bit 2: End
- "ldr d4, [x3, #0x48]\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ldr x24, [x4, #0x88]\n"
- "smlal v17.4s, v23.4h, v3.4h\n"
- "smlal2 v21.4s, v23.8h, v3.8h\n"
- "add x24, x24, x0\n"
- "smlal v13.4s, v30.4h, v4.4h\n"
- "smlal2 v19.4s, v30.8h, v4.8h\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "smlal v8.4s, v23.4h, v4.4h\n"
- "smlal2 v7.4s, v23.8h, v4.8h\n"
+ "ldr d16, [x6, #0x48]\n"
+ "usubl v8.8h, v8.8b, v18.8b\n"
+ "ssubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0x88]\n"
+ "smlal v23.4s, v8.4h, v31.4h\n"
+ "smlal2 v19.4s, v8.8h, v31.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v6.4h, v16.4h\n"
+ "smlal2 v15.4s, v6.8h, v16.8h\n"
+ "smlal v20.4s, v29.4h, v16.4h\n"
+ "smlal2 v5.4s, v29.8h, v16.8h\n"
+ "smlal v24.4s, v8.4h, v16.4h\n"
+ "smlal2 v22.4s, v8.8h, v16.8h\n"
"tbz x1, #2, 41f\n"
- "ld1 { v28.s }[0], [x24], #0x4\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
"tbz x1, #1, 40f\n"
- "ld1 { v28.h }[2], [x24], #0x2\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
"tbz x1, #0, 43f\n"
- "ld1 { v28.b }[6], [x24]\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 43f\n"
- "ld1 { v28.b }[4], [x24]\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 5): Bit 2: Unset
"tbz x1, #1, 42f\n"
- "ld1 { v28.h }[0], [x24], #0x2\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
"tbz x1, #0, 43f\n"
- "ld1 { v28.b }[2], [x24]\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 43f\n"
- "ld1 { v28.b }[0], [x24]\n"
+ "ld1 { v27.b }[0], [x20]\n"
"43:" // Oddments: Load (2, 5): Bit 2: End
- "ldr d0, [x3, #0x50]\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ldr x15, [x4, #0x90]\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "smlal2 v21.4s, v28.8h, v4.8h\n"
- "add x15, x15, x0\n"
- "smlal v13.4s, v22.4h, v0.4h\n"
- "smlal2 v19.4s, v22.8h, v0.8h\n"
- "smlal v20.4s, v25.4h, v0.4h\n"
- "smlal2 v10.4s, v25.8h, v0.8h\n"
+ "ldr d21, [x6, #0x50]\n"
+ "usubl v27.8h, v27.8b, v18.8b\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ldr x20, [x5, #0x90]\n"
+ "smlal v23.4s, v27.4h, v16.4h\n"
+ "smlal2 v19.4s, v27.8h, v16.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v28.4h, v21.4h\n"
+ "smlal2 v15.4s, v28.8h, v21.8h\n"
+ "smlal v20.4s, v25.4h, v21.4h\n"
+ "smlal2 v5.4s, v25.8h, v21.8h\n"
"tbz x1, #2, 45f\n"
- "ld1 { v31.s }[0], [x15], #0x4\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
"tbz x1, #1, 44f\n"
- "ld1 { v31.h }[2], [x15], #0x2\n"
+ "ld1 { v31.h }[2], [x20], #0x2\n"
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[6], [x15]\n"
+ "ld1 { v31.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[4], [x15]\n"
+ "ld1 { v31.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (3, 0): Bit 2: Unset
"tbz x1, #1, 46f\n"
- "ld1 { v31.h }[0], [x15], #0x2\n"
+ "ld1 { v31.h }[0], [x20], #0x2\n"
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[2], [x15]\n"
+ "ld1 { v31.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[0], [x15]\n"
+ "ld1 { v31.b }[0], [x20]\n"
"47:" // Oddments: Load (3, 0): Bit 2: End
- "usubl v31.8h, v31.8b, v9.8b\n"
- "ldr x21, [x4, #0x98]\n"
- "smlal v8.4s, v31.4h, v0.4h\n"
- "smlal2 v7.4s, v31.8h, v0.8h\n"
- "add x21, x21, x0\n"
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "ldr x20, [x5, #0x98]\n"
+ "smlal v24.4s, v31.4h, v21.4h\n"
+ "smlal2 v22.4s, v31.8h, v21.8h\n"
+ "add x20, x20, x3\n"
"tbz x1, #2, 49f\n"
- "ld1 { v30.s }[0], [x21], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
"tbz x1, #1, 48f\n"
- "ld1 { v30.h }[2], [x21], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
"tbz x1, #0, 51f\n"
- "ld1 { v30.b }[6], [x21]\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
"tbz x1, #0, 51f\n"
- "ld1 { v30.b }[4], [x21]\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
"tbz x1, #1, 50f\n"
- "ld1 { v30.h }[0], [x21], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
"tbz x1, #0, 51f\n"
- "ld1 { v30.b }[2], [x21]\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 51f\n"
- "ld1 { v30.b }[0], [x21]\n"
+ "ld1 { v28.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "ldr d1, [x3, #0x58]\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldr x14, [x4, #0xa0]\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "smlal2 v21.4s, v30.8h, v0.8h\n"
- "add x14, x14, x0\n"
- "smlal v13.4s, v25.4h, v1.4h\n"
- "smlal2 v19.4s, v25.8h, v1.8h\n"
- "smlal v20.4s, v24.4h, v1.4h\n"
- "smlal2 v10.4s, v24.8h, v1.8h\n"
- "smlal v8.4s, v30.4h, v1.4h\n"
- "smlal2 v7.4s, v30.8h, v1.8h\n"
+ "ldr d2, [x6, #0x58]\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "ldr x20, [x5, #0xa0]\n"
+ "smlal v23.4s, v28.4h, v21.4h\n"
+ "smlal2 v19.4s, v28.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v25.4h, v2.4h\n"
+ "smlal2 v15.4s, v25.8h, v2.8h\n"
+ "smlal v20.4s, v10.4h, v2.4h\n"
+ "smlal2 v5.4s, v10.8h, v2.8h\n"
+ "smlal v24.4s, v28.4h, v2.4h\n"
+ "smlal2 v22.4s, v28.8h, v2.8h\n"
"tbz x1, #2, 53f\n"
- "ld1 { v26.s }[0], [x14], #0x4\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
"tbz x1, #1, 52f\n"
- "ld1 { v26.h }[2], [x14], #0x2\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
"tbz x1, #0, 55f\n"
- "ld1 { v26.b }[6], [x14]\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 55f\n"
- "ld1 { v26.b }[4], [x14]\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
"tbz x1, #1, 54f\n"
- "ld1 { v26.h }[0], [x14], #0x2\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
"tbz x1, #0, 55f\n"
- "ld1 { v26.b }[2], [x14]\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 55f\n"
- "ld1 { v26.b }[0], [x14]\n"
+ "ld1 { v21.b }[0], [x20]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "ldr d2, [x3, #0x60]\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ldr x13, [x4, #0xa8]\n"
- "smlal v17.4s, v26.4h, v1.4h\n"
- "smlal2 v21.4s, v26.8h, v1.8h\n"
- "add x13, x13, x0\n"
- "smlal v13.4s, v24.4h, v2.4h\n"
- "smlal2 v19.4s, v24.8h, v2.8h\n"
- "smlal v20.4s, v27.4h, v2.4h\n"
- "smlal2 v10.4s, v27.8h, v2.8h\n"
- "smlal v8.4s, v26.4h, v2.4h\n"
- "smlal2 v7.4s, v26.8h, v2.8h\n"
+ "ldr d25, [x6, #0x60]\n"
+ "usubl v21.8h, v21.8b, v18.8b\n"
+ "ssubl v25.8h, v25.8b, v13.8b\n"
+ "ldr x20, [x5, #0xa8]\n"
+ "smlal v23.4s, v21.4h, v2.4h\n"
+ "smlal2 v19.4s, v21.8h, v2.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v10.4h, v25.4h\n"
+ "smlal2 v15.4s, v10.8h, v25.8h\n"
+ "smlal v20.4s, v12.4h, v25.4h\n"
+ "smlal2 v5.4s, v12.8h, v25.8h\n"
+ "smlal v24.4s, v21.4h, v25.4h\n"
+ "smlal2 v22.4s, v21.8h, v25.8h\n"
"tbz x1, #2, 57f\n"
- "ld1 { v25.s }[0], [x13], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz x1, #1, 56f\n"
- "ld1 { v25.h }[2], [x13], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"tbz x1, #0, 59f\n"
- "ld1 { v25.b }[6], [x13]\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 59f\n"
"56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 59f\n"
- "ld1 { v25.b }[4], [x13]\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 59f\n"
"57:" // Oddments: Load (3, 3): Bit 2: Unset
"tbz x1, #1, 58f\n"
- "ld1 { v25.h }[0], [x13], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"tbz x1, #0, 59f\n"
- "ld1 { v25.b }[2], [x13]\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 59f\n"
"58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 59f\n"
- "ld1 { v25.b }[0], [x13]\n"
+ "ld1 { v9.b }[0], [x20]\n"
"59:" // Oddments: Load (3, 3): Bit 2: End
- "ldr d3, [x3, #0x68]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr x12, [x4, #0xb0]\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "add x12, x12, x0\n"
- "smlal v13.4s, v27.4h, v3.4h\n"
- "smlal2 v19.4s, v27.8h, v3.8h\n"
- "smlal v20.4s, v23.4h, v3.4h\n"
- "smlal2 v10.4s, v23.8h, v3.8h\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "ldr d1, [x6, #0x68]\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "ldr x20, [x5, #0xb0]\n"
+ "smlal v23.4s, v9.4h, v25.4h\n"
+ "smlal2 v19.4s, v9.8h, v25.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v12.4h, v1.4h\n"
+ "smlal2 v15.4s, v12.8h, v1.8h\n"
+ "smlal v20.4s, v8.4h, v1.4h\n"
+ "smlal2 v5.4s, v8.8h, v1.8h\n"
+ "smlal v24.4s, v9.4h, v1.4h\n"
+ "smlal2 v22.4s, v9.8h, v1.8h\n"
"tbz x1, #2, 61f\n"
- "ld1 { v24.s }[0], [x12], #0x4\n"
+ "ld1 { v3.s }[0], [x20], #0x4\n"
"tbz x1, #1, 60f\n"
- "ld1 { v24.h }[2], [x12], #0x2\n"
+ "ld1 { v3.h }[2], [x20], #0x2\n"
"tbz x1, #0, 63f\n"
- "ld1 { v24.b }[6], [x12]\n"
+ "ld1 { v3.b }[6], [x20]\n"
"b 63f\n"
"60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 63f\n"
- "ld1 { v24.b }[4], [x12]\n"
+ "ld1 { v3.b }[4], [x20]\n"
"b 63f\n"
"61:" // Oddments: Load (3, 4): Bit 2: Unset
"tbz x1, #1, 62f\n"
- "ld1 { v24.h }[0], [x12], #0x2\n"
+ "ld1 { v3.h }[0], [x20], #0x2\n"
"tbz x1, #0, 63f\n"
- "ld1 { v24.b }[2], [x12]\n"
+ "ld1 { v3.b }[2], [x20]\n"
"b 63f\n"
"62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 63f\n"
- "ld1 { v24.b }[0], [x12]\n"
+ "ld1 { v3.b }[0], [x20]\n"
"63:" // Oddments: Load (3, 4): Bit 2: End
- "ldr d4, [x3, #0x70]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ldr x20, [x4, #0xb8]\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "add x20, x20, x0\n"
- "smlal v13.4s, v23.4h, v4.4h\n"
- "smlal2 v19.4s, v23.8h, v4.8h\n"
- "smlal v20.4s, v28.4h, v4.4h\n"
- "smlal2 v10.4s, v28.8h, v4.8h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
+ "ldr d16, [x6, #0x70]\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "ssubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0xb8]\n"
+ "smlal v23.4s, v3.4h, v1.4h\n"
+ "smlal2 v19.4s, v3.8h, v1.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v8.4h, v16.4h\n"
+ "smlal2 v15.4s, v8.8h, v16.8h\n"
+ "smlal v20.4s, v27.4h, v16.4h\n"
+ "smlal2 v5.4s, v27.8h, v16.8h\n"
+ "smlal v24.4s, v3.4h, v16.4h\n"
+ "smlal2 v22.4s, v3.8h, v16.8h\n"
"tbz x1, #2, 65f\n"
- "ld1 { v22.s }[0], [x20], #0x4\n"
+ "ld1 { v14.s }[0], [x20], #0x4\n"
"tbz x1, #1, 64f\n"
- "ld1 { v22.h }[2], [x20], #0x2\n"
+ "ld1 { v14.h }[2], [x20], #0x2\n"
"tbz x1, #0, 67f\n"
- "ld1 { v22.b }[6], [x20]\n"
+ "ld1 { v14.b }[6], [x20]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 67f\n"
- "ld1 { v22.b }[4], [x20]\n"
+ "ld1 { v14.b }[4], [x20]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 5): Bit 2: Unset
"tbz x1, #1, 66f\n"
- "ld1 { v22.h }[0], [x20], #0x2\n"
+ "ld1 { v14.h }[0], [x20], #0x2\n"
"tbz x1, #0, 67f\n"
- "ld1 { v22.b }[2], [x20]\n"
+ "ld1 { v14.b }[2], [x20]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 67f\n"
- "ld1 { v22.b }[0], [x20]\n"
+ "ld1 { v14.b }[0], [x20]\n"
"67:" // Oddments: Load (3, 5): Bit 2: End
- "ldr d0, [x3, #0x78]\n"
- "usubl v22.8h, v22.8b, v9.8b\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ldr x11, [x4, #0xc0]\n"
- "smlal v17.4s, v22.4h, v4.4h\n"
- "smlal2 v21.4s, v22.8h, v4.8h\n"
- "add x11, x11, x0\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "ldr d17, [x6, #0x78]\n"
+ "usubl v14.8h, v14.8b, v18.8b\n"
+ "ssubl v17.8h, v17.8b, v13.8b\n"
+ "ldr x20, [x5, #0xc0]\n"
+ "smlal v23.4s, v14.4h, v16.4h\n"
+ "smlal2 v19.4s, v14.8h, v16.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v31.4h, v17.4h\n"
+ "smlal2 v15.4s, v31.8h, v17.8h\n"
+ "smlal v20.4s, v28.4h, v17.4h\n"
+ "smlal2 v5.4s, v28.8h, v17.8h\n"
"tbz x1, #2, 69f\n"
- "ld1 { v27.s }[0], [x11], #0x4\n"
+ "ld1 { v1.s }[0], [x20], #0x4\n"
"tbz x1, #1, 68f\n"
- "ld1 { v27.h }[2], [x11], #0x2\n"
+ "ld1 { v1.h }[2], [x20], #0x2\n"
"tbz x1, #0, 71f\n"
- "ld1 { v27.b }[6], [x11]\n"
+ "ld1 { v1.b }[6], [x20]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
"tbz x1, #0, 71f\n"
- "ld1 { v27.b }[4], [x11]\n"
+ "ld1 { v1.b }[4], [x20]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 0): Bit 2: Unset
"tbz x1, #1, 70f\n"
- "ld1 { v27.h }[0], [x11], #0x2\n"
+ "ld1 { v1.h }[0], [x20], #0x2\n"
"tbz x1, #0, 71f\n"
- "ld1 { v27.b }[2], [x11]\n"
+ "ld1 { v1.b }[2], [x20]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 71f\n"
- "ld1 { v27.b }[0], [x11]\n"
+ "ld1 { v1.b }[0], [x20]\n"
"71:" // Oddments: Load (4, 0): Bit 2: End
- "usubl v27.8h, v27.8b, v9.8b\n"
- "ldr x22, [x4, #0xc8]\n"
- "smlal v8.4s, v27.4h, v0.4h\n"
- "smlal2 v7.4s, v27.8h, v0.8h\n"
- "add x22, x22, x0\n"
+ "usubl v1.8h, v1.8b, v18.8b\n"
+ "ldr x20, [x5, #0xc8]\n"
+ "smlal v24.4s, v1.4h, v17.4h\n"
+ "smlal2 v22.4s, v1.8h, v17.8h\n"
+ "add x20, x20, x3\n"
"tbz x1, #2, 73f\n"
- "ld1 { v23.s }[0], [x22], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x1, #1, 72f\n"
- "ld1 { v23.h }[2], [x22], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x1, #0, 75f\n"
- "ld1 { v23.b }[6], [x22]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
"tbz x1, #0, 75f\n"
- "ld1 { v23.b }[4], [x22]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 1): Bit 2: Unset
"tbz x1, #1, 74f\n"
- "ld1 { v23.h }[0], [x22], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x1, #0, 75f\n"
- "ld1 { v23.b }[2], [x22]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 75f\n"
- "ld1 { v23.b }[0], [x22]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"75:" // Oddments: Load (4, 1): Bit 2: End
- "ldr d1, [x3, #0x80]\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldr x9, [x4, #0xd0]\n"
- "smlal v17.4s, v23.4h, v0.4h\n"
- "smlal2 v21.4s, v23.8h, v0.8h\n"
- "add x9, x9, x0\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal v20.4s, v26.4h, v1.4h\n"
- "smlal2 v10.4s, v26.8h, v1.8h\n"
- "smlal v8.4s, v23.4h, v1.4h\n"
- "smlal2 v7.4s, v23.8h, v1.8h\n"
+ "ldr d29, [x6, #0x80]\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "ldr x20, [x5, #0xd0]\n"
+ "smlal v23.4s, v16.4h, v17.4h\n"
+ "smlal2 v19.4s, v16.8h, v17.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v28.4h, v29.4h\n"
+ "smlal2 v15.4s, v28.8h, v29.8h\n"
+ "smlal v20.4s, v21.4h, v29.4h\n"
+ "smlal2 v5.4s, v21.8h, v29.8h\n"
+ "smlal v24.4s, v16.4h, v29.4h\n"
+ "smlal2 v22.4s, v16.8h, v29.8h\n"
"tbz x1, #2, 77f\n"
- "ld1 { v31.s }[0], [x9], #0x4\n"
+ "ld1 { v30.s }[0], [x20], #0x4\n"
"tbz x1, #1, 76f\n"
- "ld1 { v31.h }[2], [x9], #0x2\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
"tbz x1, #0, 79f\n"
- "ld1 { v31.b }[6], [x9]\n"
+ "ld1 { v30.b }[6], [x20]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 79f\n"
- "ld1 { v31.b }[4], [x9]\n"
+ "ld1 { v30.b }[4], [x20]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 2): Bit 2: Unset
"tbz x1, #1, 78f\n"
- "ld1 { v31.h }[0], [x9], #0x2\n"
+ "ld1 { v30.h }[0], [x20], #0x2\n"
"tbz x1, #0, 79f\n"
- "ld1 { v31.b }[2], [x9]\n"
+ "ld1 { v30.b }[2], [x20]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 79f\n"
- "ld1 { v31.b }[0], [x9]\n"
+ "ld1 { v30.b }[0], [x20]\n"
"79:" // Oddments: Load (4, 2): Bit 2: End
- "ldr d2, [x3, #0x88]\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ldr x28, [x4, #0xd8]\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal2 v21.4s, v31.8h, v1.8h\n"
- "add x28, x28, x0\n"
- "smlal v13.4s, v26.4h, v2.4h\n"
- "smlal2 v19.4s, v26.8h, v2.8h\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "smlal v8.4s, v31.4h, v2.4h\n"
- "smlal2 v7.4s, v31.8h, v2.8h\n"
+ "ldr d12, [x6, #0x88]\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "ssubl v12.8h, v12.8b, v13.8b\n"
+ "ldr x20, [x5, #0xd8]\n"
+ "smlal v23.4s, v30.4h, v29.4h\n"
+ "smlal2 v19.4s, v30.8h, v29.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v21.4h, v12.4h\n"
+ "smlal2 v15.4s, v21.8h, v12.8h\n"
+ "smlal v20.4s, v9.4h, v12.4h\n"
+ "smlal2 v5.4s, v9.8h, v12.8h\n"
+ "smlal v24.4s, v30.4h, v12.4h\n"
+ "smlal2 v22.4s, v30.8h, v12.8h\n"
"tbz x1, #2, 81f\n"
- "ld1 { v30.s }[0], [x28], #0x4\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
"tbz x1, #1, 80f\n"
- "ld1 { v30.h }[2], [x28], #0x2\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
"tbz x1, #0, 83f\n"
- "ld1 { v30.b }[6], [x28]\n"
+ "ld1 { v29.b }[6], [x20]\n"
"b 83f\n"
"80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 83f\n"
- "ld1 { v30.b }[4], [x28]\n"
+ "ld1 { v29.b }[4], [x20]\n"
"b 83f\n"
"81:" // Oddments: Load (4, 3): Bit 2: Unset
"tbz x1, #1, 82f\n"
- "ld1 { v30.h }[0], [x28], #0x2\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
"tbz x1, #0, 83f\n"
- "ld1 { v30.b }[2], [x28]\n"
+ "ld1 { v29.b }[2], [x20]\n"
"b 83f\n"
"82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 83f\n"
- "ld1 { v30.b }[0], [x28]\n"
+ "ld1 { v29.b }[0], [x20]\n"
"83:" // Oddments: Load (4, 3): Bit 2: End
- "ldr d3, [x3, #0x90]\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr x27, [x4, #0xe0]\n"
- "smlal v17.4s, v30.4h, v2.4h\n"
- "smlal2 v21.4s, v30.8h, v2.8h\n"
- "add x27, x27, x0\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "smlal v8.4s, v30.4h, v3.4h\n"
- "smlal2 v7.4s, v30.8h, v3.8h\n"
+ "ldr d21, [x6, #0x90]\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ldr x20, [x5, #0xe0]\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v9.4h, v21.4h\n"
+ "smlal2 v15.4s, v9.8h, v21.8h\n"
+ "smlal v20.4s, v3.4h, v21.4h\n"
+ "smlal2 v5.4s, v3.8h, v21.8h\n"
+ "smlal v24.4s, v29.4h, v21.4h\n"
+ "smlal2 v22.4s, v29.8h, v21.8h\n"
"tbz x1, #2, 85f\n"
- "ld1 { v28.s }[0], [x27], #0x4\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
"tbz x1, #1, 84f\n"
- "ld1 { v28.h }[2], [x27], #0x2\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
"tbz x1, #0, 87f\n"
- "ld1 { v28.b }[6], [x27]\n"
+ "ld1 { v25.b }[6], [x20]\n"
"b 87f\n"
"84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 87f\n"
- "ld1 { v28.b }[4], [x27]\n"
+ "ld1 { v25.b }[4], [x20]\n"
"b 87f\n"
"85:" // Oddments: Load (4, 4): Bit 2: Unset
"tbz x1, #1, 86f\n"
- "ld1 { v28.h }[0], [x27], #0x2\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
"tbz x1, #0, 87f\n"
- "ld1 { v28.b }[2], [x27]\n"
+ "ld1 { v25.b }[2], [x20]\n"
"b 87f\n"
"86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 87f\n"
- "ld1 { v28.b }[0], [x27]\n"
+ "ld1 { v25.b }[0], [x20]\n"
"87:" // Oddments: Load (4, 4): Bit 2: End
- "ldr d4, [x3, #0x98]\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ldr x26, [x4, #0xe8]\n"
- "smlal v17.4s, v28.4h, v3.4h\n"
- "smlal2 v21.4s, v28.8h, v3.8h\n"
- "add x26, x26, x0\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "smlal v20.4s, v22.4h, v4.4h\n"
- "smlal2 v10.4s, v22.8h, v4.8h\n"
- "smlal v8.4s, v28.4h, v4.4h\n"
- "smlal2 v7.4s, v28.8h, v4.8h\n"
+ "ldr d8, [x6, #0x98]\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x20, [x5, #0xe8]\n"
+ "smlal v23.4s, v25.4h, v21.4h\n"
+ "smlal2 v19.4s, v25.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v3.4h, v8.4h\n"
+ "smlal2 v15.4s, v3.8h, v8.8h\n"
+ "smlal v20.4s, v14.4h, v8.4h\n"
+ "smlal2 v5.4s, v14.8h, v8.8h\n"
+ "smlal v24.4s, v25.4h, v8.4h\n"
+ "smlal2 v22.4s, v25.8h, v8.8h\n"
"tbz x1, #2, 89f\n"
- "ld1 { v26.s }[0], [x26], #0x4\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
"tbz x1, #1, 88f\n"
- "ld1 { v26.h }[2], [x26], #0x2\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
"tbz x1, #0, 91f\n"
- "ld1 { v26.b }[6], [x26]\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 91f\n"
"88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 91f\n"
- "ld1 { v26.b }[4], [x26]\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 91f\n"
"89:" // Oddments: Load (4, 5): Bit 2: Unset
"tbz x1, #1, 90f\n"
- "ld1 { v26.h }[0], [x26], #0x2\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
"tbz x1, #0, 91f\n"
- "ld1 { v26.b }[2], [x26]\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 91f\n"
"90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 91f\n"
- "ld1 { v26.b }[0], [x26]\n"
+ "ld1 { v21.b }[0], [x20]\n"
"91:" // Oddments: Load (4, 5): Bit 2: End
- "ldr d0, [x3, #0xa0]\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ldr x25, [x4, #0xf0]\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "add x25, x25, x0\n"
- "smlal v13.4s, v27.4h, v0.4h\n"
- "smlal2 v19.4s, v27.8h, v0.8h\n"
- "smlal v20.4s, v23.4h, v0.4h\n"
- "smlal2 v10.4s, v23.8h, v0.8h\n"
+ "ldr d9, [x6, #0xa0]\n"
+ "usubl v21.8h, v21.8b, v18.8b\n"
+ "ssubl v9.8h, v9.8b, v13.8b\n"
+ "ldr x20, [x5, #0xf0]\n"
+ "smlal v23.4s, v21.4h, v8.4h\n"
+ "smlal2 v19.4s, v21.8h, v8.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v1.4h, v9.4h\n"
+ "smlal2 v15.4s, v1.8h, v9.8h\n"
+ "smlal v20.4s, v16.4h, v9.4h\n"
+ "smlal2 v5.4s, v16.8h, v9.8h\n"
"tbz x1, #2, 93f\n"
- "ld1 { v25.s }[0], [x25], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz x1, #1, 92f\n"
- "ld1 { v25.h }[2], [x25], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"tbz x1, #0, 95f\n"
- "ld1 { v25.b }[6], [x25]\n"
+ "ld1 { v12.b }[6], [x20]\n"
"b 95f\n"
"92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
"tbz x1, #0, 95f\n"
- "ld1 { v25.b }[4], [x25]\n"
+ "ld1 { v12.b }[4], [x20]\n"
"b 95f\n"
"93:" // Oddments: Load (5, 0): Bit 2: Unset
"tbz x1, #1, 94f\n"
- "ld1 { v25.h }[0], [x25], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"tbz x1, #0, 95f\n"
- "ld1 { v25.b }[2], [x25]\n"
+ "ld1 { v12.b }[2], [x20]\n"
"b 95f\n"
"94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 95f\n"
- "ld1 { v25.b }[0], [x25]\n"
+ "ld1 { v12.b }[0], [x20]\n"
"95:" // Oddments: Load (5, 0): Bit 2: End
- "usubl v25.8h, v25.8b, v9.8b\n"
- "ldr x24, [x4, #0xf8]\n"
- "smlal v8.4s, v25.4h, v0.4h\n"
- "smlal2 v7.4s, v25.8h, v0.8h\n"
- "add x24, x24, x0\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "ldr x20, [x5, #0xf8]\n"
+ "smlal v24.4s, v12.4h, v9.4h\n"
+ "smlal2 v22.4s, v12.8h, v9.8h\n"
+ "add x20, x20, x3\n"
"tbz x1, #2, 97f\n"
- "ld1 { v24.s }[0], [x24], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz x1, #1, 96f\n"
- "ld1 { v24.h }[2], [x24], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"tbz x1, #0, 99f\n"
- "ld1 { v24.b }[6], [x24]\n"
+ "ld1 { v10.b }[6], [x20]\n"
"b 99f\n"
"96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
"tbz x1, #0, 99f\n"
- "ld1 { v24.b }[4], [x24]\n"
+ "ld1 { v10.b }[4], [x20]\n"
"b 99f\n"
"97:" // Oddments: Load (5, 1): Bit 2: Unset
"tbz x1, #1, 98f\n"
- "ld1 { v24.h }[0], [x24], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"tbz x1, #0, 99f\n"
- "ld1 { v24.b }[2], [x24]\n"
+ "ld1 { v10.b }[2], [x20]\n"
"b 99f\n"
"98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 99f\n"
- "ld1 { v24.b }[0], [x24]\n"
+ "ld1 { v10.b }[0], [x20]\n"
"99:" // Oddments: Load (5, 1): Bit 2: End
- "ldr d1, [x3, #0xa8]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldr x23, [x4, #0x100]\n"
- "smlal v17.4s, v24.4h, v0.4h\n"
- "smlal2 v21.4s, v24.8h, v0.8h\n"
- "add x23, x23, x0\n"
- "smlal v13.4s, v23.4h, v1.4h\n"
- "smlal2 v19.4s, v23.8h, v1.8h\n"
- "smlal v20.4s, v31.4h, v1.4h\n"
- "smlal2 v10.4s, v31.8h, v1.8h\n"
- "smlal v8.4s, v24.4h, v1.4h\n"
- "smlal2 v7.4s, v24.8h, v1.8h\n"
+ "ldr d12, [x6, #0xa8]\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "ssubl v12.8h, v12.8b, v13.8b\n"
+ "ldr x20, [x5, #0x100]\n"
+ "smlal v23.4s, v10.4h, v9.4h\n"
+ "smlal2 v19.4s, v10.8h, v9.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v16.4h, v12.4h\n"
+ "smlal2 v15.4s, v16.8h, v12.8h\n"
+ "smlal v20.4s, v30.4h, v12.4h\n"
+ "smlal2 v5.4s, v30.8h, v12.8h\n"
+ "smlal v24.4s, v10.4h, v12.4h\n"
+ "smlal2 v22.4s, v10.8h, v12.8h\n"
"tbz x1, #2, 101f\n"
- "ld1 { v27.s }[0], [x23], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz x1, #1, 100f\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"tbz x1, #0, 103f\n"
- "ld1 { v27.b }[6], [x23]\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 103f\n"
"100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 103f\n"
- "ld1 { v27.b }[4], [x23]\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 103f\n"
"101:" // Oddments: Load (5, 2): Bit 2: Unset
"tbz x1, #1, 102f\n"
- "ld1 { v27.h }[0], [x23], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"tbz x1, #0, 103f\n"
- "ld1 { v27.b }[2], [x23]\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 103f\n"
"102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 103f\n"
- "ld1 { v27.b }[0], [x23]\n"
+ "ld1 { v9.b }[0], [x20]\n"
"103:" // Oddments: Load (5, 2): Bit 2: End
- "ldr d2, [x3, #0xb0]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ldr x15, [x4, #0x108]\n"
- "smlal v17.4s, v27.4h, v1.4h\n"
- "smlal2 v21.4s, v27.8h, v1.8h\n"
- "add x15, x15, x0\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v19.4s, v31.8h, v2.8h\n"
- "smlal v20.4s, v30.4h, v2.4h\n"
- "smlal2 v10.4s, v30.8h, v2.8h\n"
- "smlal v8.4s, v27.4h, v2.4h\n"
- "smlal2 v7.4s, v27.8h, v2.8h\n"
+ "ldr d28, [x6, #0xb0]\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "ssubl v28.8h, v28.8b, v13.8b\n"
+ "ldr x20, [x5, #0x108]\n"
+ "smlal v23.4s, v9.4h, v12.4h\n"
+ "smlal2 v19.4s, v9.8h, v12.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v30.4h, v28.4h\n"
+ "smlal2 v15.4s, v30.8h, v28.8h\n"
+ "smlal v20.4s, v29.4h, v28.4h\n"
+ "smlal2 v5.4s, v29.8h, v28.8h\n"
+ "smlal v24.4s, v9.4h, v28.4h\n"
+ "smlal2 v22.4s, v9.8h, v28.8h\n"
"tbz x1, #2, 105f\n"
- "ld1 { v25.s }[0], [x15], #0x4\n"
+ "ld1 { v2.s }[0], [x20], #0x4\n"
"tbz x1, #1, 104f\n"
- "ld1 { v25.h }[2], [x15], #0x2\n"
+ "ld1 { v2.h }[2], [x20], #0x2\n"
"tbz x1, #0, 107f\n"
- "ld1 { v25.b }[6], [x15]\n"
+ "ld1 { v2.b }[6], [x20]\n"
"b 107f\n"
"104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 107f\n"
- "ld1 { v25.b }[4], [x15]\n"
+ "ld1 { v2.b }[4], [x20]\n"
"b 107f\n"
"105:" // Oddments: Load (5, 3): Bit 2: Unset
"tbz x1, #1, 106f\n"
- "ld1 { v25.h }[0], [x15], #0x2\n"
+ "ld1 { v2.h }[0], [x20], #0x2\n"
"tbz x1, #0, 107f\n"
- "ld1 { v25.b }[2], [x15]\n"
+ "ld1 { v2.b }[2], [x20]\n"
"b 107f\n"
"106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 107f\n"
- "ld1 { v25.b }[0], [x15]\n"
+ "ld1 { v2.b }[0], [x20]\n"
"107:" // Oddments: Load (5, 3): Bit 2: End
- "ldr d3, [x3, #0xb8]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr x21, [x4, #0x110]\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "add x21, x21, x0\n"
- "smlal v13.4s, v30.4h, v3.4h\n"
- "smlal2 v19.4s, v30.8h, v3.8h\n"
- "smlal v20.4s, v28.4h, v3.4h\n"
- "smlal2 v10.4s, v28.8h, v3.8h\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "ldr d30, [x6, #0xb8]\n"
+ "usubl v2.8h, v2.8b, v18.8b\n"
+ "ssubl v30.8h, v30.8b, v13.8b\n"
+ "ldr x20, [x5, #0x110]\n"
+ "smlal v23.4s, v2.4h, v28.4h\n"
+ "smlal2 v19.4s, v2.8h, v28.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v29.4h, v30.4h\n"
+ "smlal2 v15.4s, v29.8h, v30.8h\n"
+ "smlal v20.4s, v25.4h, v30.4h\n"
+ "smlal2 v5.4s, v25.8h, v30.8h\n"
+ "smlal v24.4s, v2.4h, v30.4h\n"
+ "smlal2 v22.4s, v2.8h, v30.8h\n"
"tbz x1, #2, 109f\n"
- "ld1 { v24.s }[0], [x21], #0x4\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
"tbz x1, #1, 108f\n"
- "ld1 { v24.h }[2], [x21], #0x2\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
"tbz x1, #0, 111f\n"
- "ld1 { v24.b }[6], [x21]\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 111f\n"
"108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 111f\n"
- "ld1 { v24.b }[4], [x21]\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 111f\n"
"109:" // Oddments: Load (5, 4): Bit 2: Unset
"tbz x1, #1, 110f\n"
- "ld1 { v24.h }[0], [x21], #0x2\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
"tbz x1, #0, 111f\n"
- "ld1 { v24.b }[2], [x21]\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 111f\n"
"110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 111f\n"
- "ld1 { v24.b }[0], [x21]\n"
+ "ld1 { v27.b }[0], [x20]\n"
"111:" // Oddments: Load (5, 4): Bit 2: End
- "ldr d4, [x3, #0xc0]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ldr x20, [x4, #0x118]\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "add x20, x20, x0\n"
- "smlal v13.4s, v28.4h, v4.4h\n"
- "smlal2 v19.4s, v28.8h, v4.8h\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
+ "ldr d8, [x6, #0xc0]\n"
+ "usubl v27.8h, v27.8b, v18.8b\n"
+ "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x20, [x5, #0x118]\n"
+ "smlal v23.4s, v27.4h, v30.4h\n"
+ "smlal2 v19.4s, v27.8h, v30.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v25.4h, v8.4h\n"
+ "smlal2 v15.4s, v25.8h, v8.8h\n"
+ "smlal v20.4s, v21.4h, v8.4h\n"
+ "smlal2 v5.4s, v21.8h, v8.8h\n"
+ "smlal v24.4s, v27.4h, v8.4h\n"
+ "smlal2 v22.4s, v27.8h, v8.8h\n"
"tbz x1, #2, 113f\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz x1, #1, 112f\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"tbz x1, #0, 115f\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 115f\n"
"112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 115f\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 115f\n"
"113:" // Oddments: Load (5, 5): Bit 2: Unset
"tbz x1, #1, 114f\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"tbz x1, #0, 115f\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 115f\n"
"114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 115f\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "ld1 { v9.b }[0], [x20]\n"
"115:" // Oddments: Load (5, 5): Bit 2: End
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
- "smlal2 v21.4s, v27.8h, v4.8h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal v23.4s, v9.4h, v8.4h\n"
+ "smlal2 v19.4s, v9.8h, v8.8h\n"
"tbz x1, #2, 117f\n"
- "ld1 { v18.4s }, [x5], #0x10\n"
- "ld1 { v6.4s }, [x8], #0x10\n"
+ "ld1 { v30.4s }, [x7], #0x10\n"
+ "ld1 { v12.4s }, [x8], #0x10\n"
"tbz x1, #1, 116f\n"
- "ld1 { v5.d }[0], [x5], #0x8\n"
- "ld1 { v22.d }[0], [x8], #0x8\n"
+ "ld1 { v14.d }[0], [x7], #0x8\n"
+ "ld1 { v27.d }[0], [x8], #0x8\n"
"tbz x1, #0, 119f\n"
- "ld1 { v5.s }[2], [x5]\n"
- "ld1 { v22.s }[2], [x8]\n"
+ "ld1 { v14.s }[2], [x7]\n"
+ "ld1 { v27.s }[2], [x8]\n"
"b 119f\n"
"116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
"tbz x1, #0, 119f\n"
- "ld1 { v5.s }[0], [x5]\n"
- "ld1 { v22.s }[0], [x8]\n"
+ "ld1 { v14.s }[0], [x7]\n"
+ "ld1 { v27.s }[0], [x8]\n"
"b 119f\n"
"117:" // Oddments: Load requant params: Bit 2: Unset
"tbz x1, #1, 118f\n"
- "ld1 { v18.d }[0], [x5], #0x8\n"
- "ld1 { v6.d }[0], [x8], #0x8\n"
+ "ld1 { v30.d }[0], [x7], #0x8\n"
+ "ld1 { v12.d }[0], [x8], #0x8\n"
"tbz x1, #0, 119f\n"
- "ld1 { v18.s }[2], [x5]\n"
- "ld1 { v6.s }[2], [x8]\n"
+ "ld1 { v30.s }[2], [x7]\n"
+ "ld1 { v12.s }[2], [x8]\n"
"b 119f\n"
"118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 119f\n"
- "ld1 { v18.s }[0], [x5]\n"
- "ld1 { v6.s }[0], [x8]\n"
+ "ld1 { v30.s }[0], [x7]\n"
+ "ld1 { v12.s }[0], [x8]\n"
"119:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v13.4s, v13.4s, v18.4s\n"
- "and v30.16b, v13.16b, v6.16b\n"
- "add x17, x17, x10\n"
- "add x6, x6, x10\n"
- "sqrdmulh v19.4s, v19.4s, v5.4s\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "add x7, x7, x10\n"
- "add x16, x16, x10\n"
- "and v16.16b, v19.16b, v22.16b\n"
- "sqrdmulh v20.4s, v20.4s, v18.4s\n"
- "sqrdmulh v8.4s, v8.4s, v18.4s\n"
- "sqrdmulh v17.4s, v17.4s, v18.4s\n"
- "sqadd v13.4s, v13.4s, v30.4s\n"
+ "sqrdmulh v7.4s, v7.4s, v30.4s\n"
+ "and v16.16b, v7.16b, v12.16b\n"
+ "add x17, x17, x4\n"
+ "add x16, x16, x4\n"
+ "sqrdmulh v15.4s, v15.4s, v14.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "and v0.16b, v20.16b, v6.16b\n"
- "sqrdmulh v10.4s, v10.4s, v5.4s\n"
- "and v18.16b, v8.16b, v6.16b\n"
- "sqrdmulh v7.4s, v7.4s, v5.4s\n"
- "and v30.16b, v17.16b, v6.16b\n"
- "sqrdmulh v21.4s, v21.4s, v5.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v26.16b, v10.16b, v22.16b\n"
+ "add x15, x15, x4\n"
+ "add x14, x14, x4\n"
+ "and v2.16b, v15.16b, v27.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+ "sqadd v7.4s, v7.4s, v16.4s\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "and v21.16b, v20.16b, v12.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v14.4s\n"
+ "and v18.16b, v24.16b, v12.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v14.4s\n"
+ "and v31.16b, v23.16b, v12.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v14.4s\n"
+ "sqadd v15.4s, v15.4s, v2.4s\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v9.16b, v5.16b, v27.16b\n"
"sshr v18.4s, v18.4s, #0x1f\n"
- "and v23.16b, v7.16b, v22.16b\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "and v16.16b, v21.16b, v22.16b\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v18.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v30.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v6.4s\n"
- "srshl v20.4s, v20.4s, v6.4s\n"
- "sqadd v10.4s, v10.4s, v26.4s\n"
- "srshl v8.4s, v8.4s, v6.4s\n"
- "sqadd v7.4s, v7.4s, v23.4s\n"
- "srshl v17.4s, v17.4s, v6.4s\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "srshl v19.4s, v19.4s, v22.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v10.4s, v10.4s, v22.4s\n"
+ "and v4.16b, v22.16b, v27.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v28.16b, v19.16b, v27.16b\n"
+ "sqadd v20.4s, v20.4s, v21.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v18.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v31.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "srshl v7.4s, v7.4s, v12.4s\n"
+ "srshl v20.4s, v20.4s, v12.4s\n"
+ "sqadd v5.4s, v5.4s, v9.4s\n"
+ "srshl v24.4s, v24.4s, v12.4s\n"
+ "sqadd v22.4s, v22.4s, v4.4s\n"
+ "srshl v23.4s, v23.4s, v12.4s\n"
+ "sqadd v19.4s, v19.4s, v28.4s\n"
+ "srshl v15.4s, v15.4s, v27.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v5.4s, v5.4s, v27.4s\n"
"sqxtn v20.4h, v20.4s\n"
- "srshl v7.4s, v7.4s, v22.4s\n"
- "sqxtn v8.4h, v8.4s\n"
- "srshl v21.4s, v21.4s, v22.4s\n"
- "sqxtn v17.4h, v17.4s\n"
- "sqxtn2 v13.8h, v19.4s\n"
- "sqxtn2 v20.8h, v10.4s\n"
- "sqxtn2 v8.8h, v7.4s\n"
- "sqxtn2 v17.8h, v21.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v20.8h, v20.8h, v14.8h\n"
- "sqadd v8.8h, v8.8h, v14.8h\n"
- "sqadd v17.8h, v17.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v20.8h, v20.8h, v12.8h\n"
- "smax v8.8h, v8.8h, v12.8h\n"
- "smax v17.8h, v17.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v20.8h, v20.8h, v11.8h\n"
- "smin v8.8h, v8.8h, v11.8h\n"
- "smin v17.8h, v17.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "srshl v22.4s, v22.4s, v27.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "srshl v19.4s, v19.4s, v27.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v7.8h, v15.4s\n"
+ "sqxtn2 v20.8h, v5.4s\n"
+ "sqxtn2 v24.8h, v22.4s\n"
+ "sqxtn2 v23.8h, v19.4s\n"
+ "sqadd v7.8h, v7.8h, v26.8h\n"
+ "sqadd v20.8h, v20.8h, v26.8h\n"
+ "sqadd v24.8h, v24.8h, v26.8h\n"
+ "sqadd v23.8h, v23.8h, v26.8h\n"
+ "smax v7.8h, v7.8h, v11.8h\n"
+ "smax v20.8h, v20.8h, v11.8h\n"
+ "smax v24.8h, v24.8h, v11.8h\n"
+ "smax v23.8h, v23.8h, v11.8h\n"
+ "smin v7.8h, v7.8h, v0.8h\n"
+ "smin v20.8h, v20.8h, v0.8h\n"
+ "smin v24.8h, v24.8h, v0.8h\n"
+ "smin v23.8h, v23.8h, v0.8h\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v8.16b, v8.16b, v8.16b\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
"tbz x1, #2, 121f\n"
- "st1 { v13.s }[0], [x17], #0x4\n"
- "st1 { v20.s }[0], [x6], #0x4\n"
- "st1 { v8.s }[0], [x7], #0x4\n"
- "st1 { v17.s }[0], [x16], #0x4\n"
+ "st1 { v7.s }[0], [x17], #0x4\n"
+ "st1 { v20.s }[0], [x16], #0x4\n"
+ "st1 { v24.s }[0], [x15], #0x4\n"
+ "st1 { v23.s }[0], [x14], #0x4\n"
"tbz x1, #1, 120f\n"
- "st1 { v13.h }[2], [x17], #0x2\n"
- "st1 { v20.h }[2], [x6], #0x2\n"
- "st1 { v8.h }[2], [x7], #0x2\n"
- "st1 { v17.h }[2], [x16], #0x2\n"
+ "st1 { v7.h }[2], [x17], #0x2\n"
+ "st1 { v20.h }[2], [x16], #0x2\n"
+ "st1 { v24.h }[2], [x15], #0x2\n"
+ "st1 { v23.h }[2], [x14], #0x2\n"
"tbz x1, #0, 123f\n"
- "st1 { v13.b }[6], [x17], #0x1\n"
- "st1 { v20.b }[6], [x6], #0x1\n"
- "st1 { v8.b }[6], [x7], #0x1\n"
- "st1 { v17.b }[6], [x16], #0x1\n"
+ "st1 { v7.b }[6], [x17], #0x1\n"
+ "st1 { v20.b }[6], [x16], #0x1\n"
+ "st1 { v24.b }[6], [x15], #0x1\n"
+ "st1 { v23.b }[6], [x14], #0x1\n"
"b 123f\n"
"120:" // Oddments: Bit 2: Bit 1: Unset
"tbz x1, #0, 123f\n"
- "st1 { v13.b }[4], [x17], #0x1\n"
- "st1 { v20.b }[4], [x6], #0x1\n"
- "st1 { v8.b }[4], [x7], #0x1\n"
- "st1 { v17.b }[4], [x16], #0x1\n"
+ "st1 { v7.b }[4], [x17], #0x1\n"
+ "st1 { v20.b }[4], [x16], #0x1\n"
+ "st1 { v24.b }[4], [x15], #0x1\n"
+ "st1 { v23.b }[4], [x14], #0x1\n"
"b 123f\n"
"121:" // Oddments: Bit 2: Unset
"tbz x1, #1, 122f\n"
- "st1 { v13.h }[0], [x17], #0x2\n"
- "st1 { v20.h }[0], [x6], #0x2\n"
- "st1 { v8.h }[0], [x7], #0x2\n"
- "st1 { v17.h }[0], [x16], #0x2\n"
+ "st1 { v7.h }[0], [x17], #0x2\n"
+ "st1 { v20.h }[0], [x16], #0x2\n"
+ "st1 { v24.h }[0], [x15], #0x2\n"
+ "st1 { v23.h }[0], [x14], #0x2\n"
"tbz x1, #0, 123f\n"
- "st1 { v13.b }[2], [x17], #0x1\n"
- "st1 { v20.b }[2], [x6], #0x1\n"
- "st1 { v8.b }[2], [x7], #0x1\n"
- "st1 { v17.b }[2], [x16], #0x1\n"
+ "st1 { v7.b }[2], [x17], #0x1\n"
+ "st1 { v20.b }[2], [x16], #0x1\n"
+ "st1 { v24.b }[2], [x15], #0x1\n"
+ "st1 { v23.b }[2], [x14], #0x1\n"
"b 123f\n"
"122:" // Oddments: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 123f\n"
- "st1 { v13.b }[0], [x17], #0x1\n"
- "st1 { v20.b }[0], [x6], #0x1\n"
- "st1 { v8.b }[0], [x7], #0x1\n"
- "st1 { v17.b }[0], [x16], #0x1\n"
+ "st1 { v7.b }[0], [x17], #0x1\n"
+ "st1 { v20.b }[0], [x16], #0x1\n"
+ "st1 { v24.b }[0], [x15], #0x1\n"
+ "st1 { v23.b }[0], [x14], #0x1\n"
"123:" // Oddments: Bit 2: End
"124:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp
index 6bdcca115c..2c677d2f62 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -47,4 +47,5 @@ class a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirst
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
index 1676119bc1..c2bec4cdab 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -22,12 +22,13 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include "arm_gemm.hpp"
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
@@ -41,7 +42,7 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
)
{
__asm__ __volatile__(
- "lsr x12, %x[n_channels], #0x2\n"
+ "lsr x9, %x[n_channels], #0x2\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
"ld1r { v8.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
@@ -59,7 +60,7 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
"ld1r { v1.4s }, [x20]\n"
"mov x11, #0x0\n"
- "cbz x12, 6f\n"
+ "cbz x9, 6f\n"
"1:" // Channel loop
"movi v23.4s, #0x0\n"
"cbz %x[bias], 2f\n"
@@ -67,34 +68,34 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
"ldr q23, [%x[bias], x20]\n"
"2:" // Channel loop: Load bias: Done
"ldr s0, [%x[params]], #0x4\n"
- "mov x21, %x[inptrs]\n"
- "ldp x10, x9, [x21], #0x10\n"
- "subs x20, %x[n_points], #0x1\n"
- "ldr s14, [x10, x11]\n"
- "ldr s15, [x9, x11]\n"
+ "mov x25, %x[inptrs]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "subs x24, %x[n_points], #0x1\n"
+ "ldr s14, [x21, x11]\n"
+ "ldr s15, [x20, x11]\n"
"mov v24.16b, v23.16b\n"
"mov v25.16b, v23.16b\n"
- "ldp x28, x27, [x21], #0x10\n"
- "ldr s16, [x28, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "ldr s16, [x21, x11]\n"
"mov v26.16b, v23.16b\n"
"mov v27.16b, v23.16b\n"
- "ldr s17, [x27, x11]\n"
- "ldp x26, x25, [x21], #0x10\n"
+ "ldr s17, [x20, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
"mov v28.16b, v23.16b\n"
"mov v29.16b, v23.16b\n"
- "ldr s18, [x26, x11]\n"
- "ldr s19, [x25, x11]\n"
+ "ldr s18, [x21, x11]\n"
+ "ldr s19, [x20, x11]\n"
"mov v30.16b, v23.16b\n"
"mov v31.16b, v23.16b\n"
- "ldp x24, x23, [x21], #0x10\n"
- "ldr s20, [x24, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "ldr s20, [x21, x11]\n"
"ssubl v0.8h, v0.8b, v5.8b\n"
"usubl v14.8h, v14.8b, v6.8b\n"
- "ldr s21, [x23, x11]\n"
- "ldr x22, [x21], #0x8\n"
+ "ldr s21, [x20, x11]\n"
+ "ldr x20, [x25], #0x8\n"
"usubl v15.8h, v15.8b, v6.8b\n"
"usubl v16.8h, v16.8b, v6.8b\n"
- "ldr s22, [x22, x11]\n"
+ "ldr s22, [x20, x11]\n"
"usubl v17.8h, v17.8b, v6.8b\n"
"usubl v18.8h, v18.8b, v6.8b\n"
"usubl v19.8h, v19.8b, v6.8b\n"
@@ -103,35 +104,35 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
"usubl v22.8h, v22.8b, v6.8b\n"
"ble 4f\n"
"3:" // Channel loop: Planar loop
- "ldp x10, x9, [x21], #0x10\n"
- "ldp x28, x27, [x21], #0x10\n"
+ "ldp x23, x22, [x25], #0x10\n"
+ "ldp x21, x20, [x25], #0x10\n"
"smlal v23.4s, v14.4h, v0.4h\n"
"smlal v24.4s, v15.4h, v0.4h\n"
- "ldr s14, [x10, x11]\n"
- "ldr s15, [x9, x11]\n"
+ "ldr s14, [x23, x11]\n"
+ "ldr s15, [x22, x11]\n"
"smlal v25.4s, v16.4h, v0.4h\n"
"smlal v26.4s, v17.4h, v0.4h\n"
- "ldr s16, [x28, x11]\n"
- "ldr s17, [x27, x11]\n"
+ "ldr s16, [x21, x11]\n"
+ "ldr s17, [x20, x11]\n"
"smlal v27.4s, v18.4h, v0.4h\n"
"smlal v28.4s, v19.4h, v0.4h\n"
- "ldp x26, x25, [x21], #0x10\n"
- "ldr s18, [x26, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "ldr s18, [x21, x11]\n"
"smlal v29.4s, v20.4h, v0.4h\n"
"smlal v30.4s, v21.4h, v0.4h\n"
- "ldr s19, [x25, x11]\n"
- "ldp x24, x23, [x21], #0x10\n"
+ "ldr s19, [x20, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
"smlal v31.4s, v22.4h, v0.4h\n"
- "subs x20, x20, #0x1\n"
+ "subs x24, x24, #0x1\n"
"ldr s0, [%x[params]], #0x4\n"
- "ldr s20, [x24, x11]\n"
+ "ldr s20, [x21, x11]\n"
"ssubl v0.8h, v0.8b, v5.8b\n"
"usubl v14.8h, v14.8b, v6.8b\n"
- "ldr s21, [x23, x11]\n"
- "ldr x22, [x21], #0x8\n"
+ "ldr s21, [x20, x11]\n"
+ "ldr x20, [x25], #0x8\n"
"usubl v15.8h, v15.8b, v6.8b\n"
"usubl v16.8h, v16.8b, v6.8b\n"
- "ldr s22, [x22, x11]\n"
+ "ldr s22, [x20, x11]\n"
"usubl v17.8h, v17.8b, v6.8b\n"
"usubl v18.8h, v18.8b, v6.8b\n"
"usubl v19.8h, v19.8b, v6.8b\n"
@@ -167,45 +168,45 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
"sqrdmulh v24.4s, v24.4s, v2.4s\n"
"sqrdmulh v25.4s, v25.4s, v2.4s\n"
"ldr x20, [%x[outptrs], #0x40]\n"
- "and v21.16b, v23.16b, v1.16b\n"
- "and v20.16b, v24.16b, v1.16b\n"
- "and v19.16b, v25.16b, v1.16b\n"
+ "and v18.16b, v23.16b, v1.16b\n"
+ "and v17.16b, v24.16b, v1.16b\n"
+ "and v16.16b, v25.16b, v1.16b\n"
"sshl v26.4s, v26.4s, v3.4s\n"
"sshl v27.4s, v27.4s, v3.4s\n"
"sshl v28.4s, v28.4s, v3.4s\n"
"sshl v29.4s, v29.4s, v3.4s\n"
"sshl v30.4s, v30.4s, v3.4s\n"
"sshl v31.4s, v31.4s, v3.4s\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v2.4s\n"
"sqrdmulh v27.4s, v27.4s, v2.4s\n"
"sqrdmulh v28.4s, v28.4s, v2.4s\n"
"sqrdmulh v29.4s, v29.4s, v2.4s\n"
"sqrdmulh v30.4s, v30.4s, v2.4s\n"
"sqrdmulh v31.4s, v31.4s, v2.4s\n"
- "sqadd v23.4s, v23.4s, v21.4s\n"
- "sqadd v24.4s, v24.4s, v20.4s\n"
- "sqadd v25.4s, v25.4s, v19.4s\n"
- "and v18.16b, v26.16b, v1.16b\n"
- "and v17.16b, v27.16b, v1.16b\n"
- "and v16.16b, v28.16b, v1.16b\n"
- "and v21.16b, v29.16b, v1.16b\n"
- "and v20.16b, v30.16b, v1.16b\n"
- "and v19.16b, v31.16b, v1.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v18.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "and v21.16b, v26.16b, v1.16b\n"
+ "and v20.16b, v27.16b, v1.16b\n"
+ "and v19.16b, v28.16b, v1.16b\n"
+ "and v18.16b, v29.16b, v1.16b\n"
+ "and v17.16b, v30.16b, v1.16b\n"
+ "and v16.16b, v31.16b, v1.16b\n"
"sshr v21.4s, v21.4s, #0x1f\n"
"sshr v20.4s, v20.4s, #0x1f\n"
"sshr v19.4s, v19.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v18.4s\n"
- "sqadd v27.4s, v27.4s, v17.4s\n"
- "sqadd v28.4s, v28.4s, v16.4s\n"
- "sqadd v29.4s, v29.4s, v21.4s\n"
- "sqadd v30.4s, v30.4s, v20.4s\n"
- "sqadd v31.4s, v31.4s, v19.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v21.4s\n"
+ "sqadd v27.4s, v27.4s, v20.4s\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
"srshl v23.4s, v23.4s, v1.4s\n"
"srshl v24.4s, v24.4s, v1.4s\n"
"srshl v25.4s, v25.4s, v1.4s\n"
@@ -270,7 +271,7 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
"str s30, [x21, x11]\n"
"str s31, [x20, x11]\n"
"add x11, x11, #0x4\n"
- "cmp x11, x12, LSL #2\n"
+ "cmp x11, x9, LSL #2\n"
"blt 1b\n"
"6:" // Oddments
"tst %x[n_channels], #0x3\n"
@@ -288,61 +289,61 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
"8:" // Oddments: Load bias: Bit 1: End
"9:" // Oddments: Load bias: Done
"ldr s0, [%x[params]], #0x4\n"
- "mov x21, %x[inptrs]\n"
- "ldp x10, x9, [x21], #0x10\n"
+ "mov x10, %x[inptrs]\n"
+ "ldp x9, x28, [x10], #0x10\n"
"mov v24.16b, v23.16b\n"
- "ldp x28, x27, [x21], #0x10\n"
- "ldp x26, x25, [x21], #0x10\n"
+ "ldp x27, x26, [x10], #0x10\n"
+ "ldp x25, x24, [x10], #0x10\n"
"mov v25.16b, v23.16b\n"
"mov v26.16b, v23.16b\n"
- "ldp x24, x23, [x21], #0x10\n"
- "ldr x22, [x21], #0x8\n"
+ "ldp x23, x22, [x10], #0x10\n"
+ "ldr x21, [x10], #0x8\n"
"mov v27.16b, v23.16b\n"
"mov v28.16b, v23.16b\n"
"mov v29.16b, v23.16b\n"
"mov v30.16b, v23.16b\n"
- "add x10, x10, x11\n"
"add x9, x9, x11\n"
+ "add x28, x28, x11\n"
"mov v31.16b, v23.16b\n"
"ssubl v0.8h, v0.8b, v5.8b\n"
- "add x28, x28, x11\n"
"add x27, x27, x11\n"
"add x26, x26, x11\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
+ "add x21, x21, x11\n"
"tbz %x[n_channels], #1, 10f\n"
- "ldr h14, [x10], #0x2\n"
- "ldr h15, [x9], #0x2\n"
- "ldr h16, [x28], #0x2\n"
- "ldr h17, [x27], #0x2\n"
- "ldr h18, [x26], #0x2\n"
- "ldr h19, [x25], #0x2\n"
- "ldr h20, [x24], #0x2\n"
- "ldr h21, [x23], #0x2\n"
- "ldr h22, [x22], #0x2\n"
+ "ldr h14, [x9], #0x2\n"
+ "ldr h15, [x28], #0x2\n"
+ "ldr h16, [x27], #0x2\n"
+ "ldr h17, [x26], #0x2\n"
+ "ldr h18, [x25], #0x2\n"
+ "ldr h19, [x24], #0x2\n"
+ "ldr h20, [x23], #0x2\n"
+ "ldr h21, [x22], #0x2\n"
+ "ldr h22, [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v14.b }[2], [x10], #0x1\n"
- "ld1 { v15.b }[2], [x9], #0x1\n"
- "ld1 { v16.b }[2], [x28], #0x1\n"
- "ld1 { v17.b }[2], [x27], #0x1\n"
- "ld1 { v18.b }[2], [x26], #0x1\n"
- "ld1 { v19.b }[2], [x25], #0x1\n"
- "ld1 { v20.b }[2], [x24], #0x1\n"
- "ld1 { v21.b }[2], [x23], #0x1\n"
- "ld1 { v22.b }[2], [x22], #0x1\n"
+ "ld1 { v14.b }[2], [x9], #0x1\n"
+ "ld1 { v15.b }[2], [x28], #0x1\n"
+ "ld1 { v16.b }[2], [x27], #0x1\n"
+ "ld1 { v17.b }[2], [x26], #0x1\n"
+ "ld1 { v18.b }[2], [x25], #0x1\n"
+ "ld1 { v19.b }[2], [x24], #0x1\n"
+ "ld1 { v20.b }[2], [x23], #0x1\n"
+ "ld1 { v21.b }[2], [x22], #0x1\n"
+ "ld1 { v22.b }[2], [x21], #0x1\n"
"b 11f\n"
"10:" // Oddments: Load: Bit 1: Unset
- "ldr b14, [x10], #0x1\n"
- "ldr b15, [x9], #0x1\n"
- "ldr b16, [x28], #0x1\n"
- "ldr b17, [x27], #0x1\n"
- "ldr b18, [x26], #0x1\n"
- "ldr b19, [x25], #0x1\n"
- "ldr b20, [x24], #0x1\n"
- "ldr b21, [x23], #0x1\n"
- "ldr b22, [x22], #0x1\n"
+ "ldr b14, [x9], #0x1\n"
+ "ldr b15, [x28], #0x1\n"
+ "ldr b16, [x27], #0x1\n"
+ "ldr b17, [x26], #0x1\n"
+ "ldr b18, [x25], #0x1\n"
+ "ldr b19, [x24], #0x1\n"
+ "ldr b20, [x23], #0x1\n"
+ "ldr b21, [x22], #0x1\n"
+ "ldr b22, [x21], #0x1\n"
"11:" // Oddments: Load: Bit 1: End
"subs x20, %x[n_points], #0x1\n"
"usubl v14.8h, v14.8b, v6.8b\n"
@@ -356,62 +357,62 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
"usubl v22.8h, v22.8b, v6.8b\n"
"ble 15f\n"
"12:" // Oddments: Planar loop
- "ldp x10, x9, [x21], #0x10\n"
- "ldp x28, x27, [x21], #0x10\n"
+ "ldp x9, x28, [x10], #0x10\n"
+ "ldp x27, x26, [x10], #0x10\n"
"smlal v23.4s, v14.4h, v0.4h\n"
"smlal v24.4s, v15.4h, v0.4h\n"
- "ldp x26, x25, [x21], #0x10\n"
- "ldp x24, x23, [x21], #0x10\n"
+ "ldp x25, x24, [x10], #0x10\n"
+ "ldp x23, x22, [x10], #0x10\n"
"smlal v25.4s, v16.4h, v0.4h\n"
"smlal v26.4s, v17.4h, v0.4h\n"
"smlal v27.4s, v18.4h, v0.4h\n"
"smlal v28.4s, v19.4h, v0.4h\n"
- "ldr x22, [x21], #0x8\n"
- "add x10, x10, x11\n"
+ "ldr x21, [x10], #0x8\n"
+ "add x9, x9, x11\n"
"smlal v29.4s, v20.4h, v0.4h\n"
"smlal v30.4s, v21.4h, v0.4h\n"
- "add x9, x9, x11\n"
"add x28, x28, x11\n"
+ "add x27, x27, x11\n"
"smlal v31.4s, v22.4h, v0.4h\n"
"ldr s0, [%x[params]], #0x4\n"
"ssubl v0.8h, v0.8b, v5.8b\n"
- "add x27, x27, x11\n"
"add x26, x26, x11\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
+ "add x21, x21, x11\n"
"tbz %x[n_channels], #1, 13f\n"
- "ldr h14, [x10], #0x2\n"
- "ldr h15, [x9], #0x2\n"
- "ldr h16, [x28], #0x2\n"
- "ldr h17, [x27], #0x2\n"
- "ldr h18, [x26], #0x2\n"
- "ldr h19, [x25], #0x2\n"
- "ldr h20, [x24], #0x2\n"
- "ldr h21, [x23], #0x2\n"
- "ldr h22, [x22], #0x2\n"
+ "ldr h14, [x9], #0x2\n"
+ "ldr h15, [x28], #0x2\n"
+ "ldr h16, [x27], #0x2\n"
+ "ldr h17, [x26], #0x2\n"
+ "ldr h18, [x25], #0x2\n"
+ "ldr h19, [x24], #0x2\n"
+ "ldr h20, [x23], #0x2\n"
+ "ldr h21, [x22], #0x2\n"
+ "ldr h22, [x21], #0x2\n"
"tbz %x[n_channels], #0, 14f\n"
- "ld1 { v14.b }[2], [x10], #0x1\n"
- "ld1 { v15.b }[2], [x9], #0x1\n"
- "ld1 { v16.b }[2], [x28], #0x1\n"
- "ld1 { v17.b }[2], [x27], #0x1\n"
- "ld1 { v18.b }[2], [x26], #0x1\n"
- "ld1 { v19.b }[2], [x25], #0x1\n"
- "ld1 { v20.b }[2], [x24], #0x1\n"
- "ld1 { v21.b }[2], [x23], #0x1\n"
- "ld1 { v22.b }[2], [x22], #0x1\n"
+ "ld1 { v14.b }[2], [x9], #0x1\n"
+ "ld1 { v15.b }[2], [x28], #0x1\n"
+ "ld1 { v16.b }[2], [x27], #0x1\n"
+ "ld1 { v17.b }[2], [x26], #0x1\n"
+ "ld1 { v18.b }[2], [x25], #0x1\n"
+ "ld1 { v19.b }[2], [x24], #0x1\n"
+ "ld1 { v20.b }[2], [x23], #0x1\n"
+ "ld1 { v21.b }[2], [x22], #0x1\n"
+ "ld1 { v22.b }[2], [x21], #0x1\n"
"b 14f\n"
"13:" // Oddments: Planar loop: Load: Bit 1: Unset
- "ldr b14, [x10], #0x1\n"
- "ldr b15, [x9], #0x1\n"
- "ldr b16, [x28], #0x1\n"
- "ldr b17, [x27], #0x1\n"
- "ldr b18, [x26], #0x1\n"
- "ldr b19, [x25], #0x1\n"
- "ldr b20, [x24], #0x1\n"
- "ldr b21, [x23], #0x1\n"
- "ldr b22, [x22], #0x1\n"
+ "ldr b14, [x9], #0x1\n"
+ "ldr b15, [x28], #0x1\n"
+ "ldr b16, [x27], #0x1\n"
+ "ldr b17, [x26], #0x1\n"
+ "ldr b18, [x25], #0x1\n"
+ "ldr b19, [x24], #0x1\n"
+ "ldr b20, [x23], #0x1\n"
+ "ldr b21, [x22], #0x1\n"
+ "ldr b22, [x21], #0x1\n"
"14:" // Oddments: Planar loop: Load: Bit 1: End
"subs x20, x20, #0x1\n"
"usubl v14.8h, v14.8b, v6.8b\n"
@@ -457,9 +458,7 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
"cbz %x[rq_left_shift_ptr], 19f\n"
"ld1 { v3.s }[0], [x20], #0x4\n"
"19:" // Oddments: Load quantisation parameters: Bit 1: Unset: Bit 0: Load left shift: Done
-
"20:" // Oddments: Load quantisation parameters: Bit 1: End
-
"21:" // Oddments: Load quantisation parameters: Done
"sshl v23.4s, v23.4s, v3.4s\n"
"sshl v24.4s, v24.4s, v3.4s\n"
@@ -473,11 +472,11 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
"sqrdmulh v25.4s, v25.4s, v2.4s\n"
"ldr x20, [%x[outptrs], #0x40]\n"
"add x28, x28, x11\n"
- "and v21.16b, v23.16b, v1.16b\n"
- "and v20.16b, v24.16b, v1.16b\n"
+ "and v18.16b, v23.16b, v1.16b\n"
+ "and v17.16b, v24.16b, v1.16b\n"
"add x27, x27, x11\n"
"add x26, x26, x11\n"
- "and v19.16b, v25.16b, v1.16b\n"
+ "and v16.16b, v25.16b, v1.16b\n"
"sshl v26.4s, v26.4s, v3.4s\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
@@ -490,36 +489,36 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
"add x21, x21, x11\n"
"add x20, x20, x11\n"
"sshl v31.4s, v31.4s, v3.4s\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v2.4s\n"
"sqrdmulh v27.4s, v27.4s, v2.4s\n"
"sqrdmulh v28.4s, v28.4s, v2.4s\n"
"sqrdmulh v29.4s, v29.4s, v2.4s\n"
"sqrdmulh v30.4s, v30.4s, v2.4s\n"
"sqrdmulh v31.4s, v31.4s, v2.4s\n"
- "sqadd v23.4s, v23.4s, v21.4s\n"
- "sqadd v24.4s, v24.4s, v20.4s\n"
- "sqadd v25.4s, v25.4s, v19.4s\n"
- "and v18.16b, v26.16b, v1.16b\n"
- "and v17.16b, v27.16b, v1.16b\n"
- "and v16.16b, v28.16b, v1.16b\n"
- "and v21.16b, v29.16b, v1.16b\n"
- "and v20.16b, v30.16b, v1.16b\n"
- "and v19.16b, v31.16b, v1.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v18.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "and v21.16b, v26.16b, v1.16b\n"
+ "and v20.16b, v27.16b, v1.16b\n"
+ "and v19.16b, v28.16b, v1.16b\n"
+ "and v18.16b, v29.16b, v1.16b\n"
+ "and v17.16b, v30.16b, v1.16b\n"
+ "and v16.16b, v31.16b, v1.16b\n"
"sshr v21.4s, v21.4s, #0x1f\n"
"sshr v20.4s, v20.4s, #0x1f\n"
"sshr v19.4s, v19.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v18.4s\n"
- "sqadd v27.4s, v27.4s, v17.4s\n"
- "sqadd v28.4s, v28.4s, v16.4s\n"
- "sqadd v29.4s, v29.4s, v21.4s\n"
- "sqadd v30.4s, v30.4s, v20.4s\n"
- "sqadd v31.4s, v31.4s, v19.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v21.4s\n"
+ "sqadd v27.4s, v27.4s, v20.4s\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
"srshl v23.4s, v23.4s, v1.4s\n"
"srshl v24.4s, v24.4s, v1.4s\n"
"srshl v25.4s, v25.4s, v1.4s\n"
@@ -606,15 +605,14 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
"st1 { v30.b }[0], [x21], #0x1\n"
"st1 { v31.b }[0], [x20], #0x1\n"
"23:" // Oddments: Store: Bit 1: End
-
"24:" // End
-
: [params] "+&r" (params)
: [bias] "r" (qp.bias), [inptrs] "r" (inptrs), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (qp.per_channel_left_shifts), [rq_mul_ptr] "r" (qp.per_channel_muls), [rq_right_shift_ptr] "r" (qp.per_channel_right_shifts)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
index 394df363da..b7ba363b43 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
index 976434aa28..ed99f1f642 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -22,12 +22,13 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include "arm_gemm.hpp"
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
@@ -47,21 +48,21 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
__asm__ __volatile__(
"lsr x10, %x[n_output_channels], #0x2\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
- "ld1r { v13.4s }, [x20]\n"
+ "ld1r { v15.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v11.4s }, [x20]\n"
+ "ld1r { v14.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v3.16b }, [x20]\n"
+ "ld1r { v13.16b }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
"ld1r { v12.16b }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v14.4s }, [x20]\n"
+ "ld1r { v11.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
- "ld1r { v15.4s }, [x20]\n"
+ "ld1r { v10.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
"ld1r { v9.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
- "ld1r { v10.4s }, [x20]\n"
+ "ld1r { v8.4s }, [x20]\n"
"mov x9, #0x0\n"
"cbz x10, 9f\n"
"1:" // Output channel loop
@@ -89,256 +90,256 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
"cbz %x[rq_mul_ptr], 3f\n"
"lsl x20, x9, #0x2\n"
"ldr q9, [%x[rq_mul_ptr], x20]\n"
- "ldr q10, [%x[rq_right_shift_ptr], x20]\n"
+ "ldr q8, [%x[rq_right_shift_ptr], x20]\n"
"cbz %x[rq_left_shift_ptr], 3f\n"
- "ldr q15, [%x[rq_left_shift_ptr], x20]\n"
+ "ldr q10, [%x[rq_left_shift_ptr], x20]\n"
"3:" // Output channel loop: Load quantization parameters: Done
- "ldr s8, [%x[weights]], #0x4\n"
- "mov x20, %x[inptrs]\n"
- "ldp x25, x28, [x20], #0x10\n"
- "lsr x21, %x[kernel_points], #0x1\n"
- "ldr d2, [x25, #0x0]\n"
- "ldr d7, [x28, #0x0]\n"
- "usubl v2.8h, v2.8b, v3.8b\n"
- "usubl v7.8h, v7.8b, v3.8b\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "cbz x21, 7f\n"
- "ldr s6, [%x[weights]], #0x4\n"
- "ldp x25, x28, [x20], #0x10\n"
- "subs x21, x21, #0x1\n"
- "ssubl v6.8h, v6.8b, v12.8b\n"
- "ldr d1, [x25, #0x0]\n"
- "ldr d0, [x28, #0x0]\n"
- "usubl v1.8h, v1.8b, v3.8b\n"
- "usubl v0.8h, v0.8b, v3.8b\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "mov x22, %x[inptrs]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "lsr x23, %x[kernel_points], #0x1\n"
+ "ldr d0, [x21, #0x0]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "cbz x23, 7f\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "subs x23, x23, #0x1\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
+ "ldr d3, [x21, #0x0]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
"beq 5f\n"
"4:" // Output channel loop: Kernel loop
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "subs x21, x21, #0x1\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "ldr d2, [x25, #0x0]\n"
- "usubl v2.8h, v2.8b, v3.8b\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "ldr d7, [x28, #0x0]\n"
- "ldr s8, [%x[weights]], #0x4\n"
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "usubl v7.8h, v7.8b, v3.8b\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
- "ldr d1, [x25, #0x0]\n"
- "usubl v1.8h, v1.8b, v3.8b\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
- "ldr d0, [x28, #0x0]\n"
- "ldr s6, [%x[weights]], #0x4\n"
- "usubl v0.8h, v0.8b, v3.8b\n"
- "ssubl v6.8h, v6.8b, v12.8b\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "subs x23, x23, #0x1\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d0, [x21, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "ldr d3, [x21, #0x0]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
"bgt 4b\n"
"5:" // Output channel loop: Kernel loop tail
"tbnz %x[kernel_points], #0, 6f\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "sshl v16.4s, v16.4s, v15.4s\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "sshl v17.4s, v17.4s, v15.4s\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "sshl v18.4s, v18.4s, v15.4s\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "sshl v19.4s, v19.4s, v15.4s\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
"sqrdmulh v16.4s, v16.4s, v9.4s\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
"sqrdmulh v17.4s, v17.4s, v9.4s\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
"sqrdmulh v18.4s, v18.4s, v9.4s\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
"sqrdmulh v19.4s, v19.4s, v9.4s\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "and v5.16b, v16.16b, v10.16b\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "and v4.16b, v17.16b, v10.16b\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "and v2.16b, v18.16b, v10.16b\n"
- "and v1.16b, v19.16b, v10.16b\n"
- "sshl v20.4s, v20.4s, v15.4s\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "sshl v21.4s, v21.4s, v15.4s\n"
- "sshl v22.4s, v22.4s, v15.4s\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "sshl v23.4s, v23.4s, v15.4s\n"
- "sshl v24.4s, v24.4s, v15.4s\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "sshl v25.4s, v25.4s, v15.4s\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v20.4s, v20.4s, v9.4s\n"
"sqrdmulh v21.4s, v21.4s, v9.4s\n"
"sqrdmulh v22.4s, v22.4s, v9.4s\n"
"sqrdmulh v23.4s, v23.4s, v9.4s\n"
"sqrdmulh v24.4s, v24.4s, v9.4s\n"
"sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqadd v16.4s, v16.4s, v5.4s\n"
- "sqadd v17.4s, v17.4s, v4.4s\n"
- "sqadd v18.4s, v18.4s, v2.4s\n"
- "sqadd v19.4s, v19.4s, v1.4s\n"
- "and v8.16b, v20.16b, v10.16b\n"
- "and v0.16b, v21.16b, v10.16b\n"
- "and v5.16b, v22.16b, v10.16b\n"
- "and v4.16b, v23.16b, v10.16b\n"
- "and v2.16b, v24.16b, v10.16b\n"
- "and v1.16b, v25.16b, v10.16b\n"
- "sshl v26.4s, v26.4s, v15.4s\n"
- "sshl v27.4s, v27.4s, v15.4s\n"
- "sshl v28.4s, v28.4s, v15.4s\n"
- "sshl v29.4s, v29.4s, v15.4s\n"
- "sshl v30.4s, v30.4s, v15.4s\n"
- "sshl v31.4s, v31.4s, v15.4s\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v9.4s\n"
"sqrdmulh v27.4s, v27.4s, v9.4s\n"
"sqrdmulh v28.4s, v28.4s, v9.4s\n"
"sqrdmulh v29.4s, v29.4s, v9.4s\n"
"sqrdmulh v30.4s, v30.4s, v9.4s\n"
"sqrdmulh v31.4s, v31.4s, v9.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sqadd v22.4s, v22.4s, v5.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v2.4s\n"
- "sqadd v25.4s, v25.4s, v1.4s\n"
- "and v8.16b, v26.16b, v10.16b\n"
- "and v0.16b, v27.16b, v10.16b\n"
- "and v5.16b, v28.16b, v10.16b\n"
- "and v4.16b, v29.16b, v10.16b\n"
- "and v2.16b, v30.16b, v10.16b\n"
- "and v1.16b, v31.16b, v10.16b\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "srshl v17.4s, v17.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v10.4s\n"
- "srshl v19.4s, v19.4s, v10.4s\n"
- "srshl v20.4s, v20.4s, v10.4s\n"
- "srshl v21.4s, v21.4s, v10.4s\n"
- "srshl v22.4s, v22.4s, v10.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "sqadd v26.4s, v26.4s, v8.4s\n"
- "sqadd v27.4s, v27.4s, v0.4s\n"
- "sqadd v28.4s, v28.4s, v5.4s\n"
- "sqadd v29.4s, v29.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v2.4s\n"
- "sqadd v31.4s, v31.4s, v1.4s\n"
- "add v16.4s, v16.4s, v14.4s\n"
- "add v17.4s, v17.4s, v14.4s\n"
- "add v18.4s, v18.4s, v14.4s\n"
- "add v19.4s, v19.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v22.4s, v22.4s, v14.4s\n"
- "add v23.4s, v23.4s, v14.4s\n"
- "srshl v24.4s, v24.4s, v10.4s\n"
- "srshl v25.4s, v25.4s, v10.4s\n"
- "srshl v26.4s, v26.4s, v10.4s\n"
- "srshl v27.4s, v27.4s, v10.4s\n"
- "srshl v28.4s, v28.4s, v10.4s\n"
- "srshl v29.4s, v29.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v10.4s\n"
- "srshl v31.4s, v31.4s, v10.4s\n"
- "smin v16.4s, v16.4s, v11.4s\n"
- "smin v17.4s, v17.4s, v11.4s\n"
- "smin v18.4s, v18.4s, v11.4s\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "smin v23.4s, v23.4s, v11.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "smax v16.4s, v16.4s, v13.4s\n"
- "smax v17.4s, v17.4s, v13.4s\n"
- "smax v18.4s, v18.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smax v22.4s, v22.4s, v13.4s\n"
- "smax v23.4s, v23.4s, v13.4s\n"
- "smin v24.4s, v24.4s, v11.4s\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "smin v26.4s, v26.4s, v11.4s\n"
- "smin v27.4s, v27.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v31.4s, v31.4s, v11.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
@@ -347,263 +348,263 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s16, [x20, x9]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "str s17, [x21, x9]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s18, [x22, x9]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s19, [x23, x9]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s20, [x24, x9]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s21, [x25, x9]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s22, [x26, x9]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s23, [x27, x9]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x20, x9]\n"
+ "str s24, [x27, x9]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s25, [x21, x9]\n"
+ "str s25, [x26, x9]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s26, [x22, x9]\n"
+ "str s26, [x25, x9]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s27, [x23, x9]\n"
+ "str s27, [x24, x9]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s28, [x24, x9]\n"
- "str s29, [x25, x9]\n"
- "str s30, [x26, x9]\n"
- "str s31, [x27, x9]\n"
+ "str s28, [x23, x9]\n"
+ "str s29, [x22, x9]\n"
+ "str s30, [x21, x9]\n"
+ "str s31, [x20, x9]\n"
"b 8f\n"
"6:" // Output channel loop: Odd tail
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "ldr d2, [x25, #0x0]\n"
- "usubl v2.8h, v2.8b, v3.8b\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "ldr s8, [%x[weights]], #0x4\n"
- "ldr d7, [x28, #0x0]\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "usubl v7.8h, v7.8b, v3.8b\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "sshl v16.4s, v16.4s, v15.4s\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "sshl v17.4s, v17.4s, v15.4s\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
- "sshl v18.4s, v18.4s, v15.4s\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
- "sshl v19.4s, v19.4s, v15.4s\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
+ "ldp x20, x28, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d0, [x20, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "ldr d4, [x28, #0x0]\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
"sqrdmulh v16.4s, v16.4s, v9.4s\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
"sqrdmulh v17.4s, v17.4s, v9.4s\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
"sqrdmulh v18.4s, v18.4s, v9.4s\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
"sqrdmulh v19.4s, v19.4s, v9.4s\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "and v5.16b, v16.16b, v10.16b\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "and v4.16b, v17.16b, v10.16b\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
- "and v2.16b, v18.16b, v10.16b\n"
- "and v1.16b, v19.16b, v10.16b\n"
- "sshl v20.4s, v20.4s, v15.4s\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "sshl v21.4s, v21.4s, v15.4s\n"
- "sshl v22.4s, v22.4s, v15.4s\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "sshl v23.4s, v23.4s, v15.4s\n"
- "sshl v24.4s, v24.4s, v15.4s\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "sshl v25.4s, v25.4s, v15.4s\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v20.4s, v20.4s, v9.4s\n"
"sqrdmulh v21.4s, v21.4s, v9.4s\n"
"sqrdmulh v22.4s, v22.4s, v9.4s\n"
"sqrdmulh v23.4s, v23.4s, v9.4s\n"
"sqrdmulh v24.4s, v24.4s, v9.4s\n"
"sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqadd v16.4s, v16.4s, v5.4s\n"
- "sqadd v17.4s, v17.4s, v4.4s\n"
- "sqadd v18.4s, v18.4s, v2.4s\n"
- "sqadd v19.4s, v19.4s, v1.4s\n"
- "and v8.16b, v20.16b, v10.16b\n"
- "and v0.16b, v21.16b, v10.16b\n"
- "and v5.16b, v22.16b, v10.16b\n"
- "and v4.16b, v23.16b, v10.16b\n"
- "and v2.16b, v24.16b, v10.16b\n"
- "and v1.16b, v25.16b, v10.16b\n"
- "sshl v26.4s, v26.4s, v15.4s\n"
- "sshl v27.4s, v27.4s, v15.4s\n"
- "sshl v28.4s, v28.4s, v15.4s\n"
- "sshl v29.4s, v29.4s, v15.4s\n"
- "sshl v30.4s, v30.4s, v15.4s\n"
- "sshl v31.4s, v31.4s, v15.4s\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v9.4s\n"
"sqrdmulh v27.4s, v27.4s, v9.4s\n"
"sqrdmulh v28.4s, v28.4s, v9.4s\n"
"sqrdmulh v29.4s, v29.4s, v9.4s\n"
"sqrdmulh v30.4s, v30.4s, v9.4s\n"
"sqrdmulh v31.4s, v31.4s, v9.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sqadd v22.4s, v22.4s, v5.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v2.4s\n"
- "sqadd v25.4s, v25.4s, v1.4s\n"
- "and v8.16b, v26.16b, v10.16b\n"
- "and v0.16b, v27.16b, v10.16b\n"
- "and v5.16b, v28.16b, v10.16b\n"
- "and v4.16b, v29.16b, v10.16b\n"
- "and v2.16b, v30.16b, v10.16b\n"
- "and v1.16b, v31.16b, v10.16b\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "srshl v17.4s, v17.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v10.4s\n"
- "srshl v19.4s, v19.4s, v10.4s\n"
- "srshl v20.4s, v20.4s, v10.4s\n"
- "srshl v21.4s, v21.4s, v10.4s\n"
- "srshl v22.4s, v22.4s, v10.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "sqadd v26.4s, v26.4s, v8.4s\n"
- "sqadd v27.4s, v27.4s, v0.4s\n"
- "sqadd v28.4s, v28.4s, v5.4s\n"
- "sqadd v29.4s, v29.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v2.4s\n"
- "sqadd v31.4s, v31.4s, v1.4s\n"
- "add v16.4s, v16.4s, v14.4s\n"
- "add v17.4s, v17.4s, v14.4s\n"
- "add v18.4s, v18.4s, v14.4s\n"
- "add v19.4s, v19.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v22.4s, v22.4s, v14.4s\n"
- "add v23.4s, v23.4s, v14.4s\n"
- "srshl v24.4s, v24.4s, v10.4s\n"
- "srshl v25.4s, v25.4s, v10.4s\n"
- "srshl v26.4s, v26.4s, v10.4s\n"
- "srshl v27.4s, v27.4s, v10.4s\n"
- "srshl v28.4s, v28.4s, v10.4s\n"
- "srshl v29.4s, v29.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v10.4s\n"
- "srshl v31.4s, v31.4s, v10.4s\n"
- "smin v16.4s, v16.4s, v11.4s\n"
- "smin v17.4s, v17.4s, v11.4s\n"
- "smin v18.4s, v18.4s, v11.4s\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "smin v23.4s, v23.4s, v11.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "smax v16.4s, v16.4s, v13.4s\n"
- "smax v17.4s, v17.4s, v13.4s\n"
- "smax v18.4s, v18.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smax v22.4s, v22.4s, v13.4s\n"
- "smax v23.4s, v23.4s, v13.4s\n"
- "smin v24.4s, v24.4s, v11.4s\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "smin v26.4s, v26.4s, v11.4s\n"
- "smin v27.4s, v27.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v31.4s, v31.4s, v11.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
@@ -612,224 +613,224 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s16, [x20, x9]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "str s17, [x21, x9]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s18, [x22, x9]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s19, [x23, x9]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s20, [x24, x9]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s21, [x25, x9]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s22, [x26, x9]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s23, [x27, x9]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x20, x9]\n"
+ "str s24, [x27, x9]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s25, [x21, x9]\n"
+ "str s25, [x26, x9]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s26, [x22, x9]\n"
+ "str s26, [x25, x9]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s27, [x23, x9]\n"
+ "str s27, [x24, x9]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s28, [x24, x9]\n"
- "str s29, [x25, x9]\n"
- "str s30, [x26, x9]\n"
- "str s31, [x27, x9]\n"
+ "str s28, [x23, x9]\n"
+ "str s29, [x22, x9]\n"
+ "str s30, [x21, x9]\n"
+ "str s31, [x20, x9]\n"
"b 8f\n"
"7:" // Output channel loop: Single kernel point
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "sshl v16.4s, v16.4s, v15.4s\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "sshl v17.4s, v17.4s, v15.4s\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "sshl v18.4s, v18.4s, v15.4s\n"
- "sshl v19.4s, v19.4s, v15.4s\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
"sqrdmulh v16.4s, v16.4s, v9.4s\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
"sqrdmulh v17.4s, v17.4s, v9.4s\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
"sqrdmulh v18.4s, v18.4s, v9.4s\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
"sqrdmulh v19.4s, v19.4s, v9.4s\n"
- "and v5.16b, v16.16b, v10.16b\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "and v4.16b, v17.16b, v10.16b\n"
- "and v2.16b, v18.16b, v10.16b\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "and v1.16b, v19.16b, v10.16b\n"
- "sshl v20.4s, v20.4s, v15.4s\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "sshl v21.4s, v21.4s, v15.4s\n"
- "sshl v22.4s, v22.4s, v15.4s\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "sshl v23.4s, v23.4s, v15.4s\n"
- "sshl v24.4s, v24.4s, v15.4s\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "sshl v25.4s, v25.4s, v15.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v20.4s, v20.4s, v9.4s\n"
"sqrdmulh v21.4s, v21.4s, v9.4s\n"
"sqrdmulh v22.4s, v22.4s, v9.4s\n"
"sqrdmulh v23.4s, v23.4s, v9.4s\n"
"sqrdmulh v24.4s, v24.4s, v9.4s\n"
"sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqadd v16.4s, v16.4s, v5.4s\n"
- "sqadd v17.4s, v17.4s, v4.4s\n"
- "sqadd v18.4s, v18.4s, v2.4s\n"
- "sqadd v19.4s, v19.4s, v1.4s\n"
- "and v8.16b, v20.16b, v10.16b\n"
- "and v0.16b, v21.16b, v10.16b\n"
- "and v5.16b, v22.16b, v10.16b\n"
- "and v4.16b, v23.16b, v10.16b\n"
- "and v2.16b, v24.16b, v10.16b\n"
- "and v1.16b, v25.16b, v10.16b\n"
- "sshl v26.4s, v26.4s, v15.4s\n"
- "sshl v27.4s, v27.4s, v15.4s\n"
- "sshl v28.4s, v28.4s, v15.4s\n"
- "sshl v29.4s, v29.4s, v15.4s\n"
- "sshl v30.4s, v30.4s, v15.4s\n"
- "sshl v31.4s, v31.4s, v15.4s\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v9.4s\n"
"sqrdmulh v27.4s, v27.4s, v9.4s\n"
"sqrdmulh v28.4s, v28.4s, v9.4s\n"
"sqrdmulh v29.4s, v29.4s, v9.4s\n"
"sqrdmulh v30.4s, v30.4s, v9.4s\n"
"sqrdmulh v31.4s, v31.4s, v9.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sqadd v22.4s, v22.4s, v5.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v2.4s\n"
- "sqadd v25.4s, v25.4s, v1.4s\n"
- "and v8.16b, v26.16b, v10.16b\n"
- "and v0.16b, v27.16b, v10.16b\n"
- "and v5.16b, v28.16b, v10.16b\n"
- "and v4.16b, v29.16b, v10.16b\n"
- "and v2.16b, v30.16b, v10.16b\n"
- "and v1.16b, v31.16b, v10.16b\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "srshl v17.4s, v17.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v10.4s\n"
- "srshl v19.4s, v19.4s, v10.4s\n"
- "srshl v20.4s, v20.4s, v10.4s\n"
- "srshl v21.4s, v21.4s, v10.4s\n"
- "srshl v22.4s, v22.4s, v10.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "sqadd v26.4s, v26.4s, v8.4s\n"
- "sqadd v27.4s, v27.4s, v0.4s\n"
- "sqadd v28.4s, v28.4s, v5.4s\n"
- "sqadd v29.4s, v29.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v2.4s\n"
- "sqadd v31.4s, v31.4s, v1.4s\n"
- "add v16.4s, v16.4s, v14.4s\n"
- "add v17.4s, v17.4s, v14.4s\n"
- "add v18.4s, v18.4s, v14.4s\n"
- "add v19.4s, v19.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v22.4s, v22.4s, v14.4s\n"
- "add v23.4s, v23.4s, v14.4s\n"
- "srshl v24.4s, v24.4s, v10.4s\n"
- "srshl v25.4s, v25.4s, v10.4s\n"
- "srshl v26.4s, v26.4s, v10.4s\n"
- "srshl v27.4s, v27.4s, v10.4s\n"
- "srshl v28.4s, v28.4s, v10.4s\n"
- "srshl v29.4s, v29.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v10.4s\n"
- "srshl v31.4s, v31.4s, v10.4s\n"
- "smin v16.4s, v16.4s, v11.4s\n"
- "smin v17.4s, v17.4s, v11.4s\n"
- "smin v18.4s, v18.4s, v11.4s\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "smin v23.4s, v23.4s, v11.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "smax v16.4s, v16.4s, v13.4s\n"
- "smax v17.4s, v17.4s, v13.4s\n"
- "smax v18.4s, v18.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smax v22.4s, v22.4s, v13.4s\n"
- "smax v23.4s, v23.4s, v13.4s\n"
- "smin v24.4s, v24.4s, v11.4s\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "smin v26.4s, v26.4s, v11.4s\n"
- "smin v27.4s, v27.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v31.4s, v31.4s, v11.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
@@ -838,62 +839,62 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s16, [x20, x9]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "str s17, [x21, x9]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s18, [x22, x9]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s19, [x23, x9]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s20, [x24, x9]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s21, [x25, x9]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s22, [x26, x9]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s23, [x27, x9]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x20, x9]\n"
+ "str s24, [x27, x9]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s25, [x21, x9]\n"
+ "str s25, [x26, x9]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s26, [x22, x9]\n"
+ "str s26, [x25, x9]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s27, [x23, x9]\n"
+ "str s27, [x24, x9]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s28, [x24, x9]\n"
- "str s29, [x25, x9]\n"
- "str s30, [x26, x9]\n"
- "str s31, [x27, x9]\n"
+ "str s28, [x23, x9]\n"
+ "str s29, [x22, x9]\n"
+ "str s30, [x21, x9]\n"
+ "str s31, [x20, x9]\n"
"8:" // Output channel loop: Done
"add x9, x9, #0x4\n"
"cmp x9, x10, LSL #2\n"
@@ -936,354 +937,354 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
"cbz %x[rq_left_shift_ptr], 15f\n"
"tbz %x[n_output_channels], #1, 13f\n"
"ld1 { v9.d }[0], [x22], #0x8\n"
- "ld1 { v10.d }[0], [x21], #0x8\n"
- "ld1 { v15.d }[0], [x20], #0x8\n"
+ "ld1 { v8.d }[0], [x21], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_output_channels], #0, 14f\n"
"ld1 { v9.s }[2], [x22], #0x4\n"
- "ld1 { v10.s }[2], [x21], #0x4\n"
- "ld1 { v15.s }[2], [x20], #0x4\n"
+ "ld1 { v8.s }[2], [x21], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"b 14f\n"
"13:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: Unset
"ld1 { v9.s }[0], [x22], #0x4\n"
- "ld1 { v10.s }[0], [x21], #0x4\n"
- "ld1 { v15.s }[0], [x20], #0x4\n"
+ "ld1 { v8.s }[0], [x21], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"14:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: End
"b 18f\n"
"15:" // Output channel oddments: Load quantization parameters: No left shift
"tbz %x[n_output_channels], #1, 16f\n"
"ld1 { v9.d }[0], [x22], #0x8\n"
- "ld1 { v10.d }[0], [x21], #0x8\n"
+ "ld1 { v8.d }[0], [x21], #0x8\n"
"tbz %x[n_output_channels], #0, 17f\n"
"ld1 { v9.s }[2], [x22], #0x4\n"
- "ld1 { v10.s }[2], [x21], #0x4\n"
+ "ld1 { v8.s }[2], [x21], #0x4\n"
"b 17f\n"
"16:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: Unset
"ld1 { v9.s }[0], [x22], #0x4\n"
- "ld1 { v10.s }[0], [x21], #0x4\n"
+ "ld1 { v8.s }[0], [x21], #0x4\n"
"17:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: End
"18:" // Output channel oddments: Load quantization parameters: Done
- "ldr s8, [%x[weights]], #0x4\n"
- "mov x20, %x[inptrs]\n"
- "ldp x25, x28, [x20], #0x10\n"
- "lsr x21, %x[kernel_points], #0x1\n"
- "ldr d2, [x25, #0x0]\n"
- "ldr d7, [x28, #0x0]\n"
- "usubl v2.8h, v2.8b, v3.8b\n"
- "usubl v7.8h, v7.8b, v3.8b\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "cbz x21, 22f\n"
- "ldr s6, [%x[weights]], #0x4\n"
- "ldp x25, x28, [x20], #0x10\n"
- "subs x21, x21, #0x1\n"
- "ssubl v6.8h, v6.8b, v12.8b\n"
- "ldr d1, [x25, #0x0]\n"
- "ldr d0, [x28, #0x0]\n"
- "usubl v1.8h, v1.8b, v3.8b\n"
- "usubl v0.8h, v0.8b, v3.8b\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "mov x22, %x[inptrs]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "lsr x23, %x[kernel_points], #0x1\n"
+ "ldr d0, [x21, #0x0]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "cbz x23, 22f\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "subs x23, x23, #0x1\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
+ "ldr d3, [x21, #0x0]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
"beq 20f\n"
"19:" // Output channel oddments: Kernel loop
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "subs x21, x21, #0x1\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "ldr d2, [x25, #0x0]\n"
- "usubl v2.8h, v2.8b, v3.8b\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "ldr d7, [x28, #0x0]\n"
- "ldr s8, [%x[weights]], #0x4\n"
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "usubl v7.8h, v7.8b, v3.8b\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
- "ldr d1, [x25, #0x0]\n"
- "usubl v1.8h, v1.8b, v3.8b\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
- "ldr d0, [x28, #0x0]\n"
- "ldr s6, [%x[weights]], #0x4\n"
- "usubl v0.8h, v0.8b, v3.8b\n"
- "ssubl v6.8h, v6.8b, v12.8b\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "subs x23, x23, #0x1\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d0, [x21, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "ldr d3, [x21, #0x0]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
"bgt 19b\n"
"20:" // Output channel oddments: Kernel loop tail
"tbnz %x[kernel_points], #0, 21f\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
"b 23f\n"
"21:" // Output channel oddments: Odd tail
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "ldr d2, [x25, #0x0]\n"
- "usubl v2.8h, v2.8b, v3.8b\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "ldr d7, [x28, #0x0]\n"
- "ldr s8, [%x[weights]], #0x4\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "usubl v7.8h, v7.8b, v3.8b\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d2, [x21, #0x0]\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr d1, [x20, #0x0]\n"
+ "ldr s0, [%x[weights]], #0x4\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "ssubl v0.8h, v0.8b, v12.8b\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "smlal v16.4s, v0.4h, v2.h[0]\n"
+ "smlal v17.4s, v0.4h, v2.h[1]\n"
+ "smlal v18.4s, v0.4h, v2.h[2]\n"
+ "smlal v19.4s, v0.4h, v2.h[3]\n"
+ "smlal v20.4s, v0.4h, v2.h[4]\n"
+ "smlal v21.4s, v0.4h, v2.h[5]\n"
+ "smlal v22.4s, v0.4h, v2.h[6]\n"
+ "smlal v23.4s, v0.4h, v2.h[7]\n"
+ "smlal v24.4s, v0.4h, v1.h[0]\n"
+ "smlal v25.4s, v0.4h, v1.h[1]\n"
+ "smlal v26.4s, v0.4h, v1.h[2]\n"
+ "smlal v27.4s, v0.4h, v1.h[3]\n"
+ "smlal v28.4s, v0.4h, v1.h[4]\n"
+ "smlal v29.4s, v0.4h, v1.h[5]\n"
+ "smlal v30.4s, v0.4h, v1.h[6]\n"
+ "smlal v31.4s, v0.4h, v1.h[7]\n"
"b 23f\n"
"22:" // Output channel oddments: Single kernel point
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
"23:" // Output channel oddments: Done
- "sshl v16.4s, v16.4s, v15.4s\n"
- "sshl v17.4s, v17.4s, v15.4s\n"
- "sshl v18.4s, v18.4s, v15.4s\n"
- "sshl v19.4s, v19.4s, v15.4s\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
"sqrdmulh v16.4s, v16.4s, v9.4s\n"
"sqrdmulh v17.4s, v17.4s, v9.4s\n"
"sqrdmulh v18.4s, v18.4s, v9.4s\n"
"sqrdmulh v19.4s, v19.4s, v9.4s\n"
- "and v5.16b, v16.16b, v10.16b\n"
- "and v4.16b, v17.16b, v10.16b\n"
- "and v2.16b, v18.16b, v10.16b\n"
- "and v1.16b, v19.16b, v10.16b\n"
- "sshl v20.4s, v20.4s, v15.4s\n"
- "sshl v21.4s, v21.4s, v15.4s\n"
- "sshl v22.4s, v22.4s, v15.4s\n"
- "sshl v23.4s, v23.4s, v15.4s\n"
- "sshl v24.4s, v24.4s, v15.4s\n"
- "sshl v25.4s, v25.4s, v15.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v20.4s, v20.4s, v9.4s\n"
"sqrdmulh v21.4s, v21.4s, v9.4s\n"
"sqrdmulh v22.4s, v22.4s, v9.4s\n"
"sqrdmulh v23.4s, v23.4s, v9.4s\n"
"sqrdmulh v24.4s, v24.4s, v9.4s\n"
"sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqadd v16.4s, v16.4s, v5.4s\n"
- "sqadd v17.4s, v17.4s, v4.4s\n"
- "sqadd v18.4s, v18.4s, v2.4s\n"
- "sqadd v19.4s, v19.4s, v1.4s\n"
- "and v8.16b, v20.16b, v10.16b\n"
- "and v0.16b, v21.16b, v10.16b\n"
- "and v5.16b, v22.16b, v10.16b\n"
- "and v4.16b, v23.16b, v10.16b\n"
- "and v2.16b, v24.16b, v10.16b\n"
- "and v1.16b, v25.16b, v10.16b\n"
- "sshl v26.4s, v26.4s, v15.4s\n"
- "sshl v27.4s, v27.4s, v15.4s\n"
- "sshl v28.4s, v28.4s, v15.4s\n"
- "sshl v29.4s, v29.4s, v15.4s\n"
- "sshl v30.4s, v30.4s, v15.4s\n"
- "sshl v31.4s, v31.4s, v15.4s\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v9.4s\n"
"sqrdmulh v27.4s, v27.4s, v9.4s\n"
"sqrdmulh v28.4s, v28.4s, v9.4s\n"
"sqrdmulh v29.4s, v29.4s, v9.4s\n"
"sqrdmulh v30.4s, v30.4s, v9.4s\n"
"sqrdmulh v31.4s, v31.4s, v9.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sqadd v22.4s, v22.4s, v5.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v2.4s\n"
- "sqadd v25.4s, v25.4s, v1.4s\n"
- "and v8.16b, v26.16b, v10.16b\n"
- "and v0.16b, v27.16b, v10.16b\n"
- "and v5.16b, v28.16b, v10.16b\n"
- "and v4.16b, v29.16b, v10.16b\n"
- "and v2.16b, v30.16b, v10.16b\n"
- "and v1.16b, v31.16b, v10.16b\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v8.4s\n"
- "sqadd v27.4s, v27.4s, v0.4s\n"
- "sqadd v28.4s, v28.4s, v5.4s\n"
- "sqadd v29.4s, v29.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v2.4s\n"
- "sqadd v31.4s, v31.4s, v1.4s\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "srshl v17.4s, v17.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v10.4s\n"
- "srshl v19.4s, v19.4s, v10.4s\n"
- "srshl v20.4s, v20.4s, v10.4s\n"
- "srshl v21.4s, v21.4s, v10.4s\n"
- "srshl v22.4s, v22.4s, v10.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "srshl v24.4s, v24.4s, v10.4s\n"
- "srshl v25.4s, v25.4s, v10.4s\n"
- "srshl v26.4s, v26.4s, v10.4s\n"
- "srshl v27.4s, v27.4s, v10.4s\n"
- "srshl v28.4s, v28.4s, v10.4s\n"
- "srshl v29.4s, v29.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v10.4s\n"
- "srshl v31.4s, v31.4s, v10.4s\n"
- "add v16.4s, v16.4s, v14.4s\n"
- "add v17.4s, v17.4s, v14.4s\n"
- "add v18.4s, v18.4s, v14.4s\n"
- "add v19.4s, v19.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v22.4s, v22.4s, v14.4s\n"
- "add v23.4s, v23.4s, v14.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "smin v16.4s, v16.4s, v11.4s\n"
- "smin v17.4s, v17.4s, v11.4s\n"
- "smin v18.4s, v18.4s, v11.4s\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "smin v23.4s, v23.4s, v11.4s\n"
- "smin v24.4s, v24.4s, v11.4s\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "smin v26.4s, v26.4s, v11.4s\n"
- "smin v27.4s, v27.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v31.4s, v31.4s, v11.4s\n"
- "smax v16.4s, v16.4s, v13.4s\n"
- "smax v17.4s, v17.4s, v13.4s\n"
- "smax v18.4s, v18.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smax v22.4s, v22.4s, v13.4s\n"
- "smax v23.4s, v23.4s, v13.4s\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
@@ -1317,158 +1318,156 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
"uzp1 v30.16b, v30.16b, v30.16b\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
"tbz %x[n_output_channels], #1, 24f\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x9\n"
- "add x21, x21, x9\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x9\n"
- "add x23, x23, x9\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x9\n"
- "add x25, x25, x9\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x9\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x9\n"
- "st1 { v16.h }[0], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x9\n"
- "st1 { v17.h }[0], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x9\n"
- "st1 { v18.h }[0], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x9\n"
- "st1 { v19.h }[0], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x9\n"
- "st1 { v20.h }[0], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x9\n"
- "st1 { v21.h }[0], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x9\n"
- "st1 { v22.h }[0], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x9\n"
- "st1 { v23.h }[0], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
+ "st1 { v16.h }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x9\n"
+ "st1 { v17.h }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x9\n"
+ "st1 { v18.h }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x9\n"
+ "st1 { v19.h }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x9\n"
+ "st1 { v20.h }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v21.h }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x9\n"
+ "st1 { v22.h }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x9\n"
+ "st1 { v23.h }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x9\n"
"add x9, x9, #0x2\n"
- "st1 { v24.h }[0], [x20]\n"
- "st1 { v25.h }[0], [x21]\n"
- "st1 { v26.h }[0], [x22]\n"
- "st1 { v27.h }[0], [x23]\n"
- "st1 { v28.h }[0], [x24]\n"
- "st1 { v29.h }[0], [x25]\n"
- "st1 { v30.h }[0], [x26]\n"
- "st1 { v31.h }[0], [x27]\n"
+ "st1 { v24.h }[0], [x27]\n"
+ "st1 { v25.h }[0], [x26]\n"
+ "st1 { v26.h }[0], [x25]\n"
+ "st1 { v27.h }[0], [x24]\n"
+ "st1 { v28.h }[0], [x23]\n"
+ "st1 { v29.h }[0], [x22]\n"
+ "st1 { v30.h }[0], [x21]\n"
+ "st1 { v31.h }[0], [x20]\n"
"tbz %x[n_output_channels], #0, 25f\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x9\n"
- "add x21, x21, x9\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x9\n"
- "add x23, x23, x9\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x9\n"
- "add x25, x25, x9\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x9\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x9\n"
- "st1 { v16.b }[2], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x9\n"
- "st1 { v17.b }[2], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x9\n"
- "st1 { v18.b }[2], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x9\n"
- "st1 { v19.b }[2], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x9\n"
- "st1 { v20.b }[2], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x9\n"
- "st1 { v21.b }[2], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x9\n"
- "st1 { v22.b }[2], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x9\n"
- "st1 { v23.b }[2], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
+ "st1 { v16.b }[2], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x9\n"
- "st1 { v24.b }[2], [x20]\n"
- "st1 { v25.b }[2], [x21]\n"
- "st1 { v26.b }[2], [x22]\n"
- "st1 { v27.b }[2], [x23]\n"
- "st1 { v28.b }[2], [x24]\n"
- "st1 { v29.b }[2], [x25]\n"
- "st1 { v30.b }[2], [x26]\n"
- "st1 { v31.b }[2], [x27]\n"
+ "st1 { v17.b }[2], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x9\n"
+ "st1 { v18.b }[2], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x9\n"
+ "st1 { v19.b }[2], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x9\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v21.b }[2], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x9\n"
+ "st1 { v22.b }[2], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x9\n"
+ "st1 { v23.b }[2], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x9\n"
+ "st1 { v24.b }[2], [x27]\n"
+ "st1 { v25.b }[2], [x26]\n"
+ "st1 { v26.b }[2], [x25]\n"
+ "st1 { v27.b }[2], [x24]\n"
+ "st1 { v28.b }[2], [x23]\n"
+ "st1 { v29.b }[2], [x22]\n"
+ "st1 { v30.b }[2], [x21]\n"
+ "st1 { v31.b }[2], [x20]\n"
"b 25f\n"
"24:" // Output channel oddments: Done: Store: Bit 1: Unset
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x9\n"
- "add x21, x21, x9\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x9\n"
- "add x23, x23, x9\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x9\n"
- "add x25, x25, x9\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x9\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x9\n"
- "st1 { v16.b }[0], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x9\n"
- "st1 { v17.b }[0], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x9\n"
- "st1 { v18.b }[0], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x9\n"
- "st1 { v19.b }[0], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x9\n"
- "st1 { v20.b }[0], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x9\n"
- "st1 { v21.b }[0], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x9\n"
- "st1 { v22.b }[0], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x9\n"
- "st1 { v23.b }[0], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
+ "st1 { v16.b }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x9\n"
- "st1 { v24.b }[0], [x20]\n"
- "st1 { v25.b }[0], [x21]\n"
- "st1 { v26.b }[0], [x22]\n"
- "st1 { v27.b }[0], [x23]\n"
- "st1 { v28.b }[0], [x24]\n"
- "st1 { v29.b }[0], [x25]\n"
- "st1 { v30.b }[0], [x26]\n"
- "st1 { v31.b }[0], [x27]\n"
+ "st1 { v17.b }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x9\n"
+ "st1 { v18.b }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x9\n"
+ "st1 { v19.b }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x9\n"
+ "st1 { v20.b }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v21.b }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x9\n"
+ "st1 { v22.b }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x9\n"
+ "st1 { v23.b }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x9\n"
+ "st1 { v24.b }[0], [x27]\n"
+ "st1 { v25.b }[0], [x26]\n"
+ "st1 { v26.b }[0], [x25]\n"
+ "st1 { v27.b }[0], [x24]\n"
+ "st1 { v28.b }[0], [x23]\n"
+ "st1 { v29.b }[0], [x22]\n"
+ "st1 { v30.b }[0], [x21]\n"
+ "st1 { v31.b }[0], [x20]\n"
"25:" // Output channel oddments: Done: Store: Bit 1: End
-
"26:" // Done
-
: [weights] "+&r" (weights)
: [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [n_output_channels] "r" ((uint64_t) n_output_channels), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (per_channel_left_shifts), [rq_mul_ptr] "r" (per_channel_muls), [rq_right_shift_ptr] "r" (per_channel_right_shifts)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
@@ -1477,4 +1476,5 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index a200ebf2cc..25d83f15c3 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,13 +22,13 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
namespace arm_conv {
namespace depthwise {
@@ -68,4 +68,4 @@ class sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirs
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
index 2ee961db15..96cfd5e497 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -22,11 +22,11 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME2)
-
#include <cstddef>
#include <cstdint>
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
namespace arm_conv {
namespace depthwise {
@@ -151,7 +151,7 @@ void sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
"mul x21, x4, x22\n" // offset = tile_i * ld_output_row
"mov x20, #0x2\n"
- "ld1w { z18.s }, p3/Z, [x15]\n"
+ "ld1w { z22.s }, p3/Z, [x15]\n"
"ldr x25, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
"madd x21, x5, x25, x21\n" // offset += tile_j * ld_output_col
"addvl x15, x15, #1\n"
@@ -159,13 +159,13 @@ void sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"ldr x24, [%x[params_struct], %[offsetof_args_outptr]]\n"
"mul x21, x21, x20\n" // offset *= output_tile_size
"cntw x23\n"
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z21.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
"addvl x15, x15, #4\n"
"add x24, x24, x21, LSL #2\n" // outptrs[0] += offset * sizeof(float)
".inst 0xa040c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
"addvl x15, x15, #4\n"
- "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"cmp x23, %x[n_channels]\n"
"add x22, x24, x22, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x15]\n"
@@ -179,71 +179,71 @@ void sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"ld1w { z13.s }, p2/Z, [x17, x6, LSL #2]\n"
"bge 4f\n"
"3:" // Tile loop: Channel loop
- "movprfx z28, z18\n fmla z28.s, p3/M, z4.s, z9.s\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z3.s, z9.s\n"
+ "movprfx z28, z22\n fmla z28.s, p3/M, z4.s, z9.s\n"
+ "movprfx z29, z22\n fmla z29.s, p3/M, z3.s, z9.s\n"
"whilelt p1.s, x23, %x[n_channels]\n"
"incw x21\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x14]\n"
+ "movprfx z30, z22\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "movprfx z31, z22\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z18.s }, p2/Z, [x14]\n"
"incw x23\n"
"fmla z28.s, p3/M, z0.s, z10.s\n"
"fmla z29.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x13, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x14, x13, LSL #2]\n"
"mov p0.b, p2.b\n"
"fmla z30.s, p3/M, z2.s, z12.s\n"
"fmla z31.s, p3/M, z1.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x17, x16, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x17, x16, LSL #2]\n"
"incw x20\n"
"fmla z28.s, p3/M, z5.s, z12.s\n"
"fmla z29.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x7, x6, LSL #2]\n"
- "fmla z30.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z11.s }, p2/Z, [x7, x6, LSL #2]\n"
+ "fmla z30.s, p3/M, z6.s, z18.s\n"
"fmla z31.s, p3/M, z3.s, z13.s\n"
- "ld1w { z9.s }, p2/Z, [x7, x16, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x7, x16, LSL #2]\n"
"addvl x7, x7, #1\n"
"fmla z28.s, p3/M, z7.s, z13.s\n"
"fmla z29.s, p3/M, z6.s, z13.s\n"
- "ld1w { z18.s }, p3/Z, [x15]\n"
+ "ld1w { z22.s }, p3/Z, [x15]\n"
"addvl x15, x15, #1\n"
"fmla z30.s, p3/M, z4.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x8]\n"
- "fmla z28.s, p3/M, z1.s, z12.s\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x8, x13, LSL #2]\n"
+ "fmla z31.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z9.s }, p2/Z, [x8]\n"
+ "fmla z28.s, p3/M, z1.s, z11.s\n"
+ "fmla z29.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z19.s }, p2/Z, [x8, x13, LSL #2]\n"
"addvl x8, x8, #1\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z29.s, p3/M, z1.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x17]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x17, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z5.s, z16.s\n"
+ "fmla z31.s, p3/M, z4.s, z16.s\n"
+ "fmla z28.s, p3/M, z2.s, z10.s\n"
+ "fmla z29.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z18.s }, p2/Z, [x17]\n"
+ "fmla z30.s, p3/M, z0.s, z9.s\n"
+ "fmla z31.s, p3/M, z2.s, z19.s\n"
+ "fmla z28.s, p3/M, z8.s, z16.s\n"
+ "fmla z29.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x17, x13, LSL #2]\n"
"addvl x17, x17, #1\n"
- "fmla z30.s, p3/M, z3.s, z9.s\n"
- "fmla z31.s, p3/M, z5.s, z10.s\n"
+ "fmla z30.s, p3/M, z3.s, z18.s\n"
+ "fmla z31.s, p3/M, z5.s, z17.s\n"
"ld1w { z13.s }, p1/Z, [x17, x6, LSL #2]\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x6, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
- "fmla z30.s, p3/M, z7.s, z11.s\n"
- "fmla z31.s, p3/M, z6.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x14, x16, LSL #2]\n"
+ "fmla z28.s, p3/M, z3.s, z9.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x6, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z19.s\n"
+ "fmla z30.s, p3/M, z7.s, z16.s\n"
+ "fmla z31.s, p3/M, z6.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x16, LSL #2]\n"
"whilelt p2.s, x21, %x[n_channels]\n"
- "fmla z28.s, p3/M, z6.s, z9.s\n"
- "fmla z29.s, p3/M, z8.s, z10.s\n"
+ "fmla z28.s, p3/M, z6.s, z18.s\n"
+ "fmla z29.s, p3/M, z8.s, z17.s\n"
".inst 0xa040c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15]\n"
"addvl x15, x15, #4\n"
- "fmla z30.s, p3/M, z8.s, z12.s\n"
- "fmla z31.s, p3/M, z7.s, z12.s\n"
+ "fmla z30.s, p3/M, z8.s, z16.s\n"
+ "fmla z31.s, p3/M, z7.s, z16.s\n"
".inst 0xa040c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
"addvl x15, x15, #4\n"
"cmp x23, %x[n_channels]\n"
- ".inst 0xc1b0ca3c // fclamp { z28.s-z31.s }, z17.s, z16.s\n"
+ ".inst 0xc1aecabc // fclamp { z28.s-z31.s }, z21.s, z14.s\n"
"addvl x14, x14, #1\n"
"ld1w { z9.s }, p1/Z, [x8, x6, LSL #2]\n"
"ld1w { z10.s }, p1/Z, [x7]\n"
@@ -259,69 +259,69 @@ void sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"addvl x15, x15, #1\n"
"blt 3b\n"
"4:" // Tile loop: Channel tail
- "movprfx z28, z18\n fmla z28.s, p3/M, z4.s, z9.s\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z3.s, z9.s\n"
+ "movprfx z24, z22\n fmla z24.s, p3/M, z4.s, z9.s\n"
+ "movprfx z25, z22\n fmla z25.s, p3/M, z3.s, z9.s\n"
"ldr x5, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"add x5, x5, #0x1\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x14]\n"
+ "movprfx z26, z22\n fmla z26.s, p3/M, z1.s, z9.s\n"
+ "movprfx z27, z22\n fmla z27.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z17.s }, p2/Z, [x14]\n"
"ldr x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "fmla z29.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x13, LSL #2]\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "fmla z25.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x13, LSL #2]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "fmla z30.s, p3/M, z2.s, z12.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x17, x16, LSL #2]\n"
+ "fmla z26.s, p3/M, z2.s, z12.s\n"
+ "fmla z27.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z20.s }, p2/Z, [x17, x16, LSL #2]\n"
"ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "fmla z28.s, p3/M, z5.s, z12.s\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x7, x6, LSL #2]\n"
+ "fmla z24.s, p3/M, z5.s, z12.s\n"
+ "fmla z25.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z18.s }, p2/Z, [x7, x6, LSL #2]\n"
"cmp x5, x20\n"
- "fmla z30.s, p3/M, z6.s, z9.s\n"
- "fmla z31.s, p3/M, z3.s, z13.s\n"
- "ld1w { z9.s }, p2/Z, [x7, x16, LSL #2]\n"
+ "fmla z26.s, p3/M, z6.s, z17.s\n"
+ "fmla z27.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z17.s }, p2/Z, [x7, x16, LSL #2]\n"
"add x20, x4, #0x1\n"
- "fmla z28.s, p3/M, z7.s, z13.s\n"
- "fmla z29.s, p3/M, z6.s, z13.s\n"
+ "fmla z24.s, p3/M, z7.s, z13.s\n"
+ "fmla z25.s, p3/M, z6.s, z13.s\n"
"csel x4, x4, x20, LT\n"
"mov p0.b, p2.b\n"
- "fmla z30.s, p3/M, z4.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x8]\n"
+ "fmla z26.s, p3/M, z4.s, z13.s\n"
+ "fmla z27.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x8]\n"
"csel x5, x5, XZR, LT\n"
- "fmla z28.s, p3/M, z1.s, z12.s\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x8, x13, LSL #2]\n"
+ "fmla z24.s, p3/M, z1.s, z18.s\n"
+ "fmla z25.s, p3/M, z0.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x8, x13, LSL #2]\n"
"cmp x4, x21\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z29.s, p3/M, z1.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x17]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x17, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z3.s, z9.s\n"
- "fmla z31.s, p3/M, z5.s, z10.s\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x6, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
- "fmla z30.s, p3/M, z7.s, z11.s\n"
- "fmla z31.s, p3/M, z6.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x14, x16, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z9.s\n"
- "fmla z29.s, p3/M, z8.s, z10.s\n"
- "fmla z30.s, p3/M, z8.s, z12.s\n"
- "fmla z31.s, p3/M, z7.s, z12.s\n"
- ".inst 0xc1b0ca3c // fclamp { z28.s-z31.s }, z17.s, z16.s\n"
- "st1w { z28.s }, p0, [x24]\n"
- "st1w { z29.s }, p0, [x24, x25, LSL #2]\n"
- "st1w { z30.s }, p0, [x22]\n"
- "st1w { z31.s }, p0, [x22, x25, LSL #2]\n"
+ "fmla z26.s, p3/M, z5.s, z20.s\n"
+ "fmla z27.s, p3/M, z4.s, z20.s\n"
+ "fmla z24.s, p3/M, z2.s, z17.s\n"
+ "fmla z25.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x17]\n"
+ "fmla z26.s, p3/M, z0.s, z16.s\n"
+ "fmla z27.s, p3/M, z2.s, z19.s\n"
+ "fmla z24.s, p3/M, z8.s, z20.s\n"
+ "fmla z25.s, p3/M, z7.s, z20.s\n"
+ "ld1w { z17.s }, p2/Z, [x17, x13, LSL #2]\n"
+ "fmla z26.s, p3/M, z3.s, z18.s\n"
+ "fmla z27.s, p3/M, z5.s, z17.s\n"
+ "fmla z24.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x6, LSL #2]\n"
+ "fmla z25.s, p3/M, z5.s, z19.s\n"
+ "fmla z26.s, p3/M, z7.s, z16.s\n"
+ "fmla z27.s, p3/M, z6.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x16, LSL #2]\n"
+ "fmla z24.s, p3/M, z6.s, z18.s\n"
+ "fmla z25.s, p3/M, z8.s, z17.s\n"
+ "fmla z26.s, p3/M, z8.s, z16.s\n"
+ "fmla z27.s, p3/M, z7.s, z16.s\n"
+ ".inst 0xc1aecab8 // fclamp { z24.s-z27.s }, z21.s, z14.s\n"
+ "st1w { z24.s }, p0, [x24]\n"
+ "st1w { z25.s }, p0, [x24, x25, LSL #2]\n"
+ "st1w { z26.s }, p0, [x22]\n"
+ "st1w { z27.s }, p0, [x22, x25, LSL #2]\n"
"blt 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index 079b39c5ec..39f1b3635f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -22,11 +22,11 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME2)
-
#include <cstddef>
#include <cstdint>
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
namespace arm_conv {
namespace depthwise {
@@ -84,7 +84,7 @@ void sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"ptrue p3.b\n"
"ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
".inst 0x25207810 // ptrue pn8.b\n"
- "ld1w { z18.s }, p3/Z, [x14]\n"
+ "ld1w { z23.s }, p3/Z, [x14]\n"
"addvl x14, x14, #1\n"
"ldp x13, x12, [x20, #0x0]\n"
"cntw x11\n"
@@ -94,176 +94,176 @@ void sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"mov x28, #0x0\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
".inst 0xa040c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14]\n"
- "ldp x27, x26, [x15, #0x0]\n"
+ "ldp x24, x23, [x15, #0x0]\n"
"addvl x14, x14, #4\n"
"cmp x11, %x[n_channels]\n"
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ldp x25, x22, [x15, #0x10]\n"
- "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "sub x24, XZR, x11\n"
- "ldr x23, [x15, #0x20]\n"
+ "ld1rw { z22.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ldp x22, x21, [x15, #0x10]\n"
+ "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "sub x27, XZR, x11\n"
+ "ldr x20, [x15, #0x20]\n"
"ld1w { z8.s }, p3/Z, [x14]\n"
"addvl x14, x14, #1\n"
- "ld1w { z9.s }, p2/Z, [x27, x28, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x26, x28, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x25, x28, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x22, x28, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z9.s }, p2/Z, [x24, x28, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x22, x28, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x20, x28, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z28, z18\n fmla z28.s, p3/M, z4.s, z9.s\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z3.s, z9.s\n"
- "ldr x22, [x15, #0x28]\n"
+ "movprfx z28, z23\n fmla z28.s, p3/M, z4.s, z9.s\n"
+ "movprfx z29, z23\n fmla z29.s, p3/M, z3.s, z9.s\n"
+ "ldr x20, [x15, #0x28]\n"
"whilelt p1.s, x11, %x[n_channels]\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x22, x28, LSL #2]\n"
- "ldr x21, [x15, #0x30]\n"
+ "movprfx z30, z23\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "movprfx z31, z23\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z19.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ldr x20, [x15, #0x30]\n"
"fmla z28.s, p3/M, z0.s, z10.s\n"
"fmla z29.s, p3/M, z2.s, z11.s\n"
- "ldr x20, [x15, #0x38]\n"
- "ld1w { z11.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ldr x21, [x15, #0x38]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x28, LSL #2]\n"
"fmla z30.s, p3/M, z2.s, z12.s\n"
"fmla z31.s, p3/M, z1.s, z12.s\n"
- "ldr x26, [x15, #0x48]\n"
- "ld1w { z10.s }, p2/Z, [x26, x28, LSL #2]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
"fmla z28.s, p3/M, z5.s, z12.s\n"
"fmla z29.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x20, x28, LSL #2]\n"
- "ldr x27, [x15, #0x40]\n"
- "fmla z30.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ldr x20, [x15, #0x40]\n"
+ "fmla z30.s, p3/M, z6.s, z19.s\n"
"fmla z31.s, p3/M, z3.s, z13.s\n"
- "ld1w { z9.s }, p2/Z, [x27, x28, LSL #2]\n"
- "ldr x25, [x15, #0x50]\n"
+ "ld1w { z25.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ldr x21, [x15, #0x50]\n"
"fmla z28.s, p3/M, z7.s, z13.s\n"
"fmla z29.s, p3/M, z6.s, z13.s\n"
- "ldr x22, [x15, #0x58]\n"
- "ld1w { z18.s }, p3/Z, [x14]\n"
+ "ldr x20, [x15, #0x58]\n"
+ "ld1w { z23.s }, p3/Z, [x14]\n"
"fmla z30.s, p3/M, z4.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x28, LSL #2]\n"
- "ldr x23, [x15, #0x60]\n"
- "fmla z28.s, p3/M, z1.s, z12.s\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x22, x28, LSL #2]\n"
- "ldr x22, [x15, #0x68]\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- "ldr x21, [x15, #0x70]\n"
+ "fmla z31.s, p3/M, z8.s, z18.s\n"
+ "ld1w { z11.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ldr x21, [x15, #0x60]\n"
+ "fmla z28.s, p3/M, z1.s, z16.s\n"
+ "fmla z29.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ldr x20, [x15, #0x68]\n"
+ "fmla z30.s, p3/M, z5.s, z17.s\n"
+ "fmla z31.s, p3/M, z4.s, z17.s\n"
+ "ldr x26, [x15, #0x70]\n"
"addvl x14, x14, #1\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z29.s, p3/M, z1.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x23, x28, LSL #2]\n"
- "ldr x20, [x15, #0x78]\n"
+ "fmla z28.s, p3/M, z2.s, z25.s\n"
+ "fmla z29.s, p3/M, z1.s, z25.s\n"
+ "ld1w { z18.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ldr x25, [x15, #0x78]\n"
"fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "incw x24\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x22, x28, LSL #2]\n"
- "ldp x25, x22, [x15, #0x10]\n"
- "fmla z30.s, p3/M, z3.s, z9.s\n"
- "fmla z31.s, p3/M, z5.s, z10.s\n"
- "ldr x23, [x15, #0x20]\n"
- "ld1w { z13.s }, p1/Z, [x23, x11, LSL #2]\n"
+ "fmla z31.s, p3/M, z2.s, z19.s\n"
+ "ldp x24, x23, [x15, #0x0]\n"
+ "incw x27\n"
+ "fmla z28.s, p3/M, z8.s, z17.s\n"
+ "fmla z29.s, p3/M, z7.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ldp x22, x21, [x15, #0x10]\n"
+ "fmla z30.s, p3/M, z3.s, z18.s\n"
+ "fmla z31.s, p3/M, z5.s, z17.s\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ld1w { z13.s }, p1/Z, [x20, x11, LSL #2]\n"
"fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x21, x28, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x26, x28, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z19.s\n"
"mov p0.b, p2.b\n"
- "fmla z30.s, p3/M, z7.s, z11.s\n"
- "fmla z31.s, p3/M, z6.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z16.s\n"
+ "fmla z31.s, p3/M, z6.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x28, LSL #2]\n"
"incw x28\n"
- "fmla z28.s, p3/M, z6.s, z9.s\n"
- "fmla z29.s, p3/M, z8.s, z10.s\n"
- "ld1w { z9.s }, p1/Z, [x27, x11, LSL #2]\n"
+ "fmla z28.s, p3/M, z6.s, z18.s\n"
+ "fmla z29.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z9.s }, p1/Z, [x24, x11, LSL #2]\n"
"whilelt p2.s, x28, %x[n_channels]\n"
- "fmla z30.s, p3/M, z8.s, z12.s\n"
- "fmla z31.s, p3/M, z7.s, z12.s\n"
- "ld1w { z10.s }, p1/Z, [x26, x11, LSL #2]\n"
- "ld1w { z11.s }, p1/Z, [x25, x11, LSL #2]\n"
- ".inst 0xc1b0ca3c // fclamp { z28.s-z31.s }, z17.s, z16.s\n"
- "st1w { z28.s }, p0, [x13, x24, LSL #2]\n"
- "ld1w { z12.s }, p1/Z, [x22, x11, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z16.s\n"
+ "fmla z31.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z10.s }, p1/Z, [x23, x11, LSL #2]\n"
+ "ld1w { z11.s }, p1/Z, [x22, x11, LSL #2]\n"
+ ".inst 0xc1afcadc // fclamp { z28.s-z31.s }, z22.s, z15.s\n"
+ "st1w { z28.s }, p0, [x13, x27, LSL #2]\n"
+ "ld1w { z12.s }, p1/Z, [x21, x11, LSL #2]\n"
"incw x11\n"
"cmp x11, %x[n_channels]\n"
- "st1w { z29.s }, p0, [x12, x24, LSL #2]\n"
+ "st1w { z29.s }, p0, [x12, x27, LSL #2]\n"
".inst 0xa040c1c0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x14]\n"
"addvl x14, x14, #4\n"
- "st1w { z30.s }, p0, [x10, x24, LSL #2]\n"
+ "st1w { z30.s }, p0, [x10, x27, LSL #2]\n"
".inst 0xa040c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14]\n"
"addvl x14, x14, #4\n"
- "st1w { z31.s }, p0, [x9, x24, LSL #2]\n"
+ "st1w { z31.s }, p0, [x9, x27, LSL #2]\n"
"ld1w { z8.s }, p3/Z, [x14]\n"
"addvl x14, x14, #1\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z28, z18\n fmla z28.s, p3/M, z4.s, z9.s\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z3.s, z9.s\n"
- "ldr x22, [x15, #0x28]\n"
- "incw x24\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x22, x28, LSL #2]\n"
- "ldr x21, [x15, #0x30]\n"
+ "movprfx z28, z23\n fmla z28.s, p3/M, z4.s, z9.s\n"
+ "movprfx z29, z23\n fmla z29.s, p3/M, z3.s, z9.s\n"
+ "ldr x20, [x15, #0x28]\n"
+ "incw x27\n"
+ "movprfx z30, z23\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "movprfx z31, z23\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ldr x20, [x15, #0x30]\n"
"fmla z28.s, p3/M, z0.s, z10.s\n"
"fmla z29.s, p3/M, z2.s, z11.s\n"
- "ldr x20, [x15, #0x38]\n"
- "ld1w { z11.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ldr x21, [x15, #0x38]\n"
+ "ld1w { z16.s }, p2/Z, [x20, x28, LSL #2]\n"
"fmla z30.s, p3/M, z2.s, z12.s\n"
"fmla z31.s, p3/M, z1.s, z12.s\n"
- "ldr x26, [x15, #0x48]\n"
- "ld1w { z10.s }, p2/Z, [x26, x28, LSL #2]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "ld1w { z20.s }, p2/Z, [x20, x28, LSL #2]\n"
"fmla z28.s, p3/M, z5.s, z12.s\n"
"fmla z29.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x20, x28, LSL #2]\n"
- "ldr x27, [x15, #0x40]\n"
- "fmla z30.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z18.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ldr x20, [x15, #0x40]\n"
+ "fmla z30.s, p3/M, z6.s, z17.s\n"
"fmla z31.s, p3/M, z3.s, z13.s\n"
- "ld1w { z9.s }, p2/Z, [x27, x28, LSL #2]\n"
- "ldr x25, [x15, #0x50]\n"
+ "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ldr x20, [x15, #0x50]\n"
"fmla z28.s, p3/M, z7.s, z13.s\n"
"fmla z29.s, p3/M, z6.s, z13.s\n"
- "ldr x22, [x15, #0x58]\n"
+ "ldr x21, [x15, #0x58]\n"
"mov p0.b, p2.b\n"
"fmla z30.s, p3/M, z4.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x28, LSL #2]\n"
- "ldr x23, [x15, #0x60]\n"
- "fmla z28.s, p3/M, z1.s, z12.s\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x22, x28, LSL #2]\n"
+ "fmla z31.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ldr x20, [x15, #0x60]\n"
+ "fmla z28.s, p3/M, z1.s, z18.s\n"
+ "fmla z29.s, p3/M, z0.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x21, x28, LSL #2]\n"
"ldr x22, [x15, #0x68]\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "fmla z30.s, p3/M, z5.s, z20.s\n"
+ "fmla z31.s, p3/M, z4.s, z20.s\n"
"ldr x21, [x15, #0x70]\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z29.s, p3/M, z1.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z17.s\n"
+ "fmla z29.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x20, x28, LSL #2]\n"
"ldr x20, [x15, #0x78]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x22, x28, LSL #2]\n"
- "fmla z30.s, p3/M, z3.s, z9.s\n"
- "fmla z31.s, p3/M, z5.s, z10.s\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x21, x28, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
- "fmla z30.s, p3/M, z7.s, z11.s\n"
- "fmla z31.s, p3/M, z6.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x20, x28, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z9.s\n"
- "fmla z29.s, p3/M, z8.s, z10.s\n"
- "fmla z30.s, p3/M, z8.s, z12.s\n"
- "fmla z31.s, p3/M, z7.s, z12.s\n"
- ".inst 0xc1b0ca3c // fclamp { z28.s-z31.s }, z17.s, z16.s\n"
- "st1w { z28.s }, p0, [x13, x24, LSL #2]\n"
- "st1w { z29.s }, p0, [x12, x24, LSL #2]\n"
- "st1w { z30.s }, p0, [x10, x24, LSL #2]\n"
- "st1w { z31.s }, p0, [x9, x24, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z16.s\n"
+ "fmla z31.s, p3/M, z2.s, z19.s\n"
+ "fmla z28.s, p3/M, z8.s, z20.s\n"
+ "fmla z29.s, p3/M, z7.s, z20.s\n"
+ "ld1w { z17.s }, p2/Z, [x22, x28, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z18.s\n"
+ "fmla z31.s, p3/M, z5.s, z17.s\n"
+ "fmla z28.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z19.s\n"
+ "fmla z30.s, p3/M, z7.s, z16.s\n"
+ "fmla z31.s, p3/M, z6.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "fmla z28.s, p3/M, z6.s, z18.s\n"
+ "fmla z29.s, p3/M, z8.s, z17.s\n"
+ "fmla z30.s, p3/M, z8.s, z16.s\n"
+ "fmla z31.s, p3/M, z7.s, z16.s\n"
+ ".inst 0xc1afcadc // fclamp { z28.s-z31.s }, z22.s, z15.s\n"
+ "st1w { z28.s }, p0, [x13, x27, LSL #2]\n"
+ "st1w { z29.s }, p0, [x12, x27, LSL #2]\n"
+ "st1w { z30.s }, p0, [x10, x27, LSL #2]\n"
+ "st1w { z31.s }, p0, [x9, x27, LSL #2]\n"
".inst 0xd503467f // SMSTOP\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
index cf74f431df..bd330dc21e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,13 +22,13 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
namespace arm_conv {
namespace depthwise {
@@ -68,4 +68,4 @@ class sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirs
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
index ce0ae29756..d15a3a8377 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -22,11 +22,11 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME2)
-
#include <cstddef>
#include <cstdint>
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
namespace arm_conv {
namespace depthwise {
@@ -170,11 +170,11 @@ void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
"mul x21, x2, x22\n" // offset = tile_i * ld_output_row
"mov x20, #0x3\n"
- "ld1w { z18.s }, p3/Z, [x17]\n"
+ "ld1w { z24.s }, p3/Z, [x17]\n"
"ldr x27, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
"madd x21, x3, x27, x21\n" // offset += tile_j * ld_output_col
"mul x21, x21, x20\n" // offset *= output_tile_size
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z26.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
"ldr x26, [%x[params_struct], %[offsetof_args_outptr]]\n"
"addvl x17, x17, #1\n"
"add x26, x26, x21, LSL #2\n" // outptrs[0] += offset * sizeof(float)
@@ -184,7 +184,7 @@ void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
"add x24, x26, x22, LSL #2\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"addvl x17, x17, #4\n"
"cmp x25, %x[n_channels]\n"
"ld1w { z8.s }, p3/Z, [x17]\n"
@@ -200,275 +200,275 @@ void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"ld1w { z13.s }, p2/Z, [x6, x8, LSL #2]\n"
"bge 4f\n"
"3:" // Tile loop: Channel loop
- "movprfx z24, z18\n fmla z24.s, p3/M, z7.s, z9.s\n"
- "movprfx z23, z18\n fmla z23.s, p3/M, z8.s, z9.s\n"
+ "movprfx z28, z24\n fmla z28.s, p3/M, z7.s, z9.s\n"
+ "movprfx z27, z24\n fmla z27.s, p3/M, z8.s, z9.s\n"
"whilelt p1.s, x25, %x[n_channels]\n"
"incw x21\n"
- "movprfx z25, z18\n fmla z25.s, p3/M, z6.s, z9.s\n"
- "fmla z24.s, p3/M, z4.s, z13.s\n"
+ "movprfx z29, z24\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "fmla z28.s, p3/M, z4.s, z13.s\n"
"incw x25\n"
"mov p0.b, p2.b\n"
- "movprfx z26, z18\n fmla z26.s, p3/M, z5.s, z9.s\n"
- "movprfx z27, z18\n fmla z27.s, p3/M, z4.s, z9.s\n"
+ "movprfx z30, z24\n fmla z30.s, p3/M, z5.s, z9.s\n"
+ "movprfx z31, z24\n fmla z31.s, p3/M, z4.s, z9.s\n"
"incw x20\n"
- "movprfx z28, z18\n fmla z28.s, p3/M, z3.s, z9.s\n"
- "fmla z23.s, p3/M, z0.s, z10.s\n"
+ "movprfx z20, z24\n fmla z20.s, p3/M, z3.s, z9.s\n"
+ "fmla z27.s, p3/M, z0.s, z10.s\n"
"ld1w { z10.s }, p2/Z, [x7, x15, LSL #2]\n"
- "fmla z25.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x7, x4, LSL #2]\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z2.s, z9.s\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "fmla z23.s, p3/M, z5.s, z13.s\n"
- "fmla z25.s, p3/M, z3.s, z13.s\n"
- "fmla z26.s, p3/M, z2.s, z13.s\n"
- "fmla z27.s, p3/M, z1.s, z13.s\n"
- "fmla z28.s, p3/M, z0.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x5, x4, LSL #2]\n"
- "fmla z29.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x14, x13, LSL #2]\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "fmla z24.s, p3/M, z0.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z12.s\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x5, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z26.s, p3/M, z4.s, z11.s\n"
- "ld1w { z18.s }, p3/Z, [x17]\n"
+ "fmla z29.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z19.s }, p2/Z, [x7, x4, LSL #2]\n"
+ "movprfx z21, z24\n fmla z21.s, p3/M, z2.s, z9.s\n"
+ "fmla z28.s, p3/M, z6.s, z19.s\n"
+ "movprfx z23, z24\n fmla z23.s, p3/M, z0.s, z9.s\n"
+ "fmla z27.s, p3/M, z5.s, z13.s\n"
+ "fmla z29.s, p3/M, z3.s, z13.s\n"
+ "fmla z30.s, p3/M, z2.s, z13.s\n"
+ "fmla z31.s, p3/M, z1.s, z13.s\n"
+ "fmla z20.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z18.s }, p2/Z, [x5, x4, LSL #2]\n"
+ "fmla z21.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z15.s }, p2/Z, [x14, x13, LSL #2]\n"
+ "movprfx z22, z24\n fmla z22.s, p3/M, z1.s, z9.s\n"
+ "fmla z28.s, p3/M, z0.s, z18.s\n"
+ "fmla z23.s, p3/M, z8.s, z15.s\n"
+ "fmla z27.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x5, x15, LSL #2]\n"
+ "fmla z22.s, p3/M, z0.s, z19.s\n"
+ "fmla z30.s, p3/M, z4.s, z19.s\n"
+ "ld1w { z24.s }, p3/Z, [x17]\n"
"addvl x17, x17, #1\n"
- "fmla z27.s, p3/M, z3.s, z11.s\n"
- "fmla z29.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x6]\n"
- "fmla z24.s, p3/M, z2.s, z12.s\n"
- "fmla z25.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x16]\n"
- "fmla z28.s, p3/M, z4.s, z10.s\n"
- "fmla z23.s, p3/M, z1.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x6, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z10.s\n"
- "fmla z31.s, p3/M, z1.s, z10.s\n"
- "fmla z24.s, p3/M, z8.s, z10.s\n"
- "fmla z25.s, p3/M, z7.s, z10.s\n"
- "fmla z27.s, p3/M, z5.s, z10.s\n"
- "fmla z26.s, p3/M, z0.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x16, x8, LSL #2]\n"
- "fmla z28.s, p3/M, z2.s, z13.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "fmla z30.s, p3/M, z4.s, z10.s\n"
- "fmla z31.s, p3/M, z3.s, z10.s\n"
- "fmla z23.s, p3/M, z3.s, z11.s\n"
- "fmla z25.s, p3/M, z5.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x16, x13, LSL #2]\n"
- "fmla z26.s, p3/M, z6.s, z12.s\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- "ld1w { z13.s }, p2/Z, [x14, x4, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z10.s\n"
- "fmla z29.s, p3/M, z5.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x6, x4, LSL #2]\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "fmla z30.s, p3/M, z6.s, z13.s\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "fmla z28.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x6, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z3.s, z19.s\n"
+ "fmla z21.s, p3/M, z1.s, z19.s\n"
+ "ld1w { z17.s }, p2/Z, [x6]\n"
+ "fmla z28.s, p3/M, z2.s, z16.s\n"
+ "fmla z29.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x16]\n"
+ "fmla z20.s, p3/M, z4.s, z10.s\n"
+ "fmla z27.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z9.s }, p2/Z, [x6, x13, LSL #2]\n"
+ "fmla z22.s, p3/M, z2.s, z10.s\n"
+ "fmla z23.s, p3/M, z1.s, z10.s\n"
+ "fmla z28.s, p3/M, z8.s, z10.s\n"
+ "fmla z29.s, p3/M, z7.s, z10.s\n"
+ "fmla z31.s, p3/M, z5.s, z10.s\n"
+ "fmla z30.s, p3/M, z0.s, z17.s\n"
+ "ld1w { z19.s }, p2/Z, [x16, x8, LSL #2]\n"
+ "fmla z20.s, p3/M, z2.s, z9.s\n"
+ "fmla z21.s, p3/M, z3.s, z16.s\n"
+ "fmla z22.s, p3/M, z4.s, z19.s\n"
+ "fmla z23.s, p3/M, z3.s, z19.s\n"
+ "fmla z27.s, p3/M, z3.s, z17.s\n"
+ "fmla z29.s, p3/M, z5.s, z9.s\n"
+ "ld1w { z17.s }, p2/Z, [x16, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z6.s, z16.s\n"
+ "fmla z31.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x4, LSL #2]\n"
+ "fmla z20.s, p3/M, z6.s, z19.s\n"
+ "fmla z21.s, p3/M, z5.s, z19.s\n"
+ "ld1w { z18.s }, p2/Z, [x6, x4, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z17.s\n"
+ "fmla z22.s, p3/M, z6.s, z16.s\n"
+ "fmla z30.s, p3/M, z8.s, z19.s\n"
+ "fmla z20.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x6, x15, LSL #2]\n"
"addvl x6, x6, #1\n"
- "fmla z29.s, p3/M, z7.s, z13.s\n"
- "fmla z24.s, p3/M, z3.s, z12.s\n"
- "ld1w { z13.s }, p2/Z, [x14, x15, LSL #2]\n"
- "fmla z27.s, p3/M, z0.s, z12.s\n"
- "fmla z23.s, p3/M, z4.s, z12.s\n"
- "fmla z30.s, p3/M, z8.s, z13.s\n"
- "fmla z31.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x16, x15, LSL #2]\n"
- "fmla z26.s, p3/M, z1.s, z12.s\n"
- "fmla z24.s, p3/M, z5.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x16, x4, LSL #2]\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
+ "fmla z28.s, p3/M, z3.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z0.s, z18.s\n"
+ "fmla z27.s, p3/M, z4.s, z18.s\n"
+ "fmla z22.s, p3/M, z8.s, z16.s\n"
+ "fmla z23.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x16, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z18.s\n"
+ "fmla z28.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z11.s }, p2/Z, [x16, x4, LSL #2]\n"
"addvl x16, x16, #1\n"
- "fmla z25.s, p3/M, z4.s, z11.s\n"
- "fmla z27.s, p3/M, z2.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x5, x8, LSL #2]\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
+ "fmla z29.s, p3/M, z4.s, z17.s\n"
+ "fmla z31.s, p3/M, z2.s, z17.s\n"
+ "fmla z20.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x5, x8, LSL #2]\n"
+ "fmla z21.s, p3/M, z4.s, z11.s\n"
"addvl x5, x5, #1\n"
- "fmla z30.s, p3/M, z3.s, z12.s\n"
- "fmla z23.s, p3/M, z2.s, z11.s\n"
+ "fmla z22.s, p3/M, z3.s, z11.s\n"
+ "fmla z27.s, p3/M, z2.s, z16.s\n"
"ld1w { z10.s }, p1/Z, [x5]\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "fmla z26.s, p3/M, z7.s, z12.s\n"
- "fmla z27.s, p3/M, z6.s, z12.s\n"
- "fmla z24.s, p3/M, z1.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x7]\n"
- "fmla z25.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x7, x13, LSL #2]\n"
- "fmla z28.s, p3/M, z7.s, z13.s\n"
+ "fmla z23.s, p3/M, z4.s, z19.s\n"
+ "fmla z30.s, p3/M, z7.s, z11.s\n"
+ "fmla z31.s, p3/M, z6.s, z11.s\n"
+ "fmla z28.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x7]\n"
+ "fmla z29.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x7, x13, LSL #2]\n"
+ "fmla z20.s, p3/M, z7.s, z19.s\n"
"addvl x7, x7, #1\n"
- "fmla z30.s, p3/M, z5.s, z13.s\n"
- "fmla z23.s, p3/M, z6.s, z12.s\n"
+ "fmla z22.s, p3/M, z5.s, z19.s\n"
+ "fmla z27.s, p3/M, z6.s, z18.s\n"
"ld1w { z9.s }, p1/Z, [x7, x8, LSL #2]\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "fmla z27.s, p3/M, z8.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x14, x8, LSL #2]\n"
- "fmla z26.s, p3/M, z3.s, z12.s\n"
+ "fmla z21.s, p3/M, z0.s, z18.s\n"
+ "fmla z23.s, p3/M, z2.s, z17.s\n"
+ "fmla z31.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x8, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z18.s\n"
"whilelt p2.s, x21, %x[n_channels]\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "fmla z28.s, p3/M, z5.s, z11.s\n"
+ "fmla z29.s, p3/M, z8.s, z17.s\n"
+ "fmla z20.s, p3/M, z5.s, z17.s\n"
".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
"addvl x17, x17, #4\n"
- "fmla z29.s, p3/M, z8.s, z13.s\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
+ "fmla z21.s, p3/M, z8.s, z16.s\n"
+ "fmla z22.s, p3/M, z7.s, z16.s\n"
"addvl x14, x14, #1\n"
"cmp x25, %x[n_channels]\n"
- "fmla z31.s, p3/M, z6.s, z13.s\n"
- "fmax z23.s, p3/M, z23.s, z17.s\n"
+ "fmla z23.s, p3/M, z6.s, z16.s\n"
+ "fmax z27.s, p3/M, z27.s, z26.s\n"
".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
"addvl x17, x17, #4\n"
- "fmin z23.s, p3/M, z23.s, z16.s\n"
- ".inst 0xc1b0ca38 // fclamp { z24.s-z27.s }, z17.s, z16.s\n"
+ "fmin z27.s, p3/M, z27.s, z14.s\n"
+ ".inst 0xc1aecb5c // fclamp { z28.s-z31.s }, z26.s, z14.s\n"
"ld1w { z11.s }, p1/Z, [x5, x13, LSL #2]\n"
- ".inst 0xc1b0ca3c // fclamp { z28.s-z31.s }, z17.s, z16.s\n"
+ ".inst 0xc1aecb54 // fclamp { z20.s-z23.s }, z26.s, z14.s\n"
"ld1w { z12.s }, p1/Z, [x14]\n"
- "st1w { z23.s }, p0, [x26]\n"
+ "st1w { z27.s }, p0, [x26]\n"
"ld1w { z13.s }, p1/Z, [x6, x8, LSL #2]\n"
- "st1w { z24.s }, p0, [x26, x27, LSL #2]\n"
- "st1w { z25.s }, p0, [x26, x22, LSL #2]\n"
+ "st1w { z28.s }, p0, [x26, x27, LSL #2]\n"
+ "st1w { z29.s }, p0, [x26, x22, LSL #2]\n"
"addvl x26, x26, #1\n"
"ld1w { z8.s }, p3/Z, [x17]\n"
"addvl x17, x17, #1\n"
- "st1w { z26.s }, p0, [x24]\n"
- "st1w { z27.s }, p0, [x24, x27, LSL #2]\n"
- "st1w { z28.s }, p0, [x24, x22, LSL #2]\n"
+ "st1w { z30.s }, p0, [x24]\n"
+ "st1w { z31.s }, p0, [x24, x27, LSL #2]\n"
+ "st1w { z20.s }, p0, [x24, x22, LSL #2]\n"
"addvl x24, x24, #1\n"
- "st1w { z29.s }, p0, [x23]\n"
- "st1w { z30.s }, p0, [x23, x27, LSL #2]\n"
- "st1w { z31.s }, p0, [x23, x22, LSL #2]\n"
+ "st1w { z21.s }, p0, [x23]\n"
+ "st1w { z22.s }, p0, [x23, x27, LSL #2]\n"
+ "st1w { z23.s }, p0, [x23, x22, LSL #2]\n"
"addvl x23, x23, #1\n"
"blt 3b\n"
"4:" // Tile loop: Channel tail
- "movprfx z24, z18\n fmla z24.s, p3/M, z7.s, z9.s\n"
- "movprfx z23, z18\n fmla z23.s, p3/M, z8.s, z9.s\n"
+ "movprfx z28, z24\n fmla z28.s, p3/M, z7.s, z9.s\n"
+ "movprfx z25, z24\n fmla z25.s, p3/M, z8.s, z9.s\n"
"ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"add x3, x3, #0x1\n"
- "movprfx z25, z18\n fmla z25.s, p3/M, z6.s, z9.s\n"
- "fmla z24.s, p3/M, z4.s, z13.s\n"
+ "movprfx z29, z24\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "fmla z28.s, p3/M, z4.s, z13.s\n"
"ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
"add x21, x2, #0x1\n"
- "movprfx z26, z18\n fmla z26.s, p3/M, z5.s, z9.s\n"
- "movprfx z27, z18\n fmla z27.s, p3/M, z4.s, z9.s\n"
+ "movprfx z30, z24\n fmla z30.s, p3/M, z5.s, z9.s\n"
+ "movprfx z31, z24\n fmla z31.s, p3/M, z4.s, z9.s\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
"cmp x3, x20\n"
- "movprfx z28, z18\n fmla z28.s, p3/M, z3.s, z9.s\n"
- "fmla z23.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x7, x15, LSL #2]\n"
+ "movprfx z20, z24\n fmla z20.s, p3/M, z3.s, z9.s\n"
+ "fmla z25.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z27.s }, p2/Z, [x7, x15, LSL #2]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "fmla z25.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x7, x4, LSL #2]\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z2.s, z9.s\n"
+ "fmla z29.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z17.s }, p2/Z, [x7, x4, LSL #2]\n"
+ "movprfx z21, z24\n fmla z21.s, p3/M, z2.s, z9.s\n"
"csel x2, x2, x21, LT\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "fmla z28.s, p3/M, z6.s, z17.s\n"
+ "movprfx z23, z24\n fmla z23.s, p3/M, z0.s, z9.s\n"
"mov p0.b, p2.b\n"
"csel x3, x3, XZR, LT\n"
- "fmla z23.s, p3/M, z5.s, z13.s\n"
- "fmla z25.s, p3/M, z3.s, z13.s\n"
- "cmp x2, x20\n"
- "fmla z26.s, p3/M, z2.s, z13.s\n"
- "fmla z27.s, p3/M, z1.s, z13.s\n"
- "fmla z28.s, p3/M, z0.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x5, x4, LSL #2]\n"
- "fmla z29.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x14, x13, LSL #2]\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "fmla z24.s, p3/M, z0.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z12.s\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x5, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z26.s, p3/M, z4.s, z11.s\n"
- "fmla z27.s, p3/M, z3.s, z11.s\n"
- "fmla z29.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x6]\n"
- "fmla z24.s, p3/M, z2.s, z12.s\n"
- "fmla z25.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x16]\n"
- "fmla z28.s, p3/M, z4.s, z10.s\n"
- "fmla z23.s, p3/M, z1.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x6, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z10.s\n"
- "fmla z31.s, p3/M, z1.s, z10.s\n"
- "fmla z24.s, p3/M, z8.s, z10.s\n"
- "fmla z25.s, p3/M, z7.s, z10.s\n"
- "fmla z27.s, p3/M, z5.s, z10.s\n"
- "fmla z26.s, p3/M, z0.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x16, x8, LSL #2]\n"
- "fmla z28.s, p3/M, z2.s, z13.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "fmla z30.s, p3/M, z4.s, z10.s\n"
- "fmla z31.s, p3/M, z3.s, z10.s\n"
- "fmla z23.s, p3/M, z3.s, z11.s\n"
"fmla z25.s, p3/M, z5.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x16, x13, LSL #2]\n"
- "fmla z26.s, p3/M, z6.s, z12.s\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- "ld1w { z13.s }, p2/Z, [x14, x4, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z10.s\n"
- "fmla z29.s, p3/M, z5.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x6, x4, LSL #2]\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "fmla z30.s, p3/M, z6.s, z13.s\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "fmla z28.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x6, x15, LSL #2]\n"
- "fmla z29.s, p3/M, z7.s, z13.s\n"
- "fmla z24.s, p3/M, z3.s, z12.s\n"
- "ld1w { z13.s }, p2/Z, [x14, x15, LSL #2]\n"
- "fmla z27.s, p3/M, z0.s, z12.s\n"
- "fmla z23.s, p3/M, z4.s, z12.s\n"
- "fmla z30.s, p3/M, z8.s, z13.s\n"
- "fmla z31.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x16, x15, LSL #2]\n"
- "fmla z26.s, p3/M, z1.s, z12.s\n"
- "fmla z24.s, p3/M, z5.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x16, x4, LSL #2]\n"
- "fmla z25.s, p3/M, z4.s, z11.s\n"
- "fmla z27.s, p3/M, z2.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x5, x8, LSL #2]\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "fmla z30.s, p3/M, z3.s, z12.s\n"
- "fmla z23.s, p3/M, z2.s, z11.s\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "fmla z26.s, p3/M, z7.s, z12.s\n"
- "fmla z27.s, p3/M, z6.s, z12.s\n"
- "fmla z24.s, p3/M, z1.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x7]\n"
- "fmla z25.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x7, x13, LSL #2]\n"
- "fmla z28.s, p3/M, z7.s, z13.s\n"
- "fmla z30.s, p3/M, z5.s, z13.s\n"
- "fmla z23.s, p3/M, z6.s, z12.s\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "fmla z27.s, p3/M, z8.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x14, x8, LSL #2]\n"
- "fmla z26.s, p3/M, z3.s, z12.s\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "fmla z28.s, p3/M, z5.s, z11.s\n"
- "fmla z29.s, p3/M, z8.s, z13.s\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
- "fmla z31.s, p3/M, z6.s, z13.s\n"
- "fmax z23.s, p3/M, z23.s, z17.s\n"
- "fmin z23.s, p3/M, z23.s, z16.s\n"
- ".inst 0xc1b0ca38 // fclamp { z24.s-z27.s }, z17.s, z16.s\n"
- "st1w { z23.s }, p0, [x26]\n"
- ".inst 0xc1b0ca3c // fclamp { z28.s-z31.s }, z17.s, z16.s\n"
- "st1w { z24.s }, p0, [x26, x27, LSL #2]\n"
- "st1w { z25.s }, p0, [x26, x22, LSL #2]\n"
- "st1w { z26.s }, p0, [x24]\n"
- "st1w { z27.s }, p0, [x24, x27, LSL #2]\n"
- "st1w { z28.s }, p0, [x24, x22, LSL #2]\n"
- "st1w { z29.s }, p0, [x23]\n"
- "st1w { z30.s }, p0, [x23, x27, LSL #2]\n"
- "st1w { z31.s }, p0, [x23, x22, LSL #2]\n"
+ "fmla z29.s, p3/M, z3.s, z13.s\n"
+ "cmp x2, x20\n"
+ "fmla z30.s, p3/M, z2.s, z13.s\n"
+ "fmla z31.s, p3/M, z1.s, z13.s\n"
+ "fmla z20.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z19.s }, p2/Z, [x5, x4, LSL #2]\n"
+ "fmla z21.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x13, LSL #2]\n"
+ "movprfx z22, z24\n fmla z22.s, p3/M, z1.s, z9.s\n"
+ "fmla z28.s, p3/M, z0.s, z19.s\n"
+ "fmla z23.s, p3/M, z8.s, z16.s\n"
+ "fmla z25.s, p3/M, z7.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x5, x15, LSL #2]\n"
+ "fmla z22.s, p3/M, z0.s, z17.s\n"
+ "fmla z30.s, p3/M, z4.s, z17.s\n"
+ "fmla z31.s, p3/M, z3.s, z17.s\n"
+ "fmla z21.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x6]\n"
+ "fmla z28.s, p3/M, z2.s, z16.s\n"
+ "fmla z29.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x16]\n"
+ "fmla z20.s, p3/M, z4.s, z27.s\n"
+ "fmla z25.s, p3/M, z1.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x6, x13, LSL #2]\n"
+ "fmla z22.s, p3/M, z2.s, z27.s\n"
+ "fmla z23.s, p3/M, z1.s, z27.s\n"
+ "fmla z28.s, p3/M, z8.s, z27.s\n"
+ "fmla z29.s, p3/M, z7.s, z27.s\n"
+ "fmla z31.s, p3/M, z5.s, z27.s\n"
+ "fmla z30.s, p3/M, z0.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x16, x8, LSL #2]\n"
+ "fmla z20.s, p3/M, z2.s, z16.s\n"
+ "fmla z21.s, p3/M, z3.s, z17.s\n"
+ "fmla z22.s, p3/M, z4.s, z19.s\n"
+ "fmla z23.s, p3/M, z3.s, z19.s\n"
+ "fmla z25.s, p3/M, z3.s, z18.s\n"
+ "fmla z29.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x16, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z6.s, z17.s\n"
+ "fmla z31.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x4, LSL #2]\n"
+ "fmla z20.s, p3/M, z6.s, z19.s\n"
+ "fmla z21.s, p3/M, z5.s, z19.s\n"
+ "ld1w { z17.s }, p2/Z, [x6, x4, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z18.s\n"
+ "fmla z22.s, p3/M, z6.s, z16.s\n"
+ "fmla z30.s, p3/M, z8.s, z19.s\n"
+ "fmla z20.s, p3/M, z8.s, z18.s\n"
+ "ld1w { z18.s }, p2/Z, [x6, x15, LSL #2]\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
+ "fmla z28.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z0.s, z17.s\n"
+ "fmla z25.s, p3/M, z4.s, z17.s\n"
+ "fmla z22.s, p3/M, z8.s, z16.s\n"
+ "fmla z23.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x16, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z17.s\n"
+ "fmla z28.s, p3/M, z5.s, z18.s\n"
+ "ld1w { z17.s }, p2/Z, [x16, x4, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z18.s\n"
+ "fmla z31.s, p3/M, z2.s, z18.s\n"
+ "fmla z20.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x5, x8, LSL #2]\n"
+ "fmla z21.s, p3/M, z4.s, z17.s\n"
+ "fmla z22.s, p3/M, z3.s, z17.s\n"
+ "fmla z25.s, p3/M, z2.s, z16.s\n"
+ "fmla z23.s, p3/M, z4.s, z19.s\n"
+ "fmla z30.s, p3/M, z7.s, z17.s\n"
+ "fmla z31.s, p3/M, z6.s, z17.s\n"
+ "fmla z28.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x7]\n"
+ "fmla z29.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x7, x13, LSL #2]\n"
+ "fmla z20.s, p3/M, z7.s, z19.s\n"
+ "fmla z22.s, p3/M, z5.s, z19.s\n"
+ "fmla z25.s, p3/M, z6.s, z18.s\n"
+ "fmla z21.s, p3/M, z0.s, z18.s\n"
+ "fmla z23.s, p3/M, z2.s, z17.s\n"
+ "fmla z31.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x8, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z18.s\n"
+ "fmla z29.s, p3/M, z8.s, z17.s\n"
+ "fmla z20.s, p3/M, z5.s, z17.s\n"
+ "fmla z21.s, p3/M, z8.s, z16.s\n"
+ "fmla z22.s, p3/M, z7.s, z16.s\n"
+ "fmla z23.s, p3/M, z6.s, z16.s\n"
+ "fmax z25.s, p3/M, z25.s, z26.s\n"
+ "fmin z25.s, p3/M, z25.s, z14.s\n"
+ ".inst 0xc1aecb5c // fclamp { z28.s-z31.s }, z26.s, z14.s\n"
+ "st1w { z25.s }, p0, [x26]\n"
+ ".inst 0xc1aecb54 // fclamp { z20.s-z23.s }, z26.s, z14.s\n"
+ "st1w { z28.s }, p0, [x26, x27, LSL #2]\n"
+ "st1w { z29.s }, p0, [x26, x22, LSL #2]\n"
+ "st1w { z30.s }, p0, [x24]\n"
+ "st1w { z31.s }, p0, [x24, x27, LSL #2]\n"
+ "st1w { z20.s }, p0, [x24, x22, LSL #2]\n"
+ "st1w { z21.s }, p0, [x23]\n"
+ "st1w { z22.s }, p0, [x23, x27, LSL #2]\n"
+ "st1w { z23.s }, p0, [x23, x22, LSL #2]\n"
"blt 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
index fd648a392f..2c868b6cf3 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -22,11 +22,11 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME2)
-
#include <cstddef>
#include <cstdint>
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
namespace arm_conv {
namespace depthwise {
@@ -87,354 +87,354 @@ void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
".inst 0xd503477f // SMSTART ZA\n"
- "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "add x17, %x[params_struct], %[offsetof_Args_inptrs]\n"
"ptrue p3.b\n"
".inst 0x25207810 // ptrue pn8.b\n"
- "ld1w { z18.s }, p3/Z, [x17]\n"
- "addvl x17, x17, #1\n"
- "ldp x15, x14, [x16, #0x0]\n"
- "ldp x13, x12, [x16, #0x10]\n"
- "cntw x11\n"
- ".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
- "addvl x17, x17, #4\n"
- "ldr x10, [x16, #0x20]\n"
- "mov x9, #0x0\n"
+ "ld1w { z20.s }, p3/Z, [x8]\n"
+ "addvl x8, x8, #1\n"
+ "ldp x24, x23, [x17, #0x0]\n"
+ "ldp x22, x21, [x17, #0x10]\n"
+ "cntw x16\n"
+ ".inst 0xa040c100 // ld1w { z0.s-z3.s }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
+ "ldr x20, [x17, #0x20]\n"
+ "mov x15, #0x0\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- ".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
- "addvl x17, x17, #4\n"
- "cmp x11, %x[n_channels]\n"
- "ldr x28, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "sub x27, XZR, x11\n"
- "ld1w { z8.s }, p3/Z, [x17]\n"
- "addvl x17, x17, #1\n"
- "ld1w { z9.s }, p2/Z, [x15, x9, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x14, x9, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x13, x9, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x12, x9, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x10, x9, LSL #2]\n"
+ ".inst 0xa040c104 // ld1w { z4.s-z7.s }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
+ "cmp x16, %x[n_channels]\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ld1rw { z22.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "sub x13, XZR, x16\n"
+ "ld1w { z8.s }, p3/Z, [x8]\n"
+ "addvl x8, x8, #1\n"
+ "ld1w { z9.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x20, x15, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z23, z18\n fmla z23.s, p3/M, z8.s, z9.s\n"
- "movprfx z24, z18\n fmla z24.s, p3/M, z7.s, z9.s\n"
- "ldr x26, [x16, #0x30]\n"
- "incw x27\n"
- "movprfx z25, z18\n fmla z25.s, p3/M, z6.s, z9.s\n"
- "fmla z23.s, p3/M, z0.s, z10.s\n"
- "ldr x25, [x16, #0x38]\n"
+ "movprfx z21, z20\n fmla z21.s, p3/M, z8.s, z9.s\n"
+ "movprfx z24, z20\n fmla z24.s, p3/M, z7.s, z9.s\n"
+ "ldr x22, [x17, #0x30]\n"
+ "incw x13\n"
+ "movprfx z25, z20\n fmla z25.s, p3/M, z6.s, z9.s\n"
+ "fmla z21.s, p3/M, z0.s, z10.s\n"
+ "ldr x25, [x17, #0x38]\n"
"mov p1.b, p2.b\n"
"fmla z24.s, p3/M, z4.s, z13.s\n"
- "movprfx z26, z18\n fmla z26.s, p3/M, z5.s, z9.s\n"
- "ldr x24, [x16, #0x28]\n"
- "whilelt p0.s, x11, %x[n_channels]\n"
- "movprfx z27, z18\n fmla z27.s, p3/M, z4.s, z9.s\n"
- "movprfx z28, z18\n fmla z28.s, p3/M, z3.s, z9.s\n"
- "ldr x14, [x16, #0x48]\n"
- "ld1w { z10.s }, p2/Z, [x14, x9, LSL #2]\n"
+ "movprfx z26, z20\n fmla z26.s, p3/M, z5.s, z9.s\n"
+ "ldr x21, [x17, #0x28]\n"
+ "whilelt p0.s, x16, %x[n_channels]\n"
+ "movprfx z27, z20\n fmla z27.s, p3/M, z4.s, z9.s\n"
+ "movprfx z28, z20\n fmla z28.s, p3/M, z3.s, z9.s\n"
+ "ldr x20, [x17, #0x48]\n"
+ "ld1w { z19.s }, p2/Z, [x20, x15, LSL #2]\n"
"fmla z25.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x9, LSL #2]\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z2.s, z9.s\n"
- "ldr x15, [x16, #0x40]\n"
- "fmla z23.s, p3/M, z5.s, z13.s\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "ldr x13, [x16, #0x50]\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z23.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "movprfx z29, z20\n fmla z29.s, p3/M, z2.s, z9.s\n"
+ "ldr x20, [x17, #0x40]\n"
+ "fmla z21.s, p3/M, z5.s, z13.s\n"
+ "fmla z24.s, p3/M, z6.s, z23.s\n"
+ "ldr x24, [x17, #0x50]\n"
+ "movprfx z31, z20\n fmla z31.s, p3/M, z0.s, z9.s\n"
"fmla z25.s, p3/M, z3.s, z13.s\n"
- "ldr x12, [x16, #0x58]\n"
+ "ldr x23, [x17, #0x58]\n"
"fmla z26.s, p3/M, z2.s, z13.s\n"
"fmla z27.s, p3/M, z1.s, z13.s\n"
- "ldr x10, [x16, #0x60]\n"
+ "ldr x22, [x17, #0x60]\n"
"fmla z28.s, p3/M, z0.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
"fmla z29.s, p3/M, z6.s, z12.s\n"
- "ldr x26, [x16, #0x70]\n"
- "ld1w { z12.s }, p2/Z, [x24, x9, LSL #2]\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
- "ldr x24, [x16, #0x68]\n"
- "fmla z24.s, p3/M, z0.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x15, x9, LSL #2]\n"
- "ldr x25, [x16, #0x78]\n"
- "fmla z26.s, p3/M, z4.s, z11.s\n"
- "fmla z27.s, p3/M, z3.s, z11.s\n"
- "ldr x15, [x16, #0x80]\n"
- "ld1w { z18.s }, p3/Z, [x17]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z28.s, p3/M, z4.s, z10.s\n"
- "ldr x14, [x16, #0x88]\n"
- "addvl x17, x17, #1\n"
- "fmla z29.s, p3/M, z1.s, z11.s\n"
- "fmla z23.s, p3/M, z1.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x13, x9, LSL #2]\n"
- "ldr x13, [x16, #0x90]\n"
- "fmla z24.s, p3/M, z2.s, z12.s\n"
- "fmla z25.s, p3/M, z1.s, z12.s\n"
- "ld1w { z13.s }, p2/Z, [x12, x9, LSL #2]\n"
- "ldr x12, [x16, #0x98]\n"
- "ld1w { z12.s }, p2/Z, [x10, x9, LSL #2]\n"
- "fmla z27.s, p3/M, z5.s, z10.s\n"
- "fmla z30.s, p3/M, z2.s, z10.s\n"
- "ldr x10, [x16, #0xa0]\n"
- "fmla z26.s, p3/M, z0.s, z11.s\n"
- "fmla z28.s, p3/M, z2.s, z13.s\n"
- "ldr x23, [x28, #0x0]\n"
- "fmla z24.s, p3/M, z8.s, z10.s\n"
- "fmla z25.s, p3/M, z7.s, z10.s\n"
- "ldr x22, [x28, #0x8]\n"
- "fmla z31.s, p3/M, z1.s, z10.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x24, x9, LSL #2]\n"
- "ldr x24, [x16, #0xa8]\n"
- "fmla z26.s, p3/M, z6.s, z12.s\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x15, x9, LSL #2]\n"
- "ldr x15, [x16, #0xc0]\n"
- "fmla z28.s, p3/M, z6.s, z10.s\n"
- "fmla z30.s, p3/M, z4.s, z10.s\n"
- "ldr x21, [x28, #0x10]\n"
- "fmla z23.s, p3/M, z3.s, z11.s\n"
- "fmla z25.s, p3/M, z5.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x9, LSL #2]\n"
- "ldr x26, [x16, #0xb0]\n"
- "fmla z29.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z3.s, z10.s\n"
- "ld1w { z13.s }, p2/Z, [x25, x9, LSL #2]\n"
- "ldr x25, [x16, #0xb8]\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "fmla z28.s, p3/M, z8.s, z11.s\n"
- "ldr x20, [x28, #0x18]\n"
- "fmla z30.s, p3/M, z6.s, z13.s\n"
- "fmla z24.s, p3/M, z3.s, z12.s\n"
- "fmla z27.s, p3/M, z0.s, z12.s\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x9, LSL #2]\n"
- "fmla z29.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x13, x9, LSL #2]\n"
- "fmla z23.s, p3/M, z4.s, z12.s\n"
- "fmla z26.s, p3/M, z1.s, z12.s\n"
- "fmla z24.s, p3/M, z5.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x9, LSL #2]\n"
- "fmla z25.s, p3/M, z4.s, z11.s\n"
- "fmla z27.s, p3/M, z2.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z30.s, p3/M, z8.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x10, x9, LSL #2]\n"
- "ldr x10, [x16, #0x20]\n"
- "fmla z23.s, p3/M, z2.s, z11.s\n"
- "fmla z26.s, p3/M, z7.s, z12.s\n"
- "fmla z27.s, p3/M, z6.s, z12.s\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "fmla z30.s, p3/M, z3.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x9, LSL #2]\n"
- "fmla z31.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x24, x9, LSL #2]\n"
- "fmla z23.s, p3/M, z6.s, z12.s\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "fmla z24.s, p3/M, z1.s, z11.s\n"
- "fmla z25.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
- "fmax z23.s, p3/M, z23.s, z17.s\n"
- "fmla z28.s, p3/M, z7.s, z13.s\n"
- "fmla z30.s, p3/M, z5.s, z13.s\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "fmla z27.s, p3/M, z8.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x15, x9, LSL #2]\n"
- "ldp x15, x14, [x16, #0x0]\n"
- "fmla z26.s, p3/M, z3.s, z12.s\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "ldp x13, x12, [x16, #0x10]\n"
- "incw x9\n"
- "fmin z23.s, p3/M, z23.s, z16.s\n"
- "st1w { z23.s }, p1, [x23, x27, LSL #2]\n"
- "ldr x23, [x28, #0x20]\n"
- "fmla z28.s, p3/M, z5.s, z11.s\n"
- "fmla z29.s, p3/M, z8.s, z13.s\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
- "ld1w { z9.s }, p0/Z, [x15, x11, LSL #2]\n"
- "whilelt p2.s, x9, %x[n_channels]\n"
- "fmla z31.s, p3/M, z6.s, z13.s\n"
- ".inst 0xc1b0ca38 // fclamp { z24.s-z27.s }, z17.s, z16.s\n"
- "st1w { z24.s }, p1, [x22, x27, LSL #2]\n"
- "ldr x22, [x28, #0x28]\n"
- "st1w { z25.s }, p1, [x21, x27, LSL #2]\n"
- "ldr x21, [x28, #0x30]\n"
- "ld1w { z10.s }, p0/Z, [x14, x11, LSL #2]\n"
- ".inst 0xc1b0ca3c // fclamp { z28.s-z31.s }, z17.s, z16.s\n"
- "st1w { z26.s }, p1, [x20, x27, LSL #2]\n"
- "ldr x20, [x28, #0x38]\n"
- "ld1w { z11.s }, p0/Z, [x13, x11, LSL #2]\n"
- "st1w { z27.s }, p1, [x23, x27, LSL #2]\n"
- "ldr x23, [x28, #0x40]\n"
- "ld1w { z12.s }, p0/Z, [x12, x11, LSL #2]\n"
- "ld1w { z13.s }, p0/Z, [x10, x11, LSL #2]\n"
- "incw x11\n"
- "cmp x11, %x[n_channels]\n"
- "st1w { z28.s }, p1, [x22, x27, LSL #2]\n"
- ".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
- "addvl x17, x17, #4\n"
- "st1w { z29.s }, p1, [x21, x27, LSL #2]\n"
- ".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
- "addvl x17, x17, #4\n"
- "st1w { z30.s }, p1, [x20, x27, LSL #2]\n"
- "st1w { z31.s }, p1, [x23, x27, LSL #2]\n"
- "ld1w { z8.s }, p3/Z, [x17]\n"
- "addvl x17, x17, #1\n"
+ "ldr x12, [x17, #0x70]\n"
+ "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "movprfx z30, z20\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "fmla z21.s, p3/M, z7.s, z23.s\n"
+ "ldr x21, [x17, #0x68]\n"
+ "fmla z24.s, p3/M, z0.s, z17.s\n"
+ "fmla z31.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x27, [x17, #0x78]\n"
+ "fmla z26.s, p3/M, z4.s, z23.s\n"
+ "fmla z27.s, p3/M, z3.s, z23.s\n"
+ "ldr x20, [x17, #0x80]\n"
+ "ld1w { z20.s }, p3/Z, [x8]\n"
+ "fmla z30.s, p3/M, z0.s, z23.s\n"
+ "fmla z28.s, p3/M, z4.s, z19.s\n"
+ "ldr x11, [x17, #0x88]\n"
+ "addvl x8, x8, #1\n"
+ "fmla z29.s, p3/M, z1.s, z23.s\n"
+ "fmla z21.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ldr x26, [x17, #0x90]\n"
+ "fmla z24.s, p3/M, z2.s, z16.s\n"
+ "fmla z25.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z11.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ldr x25, [x17, #0x98]\n"
+ "ld1w { z17.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z5.s, z19.s\n"
+ "fmla z30.s, p3/M, z2.s, z19.s\n"
+ "ldr x24, [x17, #0xa0]\n"
+ "fmla z26.s, p3/M, z0.s, z18.s\n"
+ "fmla z28.s, p3/M, z2.s, z11.s\n"
+ "ldr x10, [x14, #0x0]\n"
+ "fmla z24.s, p3/M, z8.s, z19.s\n"
+ "fmla z25.s, p3/M, z7.s, z19.s\n"
+ "ldr x9, [x14, #0x8]\n"
+ "fmla z31.s, p3/M, z1.s, z19.s\n"
+ "fmla z29.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x23, [x17, #0xa8]\n"
+ "fmla z26.s, p3/M, z6.s, z17.s\n"
+ "fmla z27.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z23.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x22, [x17, #0xc0]\n"
+ "fmla z28.s, p3/M, z6.s, z16.s\n"
+ "fmla z30.s, p3/M, z4.s, z16.s\n"
+ "ldr x28, [x14, #0x10]\n"
+ "fmla z21.s, p3/M, z3.s, z18.s\n"
+ "fmla z25.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z15.s }, p2/Z, [x12, x15, LSL #2]\n"
+ "ldr x21, [x17, #0xb0]\n"
+ "fmla z29.s, p3/M, z5.s, z16.s\n"
+ "fmla z31.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "fmla z26.s, p3/M, z8.s, z16.s\n"
+ "fmla z28.s, p3/M, z8.s, z15.s\n"
+ "ldr x27, [x14, #0x18]\n"
+ "fmla z30.s, p3/M, z6.s, z19.s\n"
+ "fmla z24.s, p3/M, z3.s, z23.s\n"
+ "fmla z27.s, p3/M, z0.s, z23.s\n"
+ "fmla z31.s, p3/M, z5.s, z15.s\n"
+ "ld1w { z17.s }, p2/Z, [x11, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z19.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "fmla z21.s, p3/M, z4.s, z23.s\n"
+ "fmla z26.s, p3/M, z1.s, z23.s\n"
+ "fmla z24.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z25.s, p3/M, z4.s, z17.s\n"
+ "fmla z27.s, p3/M, z2.s, z17.s\n"
+ "fmla z28.s, p3/M, z1.s, z17.s\n"
+ "fmla z30.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z17.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ldr x26, [x17, #0x20]\n"
+ "fmla z21.s, p3/M, z2.s, z17.s\n"
+ "fmla z26.s, p3/M, z7.s, z16.s\n"
+ "fmla z27.s, p3/M, z6.s, z16.s\n"
+ "fmla z29.s, p3/M, z4.s, z16.s\n"
+ "fmla z30.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z21.s, p3/M, z6.s, z18.s\n"
+ "fmla z31.s, p3/M, z4.s, z16.s\n"
+ "fmla z24.s, p3/M, z1.s, z17.s\n"
+ "fmla z25.s, p3/M, z0.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmax z21.s, p3/M, z21.s, z22.s\n"
+ "fmla z28.s, p3/M, z7.s, z16.s\n"
+ "fmla z30.s, p3/M, z5.s, z16.s\n"
+ "fmla z29.s, p3/M, z0.s, z18.s\n"
+ "fmla z31.s, p3/M, z2.s, z17.s\n"
+ "fmla z27.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldp x22, x21, [x17, #0x0]\n"
+ "fmla z26.s, p3/M, z3.s, z18.s\n"
+ "fmla z25.s, p3/M, z8.s, z17.s\n"
+ "ldp x25, x24, [x17, #0x10]\n"
+ "incw x15\n"
+ "fmin z21.s, p3/M, z21.s, z14.s\n"
+ "st1w { z21.s }, p1, [x10, x13, LSL #2]\n"
+ "ldr x20, [x14, #0x20]\n"
+ "fmla z28.s, p3/M, z5.s, z17.s\n"
+ "fmla z29.s, p3/M, z8.s, z16.s\n"
+ "fmla z30.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z9.s }, p0/Z, [x22, x16, LSL #2]\n"
+ "whilelt p2.s, x15, %x[n_channels]\n"
+ "fmla z31.s, p3/M, z6.s, z16.s\n"
+ ".inst 0xc1aecad8 // fclamp { z24.s-z27.s }, z22.s, z14.s\n"
+ "st1w { z24.s }, p1, [x9, x13, LSL #2]\n"
+ "ldr x23, [x14, #0x28]\n"
+ "st1w { z25.s }, p1, [x28, x13, LSL #2]\n"
+ "ldr x22, [x14, #0x30]\n"
+ "ld1w { z10.s }, p0/Z, [x21, x16, LSL #2]\n"
+ ".inst 0xc1aecadc // fclamp { z28.s-z31.s }, z22.s, z14.s\n"
+ "st1w { z26.s }, p1, [x27, x13, LSL #2]\n"
+ "ldr x21, [x14, #0x38]\n"
+ "ld1w { z11.s }, p0/Z, [x25, x16, LSL #2]\n"
+ "st1w { z27.s }, p1, [x20, x13, LSL #2]\n"
+ "ldr x20, [x14, #0x40]\n"
+ "ld1w { z12.s }, p0/Z, [x24, x16, LSL #2]\n"
+ "ld1w { z13.s }, p0/Z, [x26, x16, LSL #2]\n"
+ "incw x16\n"
+ "cmp x16, %x[n_channels]\n"
+ "st1w { z28.s }, p1, [x23, x13, LSL #2]\n"
+ ".inst 0xa040c100 // ld1w { z0.s-z3.s }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
+ "st1w { z29.s }, p1, [x22, x13, LSL #2]\n"
+ ".inst 0xa040c104 // ld1w { z4.s-z7.s }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
+ "st1w { z30.s }, p1, [x21, x13, LSL #2]\n"
+ "st1w { z31.s }, p1, [x20, x13, LSL #2]\n"
+ "ld1w { z8.s }, p3/Z, [x8]\n"
+ "addvl x8, x8, #1\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z23, z18\n fmla z23.s, p3/M, z8.s, z9.s\n"
- "movprfx z24, z18\n fmla z24.s, p3/M, z7.s, z9.s\n"
- "ldr x26, [x16, #0x30]\n"
- "incw x27\n"
- "movprfx z25, z18\n fmla z25.s, p3/M, z6.s, z9.s\n"
- "fmla z23.s, p3/M, z0.s, z10.s\n"
- "ldr x25, [x16, #0x38]\n"
- "mov p1.b, p2.b\n"
+ "movprfx z21, z20\n fmla z21.s, p3/M, z8.s, z9.s\n"
+ "movprfx z24, z20\n fmla z24.s, p3/M, z7.s, z9.s\n"
+ "ldr x23, [x17, #0x30]\n"
+ "incw x13\n"
+ "movprfx z25, z20\n fmla z25.s, p3/M, z6.s, z9.s\n"
+ "fmla z21.s, p3/M, z0.s, z10.s\n"
+ "ldr x22, [x17, #0x38]\n"
+ "mov p0.b, p2.b\n"
"fmla z24.s, p3/M, z4.s, z13.s\n"
- "movprfx z26, z18\n fmla z26.s, p3/M, z5.s, z9.s\n"
- "ldr x24, [x16, #0x28]\n"
- "movprfx z27, z18\n fmla z27.s, p3/M, z4.s, z9.s\n"
- "movprfx z28, z18\n fmla z28.s, p3/M, z3.s, z9.s\n"
- "ldr x14, [x16, #0x48]\n"
- "ld1w { z10.s }, p2/Z, [x14, x9, LSL #2]\n"
+ "movprfx z26, z20\n fmla z26.s, p3/M, z5.s, z9.s\n"
+ "ldr x21, [x17, #0x28]\n"
+ "movprfx z27, z20\n fmla z27.s, p3/M, z4.s, z9.s\n"
+ "movprfx z28, z20\n fmla z28.s, p3/M, z3.s, z9.s\n"
+ "ldr x20, [x17, #0x48]\n"
+ "ld1w { z19.s }, p2/Z, [x20, x15, LSL #2]\n"
"fmla z25.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x9, LSL #2]\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z2.s, z9.s\n"
- "ldr x15, [x16, #0x40]\n"
- "fmla z23.s, p3/M, z5.s, z13.s\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "ldr x13, [x16, #0x50]\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "movprfx z29, z20\n fmla z29.s, p3/M, z2.s, z9.s\n"
+ "ldr x20, [x17, #0x40]\n"
+ "fmla z21.s, p3/M, z5.s, z13.s\n"
+ "fmla z24.s, p3/M, z6.s, z18.s\n"
+ "ldr x25, [x17, #0x50]\n"
+ "movprfx z31, z20\n fmla z31.s, p3/M, z0.s, z9.s\n"
"fmla z25.s, p3/M, z3.s, z13.s\n"
- "ldr x12, [x16, #0x58]\n"
+ "ldr x24, [x17, #0x58]\n"
"fmla z26.s, p3/M, z2.s, z13.s\n"
"fmla z27.s, p3/M, z1.s, z13.s\n"
- "ldr x10, [x16, #0x60]\n"
+ "ldr x23, [x17, #0x60]\n"
"fmla z28.s, p3/M, z0.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x22, x15, LSL #2]\n"
"fmla z29.s, p3/M, z6.s, z12.s\n"
- "ldr x26, [x16, #0x70]\n"
- "ld1w { z12.s }, p2/Z, [x24, x9, LSL #2]\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
- "ldr x24, [x16, #0x68]\n"
- "fmla z24.s, p3/M, z0.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x15, x9, LSL #2]\n"
- "ldr x25, [x16, #0x78]\n"
- "fmla z26.s, p3/M, z4.s, z11.s\n"
- "fmla z27.s, p3/M, z3.s, z11.s\n"
- "ldr x15, [x16, #0x80]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z28.s, p3/M, z4.s, z10.s\n"
- "ldr x14, [x16, #0x88]\n"
- "fmla z29.s, p3/M, z1.s, z11.s\n"
- "fmla z23.s, p3/M, z1.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x13, x9, LSL #2]\n"
- "ldr x13, [x16, #0x90]\n"
- "fmla z24.s, p3/M, z2.s, z12.s\n"
- "fmla z25.s, p3/M, z1.s, z12.s\n"
- "ld1w { z13.s }, p2/Z, [x12, x9, LSL #2]\n"
- "ldr x12, [x16, #0x98]\n"
- "ld1w { z12.s }, p2/Z, [x10, x9, LSL #2]\n"
- "fmla z27.s, p3/M, z5.s, z10.s\n"
- "fmla z30.s, p3/M, z2.s, z10.s\n"
- "ldr x10, [x16, #0xa0]\n"
- "fmla z26.s, p3/M, z0.s, z11.s\n"
- "fmla z28.s, p3/M, z2.s, z13.s\n"
- "ldr x23, [x28, #0x0]\n"
- "fmla z24.s, p3/M, z8.s, z10.s\n"
- "fmla z25.s, p3/M, z7.s, z10.s\n"
- "ldr x22, [x28, #0x8]\n"
- "fmla z31.s, p3/M, z1.s, z10.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x24, x9, LSL #2]\n"
- "ldr x24, [x16, #0xa8]\n"
- "fmla z26.s, p3/M, z6.s, z12.s\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x15, x9, LSL #2]\n"
- "ldr x15, [x16, #0xc0]\n"
- "fmla z28.s, p3/M, z6.s, z10.s\n"
- "fmla z30.s, p3/M, z4.s, z10.s\n"
- "ldr x21, [x28, #0x10]\n"
- "fmla z23.s, p3/M, z3.s, z11.s\n"
- "fmla z25.s, p3/M, z5.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x9, LSL #2]\n"
- "ldr x26, [x16, #0xb0]\n"
- "fmla z29.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z3.s, z10.s\n"
- "ld1w { z13.s }, p2/Z, [x25, x9, LSL #2]\n"
- "ldr x25, [x16, #0xb8]\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "fmla z28.s, p3/M, z8.s, z11.s\n"
- "ldr x20, [x28, #0x18]\n"
- "fmla z30.s, p3/M, z6.s, z13.s\n"
- "fmla z24.s, p3/M, z3.s, z12.s\n"
- "fmla z27.s, p3/M, z0.s, z12.s\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x9, LSL #2]\n"
- "fmla z29.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x13, x9, LSL #2]\n"
- "fmla z23.s, p3/M, z4.s, z12.s\n"
- "fmla z26.s, p3/M, z1.s, z12.s\n"
- "fmla z24.s, p3/M, z5.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x9, LSL #2]\n"
- "fmla z25.s, p3/M, z4.s, z11.s\n"
- "fmla z27.s, p3/M, z2.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z30.s, p3/M, z8.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x10, x9, LSL #2]\n"
- "fmla z23.s, p3/M, z2.s, z11.s\n"
- "fmla z26.s, p3/M, z7.s, z12.s\n"
- "fmla z27.s, p3/M, z6.s, z12.s\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "fmla z30.s, p3/M, z3.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x9, LSL #2]\n"
- "fmla z31.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x24, x9, LSL #2]\n"
- "fmla z23.s, p3/M, z6.s, z12.s\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "fmla z24.s, p3/M, z1.s, z11.s\n"
- "fmla z25.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
- "fmax z23.s, p3/M, z23.s, z17.s\n"
- "fmla z28.s, p3/M, z7.s, z13.s\n"
- "fmla z30.s, p3/M, z5.s, z13.s\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "fmla z27.s, p3/M, z8.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x15, x9, LSL #2]\n"
- "fmla z26.s, p3/M, z3.s, z12.s\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "fmin z23.s, p3/M, z23.s, z16.s\n"
- "st1w { z23.s }, p1, [x23, x27, LSL #2]\n"
- "ldr x23, [x28, #0x20]\n"
- "fmla z28.s, p3/M, z5.s, z11.s\n"
- "fmla z29.s, p3/M, z8.s, z13.s\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
- "fmla z31.s, p3/M, z6.s, z13.s\n"
- ".inst 0xc1b0ca38 // fclamp { z24.s-z27.s }, z17.s, z16.s\n"
- "st1w { z24.s }, p1, [x22, x27, LSL #2]\n"
- "ldr x22, [x28, #0x28]\n"
- "st1w { z25.s }, p1, [x21, x27, LSL #2]\n"
- "ldr x21, [x28, #0x30]\n"
- ".inst 0xc1b0ca3c // fclamp { z28.s-z31.s }, z17.s, z16.s\n"
- "st1w { z26.s }, p1, [x20, x27, LSL #2]\n"
- "ldr x20, [x28, #0x38]\n"
- "st1w { z27.s }, p1, [x23, x27, LSL #2]\n"
- "ldr x23, [x28, #0x40]\n"
- "st1w { z28.s }, p1, [x22, x27, LSL #2]\n"
- "st1w { z29.s }, p1, [x21, x27, LSL #2]\n"
- "st1w { z30.s }, p1, [x20, x27, LSL #2]\n"
- "st1w { z31.s }, p1, [x23, x27, LSL #2]\n"
+ "ldr x12, [x17, #0x70]\n"
+ "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "movprfx z30, z20\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "fmla z21.s, p3/M, z7.s, z18.s\n"
+ "ldr x22, [x17, #0x68]\n"
+ "fmla z24.s, p3/M, z0.s, z17.s\n"
+ "fmla z31.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x21, [x17, #0x78]\n"
+ "fmla z26.s, p3/M, z4.s, z18.s\n"
+ "fmla z27.s, p3/M, z3.s, z18.s\n"
+ "ldr x20, [x17, #0x80]\n"
+ "fmla z30.s, p3/M, z0.s, z18.s\n"
+ "fmla z28.s, p3/M, z4.s, z19.s\n"
+ "ldr x11, [x17, #0x88]\n"
+ "fmla z29.s, p3/M, z1.s, z18.s\n"
+ "fmla z21.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z20.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "ldr x10, [x17, #0x90]\n"
+ "fmla z24.s, p3/M, z2.s, z16.s\n"
+ "fmla z25.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ldr x9, [x17, #0x98]\n"
+ "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z5.s, z19.s\n"
+ "fmla z30.s, p3/M, z2.s, z19.s\n"
+ "ldr x28, [x17, #0xa0]\n"
+ "fmla z26.s, p3/M, z0.s, z20.s\n"
+ "fmla z28.s, p3/M, z2.s, z17.s\n"
+ "ldr x27, [x14, #0x0]\n"
+ "fmla z24.s, p3/M, z8.s, z19.s\n"
+ "fmla z25.s, p3/M, z7.s, z19.s\n"
+ "ldr x26, [x14, #0x8]\n"
+ "fmla z31.s, p3/M, z1.s, z19.s\n"
+ "fmla z29.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldr x25, [x17, #0xa8]\n"
+ "fmla z26.s, p3/M, z6.s, z16.s\n"
+ "fmla z27.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z18.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x23, [x17, #0xc0]\n"
+ "fmla z28.s, p3/M, z6.s, z19.s\n"
+ "fmla z30.s, p3/M, z4.s, z19.s\n"
+ "ldr x24, [x14, #0x10]\n"
+ "fmla z21.s, p3/M, z3.s, z20.s\n"
+ "fmla z25.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x12, x15, LSL #2]\n"
+ "ldr x22, [x17, #0xb0]\n"
+ "fmla z29.s, p3/M, z5.s, z19.s\n"
+ "fmla z31.s, p3/M, z3.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "fmla z26.s, p3/M, z8.s, z19.s\n"
+ "fmla z28.s, p3/M, z8.s, z17.s\n"
+ "ldr x21, [x14, #0x18]\n"
+ "fmla z30.s, p3/M, z6.s, z16.s\n"
+ "fmla z24.s, p3/M, z3.s, z18.s\n"
+ "fmla z27.s, p3/M, z0.s, z18.s\n"
+ "fmla z31.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x11, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x10, x15, LSL #2]\n"
+ "fmla z21.s, p3/M, z4.s, z18.s\n"
+ "fmla z26.s, p3/M, z1.s, z18.s\n"
+ "fmla z24.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x9, x15, LSL #2]\n"
+ "fmla z25.s, p3/M, z4.s, z17.s\n"
+ "fmla z27.s, p3/M, z2.s, z17.s\n"
+ "fmla z28.s, p3/M, z1.s, z17.s\n"
+ "fmla z30.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z17.s }, p2/Z, [x28, x15, LSL #2]\n"
+ "fmla z21.s, p3/M, z2.s, z17.s\n"
+ "fmla z26.s, p3/M, z7.s, z16.s\n"
+ "fmla z27.s, p3/M, z6.s, z16.s\n"
+ "fmla z29.s, p3/M, z4.s, z16.s\n"
+ "fmla z30.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z21.s, p3/M, z6.s, z18.s\n"
+ "fmla z31.s, p3/M, z4.s, z16.s\n"
+ "fmla z24.s, p3/M, z1.s, z17.s\n"
+ "fmla z25.s, p3/M, z0.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmax z21.s, p3/M, z21.s, z22.s\n"
+ "fmla z28.s, p3/M, z7.s, z16.s\n"
+ "fmla z30.s, p3/M, z5.s, z16.s\n"
+ "fmla z29.s, p3/M, z0.s, z18.s\n"
+ "fmla z31.s, p3/M, z2.s, z17.s\n"
+ "fmla z27.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z26.s, p3/M, z3.s, z18.s\n"
+ "fmla z25.s, p3/M, z8.s, z17.s\n"
+ "fmin z21.s, p3/M, z21.s, z14.s\n"
+ "st1w { z21.s }, p0, [x27, x13, LSL #2]\n"
+ "ldr x20, [x14, #0x20]\n"
+ "fmla z28.s, p3/M, z5.s, z17.s\n"
+ "fmla z29.s, p3/M, z8.s, z16.s\n"
+ "fmla z30.s, p3/M, z7.s, z16.s\n"
+ "fmla z31.s, p3/M, z6.s, z16.s\n"
+ ".inst 0xc1aecad8 // fclamp { z24.s-z27.s }, z22.s, z14.s\n"
+ "st1w { z24.s }, p0, [x26, x13, LSL #2]\n"
+ "ldr x23, [x14, #0x28]\n"
+ "st1w { z25.s }, p0, [x24, x13, LSL #2]\n"
+ "ldr x22, [x14, #0x30]\n"
+ ".inst 0xc1aecadc // fclamp { z28.s-z31.s }, z22.s, z14.s\n"
+ "st1w { z26.s }, p0, [x21, x13, LSL #2]\n"
+ "ldr x21, [x14, #0x38]\n"
+ "st1w { z27.s }, p0, [x20, x13, LSL #2]\n"
+ "ldr x20, [x14, #0x40]\n"
+ "st1w { z28.s }, p0, [x23, x13, LSL #2]\n"
+ "st1w { z29.s }, p0, [x22, x13, LSL #2]\n"
+ "st1w { z30.s }, p0, [x21, x13, LSL #2]\n"
+ "st1w { z31.s }, p0, [x20, x13, LSL #2]\n"
".inst 0xd503467f // SMSTOP\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
index 9184cc00e4..add666e14e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,13 +22,13 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
namespace arm_conv {
namespace depthwise {
@@ -68,4 +68,4 @@ class sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirs
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
index 5380567d36..efd37c38ec 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -22,11 +22,11 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME2)
-
#include <cstddef>
#include <cstdint>
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
namespace arm_conv {
namespace depthwise {
@@ -193,18 +193,18 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
"mul x21, x2, x22\n" // offset = tile_i * ld_output_row
"mov x20, #0x4\n"
- "ld1w { z15.s }, p3/Z, [x17]\n"
+ "ld1w { z14.s }, p3/Z, [x17]\n"
"ldr x9, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
"madd x21, x3, x9, x21\n" // offset += tile_j * ld_output_col
"mul x21, x21, x20\n" // offset *= output_tile_size
- "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z13.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
"ldr x28, [%x[params_struct], %[offsetof_args_outptr]]\n"
"add x28, x28, x21, LSL #2\n" // outptrs[0] += offset * sizeof(float)
"addvl x17, x17, #1\n"
".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
"add x27, x28, x22, LSL #2\n"
"cntw x26\n"
- "ld1rw { z13.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"addvl x17, x17, #4\n"
"add x25, x27, x22, LSL #2\n"
".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
@@ -224,440 +224,440 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"addvl x17, x17, #1\n"
"bge 4f\n"
"3:" // Tile loop: Channel loop
- "movprfx z21, z15\n fmla z21.s, p3/M, z4.s, z9.s\n"
- "movprfx z16, z15\n fmla z16.s, p3/M, z8.s, z9.s\n"
+ "movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
+ "movprfx z28, z14\n fmla z28.s, p3/M, z8.s, z9.s\n"
"whilelt p1.s, x26, %x[n_channels]\n"
"incw x21\n"
- "movprfx z22, z15\n fmla z22.s, p3/M, z3.s, z9.s\n"
- "movprfx z25, z15\n fmla z25.s, p3/M, z1.s, z9.s\n"
+ "movprfx z26, z14\n fmla z26.s, p3/M, z3.s, z9.s\n"
+ "movprfx z17, z14\n fmla z17.s, p3/M, z1.s, z9.s\n"
"incw x26\n"
"mov p0.b, p2.b\n"
- "movprfx z26, z15\n fmla z26.s, p3/M, z0.s, z9.s\n"
- "fmla z21.s, p3/M, z5.s, z12.s\n"
+ "movprfx z18, z14\n fmla z18.s, p3/M, z0.s, z9.s\n"
+ "fmla z25.s, p3/M, z5.s, z12.s\n"
"incw x20\n"
- "movprfx z17, z15\n fmla z17.s, p3/M, z7.s, z9.s\n"
- "movprfx z18, z15\n fmla z18.s, p3/M, z6.s, z9.s\n"
- "movprfx z20, z15\n fmla z20.s, p3/M, z5.s, z9.s\n"
- "movprfx z24, z15\n fmla z24.s, p3/M, z2.s, z9.s\n"
+ "movprfx z29, z14\n fmla z29.s, p3/M, z7.s, z9.s\n"
+ "movprfx z30, z14\n fmla z30.s, p3/M, z6.s, z9.s\n"
+ "movprfx z24, z14\n fmla z24.s, p3/M, z5.s, z9.s\n"
+ "movprfx z16, z14\n fmla z16.s, p3/M, z2.s, z9.s\n"
"ld1w { z9.s }, p2/Z, [x16, x8, LSL #2]\n"
- "fmla z16.s, p3/M, z0.s, z10.s\n"
- "movprfx z19, z15\n fmla z19.s, p3/M, z2.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x12]\n"
- "fmla z22.s, p3/M, z4.s, z12.s\n"
- "fmla z25.s, p3/M, z2.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x11, LSL #2]\n"
- "fmla z26.s, p3/M, z1.s, z12.s\n"
- "movprfx z28, z15\n fmla z28.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x16, x15, LSL #2]\n"
- "fmla z21.s, p3/M, z7.s, z9.s\n"
- "fmla z17.s, p3/M, z8.s, z12.s\n"
- "fmla z18.s, p3/M, z7.s, z12.s\n"
- "fmla z19.s, p3/M, z6.s, z12.s\n"
- "movprfx z23, z15\n fmla z23.s, p3/M, z3.s, z12.s\n"
- "movprfx z27, z15\n fmla z27.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x5, x4, LSL #2]\n"
- "movprfx z31, z15\n fmla z31.s, p3/M, z8.s, z11.s\n"
- "fmla z22.s, p3/M, z6.s, z9.s\n"
- "ld1w { z11.s }, p2/Z, [x5, x13, LSL #2]\n"
- "fmla z25.s, p3/M, z4.s, z9.s\n"
- "fmla z26.s, p3/M, z3.s, z9.s\n"
- "fmla z20.s, p3/M, z8.s, z9.s\n"
- "fmla z24.s, p3/M, z5.s, z9.s\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z21.s, p3/M, z8.s, z10.s\n"
- "fmla z16.s, p3/M, z1.s, z12.s\n"
- "fmla z17.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x6, x11, LSL #2]\n"
- "fmla z18.s, p3/M, z2.s, z11.s\n"
- "fmla z19.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14]\n"
- "fmla z22.s, p3/M, z7.s, z10.s\n"
- "fmla z23.s, p3/M, z6.s, z10.s\n"
- "fmla z25.s, p3/M, z5.s, z10.s\n"
- "fmla z26.s, p3/M, z4.s, z10.s\n"
- "fmla z27.s, p3/M, z3.s, z10.s\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x11, LSL #2]\n"
- "fmla z19.s, p3/M, z5.s, z12.s\n"
- "fmla z23.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x6, x15, LSL #2]\n"
- "fmla z27.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "movprfx z29, z15\n fmla z29.s, p3/M, z1.s, z9.s\n"
- "movprfx z30, z15\n fmla z30.s, p3/M, z0.s, z9.s\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "movprfx z31, z14\n fmla z31.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z19.s }, p2/Z, [x12]\n"
+ "fmla z26.s, p3/M, z4.s, z12.s\n"
+ "fmla z17.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z22.s }, p2/Z, [x12, x11, LSL #2]\n"
+ "fmla z18.s, p3/M, z1.s, z12.s\n"
+ "movprfx z20, z14\n fmla z20.s, p3/M, z6.s, z19.s\n"
+ "ld1w { z11.s }, p2/Z, [x16, x15, LSL #2]\n"
+ "fmla z25.s, p3/M, z7.s, z9.s\n"
+ "fmla z29.s, p3/M, z8.s, z12.s\n"
+ "fmla z30.s, p3/M, z7.s, z12.s\n"
+ "fmla z31.s, p3/M, z6.s, z12.s\n"
+ "movprfx z27, z14\n fmla z27.s, p3/M, z3.s, z12.s\n"
+ "movprfx z19, z14\n fmla z19.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x5, x4, LSL #2]\n"
+ "movprfx z23, z14\n fmla z23.s, p3/M, z8.s, z22.s\n"
+ "fmla z26.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x5, x13, LSL #2]\n"
+ "fmla z17.s, p3/M, z4.s, z9.s\n"
+ "fmla z18.s, p3/M, z3.s, z9.s\n"
+ "movprfx z21, z14\n fmla z21.s, p3/M, z1.s, z9.s\n"
+ "movprfx z22, z14\n fmla z22.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z14.s }, p3/Z, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "fmla z24.s, p3/M, z8.s, z9.s\n"
+ "fmla z16.s, p3/M, z5.s, z9.s\n"
+ "fmla z20.s, p3/M, z2.s, z9.s\n"
+ "fmla z25.s, p3/M, z8.s, z11.s\n"
"ld1w { z9.s }, p2/Z, [x6]\n"
- "fmla z29.s, p3/M, z2.s, z10.s\n"
- "fmla z30.s, p3/M, z1.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x6, x8, LSL #2]\n"
- "fmla z20.s, p3/M, z0.s, z9.s\n"
- "fmla z21.s, p3/M, z1.s, z10.s\n"
- "fmla z16.s, p3/M, z3.s, z9.s\n"
- "fmla z17.s, p3/M, z4.s, z10.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x4, LSL #2]\n"
- "fmla z18.s, p3/M, z3.s, z10.s\n"
- "fmla z22.s, p3/M, z0.s, z10.s\n"
- "fmla z20.s, p3/M, z2.s, z10.s\n"
- "fmla z21.s, p3/M, z2.s, z12.s\n"
- "fmla z16.s, p3/M, z5.s, z10.s\n"
- "fmla z17.s, p3/M, z5.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x7, x4, LSL #2]\n"
- "fmla z18.s, p3/M, z4.s, z12.s\n"
- "fmla z19.s, p3/M, z3.s, z12.s\n"
- "fmla z22.s, p3/M, z1.s, z12.s\n"
- "fmla z23.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x7, x13, LSL #2]\n"
- "fmla z28.s, p3/M, z7.s, z11.s\n"
- "fmla z29.s, p3/M, z6.s, z11.s\n"
+ "fmla z28.s, p3/M, z1.s, z10.s\n"
+ "fmla z29.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x6, x11, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x14]\n"
+ "fmla z26.s, p3/M, z7.s, z11.s\n"
+ "fmla z27.s, p3/M, z6.s, z11.s\n"
+ "fmla z17.s, p3/M, z5.s, z11.s\n"
+ "fmla z18.s, p3/M, z4.s, z11.s\n"
+ "fmla z19.s, p3/M, z3.s, z11.s\n"
+ "fmla z21.s, p3/M, z2.s, z11.s\n"
+ "fmla z22.s, p3/M, z1.s, z11.s\n"
+ "fmla z23.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x6, x8, LSL #2]\n"
+ "fmla z24.s, p3/M, z0.s, z9.s\n"
+ "fmla z16.s, p3/M, z6.s, z12.s\n"
+ "fmla z20.s, p3/M, z3.s, z12.s\n"
+ "fmla z25.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z12.s }, p2/Z, [x14, x11, LSL #2]\n"
+ "fmla z28.s, p3/M, z3.s, z9.s\n"
+ "fmla z31.s, p3/M, z5.s, z10.s\n"
+ "fmla z27.s, p3/M, z2.s, z10.s\n"
+ "fmla z29.s, p3/M, z4.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x6, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z11.s\n"
+ "fmla z26.s, p3/M, z0.s, z11.s\n"
+ "fmla z19.s, p3/M, z8.s, z12.s\n"
+ "fmla z23.s, p3/M, z5.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x12, x4, LSL #2]\n"
+ "fmla z24.s, p3/M, z2.s, z11.s\n"
+ "fmla z25.s, p3/M, z2.s, z10.s\n"
+ "fmla z28.s, p3/M, z5.s, z11.s\n"
+ "fmla z29.s, p3/M, z5.s, z10.s\n"
+ "ld1w { z9.s }, p2/Z, [x7, x4, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z10.s\n"
+ "fmla z31.s, p3/M, z3.s, z10.s\n"
+ "fmla z26.s, p3/M, z1.s, z10.s\n"
+ "fmla z27.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x7, x13, LSL #2]\n"
+ "fmla z20.s, p3/M, z7.s, z12.s\n"
+ "fmla z21.s, p3/M, z6.s, z12.s\n"
"ld1w { z11.s }, p2/Z, [x12, x13, LSL #2]\n"
- "fmla z20.s, p3/M, z4.s, z10.s\n"
- "fmla z21.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z1.s, z10.s\n"
- "fmla z25.s, p3/M, z0.s, z10.s\n"
- "fmla z16.s, p3/M, z7.s, z10.s\n"
- "fmla z17.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x5, x8, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z7.s, z11.s\n"
+ "fmla z24.s, p3/M, z4.s, z9.s\n"
+ "fmla z25.s, p3/M, z3.s, z9.s\n"
+ "fmla z16.s, p3/M, z1.s, z9.s\n"
+ "fmla z17.s, p3/M, z0.s, z9.s\n"
+ "fmla z28.s, p3/M, z7.s, z9.s\n"
+ "fmla z29.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x5, x8, LSL #2]\n"
+ "fmla z22.s, p3/M, z8.s, z11.s\n"
+ "fmla z23.s, p3/M, z7.s, z11.s\n"
"ld1w { z11.s }, p2/Z, [x16, x4, LSL #2]\n"
- "fmla z18.s, p3/M, z8.s, z12.s\n"
- "fmla z19.s, p3/M, z7.s, z12.s\n"
- "fmla z22.s, p3/M, z5.s, z12.s\n"
- "fmla z23.s, p3/M, z4.s, z12.s\n"
- "fmla z26.s, p3/M, z2.s, z12.s\n"
- "fmla z27.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x5, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z10.s\n"
+ "fmla z31.s, p3/M, z7.s, z10.s\n"
+ "fmla z26.s, p3/M, z5.s, z10.s\n"
+ "fmla z27.s, p3/M, z4.s, z10.s\n"
+ "fmla z18.s, p3/M, z2.s, z10.s\n"
+ "fmla z19.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z9.s }, p2/Z, [x5, x15, LSL #2]\n"
"addvl x5, x5, #1\n"
- "fmla z20.s, p3/M, z7.s, z11.s\n"
- "fmla z21.s, p3/M, z6.s, z11.s\n"
- "fmla z24.s, p3/M, z4.s, z11.s\n"
- "fmla z25.s, p3/M, z3.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
+ "fmla z24.s, p3/M, z7.s, z11.s\n"
+ "fmla z25.s, p3/M, z6.s, z11.s\n"
+ "fmla z16.s, p3/M, z4.s, z11.s\n"
+ "fmla z17.s, p3/M, z3.s, z11.s\n"
+ "fmla z20.s, p3/M, z1.s, z11.s\n"
+ "fmla z21.s, p3/M, z0.s, z11.s\n"
"ld1w { z11.s }, p2/Z, [x16, x13, LSL #2]\n"
- "fmla z16.s, p3/M, z2.s, z10.s\n"
- "fmla z17.s, p3/M, z1.s, z10.s\n"
- "fmla z18.s, p3/M, z0.s, z10.s\n"
+ "fmla z28.s, p3/M, z2.s, z12.s\n"
+ "fmla z29.s, p3/M, z1.s, z12.s\n"
+ "fmla z30.s, p3/M, z0.s, z12.s\n"
"ld1w { z10.s }, p2/Z, [x7]\n"
- "fmla z30.s, p3/M, z2.s, z11.s\n"
- "fmla z19.s, p3/M, z0.s, z12.s\n"
- "fmla z20.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z0.s, z10.s\n"
- "fmla z22.s, p3/M, z8.s, z11.s\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
- "fmla z26.s, p3/M, z5.s, z11.s\n"
- "fmla z27.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z1.s, z11.s\n"
+ "fmla z22.s, p3/M, z2.s, z11.s\n"
+ "fmla z31.s, p3/M, z0.s, z9.s\n"
+ "fmla z24.s, p3/M, z3.s, z10.s\n"
+ "fmla z16.s, p3/M, z0.s, z10.s\n"
+ "fmla z26.s, p3/M, z8.s, z11.s\n"
+ "fmla z27.s, p3/M, z7.s, z11.s\n"
+ "fmla z18.s, p3/M, z5.s, z11.s\n"
+ "fmla z19.s, p3/M, z4.s, z11.s\n"
+ "fmla z23.s, p3/M, z1.s, z11.s\n"
"ld1w { z11.s }, p2/Z, [x14, x8, LSL #2]\n"
- "fmla z17.s, p3/M, z2.s, z12.s\n"
- "fmla z18.s, p3/M, z1.s, z12.s\n"
+ "fmla z29.s, p3/M, z2.s, z9.s\n"
+ "fmla z30.s, p3/M, z1.s, z9.s\n"
"ld1w { z12.s }, p2/Z, [x7, x11, LSL #2]\n"
"addvl x7, x7, #1\n"
- "fmla z16.s, p3/M, z6.s, z10.s\n"
+ "fmla z28.s, p3/M, z6.s, z10.s\n"
"ld1w { z10.s }, p2/Z, [x16]\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "fmla z30.s, p3/M, z3.s, z11.s\n"
- "fmla z19.s, p3/M, z8.s, z12.s\n"
- "fmla z23.s, p3/M, z5.s, z12.s\n"
- "fmla z27.s, p3/M, z2.s, z12.s\n"
+ "fmla z21.s, p3/M, z4.s, z11.s\n"
+ "fmla z22.s, p3/M, z3.s, z11.s\n"
+ "fmla z31.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z9.s }, p1/Z, [x7, x8, LSL #2]\n"
+ "fmla z27.s, p3/M, z5.s, z12.s\n"
+ "fmla z19.s, p3/M, z2.s, z12.s\n"
"ld1w { z12.s }, p2/Z, [x16, x11, LSL #2]\n"
"addvl x16, x16, #1\n"
- "fmla z20.s, p3/M, z6.s, z10.s\n"
- "fmla z24.s, p3/M, z3.s, z10.s\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "fmla z24.s, p3/M, z6.s, z10.s\n"
+ "fmla z16.s, p3/M, z3.s, z10.s\n"
+ "fmla z20.s, p3/M, z0.s, z10.s\n"
"ld1w { z10.s }, p2/Z, [x12, x8, LSL #2]\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "fmla z30.s, p3/M, z6.s, z10.s\n"
- "fmla z24.s, p3/M, z8.s, z11.s\n"
- "fmla z25.s, p3/M, z7.s, z11.s\n"
- "fmla z26.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z5.s, z11.s\n"
+ "fmla z23.s, p3/M, z2.s, z12.s\n"
+ "fmla z21.s, p3/M, z7.s, z10.s\n"
+ "fmla z22.s, p3/M, z6.s, z10.s\n"
+ "fmla z16.s, p3/M, z8.s, z11.s\n"
+ "fmla z17.s, p3/M, z7.s, z11.s\n"
+ "fmla z18.s, p3/M, z6.s, z11.s\n"
+ "fmla z20.s, p3/M, z5.s, z11.s\n"
"ld1w { z11.s }, p2/Z, [x14, x15, LSL #2]\n"
- "fmla z27.s, p3/M, z5.s, z12.s\n"
- "fmla z29.s, p3/M, z5.s, z11.s\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
- "fmla z23.s, p3/M, z8.s, z12.s\n"
+ "fmla z19.s, p3/M, z5.s, z12.s\n"
+ "fmla z21.s, p3/M, z5.s, z11.s\n"
+ "fmla z22.s, p3/M, z4.s, z11.s\n"
+ "fmla z23.s, p3/M, z3.s, z11.s\n"
+ "fmla z27.s, p3/M, z8.s, z12.s\n"
"ld1w { z12.s }, p2/Z, [x12, x15, LSL #2]\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
+ "fmla z20.s, p3/M, z8.s, z10.s\n"
"addvl x12, x12, #1\n"
"ld1w { z10.s }, p2/Z, [x6, x4, LSL #2]\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "fmla z26.s, p3/M, z7.s, z11.s\n"
- "fmla z27.s, p3/M, z6.s, z11.s\n"
- "fmla z29.s, p3/M, z8.s, z12.s\n"
+ "fmla z17.s, p3/M, z8.s, z11.s\n"
+ "fmla z18.s, p3/M, z7.s, z11.s\n"
+ "fmla z19.s, p3/M, z6.s, z11.s\n"
+ "fmla z21.s, p3/M, z8.s, z12.s\n"
"ld1w { z11.s }, p2/Z, [x6, x13, LSL #2]\n"
"addvl x6, x6, #1\n"
- "fmla z30.s, p3/M, z7.s, z12.s\n"
- "fmla z31.s, p3/M, z6.s, z12.s\n"
+ "fmla z22.s, p3/M, z7.s, z12.s\n"
+ "fmla z23.s, p3/M, z6.s, z12.s\n"
"ld1w { z12.s }, p2/Z, [x14, x4, LSL #2]\n"
- "fmla z16.s, p3/M, z4.s, z10.s\n"
- "fmla z17.s, p3/M, z3.s, z10.s\n"
- "fmla z20.s, p3/M, z1.s, z10.s\n"
- "fmla z21.s, p3/M, z0.s, z10.s\n"
+ "fmla z28.s, p3/M, z4.s, z10.s\n"
+ "fmla z29.s, p3/M, z3.s, z10.s\n"
+ "fmla z24.s, p3/M, z1.s, z10.s\n"
+ "fmla z25.s, p3/M, z0.s, z10.s\n"
"ld1w { z10.s }, p2/Z, [x14, x13, LSL #2]\n"
"whilelt p2.s, x21, %x[n_channels]\n"
- "fmla z18.s, p3/M, z5.s, z11.s\n"
- "fmla z19.s, p3/M, z4.s, z11.s\n"
- "ld1w { z15.s }, p3/Z, [x17]\n"
- "addvl x17, x17, #1\n"
- "fmla z22.s, p3/M, z2.s, z11.s\n"
- "fmla z23.s, p3/M, z1.s, z11.s\n"
+ "fmla z30.s, p3/M, z5.s, z11.s\n"
+ "fmla z31.s, p3/M, z4.s, z11.s\n"
"cmp x26, %x[n_channels]\n"
"addvl x14, x14, #1\n"
- "fmla z24.s, p3/M, z7.s, z12.s\n"
- "fmla z25.s, p3/M, z6.s, z12.s\n"
- "ld1w { z9.s }, p1/Z, [x7, x8, LSL #2]\n"
- "fmla z28.s, p3/M, z4.s, z12.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
+ "fmla z26.s, p3/M, z2.s, z11.s\n"
+ "fmla z27.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p1/Z, [x5, x11, LSL #2]\n"
+ "fmla z16.s, p3/M, z7.s, z12.s\n"
+ "fmla z17.s, p3/M, z6.s, z12.s\n"
+ "fmla z20.s, p3/M, z4.s, z12.s\n"
+ "fmla z21.s, p3/M, z3.s, z12.s\n"
".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
"addvl x17, x17, #4\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- "ld1w { z11.s }, p1/Z, [x5, x11, LSL #2]\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "fmla z18.s, p3/M, z8.s, z10.s\n"
+ "fmla z19.s, p3/M, z7.s, z10.s\n"
+ "ld1w { z12.s }, p1/Z, [x7, x15, LSL #2]\n"
+ "fmla z22.s, p3/M, z5.s, z10.s\n"
+ "fmla z23.s, p3/M, z4.s, z10.s\n"
".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
"addvl x17, x17, #4\n"
- ".inst 0xc1adc9d0 // fclamp { z16.s-z19.s }, z14.s, z13.s\n"
- ".inst 0xc1adc9d4 // fclamp { z20.s-z23.s }, z14.s, z13.s\n"
+ ".inst 0xc1afc9bc // fclamp { z28.s-z31.s }, z13.s, z15.s\n"
+ ".inst 0xc1afc9b8 // fclamp { z24.s-z27.s }, z13.s, z15.s\n"
"ld1w { z10.s }, p1/Z, [x5]\n"
- ".inst 0xc1adc9d8 // fclamp { z24.s-z27.s }, z14.s, z13.s\n"
- ".inst 0xc1adc9dc // fclamp { z28.s-z31.s }, z14.s, z13.s\n"
- "st1w { z16.s }, p0, [x28]\n"
- "ld1w { z12.s }, p1/Z, [x7, x15, LSL #2]\n"
- "st1w { z17.s }, p0, [x28, x9, LSL #2]\n"
- "st1w { z18.s }, p0, [x28, x24, LSL #2]\n"
+ ".inst 0xc1afc9b0 // fclamp { z16.s-z19.s }, z13.s, z15.s\n"
+ ".inst 0xc1afc9b4 // fclamp { z20.s-z23.s }, z13.s, z15.s\n"
+ "st1w { z28.s }, p0, [x28]\n"
+ "st1w { z29.s }, p0, [x28, x9, LSL #2]\n"
"ld1w { z8.s }, p3/Z, [x17]\n"
"addvl x17, x17, #1\n"
- "st1w { z19.s }, p0, [x28, x22, LSL #2]\n"
+ "st1w { z30.s }, p0, [x28, x24, LSL #2]\n"
+ "st1w { z31.s }, p0, [x28, x22, LSL #2]\n"
"addvl x28, x28, #1\n"
- "st1w { z20.s }, p0, [x27]\n"
- "st1w { z21.s }, p0, [x27, x9, LSL #2]\n"
- "st1w { z22.s }, p0, [x27, x24, LSL #2]\n"
- "st1w { z23.s }, p0, [x27, x22, LSL #2]\n"
+ "st1w { z24.s }, p0, [x27]\n"
+ "st1w { z25.s }, p0, [x27, x9, LSL #2]\n"
+ "st1w { z26.s }, p0, [x27, x24, LSL #2]\n"
+ "st1w { z27.s }, p0, [x27, x22, LSL #2]\n"
"addvl x27, x27, #1\n"
- "st1w { z24.s }, p0, [x25]\n"
- "st1w { z25.s }, p0, [x25, x9, LSL #2]\n"
- "st1w { z26.s }, p0, [x25, x24, LSL #2]\n"
- "st1w { z27.s }, p0, [x25, x22, LSL #2]\n"
+ "st1w { z16.s }, p0, [x25]\n"
+ "st1w { z17.s }, p0, [x25, x9, LSL #2]\n"
+ "st1w { z18.s }, p0, [x25, x24, LSL #2]\n"
+ "st1w { z19.s }, p0, [x25, x22, LSL #2]\n"
"addvl x25, x25, #1\n"
- "st1w { z28.s }, p0, [x23]\n"
- "st1w { z29.s }, p0, [x23, x9, LSL #2]\n"
- "st1w { z30.s }, p0, [x23, x24, LSL #2]\n"
- "st1w { z31.s }, p0, [x23, x22, LSL #2]\n"
+ "st1w { z20.s }, p0, [x23]\n"
+ "st1w { z21.s }, p0, [x23, x9, LSL #2]\n"
+ "st1w { z22.s }, p0, [x23, x24, LSL #2]\n"
+ "st1w { z23.s }, p0, [x23, x22, LSL #2]\n"
"addvl x23, x23, #1\n"
"blt 3b\n"
"4:" // Tile loop: Channel tail
- "movprfx z21, z15\n fmla z21.s, p3/M, z4.s, z9.s\n"
- "movprfx z16, z15\n fmla z16.s, p3/M, z8.s, z9.s\n"
+ "movprfx z21, z14\n fmla z21.s, p3/M, z4.s, z9.s\n"
+ "movprfx z24, z14\n fmla z24.s, p3/M, z8.s, z9.s\n"
"ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"add x3, x3, #0x1\n"
- "movprfx z22, z15\n fmla z22.s, p3/M, z3.s, z9.s\n"
- "movprfx z25, z15\n fmla z25.s, p3/M, z1.s, z9.s\n"
+ "movprfx z22, z14\n fmla z22.s, p3/M, z3.s, z9.s\n"
+ "movprfx z29, z14\n fmla z29.s, p3/M, z1.s, z9.s\n"
"ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
"add x21, x2, #0x1\n"
- "movprfx z26, z15\n fmla z26.s, p3/M, z0.s, z9.s\n"
+ "movprfx z30, z14\n fmla z30.s, p3/M, z0.s, z9.s\n"
"fmla z21.s, p3/M, z5.s, z12.s\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
"cmp x3, x20\n"
- "movprfx z17, z15\n fmla z17.s, p3/M, z7.s, z9.s\n"
- "movprfx z18, z15\n fmla z18.s, p3/M, z6.s, z9.s\n"
+ "movprfx z25, z14\n fmla z25.s, p3/M, z7.s, z9.s\n"
+ "movprfx z26, z14\n fmla z26.s, p3/M, z6.s, z9.s\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
"csel x2, x2, x21, LT\n"
- "movprfx z20, z15\n fmla z20.s, p3/M, z5.s, z9.s\n"
- "movprfx z24, z15\n fmla z24.s, p3/M, z2.s, z9.s\n"
+ "movprfx z20, z14\n fmla z20.s, p3/M, z5.s, z9.s\n"
+ "movprfx z28, z14\n fmla z28.s, p3/M, z2.s, z9.s\n"
"ld1w { z9.s }, p2/Z, [x16, x8, LSL #2]\n"
"mov p0.b, p2.b\n"
- "fmla z16.s, p3/M, z0.s, z10.s\n"
- "movprfx z19, z15\n fmla z19.s, p3/M, z2.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x12]\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "movprfx z27, z14\n fmla z27.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z17.s }, p2/Z, [x12]\n"
"csel x3, x3, XZR, LT\n"
"fmla z22.s, p3/M, z4.s, z12.s\n"
- "fmla z25.s, p3/M, z2.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x11, LSL #2]\n"
+ "fmla z29.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z18.s }, p2/Z, [x12, x11, LSL #2]\n"
"cmp x2, x20\n"
- "fmla z26.s, p3/M, z1.s, z12.s\n"
- "movprfx z28, z15\n fmla z28.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x16, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z12.s\n"
+ "movprfx z16, z14\n fmla z16.s, p3/M, z6.s, z17.s\n"
+ "ld1w { z11.s }, p2/Z, [x16, x15, LSL #2]\n"
"fmla z21.s, p3/M, z7.s, z9.s\n"
- "fmla z17.s, p3/M, z8.s, z12.s\n"
- "fmla z18.s, p3/M, z7.s, z12.s\n"
- "fmla z19.s, p3/M, z6.s, z12.s\n"
- "movprfx z23, z15\n fmla z23.s, p3/M, z3.s, z12.s\n"
- "movprfx z27, z15\n fmla z27.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x5, x4, LSL #2]\n"
- "movprfx z31, z15\n fmla z31.s, p3/M, z8.s, z11.s\n"
+ "fmla z25.s, p3/M, z8.s, z12.s\n"
+ "fmla z26.s, p3/M, z7.s, z12.s\n"
+ "fmla z27.s, p3/M, z6.s, z12.s\n"
+ "movprfx z23, z14\n fmla z23.s, p3/M, z3.s, z12.s\n"
+ "movprfx z31, z14\n fmla z31.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x5, x4, LSL #2]\n"
+ "movprfx z19, z14\n fmla z19.s, p3/M, z8.s, z18.s\n"
"fmla z22.s, p3/M, z6.s, z9.s\n"
- "ld1w { z11.s }, p2/Z, [x5, x13, LSL #2]\n"
- "fmla z25.s, p3/M, z4.s, z9.s\n"
- "fmla z26.s, p3/M, z3.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x5, x13, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z9.s\n"
+ "fmla z30.s, p3/M, z3.s, z9.s\n"
+ "movprfx z17, z14\n fmla z17.s, p3/M, z1.s, z9.s\n"
+ "movprfx z18, z14\n fmla z18.s, p3/M, z0.s, z9.s\n"
"fmla z20.s, p3/M, z8.s, z9.s\n"
- "fmla z24.s, p3/M, z5.s, z9.s\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z21.s, p3/M, z8.s, z10.s\n"
- "fmla z16.s, p3/M, z1.s, z12.s\n"
- "fmla z17.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x6, x11, LSL #2]\n"
- "fmla z18.s, p3/M, z2.s, z11.s\n"
- "fmla z19.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14]\n"
- "fmla z22.s, p3/M, z7.s, z10.s\n"
- "fmla z23.s, p3/M, z6.s, z10.s\n"
- "fmla z25.s, p3/M, z5.s, z10.s\n"
- "fmla z26.s, p3/M, z4.s, z10.s\n"
- "fmla z27.s, p3/M, z3.s, z10.s\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
+ "fmla z28.s, p3/M, z5.s, z9.s\n"
+ "fmla z16.s, p3/M, z2.s, z9.s\n"
+ "fmla z21.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z14.s }, p2/Z, [x6]\n"
+ "fmla z24.s, p3/M, z1.s, z10.s\n"
+ "fmla z25.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x6, x11, LSL #2]\n"
+ "fmla z26.s, p3/M, z2.s, z12.s\n"
+ "fmla z27.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x14]\n"
+ "fmla z22.s, p3/M, z7.s, z11.s\n"
+ "fmla z23.s, p3/M, z6.s, z11.s\n"
+ "fmla z29.s, p3/M, z5.s, z11.s\n"
+ "fmla z30.s, p3/M, z4.s, z11.s\n"
+ "fmla z31.s, p3/M, z3.s, z11.s\n"
+ "fmla z17.s, p3/M, z2.s, z11.s\n"
+ "fmla z18.s, p3/M, z1.s, z11.s\n"
+ "fmla z19.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z9.s }, p2/Z, [x6, x8, LSL #2]\n"
+ "fmla z20.s, p3/M, z0.s, z14.s\n"
+ "fmla z28.s, p3/M, z6.s, z12.s\n"
+ "fmla z16.s, p3/M, z3.s, z12.s\n"
+ "fmla z21.s, p3/M, z1.s, z9.s\n"
"ld1w { z11.s }, p2/Z, [x14, x11, LSL #2]\n"
- "fmla z19.s, p3/M, z5.s, z12.s\n"
- "fmla z23.s, p3/M, z2.s, z12.s\n"
+ "fmla z24.s, p3/M, z3.s, z14.s\n"
+ "fmla z27.s, p3/M, z5.s, z10.s\n"
+ "fmla z23.s, p3/M, z2.s, z10.s\n"
+ "fmla z25.s, p3/M, z4.s, z9.s\n"
"ld1w { z12.s }, p2/Z, [x6, x15, LSL #2]\n"
- "fmla z27.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "movprfx z29, z15\n fmla z29.s, p3/M, z1.s, z9.s\n"
- "movprfx z30, z15\n fmla z30.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x6]\n"
- "fmla z29.s, p3/M, z2.s, z10.s\n"
- "fmla z30.s, p3/M, z1.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x6, x8, LSL #2]\n"
- "fmla z20.s, p3/M, z0.s, z9.s\n"
- "fmla z21.s, p3/M, z1.s, z10.s\n"
- "fmla z16.s, p3/M, z3.s, z9.s\n"
- "fmla z17.s, p3/M, z4.s, z10.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x4, LSL #2]\n"
- "fmla z18.s, p3/M, z3.s, z10.s\n"
- "fmla z22.s, p3/M, z0.s, z10.s\n"
- "fmla z20.s, p3/M, z2.s, z10.s\n"
+ "fmla z26.s, p3/M, z3.s, z9.s\n"
+ "fmla z22.s, p3/M, z0.s, z9.s\n"
+ "fmla z31.s, p3/M, z8.s, z11.s\n"
+ "fmla z19.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x12, x4, LSL #2]\n"
+ "fmla z20.s, p3/M, z2.s, z9.s\n"
"fmla z21.s, p3/M, z2.s, z12.s\n"
- "fmla z16.s, p3/M, z5.s, z10.s\n"
- "fmla z17.s, p3/M, z5.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x7, x4, LSL #2]\n"
- "fmla z18.s, p3/M, z4.s, z12.s\n"
- "fmla z19.s, p3/M, z3.s, z12.s\n"
+ "fmla z24.s, p3/M, z5.s, z9.s\n"
+ "fmla z25.s, p3/M, z5.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x7, x4, LSL #2]\n"
+ "fmla z26.s, p3/M, z4.s, z12.s\n"
+ "fmla z27.s, p3/M, z3.s, z12.s\n"
"fmla z22.s, p3/M, z1.s, z12.s\n"
"fmla z23.s, p3/M, z0.s, z12.s\n"
"ld1w { z12.s }, p2/Z, [x7, x13, LSL #2]\n"
- "fmla z28.s, p3/M, z7.s, z11.s\n"
- "fmla z29.s, p3/M, z6.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x13, LSL #2]\n"
- "fmla z20.s, p3/M, z4.s, z10.s\n"
- "fmla z21.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z1.s, z10.s\n"
- "fmla z25.s, p3/M, z0.s, z10.s\n"
"fmla z16.s, p3/M, z7.s, z10.s\n"
"fmla z17.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z11.s }, p2/Z, [x12, x13, LSL #2]\n"
+ "fmla z20.s, p3/M, z4.s, z9.s\n"
+ "fmla z21.s, p3/M, z3.s, z9.s\n"
+ "fmla z28.s, p3/M, z1.s, z9.s\n"
+ "fmla z29.s, p3/M, z0.s, z9.s\n"
+ "fmla z24.s, p3/M, z7.s, z9.s\n"
+ "fmla z25.s, p3/M, z6.s, z9.s\n"
"ld1w { z10.s }, p2/Z, [x5, x8, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z7.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x16, x4, LSL #2]\n"
- "fmla z18.s, p3/M, z8.s, z12.s\n"
- "fmla z19.s, p3/M, z7.s, z12.s\n"
+ "fmla z18.s, p3/M, z8.s, z11.s\n"
+ "fmla z19.s, p3/M, z7.s, z11.s\n"
+ "ld1w { z14.s }, p2/Z, [x16, x4, LSL #2]\n"
+ "fmla z26.s, p3/M, z8.s, z12.s\n"
+ "fmla z27.s, p3/M, z7.s, z12.s\n"
"fmla z22.s, p3/M, z5.s, z12.s\n"
"fmla z23.s, p3/M, z4.s, z12.s\n"
- "fmla z26.s, p3/M, z2.s, z12.s\n"
- "fmla z27.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x5, x15, LSL #2]\n"
- "fmla z20.s, p3/M, z7.s, z11.s\n"
- "fmla z21.s, p3/M, z6.s, z11.s\n"
- "fmla z24.s, p3/M, z4.s, z11.s\n"
- "fmla z25.s, p3/M, z3.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x16, x13, LSL #2]\n"
- "fmla z16.s, p3/M, z2.s, z10.s\n"
- "fmla z17.s, p3/M, z1.s, z10.s\n"
- "fmla z18.s, p3/M, z0.s, z10.s\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x5, x15, LSL #2]\n"
+ "fmla z20.s, p3/M, z7.s, z14.s\n"
+ "fmla z21.s, p3/M, z6.s, z14.s\n"
+ "fmla z28.s, p3/M, z4.s, z14.s\n"
+ "fmla z29.s, p3/M, z3.s, z14.s\n"
+ "fmla z16.s, p3/M, z1.s, z14.s\n"
+ "fmla z17.s, p3/M, z0.s, z14.s\n"
+ "ld1w { z14.s }, p2/Z, [x16, x13, LSL #2]\n"
+ "fmla z24.s, p3/M, z2.s, z10.s\n"
+ "fmla z25.s, p3/M, z1.s, z10.s\n"
+ "fmla z26.s, p3/M, z0.s, z10.s\n"
"ld1w { z10.s }, p2/Z, [x7]\n"
- "fmla z30.s, p3/M, z2.s, z11.s\n"
- "fmla z19.s, p3/M, z0.s, z12.s\n"
+ "fmla z18.s, p3/M, z2.s, z14.s\n"
+ "fmla z27.s, p3/M, z0.s, z9.s\n"
"fmla z20.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z0.s, z10.s\n"
- "fmla z22.s, p3/M, z8.s, z11.s\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
- "fmla z26.s, p3/M, z5.s, z11.s\n"
- "fmla z27.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z1.s, z11.s\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "fmla z22.s, p3/M, z8.s, z14.s\n"
+ "fmla z23.s, p3/M, z7.s, z14.s\n"
+ "fmla z30.s, p3/M, z5.s, z14.s\n"
+ "fmla z31.s, p3/M, z4.s, z14.s\n"
+ "fmla z19.s, p3/M, z1.s, z14.s\n"
"ld1w { z11.s }, p2/Z, [x14, x8, LSL #2]\n"
- "fmla z17.s, p3/M, z2.s, z12.s\n"
- "fmla z18.s, p3/M, z1.s, z12.s\n"
+ "fmla z25.s, p3/M, z2.s, z9.s\n"
+ "fmla z26.s, p3/M, z1.s, z9.s\n"
"ld1w { z12.s }, p2/Z, [x7, x11, LSL #2]\n"
- "fmla z16.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x16]\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "fmla z30.s, p3/M, z3.s, z11.s\n"
- "fmla z19.s, p3/M, z8.s, z12.s\n"
+ "fmla z24.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z14.s }, p2/Z, [x16]\n"
+ "fmla z17.s, p3/M, z4.s, z11.s\n"
+ "fmla z18.s, p3/M, z3.s, z11.s\n"
+ "fmla z27.s, p3/M, z8.s, z12.s\n"
"fmla z23.s, p3/M, z5.s, z12.s\n"
- "fmla z27.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x16, x11, LSL #2]\n"
- "fmla z20.s, p3/M, z6.s, z10.s\n"
- "fmla z24.s, p3/M, z3.s, z10.s\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x12, x8, LSL #2]\n"
"fmla z31.s, p3/M, z2.s, z12.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "fmla z30.s, p3/M, z6.s, z10.s\n"
- "fmla z24.s, p3/M, z8.s, z11.s\n"
- "fmla z25.s, p3/M, z7.s, z11.s\n"
- "fmla z26.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x15, LSL #2]\n"
- "fmla z27.s, p3/M, z5.s, z12.s\n"
- "fmla z29.s, p3/M, z5.s, z11.s\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
- "fmla z23.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x15, LSL #2]\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x6, x4, LSL #2]\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "fmla z26.s, p3/M, z7.s, z11.s\n"
- "fmla z27.s, p3/M, z6.s, z11.s\n"
- "fmla z29.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x16, x11, LSL #2]\n"
+ "fmla z20.s, p3/M, z6.s, z14.s\n"
+ "fmla z28.s, p3/M, z3.s, z14.s\n"
+ "fmla z16.s, p3/M, z0.s, z14.s\n"
+ "ld1w { z12.s }, p2/Z, [x12, x8, LSL #2]\n"
+ "fmla z19.s, p3/M, z2.s, z9.s\n"
+ "fmla z17.s, p3/M, z7.s, z12.s\n"
+ "fmla z18.s, p3/M, z6.s, z12.s\n"
+ "fmla z28.s, p3/M, z8.s, z11.s\n"
+ "fmla z29.s, p3/M, z7.s, z11.s\n"
+ "fmla z30.s, p3/M, z6.s, z11.s\n"
+ "fmla z16.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x14, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z5.s, z9.s\n"
+ "fmla z17.s, p3/M, z5.s, z10.s\n"
+ "fmla z18.s, p3/M, z4.s, z10.s\n"
+ "fmla z19.s, p3/M, z3.s, z10.s\n"
+ "fmla z23.s, p3/M, z8.s, z9.s\n"
+ "ld1w { z14.s }, p2/Z, [x12, x15, LSL #2]\n"
+ "fmla z16.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x6, x4, LSL #2]\n"
+ "fmla z29.s, p3/M, z8.s, z10.s\n"
+ "fmla z30.s, p3/M, z7.s, z10.s\n"
+ "fmla z31.s, p3/M, z6.s, z10.s\n"
+ "fmla z17.s, p3/M, z8.s, z14.s\n"
"ld1w { z11.s }, p2/Z, [x6, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z12.s\n"
- "fmla z31.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x14, x4, LSL #2]\n"
- "fmla z16.s, p3/M, z4.s, z10.s\n"
- "fmla z17.s, p3/M, z3.s, z10.s\n"
- "fmla z20.s, p3/M, z1.s, z10.s\n"
- "fmla z21.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x14, x13, LSL #2]\n"
- "fmla z18.s, p3/M, z5.s, z11.s\n"
- "fmla z19.s, p3/M, z4.s, z11.s\n"
+ "fmla z18.s, p3/M, z7.s, z14.s\n"
+ "fmla z19.s, p3/M, z6.s, z14.s\n"
+ "ld1w { z10.s }, p2/Z, [x14, x4, LSL #2]\n"
+ "fmla z24.s, p3/M, z4.s, z9.s\n"
+ "fmla z25.s, p3/M, z3.s, z9.s\n"
+ "fmla z20.s, p3/M, z1.s, z9.s\n"
+ "fmla z21.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x14, x13, LSL #2]\n"
+ "fmla z26.s, p3/M, z5.s, z11.s\n"
+ "fmla z27.s, p3/M, z4.s, z11.s\n"
"fmla z22.s, p3/M, z2.s, z11.s\n"
"fmla z23.s, p3/M, z1.s, z11.s\n"
- "fmla z24.s, p3/M, z7.s, z12.s\n"
- "fmla z25.s, p3/M, z6.s, z12.s\n"
- "fmla z28.s, p3/M, z4.s, z12.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- ".inst 0xc1adc9d0 // fclamp { z16.s-z19.s }, z14.s, z13.s\n"
- ".inst 0xc1adc9d4 // fclamp { z20.s-z23.s }, z14.s, z13.s\n"
- "st1w { z16.s }, p0, [x28]\n"
- ".inst 0xc1adc9d8 // fclamp { z24.s-z27.s }, z14.s, z13.s\n"
- ".inst 0xc1adc9dc // fclamp { z28.s-z31.s }, z14.s, z13.s\n"
- "st1w { z17.s }, p0, [x28, x9, LSL #2]\n"
- "st1w { z18.s }, p0, [x28, x24, LSL #2]\n"
- "st1w { z19.s }, p0, [x28, x22, LSL #2]\n"
+ "fmla z28.s, p3/M, z7.s, z10.s\n"
+ "fmla z29.s, p3/M, z6.s, z10.s\n"
+ "fmla z16.s, p3/M, z4.s, z10.s\n"
+ "fmla z17.s, p3/M, z3.s, z10.s\n"
+ "fmla z30.s, p3/M, z8.s, z12.s\n"
+ "fmla z31.s, p3/M, z7.s, z12.s\n"
+ "fmla z18.s, p3/M, z5.s, z12.s\n"
+ "fmla z19.s, p3/M, z4.s, z12.s\n"
+ ".inst 0xc1afc9b8 // fclamp { z24.s-z27.s }, z13.s, z15.s\n"
+ ".inst 0xc1afc9b4 // fclamp { z20.s-z23.s }, z13.s, z15.s\n"
+ "st1w { z24.s }, p0, [x28]\n"
+ ".inst 0xc1afc9bc // fclamp { z28.s-z31.s }, z13.s, z15.s\n"
+ ".inst 0xc1afc9b0 // fclamp { z16.s-z19.s }, z13.s, z15.s\n"
+ "st1w { z25.s }, p0, [x28, x9, LSL #2]\n"
+ "st1w { z26.s }, p0, [x28, x24, LSL #2]\n"
+ "st1w { z27.s }, p0, [x28, x22, LSL #2]\n"
"st1w { z20.s }, p0, [x27]\n"
"st1w { z21.s }, p0, [x27, x9, LSL #2]\n"
"st1w { z22.s }, p0, [x27, x24, LSL #2]\n"
"st1w { z23.s }, p0, [x27, x22, LSL #2]\n"
- "st1w { z24.s }, p0, [x25]\n"
- "st1w { z25.s }, p0, [x25, x9, LSL #2]\n"
- "st1w { z26.s }, p0, [x25, x24, LSL #2]\n"
- "st1w { z27.s }, p0, [x25, x22, LSL #2]\n"
- "st1w { z28.s }, p0, [x23]\n"
- "st1w { z29.s }, p0, [x23, x9, LSL #2]\n"
- "st1w { z30.s }, p0, [x23, x24, LSL #2]\n"
- "st1w { z31.s }, p0, [x23, x22, LSL #2]\n"
+ "st1w { z28.s }, p0, [x25]\n"
+ "st1w { z29.s }, p0, [x25, x9, LSL #2]\n"
+ "st1w { z30.s }, p0, [x25, x24, LSL #2]\n"
+ "st1w { z31.s }, p0, [x25, x22, LSL #2]\n"
+ "st1w { z16.s }, p0, [x23]\n"
+ "st1w { z17.s }, p0, [x23, x9, LSL #2]\n"
+ "st1w { z18.s }, p0, [x23, x24, LSL #2]\n"
+ "st1w { z19.s }, p0, [x23, x22, LSL #2]\n"
"blt 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
index d904f68806..2e2a45bab0 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -22,11 +22,11 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME2)
-
#include <cstddef>
#include <cstdint>
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
namespace arm_conv {
namespace depthwise {
@@ -98,552 +98,552 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
".inst 0xd503477f // SMSTART ZA\n"
- "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "add x17, %x[params_struct], %[offsetof_Args_inptrs]\n"
"ptrue p3.b\n"
".inst 0x25207810 // ptrue pn8.b\n"
- "ld1w { z15.s }, p3/Z, [x17]\n"
- "addvl x17, x17, #1\n"
- "ldp x15, x14, [x16, #0x0]\n"
- "ldp x13, x12, [x16, #0x10]\n"
- "cntw x11\n"
- ".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
- "addvl x17, x17, #4\n"
- "mov x10, #0x0\n"
+ "ld1w { z13.s }, p3/Z, [x8]\n"
+ "addvl x8, x8, #1\n"
+ "ldp x23, x22, [x17, #0x0]\n"
+ "ldp x21, x20, [x17, #0x10]\n"
+ "cntw x16\n"
+ ".inst 0xa040c100 // ld1w { z0.s-z3.s }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
+ "mov x15, #0x0\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- ".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
- "ldr x9, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "addvl x17, x17, #4\n"
- "cmp x11, %x[n_channels]\n"
+ ".inst 0xa040c104 // ld1w { z4.s-z7.s }, pn8.b/Z, [x8]\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "addvl x8, x8, #4\n"
+ "cmp x16, %x[n_channels]\n"
"ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rw { z13.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "sub x28, XZR, x11\n"
- "ld1w { z8.s }, p3/Z, [x17]\n"
- "addvl x17, x17, #1\n"
- "ld1w { z9.s }, p2/Z, [x15, x10, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x14, x10, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x13, x10, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x12, x10, LSL #2]\n"
+ "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "sub x13, XZR, x16\n"
+ "ld1w { z8.s }, p3/Z, [x8]\n"
+ "addvl x8, x8, #1\n"
+ "ld1w { z9.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x20, x15, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z21, z15\n fmla z21.s, p3/M, z4.s, z9.s\n"
- "movprfx z16, z15\n fmla z16.s, p3/M, z8.s, z9.s\n"
- "ldr x27, [x16, #0x20]\n"
- "incw x28\n"
- "movprfx z22, z15\n fmla z22.s, p3/M, z3.s, z9.s\n"
- "movprfx z25, z15\n fmla z25.s, p3/M, z1.s, z9.s\n"
- "ldr x26, [x16, #0x30]\n"
+ "movprfx z29, z13\n fmla z29.s, p3/M, z4.s, z9.s\n"
+ "movprfx z16, z13\n fmla z16.s, p3/M, z8.s, z9.s\n"
+ "ldr x24, [x17, #0x20]\n"
+ "incw x13\n"
+ "movprfx z30, z13\n fmla z30.s, p3/M, z3.s, z9.s\n"
+ "movprfx z25, z13\n fmla z25.s, p3/M, z1.s, z9.s\n"
+ "ldr x20, [x17, #0x30]\n"
"mov p1.b, p2.b\n"
- "movprfx z26, z15\n fmla z26.s, p3/M, z0.s, z9.s\n"
- "ldr x25, [x16, #0x28]\n"
- "movprfx z17, z15\n fmla z17.s, p3/M, z7.s, z9.s\n"
- "whilelt p0.s, x11, %x[n_channels]\n"
- "movprfx z18, z15\n fmla z18.s, p3/M, z6.s, z9.s\n"
- "fmla z21.s, p3/M, z5.s, z12.s\n"
- "ldr x24, [x16, #0x38]\n"
- "movprfx z20, z15\n fmla z20.s, p3/M, z5.s, z9.s\n"
- "movprfx z24, z15\n fmla z24.s, p3/M, z2.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x26, x10, LSL #2]\n"
- "ldr x15, [x16, #0x40]\n"
+ "movprfx z26, z13\n fmla z26.s, p3/M, z0.s, z9.s\n"
+ "ldr x21, [x17, #0x28]\n"
+ "movprfx z17, z13\n fmla z17.s, p3/M, z7.s, z9.s\n"
+ "whilelt p0.s, x16, %x[n_channels]\n"
+ "movprfx z18, z13\n fmla z18.s, p3/M, z6.s, z9.s\n"
+ "fmla z29.s, p3/M, z5.s, z12.s\n"
+ "ldr x23, [x17, #0x38]\n"
+ "movprfx z28, z13\n fmla z28.s, p3/M, z5.s, z9.s\n"
+ "movprfx z24, z13\n fmla z24.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x22, [x17, #0x40]\n"
"fmla z16.s, p3/M, z0.s, z10.s\n"
- "movprfx z19, z15\n fmla z19.s, p3/M, z2.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x27, x10, LSL #2]\n"
- "ldr x14, [x16, #0x48]\n"
- "fmla z22.s, p3/M, z4.s, z12.s\n"
+ "movprfx z19, z13\n fmla z19.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z22.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ldr x20, [x17, #0x48]\n"
+ "fmla z30.s, p3/M, z4.s, z12.s\n"
"fmla z25.s, p3/M, z2.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x10, LSL #2]\n"
- "ldr x13, [x16, #0x50]\n"
+ "ld1w { z21.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x27, [x17, #0x50]\n"
"fmla z26.s, p3/M, z1.s, z12.s\n"
"fmla z17.s, p3/M, z8.s, z12.s\n"
- "ldr x27, [x16, #0x60]\n"
+ "ldr x26, [x17, #0x60]\n"
"fmla z18.s, p3/M, z7.s, z12.s\n"
- "movprfx z28, z15\n fmla z28.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x14, x10, LSL #2]\n"
- "ldr x25, [x16, #0x68]\n"
- "fmla z21.s, p3/M, z7.s, z9.s\n"
+ "movprfx z20, z13\n fmla z20.s, p3/M, z6.s, z22.s\n"
+ "ld1w { z11.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x25, [x17, #0x68]\n"
+ "fmla z29.s, p3/M, z7.s, z9.s\n"
"fmla z19.s, p3/M, z6.s, z12.s\n"
- "ldr x12, [x16, #0x58]\n"
- "movprfx z23, z15\n fmla z23.s, p3/M, z3.s, z12.s\n"
- "movprfx z27, z15\n fmla z27.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x10, LSL #2]\n"
- "ldr x26, [x16, #0x70]\n"
- "movprfx z31, z15\n fmla z31.s, p3/M, z8.s, z11.s\n"
- "fmla z22.s, p3/M, z6.s, z9.s\n"
- "ld1w { z11.s }, p2/Z, [x15, x10, LSL #2]\n"
- "ldr x24, [x16, #0x78]\n"
+ "ldr x21, [x17, #0x58]\n"
+ "movprfx z31, z13\n fmla z31.s, p3/M, z3.s, z12.s\n"
+ "movprfx z27, z13\n fmla z27.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ldr x24, [x17, #0x70]\n"
+ "movprfx z23, z13\n fmla z23.s, p3/M, z8.s, z21.s\n"
+ "fmla z30.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldr x23, [x17, #0x78]\n"
"fmla z25.s, p3/M, z4.s, z9.s\n"
"fmla z26.s, p3/M, z3.s, z9.s\n"
- "ldr x15, [x16, #0x80]\n"
- "fmla z20.s, p3/M, z8.s, z9.s\n"
+ "ldr x22, [x17, #0x80]\n"
+ "movprfx z21, z13\n fmla z21.s, p3/M, z1.s, z9.s\n"
+ "movprfx z22, z13\n fmla z22.s, p3/M, z0.s, z9.s\n"
+ "ldr x20, [x17, #0x88]\n"
+ "ld1w { z13.s }, p3/Z, [x8]\n"
+ "fmla z28.s, p3/M, z8.s, z9.s\n"
"fmla z24.s, p3/M, z5.s, z9.s\n"
- "ldr x14, [x16, #0x88]\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z16.s, p3/M, z1.s, z12.s\n"
- "ldr x23, [x9, #0x0]\n"
- "fmla z17.s, p3/M, z0.s, z12.s\n"
- "movprfx z29, z15\n fmla z29.s, p3/M, z1.s, z9.s\n"
- "ldr x22, [x9, #0x8]\n"
- "movprfx z30, z15\n fmla z30.s, p3/M, z0.s, z9.s\n"
- "fmla z18.s, p3/M, z2.s, z11.s\n"
- "ld1w { z9.s }, p2/Z, [x13, x10, LSL #2]\n"
- "ldr x13, [x16, #0x90]\n"
- "fmla z21.s, p3/M, z8.s, z10.s\n"
- "fmla z19.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x10, LSL #2]\n"
- "ldr x27, [x16, #0xa0]\n"
- "fmla z22.s, p3/M, z7.s, z10.s\n"
- "fmla z23.s, p3/M, z6.s, z10.s\n"
- "ldr x21, [x9, #0x10]\n"
- "fmla z25.s, p3/M, z5.s, z10.s\n"
- "fmla z26.s, p3/M, z4.s, z10.s\n"
- "ldr x20, [x9, #0x18]\n"
- "fmla z27.s, p3/M, z3.s, z10.s\n"
- "fmla z29.s, p3/M, z2.s, z10.s\n"
- "fmla z30.s, p3/M, z1.s, z10.s\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x10, LSL #2]\n"
- "ldr x25, [x16, #0xa8]\n"
+ "ldr x12, [x14, #0x0]\n"
+ "addvl x8, x8, #1\n"
+ "fmla z20.s, p3/M, z2.s, z9.s\n"
+ "fmla z16.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z9.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "ldr x27, [x17, #0x90]\n"
+ "fmla z17.s, p3/M, z0.s, z10.s\n"
+ "fmla z18.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x21, [x17, #0x98]\n"
+ "fmla z29.s, p3/M, z8.s, z11.s\n"
+ "fmla z19.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "ldr x26, [x17, #0xa0]\n"
+ "fmla z30.s, p3/M, z7.s, z11.s\n"
+ "fmla z31.s, p3/M, z6.s, z11.s\n"
+ "ldr x11, [x14, #0x8]\n"
+ "fmla z25.s, p3/M, z5.s, z11.s\n"
+ "fmla z26.s, p3/M, z4.s, z11.s\n"
+ "ldr x10, [x14, #0x10]\n"
+ "fmla z27.s, p3/M, z3.s, z11.s\n"
+ "fmla z21.s, p3/M, z2.s, z11.s\n"
+ "ldr x9, [x14, #0x18]\n"
+ "fmla z22.s, p3/M, z1.s, z11.s\n"
+ "fmla z23.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "ldr x25, [x17, #0xa8]\n"
"fmla z16.s, p3/M, z3.s, z9.s\n"
- "fmla z20.s, p3/M, z0.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x10, LSL #2]\n"
- "ldr x12, [x16, #0x98]\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x10, LSL #2]\n"
- "ldr x26, [x16, #0xb0]\n"
- "fmla z17.s, p3/M, z4.s, z10.s\n"
- "fmla z18.s, p3/M, z3.s, z10.s\n"
- "fmla z21.s, p3/M, z1.s, z10.s\n"
- "fmla z19.s, p3/M, z5.s, z12.s\n"
- "fmla z23.s, p3/M, z2.s, z12.s\n"
- "fmla z22.s, p3/M, z0.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x10, LSL #2]\n"
- "ldr x24, [x16, #0xb8]\n"
- "fmla z27.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x15, x10, LSL #2]\n"
- "ldr x15, [x16, #0xc0]\n"
- "fmla z16.s, p3/M, z5.s, z10.s\n"
- "fmla z20.s, p3/M, z2.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x14, x10, LSL #2]\n"
- "ldr x14, [x16, #0xc8]\n"
+ "fmla z28.s, p3/M, z0.s, z9.s\n"
+ "fmla z24.s, p3/M, z6.s, z12.s\n"
+ "fmla z20.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ldr x24, [x17, #0xb0]\n"
+ "fmla z17.s, p3/M, z4.s, z11.s\n"
+ "fmla z18.s, p3/M, z3.s, z11.s\n"
+ "fmla z29.s, p3/M, z1.s, z11.s\n"
+ "fmla z19.s, p3/M, z5.s, z10.s\n"
+ "fmla z31.s, p3/M, z2.s, z10.s\n"
+ "fmla z30.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z12.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ldr x23, [x17, #0xb8]\n"
+ "fmla z27.s, p3/M, z8.s, z9.s\n"
+ "fmla z23.s, p3/M, z5.s, z9.s\n"
+ "ld1w { z10.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldr x22, [x17, #0xc0]\n"
+ "fmla z16.s, p3/M, z5.s, z11.s\n"
+ "fmla z28.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x20, [x17, #0xc8]\n"
"fmla z17.s, p3/M, z5.s, z12.s\n"
"fmla z18.s, p3/M, z4.s, z12.s\n"
- "fmla z21.s, p3/M, z2.s, z12.s\n"
+ "fmla z29.s, p3/M, z2.s, z12.s\n"
"fmla z19.s, p3/M, z3.s, z12.s\n"
- "fmla z22.s, p3/M, z1.s, z12.s\n"
- "fmla z23.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x10, LSL #2]\n"
- "ldr x12, [x16, #0xd8]\n"
- "fmla z28.s, p3/M, z7.s, z11.s\n"
- "fmla z29.s, p3/M, z6.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x13, x10, LSL #2]\n"
- "ldr x13, [x16, #0xd0]\n"
- "fmla z16.s, p3/M, z7.s, z10.s\n"
- "fmla z17.s, p3/M, z6.s, z10.s\n"
- "fmla z20.s, p3/M, z4.s, z10.s\n"
- "fmla z21.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z1.s, z10.s\n"
- "fmla z25.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x27, x10, LSL #2]\n"
- "ldr x27, [x16, #0xe0]\n"
- "fmla z18.s, p3/M, z8.s, z12.s\n"
- "fmla z30.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z7.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x10, LSL #2]\n"
- "fmla z27.s, p3/M, z1.s, z12.s\n"
- "ldr x25, [x16, #0xe8]\n"
- "fmla z19.s, p3/M, z7.s, z12.s\n"
- "fmla z22.s, p3/M, z5.s, z12.s\n"
- "fmla z23.s, p3/M, z4.s, z12.s\n"
- "fmla z26.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x10, LSL #2]\n"
- "ldr x26, [x16, #0xf0]\n"
- "fmla z16.s, p3/M, z2.s, z10.s\n"
- "fmla z17.s, p3/M, z1.s, z10.s\n"
- "fmla z18.s, p3/M, z0.s, z10.s\n"
- "fmla z20.s, p3/M, z7.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x24, x10, LSL #2]\n"
- "ldr x24, [x16, #0xf8]\n"
- "fmla z21.s, p3/M, z6.s, z11.s\n"
- "fmla z24.s, p3/M, z4.s, z11.s\n"
- "fmla z25.s, p3/M, z3.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x15, x10, LSL #2]\n"
- "fmla z27.s, p3/M, z4.s, z11.s\n"
- "ldr x15, [x16, #0x100]\n"
- "fmla z30.s, p3/M, z2.s, z11.s\n"
- "fmla z17.s, p3/M, z2.s, z12.s\n"
- "fmla z18.s, p3/M, z1.s, z12.s\n"
- "fmla z19.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x14, x10, LSL #2]\n"
- "ldr x14, [x16, #0x108]\n"
- "fmla z16.s, p3/M, z6.s, z10.s\n"
- "fmla z20.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z0.s, z10.s\n"
- "fmla z22.s, p3/M, z8.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x13, x10, LSL #2]\n"
- "ldr x13, [x16, #0x110]\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
- "fmla z26.s, p3/M, z5.s, z11.s\n"
- "fmla z31.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x10, LSL #2]\n"
- "fmla z27.s, p3/M, z2.s, z12.s\n"
- "ldr x12, [x16, #0x118]\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "fmla z30.s, p3/M, z3.s, z11.s\n"
- "fmla z19.s, p3/M, z8.s, z12.s\n"
- "fmla z23.s, p3/M, z5.s, z12.s\n"
- "fmla z20.s, p3/M, z6.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x10, LSL #2]\n"
- "fmla z24.s, p3/M, z3.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x10, LSL #2]\n"
- "fmla z25.s, p3/M, z7.s, z11.s\n"
- "fmla z26.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z5.s, z11.s\n"
- "fmla z27.s, p3/M, z5.s, z12.s\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "fmla z30.s, p3/M, z6.s, z10.s\n"
- "fmla z24.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x10, LSL #2]\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
+ "fmla z30.s, p3/M, z1.s, z12.s\n"
+ "fmla z31.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x28, [x17, #0xd8]\n"
+ "fmla z20.s, p3/M, z7.s, z10.s\n"
+ "fmla z21.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "ldr x21, [x17, #0xd0]\n"
+ "fmla z16.s, p3/M, z7.s, z11.s\n"
+ "fmla z17.s, p3/M, z6.s, z11.s\n"
+ "fmla z28.s, p3/M, z4.s, z11.s\n"
+ "fmla z29.s, p3/M, z3.s, z11.s\n"
+ "fmla z24.s, p3/M, z1.s, z11.s\n"
+ "fmla z25.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "ldr x27, [x17, #0xe0]\n"
+ "fmla z18.s, p3/M, z8.s, z9.s\n"
+ "fmla z22.s, p3/M, z8.s, z10.s\n"
+ "fmla z23.s, p3/M, z7.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z1.s, z9.s\n"
+ "ldr x26, [x17, #0xe8]\n"
+ "fmla z19.s, p3/M, z7.s, z9.s\n"
+ "fmla z30.s, p3/M, z5.s, z9.s\n"
+ "fmla z31.s, p3/M, z4.s, z9.s\n"
+ "fmla z26.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ldr x25, [x17, #0xf0]\n"
+ "fmla z16.s, p3/M, z2.s, z11.s\n"
+ "fmla z17.s, p3/M, z1.s, z11.s\n"
+ "fmla z18.s, p3/M, z0.s, z11.s\n"
+ "fmla z28.s, p3/M, z7.s, z10.s\n"
+ "ld1w { z11.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ldr x24, [x17, #0xf8]\n"
+ "fmla z29.s, p3/M, z6.s, z10.s\n"
+ "fmla z24.s, p3/M, z4.s, z10.s\n"
+ "fmla z25.s, p3/M, z3.s, z10.s\n"
+ "fmla z20.s, p3/M, z1.s, z10.s\n"
+ "fmla z21.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z4.s, z10.s\n"
+ "ldr x23, [x17, #0x100]\n"
+ "fmla z22.s, p3/M, z2.s, z10.s\n"
+ "fmla z17.s, p3/M, z2.s, z9.s\n"
+ "fmla z18.s, p3/M, z1.s, z9.s\n"
+ "fmla z19.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x20, [x17, #0x108]\n"
+ "fmla z16.s, p3/M, z6.s, z11.s\n"
+ "fmla z28.s, p3/M, z3.s, z11.s\n"
+ "fmla z24.s, p3/M, z0.s, z11.s\n"
+ "fmla z30.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z11.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x22, [x17, #0x110]\n"
+ "fmla z31.s, p3/M, z7.s, z10.s\n"
+ "fmla z26.s, p3/M, z5.s, z10.s\n"
+ "fmla z23.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x28, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z2.s, z9.s\n"
+ "ldr x21, [x17, #0x118]\n"
+ "fmla z20.s, p3/M, z0.s, z11.s\n"
+ "fmla z21.s, p3/M, z4.s, z10.s\n"
+ "fmla z22.s, p3/M, z3.s, z10.s\n"
+ "fmla z19.s, p3/M, z8.s, z9.s\n"
+ "fmla z31.s, p3/M, z5.s, z9.s\n"
+ "fmla z28.s, p3/M, z6.s, z11.s\n"
+ "ld1w { z9.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "fmla z24.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z12.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "fmla z25.s, p3/M, z7.s, z10.s\n"
+ "fmla z26.s, p3/M, z6.s, z10.s\n"
+ "fmla z20.s, p3/M, z5.s, z10.s\n"
+ "fmla z27.s, p3/M, z5.s, z9.s\n"
+ "fmla z23.s, p3/M, z2.s, z9.s\n"
+ "fmla z21.s, p3/M, z7.s, z12.s\n"
+ "fmla z22.s, p3/M, z6.s, z12.s\n"
+ "fmla z24.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z11.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z20.s, p3/M, z8.s, z12.s\n"
"fmla z25.s, p3/M, z8.s, z11.s\n"
"fmla z26.s, p3/M, z7.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x15, x10, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x23, x15, LSL #2]\n"
"fmla z27.s, p3/M, z6.s, z11.s\n"
- "fmla z29.s, p3/M, z5.s, z11.s\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x10, LSL #2]\n"
- "ldp x15, x14, [x16, #0x0]\n"
- "fmla z23.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x10, LSL #2]\n"
+ "fmla z21.s, p3/M, z5.s, z11.s\n"
+ "fmla z22.s, p3/M, z4.s, z11.s\n"
+ "fmla z23.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldp x20, x25, [x17, #0x0]\n"
+ "fmla z31.s, p3/M, z8.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x15, LSL #2]\n"
"fmla z16.s, p3/M, z4.s, z10.s\n"
"fmla z17.s, p3/M, z3.s, z10.s\n"
"fmla z18.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z9.s }, p0/Z, [x20, x16, LSL #2]\n"
"fmla z19.s, p3/M, z4.s, z11.s\n"
- "fmla z29.s, p3/M, z8.s, z12.s\n"
- "fmla z30.s, p3/M, z7.s, z12.s\n"
- "fmla z31.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x13, x10, LSL #2]\n"
- "fmla z20.s, p3/M, z1.s, z10.s\n"
- "fmla z21.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x12, x10, LSL #2]\n"
- "ldp x13, x12, [x16, #0x10]\n"
- "fmla z22.s, p3/M, z2.s, z11.s\n"
- "fmla z23.s, p3/M, z1.s, z11.s\n"
- "ld1w { z15.s }, p3/Z, [x17]\n"
- "addvl x17, x17, #1\n"
- ".inst 0xc1adc9d0 // fclamp { z16.s-z19.s }, z14.s, z13.s\n"
- "st1w { z16.s }, p1, [x23, x28, LSL #2]\n"
- "ldr x23, [x9, #0x20]\n"
+ "fmla z21.s, p3/M, z8.s, z12.s\n"
+ "fmla z22.s, p3/M, z7.s, z12.s\n"
+ "fmla z23.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmla z28.s, p3/M, z1.s, z10.s\n"
+ "fmla z29.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z0.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldp x20, x24, [x17, #0x10]\n"
+ "fmla z30.s, p3/M, z2.s, z11.s\n"
+ "fmla z31.s, p3/M, z1.s, z11.s\n"
+ "incw x15\n"
+ "ld1w { z11.s }, p0/Z, [x20, x16, LSL #2]\n"
+ ".inst 0xc1afc9d0 // fclamp { z16.s-z19.s }, z14.s, z15.s\n"
+ "st1w { z16.s }, p1, [x12, x13, LSL #2]\n"
+ "ldr x23, [x14, #0x20]\n"
"fmla z24.s, p3/M, z7.s, z12.s\n"
- "st1w { z17.s }, p1, [x22, x28, LSL #2]\n"
- "ldr x22, [x9, #0x28]\n"
+ "st1w { z17.s }, p1, [x11, x13, LSL #2]\n"
+ "ldr x22, [x14, #0x28]\n"
"fmla z25.s, p3/M, z6.s, z12.s\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "st1w { z18.s }, p1, [x21, x28, LSL #2]\n"
- "ldr x21, [x9, #0x30]\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- ".inst 0xc1adc9d4 // fclamp { z20.s-z23.s }, z14.s, z13.s\n"
- "st1w { z19.s }, p1, [x20, x28, LSL #2]\n"
- "ldr x20, [x9, #0x38]\n"
- "fmla z28.s, p3/M, z4.s, z12.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "st1w { z20.s }, p1, [x23, x28, LSL #2]\n"
- "ldr x23, [x9, #0x40]\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- "st1w { z21.s }, p1, [x22, x28, LSL #2]\n"
- "ldr x22, [x9, #0x48]\n"
- ".inst 0xc1adc9d8 // fclamp { z24.s-z27.s }, z14.s, z13.s\n"
- "incw x10\n"
- "st1w { z22.s }, p1, [x21, x28, LSL #2]\n"
- "ldr x21, [x9, #0x50]\n"
- "ld1w { z9.s }, p0/Z, [x15, x11, LSL #2]\n"
- "whilelt p2.s, x10, %x[n_channels]\n"
- "st1w { z23.s }, p1, [x20, x28, LSL #2]\n"
- "ldr x20, [x9, #0x58]\n"
- "ld1w { z10.s }, p0/Z, [x14, x11, LSL #2]\n"
- ".inst 0xc1adc9dc // fclamp { z28.s-z31.s }, z14.s, z13.s\n"
- "st1w { z24.s }, p1, [x23, x28, LSL #2]\n"
- "ldr x23, [x9, #0x60]\n"
- "ld1w { z11.s }, p0/Z, [x13, x11, LSL #2]\n"
- "st1w { z25.s }, p1, [x22, x28, LSL #2]\n"
- "ldr x22, [x9, #0x68]\n"
- "ld1w { z12.s }, p0/Z, [x12, x11, LSL #2]\n"
- "incw x11\n"
- "st1w { z26.s }, p1, [x21, x28, LSL #2]\n"
- "ldr x21, [x9, #0x70]\n"
- ".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
- "addvl x17, x17, #4\n"
- "st1w { z27.s }, p1, [x20, x28, LSL #2]\n"
- "ldr x20, [x9, #0x78]\n"
- ".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
- "addvl x17, x17, #4\n"
- "cmp x11, %x[n_channels]\n"
- "st1w { z28.s }, p1, [x23, x28, LSL #2]\n"
- "ld1w { z8.s }, p3/Z, [x17]\n"
- "addvl x17, x17, #1\n"
- "st1w { z29.s }, p1, [x22, x28, LSL #2]\n"
- "st1w { z30.s }, p1, [x21, x28, LSL #2]\n"
- "st1w { z31.s }, p1, [x20, x28, LSL #2]\n"
+ "fmla z26.s, p3/M, z8.s, z0.s\n"
+ "st1w { z18.s }, p1, [x10, x13, LSL #2]\n"
+ "ldr x21, [x14, #0x30]\n"
+ "fmla z27.s, p3/M, z7.s, z0.s\n"
+ ".inst 0xc1afc9dc // fclamp { z28.s-z31.s }, z14.s, z15.s\n"
+ "st1w { z19.s }, p1, [x9, x13, LSL #2]\n"
+ "ldr x20, [x14, #0x38]\n"
+ "fmla z20.s, p3/M, z4.s, z12.s\n"
+ "fmla z21.s, p3/M, z3.s, z12.s\n"
+ "st1w { z28.s }, p1, [x23, x13, LSL #2]\n"
+ "ldr x23, [x14, #0x40]\n"
+ "fmla z22.s, p3/M, z5.s, z0.s\n"
+ "fmla z23.s, p3/M, z4.s, z0.s\n"
+ "st1w { z29.s }, p1, [x22, x13, LSL #2]\n"
+ "ldr x22, [x14, #0x48]\n"
+ ".inst 0xc1afc9d8 // fclamp { z24.s-z27.s }, z14.s, z15.s\n"
+ "ld1w { z10.s }, p0/Z, [x25, x16, LSL #2]\n"
+ "st1w { z30.s }, p1, [x21, x13, LSL #2]\n"
+ "ldr x21, [x14, #0x50]\n"
+ "ld1w { z12.s }, p0/Z, [x24, x16, LSL #2]\n"
+ "incw x16\n"
+ "st1w { z31.s }, p1, [x20, x13, LSL #2]\n"
+ "ldr x20, [x14, #0x58]\n"
+ ".inst 0xa040c100 // ld1w { z0.s-z3.s }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
+ "st1w { z24.s }, p1, [x23, x13, LSL #2]\n"
+ "ldr x23, [x14, #0x60]\n"
+ "whilelt p2.s, x15, %x[n_channels]\n"
+ ".inst 0xa040c104 // ld1w { z4.s-z7.s }, pn8.b/Z, [x8]\n"
+ "st1w { z25.s }, p1, [x22, x13, LSL #2]\n"
+ "ldr x22, [x14, #0x68]\n"
+ "addvl x8, x8, #4\n"
+ "cmp x16, %x[n_channels]\n"
+ "st1w { z26.s }, p1, [x21, x13, LSL #2]\n"
+ "ldr x21, [x14, #0x70]\n"
+ ".inst 0xc1afc9d4 // fclamp { z20.s-z23.s }, z14.s, z15.s\n"
+ "ld1w { z8.s }, p3/Z, [x8]\n"
+ "st1w { z27.s }, p1, [x20, x13, LSL #2]\n"
+ "ldr x20, [x14, #0x78]\n"
+ "addvl x8, x8, #1\n"
+ "st1w { z20.s }, p1, [x23, x13, LSL #2]\n"
+ "st1w { z21.s }, p1, [x22, x13, LSL #2]\n"
+ "st1w { z22.s }, p1, [x21, x13, LSL #2]\n"
+ "st1w { z23.s }, p1, [x20, x13, LSL #2]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z21, z15\n fmla z21.s, p3/M, z4.s, z9.s\n"
- "movprfx z16, z15\n fmla z16.s, p3/M, z8.s, z9.s\n"
- "ldr x27, [x16, #0x20]\n"
- "incw x28\n"
- "movprfx z22, z15\n fmla z22.s, p3/M, z3.s, z9.s\n"
- "movprfx z25, z15\n fmla z25.s, p3/M, z1.s, z9.s\n"
- "ldr x26, [x16, #0x30]\n"
- "mov p1.b, p2.b\n"
- "movprfx z26, z15\n fmla z26.s, p3/M, z0.s, z9.s\n"
- "ldr x25, [x16, #0x28]\n"
- "movprfx z17, z15\n fmla z17.s, p3/M, z7.s, z9.s\n"
- "movprfx z18, z15\n fmla z18.s, p3/M, z6.s, z9.s\n"
- "fmla z21.s, p3/M, z5.s, z12.s\n"
- "ldr x24, [x16, #0x38]\n"
- "movprfx z20, z15\n fmla z20.s, p3/M, z5.s, z9.s\n"
- "movprfx z24, z15\n fmla z24.s, p3/M, z2.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x26, x10, LSL #2]\n"
- "ldr x15, [x16, #0x40]\n"
- "fmla z16.s, p3/M, z0.s, z10.s\n"
- "movprfx z19, z15\n fmla z19.s, p3/M, z2.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x27, x10, LSL #2]\n"
- "ldr x14, [x16, #0x48]\n"
- "fmla z22.s, p3/M, z4.s, z12.s\n"
+ "movprfx z29, z13\n fmla z29.s, p3/M, z4.s, z9.s\n"
+ "movprfx z20, z13\n fmla z20.s, p3/M, z8.s, z9.s\n"
+ "ldr x24, [x17, #0x20]\n"
+ "incw x13\n"
+ "movprfx z30, z13\n fmla z30.s, p3/M, z3.s, z9.s\n"
+ "movprfx z25, z13\n fmla z25.s, p3/M, z1.s, z9.s\n"
+ "ldr x20, [x17, #0x30]\n"
+ "mov p0.b, p2.b\n"
+ "movprfx z26, z13\n fmla z26.s, p3/M, z0.s, z9.s\n"
+ "ldr x23, [x17, #0x28]\n"
+ "movprfx z21, z13\n fmla z21.s, p3/M, z7.s, z9.s\n"
+ "movprfx z22, z13\n fmla z22.s, p3/M, z6.s, z9.s\n"
+ "fmla z29.s, p3/M, z5.s, z12.s\n"
+ "ldr x22, [x17, #0x38]\n"
+ "movprfx z28, z13\n fmla z28.s, p3/M, z5.s, z9.s\n"
+ "movprfx z24, z13\n fmla z24.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x21, [x17, #0x40]\n"
+ "fmla z20.s, p3/M, z0.s, z10.s\n"
+ "movprfx z23, z13\n fmla z23.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z19.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ldr x20, [x17, #0x48]\n"
+ "fmla z30.s, p3/M, z4.s, z12.s\n"
"fmla z25.s, p3/M, z2.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x10, LSL #2]\n"
- "ldr x13, [x16, #0x50]\n"
+ "ld1w { z17.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ldr x27, [x17, #0x50]\n"
"fmla z26.s, p3/M, z1.s, z12.s\n"
- "fmla z17.s, p3/M, z8.s, z12.s\n"
- "ldr x27, [x16, #0x60]\n"
- "fmla z18.s, p3/M, z7.s, z12.s\n"
- "movprfx z28, z15\n fmla z28.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x14, x10, LSL #2]\n"
- "ldr x25, [x16, #0x68]\n"
- "fmla z21.s, p3/M, z7.s, z9.s\n"
- "fmla z19.s, p3/M, z6.s, z12.s\n"
- "ldr x12, [x16, #0x58]\n"
- "movprfx z23, z15\n fmla z23.s, p3/M, z3.s, z12.s\n"
- "movprfx z27, z15\n fmla z27.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x10, LSL #2]\n"
- "ldr x26, [x16, #0x70]\n"
- "movprfx z31, z15\n fmla z31.s, p3/M, z8.s, z11.s\n"
- "fmla z22.s, p3/M, z6.s, z9.s\n"
- "ld1w { z11.s }, p2/Z, [x15, x10, LSL #2]\n"
- "ldr x24, [x16, #0x78]\n"
+ "fmla z21.s, p3/M, z8.s, z12.s\n"
+ "ldr x26, [x17, #0x60]\n"
+ "fmla z22.s, p3/M, z7.s, z12.s\n"
+ "movprfx z16, z13\n fmla z16.s, p3/M, z6.s, z19.s\n"
+ "ld1w { z11.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x25, [x17, #0x68]\n"
+ "fmla z29.s, p3/M, z7.s, z9.s\n"
+ "fmla z23.s, p3/M, z6.s, z12.s\n"
+ "ldr x20, [x17, #0x58]\n"
+ "movprfx z31, z13\n fmla z31.s, p3/M, z3.s, z12.s\n"
+ "movprfx z27, z13\n fmla z27.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldr x24, [x17, #0x70]\n"
+ "movprfx z19, z13\n fmla z19.s, p3/M, z8.s, z17.s\n"
+ "fmla z30.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z10.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x23, [x17, #0x78]\n"
"fmla z25.s, p3/M, z4.s, z9.s\n"
"fmla z26.s, p3/M, z3.s, z9.s\n"
- "ldr x15, [x16, #0x80]\n"
- "fmla z20.s, p3/M, z8.s, z9.s\n"
+ "ldr x22, [x17, #0x80]\n"
+ "movprfx z17, z13\n fmla z17.s, p3/M, z1.s, z9.s\n"
+ "movprfx z18, z13\n fmla z18.s, p3/M, z0.s, z9.s\n"
+ "ldr x21, [x17, #0x88]\n"
+ "fmla z28.s, p3/M, z8.s, z9.s\n"
"fmla z24.s, p3/M, z5.s, z9.s\n"
- "ldr x14, [x16, #0x88]\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z16.s, p3/M, z1.s, z12.s\n"
- "ldr x23, [x9, #0x0]\n"
- "fmla z17.s, p3/M, z0.s, z12.s\n"
- "movprfx z29, z15\n fmla z29.s, p3/M, z1.s, z9.s\n"
- "ldr x22, [x9, #0x8]\n"
- "movprfx z30, z15\n fmla z30.s, p3/M, z0.s, z9.s\n"
- "fmla z18.s, p3/M, z2.s, z11.s\n"
- "ld1w { z9.s }, p2/Z, [x13, x10, LSL #2]\n"
- "ldr x13, [x16, #0x90]\n"
- "fmla z21.s, p3/M, z8.s, z10.s\n"
- "fmla z19.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x10, LSL #2]\n"
- "ldr x27, [x16, #0xa0]\n"
- "fmla z22.s, p3/M, z7.s, z10.s\n"
- "fmla z23.s, p3/M, z6.s, z10.s\n"
- "ldr x21, [x9, #0x10]\n"
- "fmla z25.s, p3/M, z5.s, z10.s\n"
- "fmla z26.s, p3/M, z4.s, z10.s\n"
- "ldr x20, [x9, #0x18]\n"
- "fmla z27.s, p3/M, z3.s, z10.s\n"
- "fmla z29.s, p3/M, z2.s, z10.s\n"
- "fmla z30.s, p3/M, z1.s, z10.s\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x10, LSL #2]\n"
- "ldr x25, [x16, #0xa8]\n"
- "fmla z16.s, p3/M, z3.s, z9.s\n"
- "fmla z20.s, p3/M, z0.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x10, LSL #2]\n"
- "ldr x12, [x16, #0x98]\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x10, LSL #2]\n"
- "ldr x26, [x16, #0xb0]\n"
- "fmla z17.s, p3/M, z4.s, z10.s\n"
- "fmla z18.s, p3/M, z3.s, z10.s\n"
- "fmla z21.s, p3/M, z1.s, z10.s\n"
+ "ldr x12, [x14, #0x0]\n"
+ "fmla z16.s, p3/M, z2.s, z9.s\n"
+ "fmla z20.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "ldr x27, [x17, #0x90]\n"
+ "fmla z21.s, p3/M, z0.s, z12.s\n"
+ "fmla z22.s, p3/M, z2.s, z10.s\n"
+ "ld1w { z13.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x20, [x17, #0x98]\n"
+ "fmla z29.s, p3/M, z8.s, z11.s\n"
+ "fmla z23.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "ldr x26, [x17, #0xa0]\n"
+ "fmla z30.s, p3/M, z7.s, z11.s\n"
+ "fmla z31.s, p3/M, z6.s, z11.s\n"
+ "ldr x11, [x14, #0x8]\n"
+ "fmla z25.s, p3/M, z5.s, z11.s\n"
+ "fmla z26.s, p3/M, z4.s, z11.s\n"
+ "ldr x10, [x14, #0x10]\n"
+ "fmla z27.s, p3/M, z3.s, z11.s\n"
+ "fmla z17.s, p3/M, z2.s, z11.s\n"
+ "ldr x9, [x14, #0x18]\n"
+ "fmla z18.s, p3/M, z1.s, z11.s\n"
+ "fmla z19.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "ldr x25, [x17, #0xa8]\n"
+ "fmla z20.s, p3/M, z3.s, z9.s\n"
+ "fmla z28.s, p3/M, z0.s, z9.s\n"
+ "fmla z24.s, p3/M, z6.s, z12.s\n"
+ "fmla z16.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ldr x24, [x17, #0xb0]\n"
+ "fmla z21.s, p3/M, z4.s, z10.s\n"
+ "fmla z22.s, p3/M, z3.s, z10.s\n"
+ "fmla z29.s, p3/M, z1.s, z10.s\n"
+ "fmla z23.s, p3/M, z5.s, z13.s\n"
+ "fmla z31.s, p3/M, z2.s, z13.s\n"
+ "fmla z30.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z13.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ldr x23, [x17, #0xb8]\n"
+ "fmla z27.s, p3/M, z8.s, z12.s\n"
"fmla z19.s, p3/M, z5.s, z12.s\n"
- "fmla z23.s, p3/M, z2.s, z12.s\n"
- "fmla z22.s, p3/M, z0.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x10, LSL #2]\n"
- "ldr x24, [x16, #0xb8]\n"
- "fmla z27.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x15, x10, LSL #2]\n"
- "ldr x15, [x16, #0xc0]\n"
- "fmla z16.s, p3/M, z5.s, z10.s\n"
- "fmla z20.s, p3/M, z2.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x14, x10, LSL #2]\n"
- "ldr x14, [x16, #0xc8]\n"
- "fmla z17.s, p3/M, z5.s, z12.s\n"
- "fmla z18.s, p3/M, z4.s, z12.s\n"
- "fmla z21.s, p3/M, z2.s, z12.s\n"
- "fmla z19.s, p3/M, z3.s, z12.s\n"
- "fmla z22.s, p3/M, z1.s, z12.s\n"
- "fmla z23.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x10, LSL #2]\n"
- "ldr x12, [x16, #0xd8]\n"
+ "ld1w { z9.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldr x22, [x17, #0xc0]\n"
+ "fmla z20.s, p3/M, z5.s, z10.s\n"
+ "fmla z28.s, p3/M, z2.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x21, [x17, #0xc8]\n"
+ "fmla z21.s, p3/M, z5.s, z13.s\n"
+ "fmla z22.s, p3/M, z4.s, z13.s\n"
+ "fmla z29.s, p3/M, z2.s, z13.s\n"
+ "fmla z23.s, p3/M, z3.s, z13.s\n"
+ "fmla z30.s, p3/M, z1.s, z13.s\n"
+ "fmla z31.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z10.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x28, [x17, #0xd8]\n"
+ "fmla z16.s, p3/M, z7.s, z9.s\n"
+ "fmla z17.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z11.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "ldr x20, [x17, #0xd0]\n"
+ "fmla z20.s, p3/M, z7.s, z12.s\n"
+ "fmla z21.s, p3/M, z6.s, z12.s\n"
+ "fmla z28.s, p3/M, z4.s, z12.s\n"
+ "fmla z29.s, p3/M, z3.s, z12.s\n"
+ "fmla z24.s, p3/M, z1.s, z12.s\n"
+ "fmla z25.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "ldr x27, [x17, #0xe0]\n"
+ "fmla z22.s, p3/M, z8.s, z10.s\n"
+ "fmla z18.s, p3/M, z8.s, z11.s\n"
+ "fmla z19.s, p3/M, z7.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z1.s, z10.s\n"
+ "ldr x26, [x17, #0xe8]\n"
+ "fmla z23.s, p3/M, z7.s, z10.s\n"
+ "fmla z30.s, p3/M, z5.s, z10.s\n"
+ "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "fmla z26.s, p3/M, z2.s, z10.s\n"
+ "ld1w { z9.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ldr x25, [x17, #0xf0]\n"
+ "fmla z20.s, p3/M, z2.s, z12.s\n"
+ "fmla z21.s, p3/M, z1.s, z12.s\n"
+ "fmla z22.s, p3/M, z0.s, z12.s\n"
"fmla z28.s, p3/M, z7.s, z11.s\n"
+ "ld1w { z12.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ldr x24, [x17, #0xf8]\n"
"fmla z29.s, p3/M, z6.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x13, x10, LSL #2]\n"
- "ldr x13, [x16, #0xd0]\n"
- "fmla z16.s, p3/M, z7.s, z10.s\n"
- "fmla z17.s, p3/M, z6.s, z10.s\n"
- "fmla z20.s, p3/M, z4.s, z10.s\n"
- "fmla z21.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z1.s, z10.s\n"
- "fmla z25.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x27, x10, LSL #2]\n"
- "ldr x27, [x16, #0xe0]\n"
- "fmla z18.s, p3/M, z8.s, z12.s\n"
- "fmla z30.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z7.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x10, LSL #2]\n"
- "fmla z27.s, p3/M, z1.s, z12.s\n"
- "ldr x25, [x16, #0xe8]\n"
- "fmla z19.s, p3/M, z7.s, z12.s\n"
- "fmla z22.s, p3/M, z5.s, z12.s\n"
- "fmla z23.s, p3/M, z4.s, z12.s\n"
- "fmla z26.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x10, LSL #2]\n"
- "ldr x26, [x16, #0xf0]\n"
- "fmla z16.s, p3/M, z2.s, z10.s\n"
- "fmla z17.s, p3/M, z1.s, z10.s\n"
- "fmla z18.s, p3/M, z0.s, z10.s\n"
- "fmla z20.s, p3/M, z7.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x24, x10, LSL #2]\n"
- "ldr x24, [x16, #0xf8]\n"
- "fmla z21.s, p3/M, z6.s, z11.s\n"
"fmla z24.s, p3/M, z4.s, z11.s\n"
"fmla z25.s, p3/M, z3.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x15, x10, LSL #2]\n"
- "fmla z27.s, p3/M, z4.s, z11.s\n"
- "ldr x15, [x16, #0x100]\n"
- "fmla z30.s, p3/M, z2.s, z11.s\n"
- "fmla z17.s, p3/M, z2.s, z12.s\n"
- "fmla z18.s, p3/M, z1.s, z12.s\n"
- "fmla z19.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x14, x10, LSL #2]\n"
- "ldr x14, [x16, #0x108]\n"
- "fmla z16.s, p3/M, z6.s, z10.s\n"
- "fmla z20.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z0.s, z10.s\n"
- "fmla z22.s, p3/M, z8.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x13, x10, LSL #2]\n"
- "ldr x13, [x16, #0x110]\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
- "fmla z26.s, p3/M, z5.s, z11.s\n"
- "fmla z31.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x10, LSL #2]\n"
- "fmla z27.s, p3/M, z2.s, z12.s\n"
- "ldr x12, [x16, #0x118]\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "fmla z30.s, p3/M, z3.s, z11.s\n"
- "fmla z19.s, p3/M, z8.s, z12.s\n"
- "fmla z23.s, p3/M, z5.s, z12.s\n"
- "fmla z20.s, p3/M, z6.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x10, LSL #2]\n"
- "fmla z24.s, p3/M, z3.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x10, LSL #2]\n"
- "fmla z25.s, p3/M, z7.s, z11.s\n"
- "fmla z26.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z5.s, z11.s\n"
- "fmla z27.s, p3/M, z5.s, z12.s\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "fmla z30.s, p3/M, z6.s, z10.s\n"
- "fmla z24.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x10, LSL #2]\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
+ "fmla z16.s, p3/M, z1.s, z11.s\n"
+ "fmla z17.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z4.s, z10.s\n"
+ "ldr x23, [x17, #0x100]\n"
+ "fmla z18.s, p3/M, z2.s, z10.s\n"
+ "fmla z21.s, p3/M, z2.s, z9.s\n"
+ "fmla z22.s, p3/M, z1.s, z9.s\n"
+ "fmla z23.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z11.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x22, [x17, #0x108]\n"
+ "fmla z20.s, p3/M, z6.s, z12.s\n"
+ "fmla z28.s, p3/M, z3.s, z12.s\n"
+ "fmla z24.s, p3/M, z0.s, z12.s\n"
+ "fmla z30.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x21, [x17, #0x110]\n"
+ "fmla z31.s, p3/M, z7.s, z10.s\n"
+ "fmla z26.s, p3/M, z5.s, z10.s\n"
+ "fmla z19.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z9.s }, p2/Z, [x28, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z2.s, z11.s\n"
+ "ldr x20, [x17, #0x118]\n"
+ "fmla z16.s, p3/M, z0.s, z12.s\n"
+ "fmla z17.s, p3/M, z4.s, z9.s\n"
+ "fmla z18.s, p3/M, z3.s, z9.s\n"
+ "fmla z23.s, p3/M, z8.s, z11.s\n"
+ "fmla z31.s, p3/M, z5.s, z11.s\n"
+ "fmla z28.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "fmla z24.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "fmla z25.s, p3/M, z7.s, z9.s\n"
+ "fmla z26.s, p3/M, z6.s, z9.s\n"
+ "fmla z16.s, p3/M, z5.s, z9.s\n"
+ "fmla z27.s, p3/M, z5.s, z10.s\n"
+ "fmla z19.s, p3/M, z2.s, z10.s\n"
+ "fmla z17.s, p3/M, z7.s, z12.s\n"
+ "fmla z18.s, p3/M, z6.s, z12.s\n"
+ "fmla z24.s, p3/M, z8.s, z9.s\n"
+ "ld1w { z11.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z16.s, p3/M, z8.s, z12.s\n"
"fmla z25.s, p3/M, z8.s, z11.s\n"
"fmla z26.s, p3/M, z7.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x15, x10, LSL #2]\n"
+ "ld1w { z9.s }, p2/Z, [x23, x15, LSL #2]\n"
"fmla z27.s, p3/M, z6.s, z11.s\n"
- "fmla z29.s, p3/M, z5.s, z11.s\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x10, LSL #2]\n"
- "fmla z23.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x10, LSL #2]\n"
- "fmla z16.s, p3/M, z4.s, z10.s\n"
- "fmla z17.s, p3/M, z3.s, z10.s\n"
- "fmla z18.s, p3/M, z5.s, z11.s\n"
- "fmla z19.s, p3/M, z4.s, z11.s\n"
- "fmla z29.s, p3/M, z8.s, z12.s\n"
- "fmla z30.s, p3/M, z7.s, z12.s\n"
- "fmla z31.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x13, x10, LSL #2]\n"
- "fmla z20.s, p3/M, z1.s, z10.s\n"
- "fmla z21.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x12, x10, LSL #2]\n"
- "fmla z22.s, p3/M, z2.s, z11.s\n"
- "fmla z23.s, p3/M, z1.s, z11.s\n"
- ".inst 0xc1adc9d0 // fclamp { z16.s-z19.s }, z14.s, z13.s\n"
- "st1w { z16.s }, p1, [x23, x28, LSL #2]\n"
- "ldr x23, [x9, #0x20]\n"
- "fmla z24.s, p3/M, z7.s, z12.s\n"
- "st1w { z17.s }, p1, [x22, x28, LSL #2]\n"
- "ldr x22, [x9, #0x28]\n"
- "fmla z25.s, p3/M, z6.s, z12.s\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "st1w { z18.s }, p1, [x21, x28, LSL #2]\n"
- "ldr x21, [x9, #0x30]\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- ".inst 0xc1adc9d4 // fclamp { z20.s-z23.s }, z14.s, z13.s\n"
- "st1w { z19.s }, p1, [x20, x28, LSL #2]\n"
- "ldr x20, [x9, #0x38]\n"
- "fmla z28.s, p3/M, z4.s, z12.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "st1w { z20.s }, p1, [x23, x28, LSL #2]\n"
- "ldr x23, [x9, #0x40]\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- "st1w { z21.s }, p1, [x22, x28, LSL #2]\n"
- "ldr x22, [x9, #0x48]\n"
- ".inst 0xc1adc9d8 // fclamp { z24.s-z27.s }, z14.s, z13.s\n"
- ".inst 0xc1adc9dc // fclamp { z28.s-z31.s }, z14.s, z13.s\n"
- "st1w { z22.s }, p1, [x21, x28, LSL #2]\n"
- "ldr x21, [x9, #0x50]\n"
- "st1w { z23.s }, p1, [x20, x28, LSL #2]\n"
- "ldr x20, [x9, #0x58]\n"
- "st1w { z24.s }, p1, [x23, x28, LSL #2]\n"
- "ldr x23, [x9, #0x60]\n"
- "st1w { z25.s }, p1, [x22, x28, LSL #2]\n"
- "ldr x22, [x9, #0x68]\n"
- "st1w { z26.s }, p1, [x21, x28, LSL #2]\n"
- "ldr x21, [x9, #0x70]\n"
- "st1w { z27.s }, p1, [x20, x28, LSL #2]\n"
- "ldr x20, [x9, #0x78]\n"
- "st1w { z28.s }, p1, [x23, x28, LSL #2]\n"
- "st1w { z29.s }, p1, [x22, x28, LSL #2]\n"
- "st1w { z30.s }, p1, [x21, x28, LSL #2]\n"
- "st1w { z31.s }, p1, [x20, x28, LSL #2]\n"
+ "fmla z17.s, p3/M, z5.s, z11.s\n"
+ "fmla z18.s, p3/M, z4.s, z11.s\n"
+ "fmla z19.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "fmla z20.s, p3/M, z4.s, z9.s\n"
+ "fmla z21.s, p3/M, z3.s, z9.s\n"
+ "fmla z22.s, p3/M, z5.s, z11.s\n"
+ "fmla z23.s, p3/M, z4.s, z11.s\n"
+ "fmla z17.s, p3/M, z8.s, z12.s\n"
+ "fmla z18.s, p3/M, z7.s, z12.s\n"
+ "fmla z19.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z13.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "fmla z28.s, p3/M, z1.s, z9.s\n"
+ "fmla z29.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z0.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z11.s\n"
+ "fmla z31.s, p3/M, z1.s, z11.s\n"
+ ".inst 0xc1afc9d4 // fclamp { z20.s-z23.s }, z14.s, z15.s\n"
+ "st1w { z20.s }, p0, [x12, x13, LSL #2]\n"
+ "ldr x23, [x14, #0x20]\n"
+ "fmla z24.s, p3/M, z7.s, z13.s\n"
+ "st1w { z21.s }, p0, [x11, x13, LSL #2]\n"
+ "ldr x22, [x14, #0x28]\n"
+ "fmla z25.s, p3/M, z6.s, z13.s\n"
+ "fmla z26.s, p3/M, z8.s, z0.s\n"
+ "st1w { z22.s }, p0, [x10, x13, LSL #2]\n"
+ "ldr x21, [x14, #0x30]\n"
+ "fmla z27.s, p3/M, z7.s, z0.s\n"
+ ".inst 0xc1afc9dc // fclamp { z28.s-z31.s }, z14.s, z15.s\n"
+ "st1w { z23.s }, p0, [x9, x13, LSL #2]\n"
+ "ldr x20, [x14, #0x38]\n"
+ "fmla z16.s, p3/M, z4.s, z13.s\n"
+ "fmla z17.s, p3/M, z3.s, z13.s\n"
+ "st1w { z28.s }, p0, [x23, x13, LSL #2]\n"
+ "ldr x23, [x14, #0x40]\n"
+ "fmla z18.s, p3/M, z5.s, z0.s\n"
+ "fmla z19.s, p3/M, z4.s, z0.s\n"
+ "st1w { z29.s }, p0, [x22, x13, LSL #2]\n"
+ "ldr x22, [x14, #0x48]\n"
+ ".inst 0xc1afc9d8 // fclamp { z24.s-z27.s }, z14.s, z15.s\n"
+ ".inst 0xc1afc9d0 // fclamp { z16.s-z19.s }, z14.s, z15.s\n"
+ "st1w { z30.s }, p0, [x21, x13, LSL #2]\n"
+ "ldr x21, [x14, #0x50]\n"
+ "st1w { z31.s }, p0, [x20, x13, LSL #2]\n"
+ "ldr x20, [x14, #0x58]\n"
+ "st1w { z24.s }, p0, [x23, x13, LSL #2]\n"
+ "ldr x23, [x14, #0x60]\n"
+ "st1w { z25.s }, p0, [x22, x13, LSL #2]\n"
+ "ldr x22, [x14, #0x68]\n"
+ "st1w { z26.s }, p0, [x21, x13, LSL #2]\n"
+ "ldr x21, [x14, #0x70]\n"
+ "st1w { z27.s }, p0, [x20, x13, LSL #2]\n"
+ "ldr x20, [x14, #0x78]\n"
+ "st1w { z16.s }, p0, [x23, x13, LSL #2]\n"
+ "st1w { z17.s }, p0, [x22, x13, LSL #2]\n"
+ "st1w { z18.s }, p0, [x21, x13, LSL #2]\n"
+ "st1w { z19.s }, p0, [x20, x13, LSL #2]\n"
".inst 0xd503467f // SMSTOP\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index a4ca907e1b..dcffffeb21 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,13 +22,13 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
namespace arm_conv {
namespace depthwise {
@@ -68,4 +68,4 @@ class sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirs
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
index f7f67855c1..066b935486 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -22,11 +22,11 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME2)
-
#include <cstddef>
#include <cstdint>
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
namespace arm_conv {
namespace depthwise {
@@ -170,7 +170,7 @@ void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
"mul x21, x2, x22\n" // offset = tile_i * ld_output_row
"mov x20, #0x2\n"
- "ld1w { z19.s }, p3/Z, [x17]\n"
+ "ld1w { z22.s }, p3/Z, [x17]\n"
"ldr x25, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
"madd x21, x3, x25, x21\n" // offset += tile_j * ld_output_col
"addvl x17, x17, #1\n"
@@ -178,13 +178,13 @@ void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr x24, [%x[params_struct], %[offsetof_args_outptr]]\n"
"mul x21, x21, x20\n" // offset *= output_tile_size
"cntw x23\n"
- "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z26.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
"addvl x17, x17, #4\n"
"add x24, x24, x21, LSL #2\n" // outptrs[0] += offset * sizeof(float)
".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
"addvl x17, x17, #4\n"
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rw { z24.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"cmp x23, %x[n_channels]\n"
"add x22, x24, x22, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x17]\n"
@@ -201,73 +201,73 @@ void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ld1w { z16.s }, p2/Z, [x5, x8, LSL #2]\n"
"bge 4f\n"
"3:" // Tile loop: Channel loop
- "movprfx z28, z19\n fmla z28.s, p3/M, z8.s, z9.s\n"
- "movprfx z29, z19\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "movprfx z28, z22\n fmla z28.s, p3/M, z8.s, z9.s\n"
+ "movprfx z29, z22\n fmla z29.s, p3/M, z6.s, z9.s\n"
"whilelt p1.s, x23, %x[n_channels]\n"
"incw x21\n"
"fmla z28.s, p3/M, z0.s, z10.s\n"
"fmla z29.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x6, x13, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x6, x13, LSL #2]\n"
"incw x23\n"
"fmla z28.s, p3/M, z1.s, z11.s\n"
"fmla z29.s, p3/M, z2.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x6, x15, LSL #2]\n"
+ "ld1w { z27.s }, p2/Z, [x6, x15, LSL #2]\n"
"mov p0.b, p2.b\n"
"fmla z28.s, p3/M, z3.s, z14.s\n"
"fmla z29.s, p3/M, z0.s, z16.s\n"
- "ld1w { z13.s }, p2/Z, [x6, x8, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x6, x8, LSL #2]\n"
"addvl x5, x5, #1\n"
"fmla z28.s, p3/M, z4.s, z15.s\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "ld1w { z14.s }, p2/Z, [x16]\n"
+ "fmla z29.s, p3/M, z4.s, z27.s\n"
+ "ld1w { z25.s }, p2/Z, [x16]\n"
"addvl x6, x6, #1\n"
"fmla z28.s, p3/M, z2.s, z16.s\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
- "ld1w { z15.s }, p2/Z, [x7]\n"
+ "fmla z29.s, p3/M, z5.s, z18.s\n"
+ "ld1w { z12.s }, p2/Z, [x7]\n"
"incw x20\n"
- "movprfx z30, z19\n fmla z30.s, p3/M, z2.s, z9.s\n"
- "movprfx z31, z19\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x7, x15, LSL #2]\n"
- "fmla z28.s, p3/M, z5.s, z13.s\n"
- "fmla z29.s, p3/M, z3.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x16, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z3.s, z14.s\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x16, x4, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z15.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
- "ld1w { z14.s }, p2/Z, [x16, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z14.s\n"
+ "movprfx z30, z22\n fmla z30.s, p3/M, z2.s, z9.s\n"
+ "movprfx z31, z22\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z18.s }, p2/Z, [x7, x15, LSL #2]\n"
+ "fmla z28.s, p3/M, z5.s, z17.s\n"
+ "fmla z29.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x16, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z25.s\n"
+ "fmla z31.s, p3/M, z4.s, z16.s\n"
+ "ld1w { z10.s }, p2/Z, [x16, x4, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z12.s\n"
+ "fmla z31.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x16, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z10.s\n"
+ "fmla z31.s, p3/M, z5.s, z16.s\n"
"ld1w { z16.s }, p2/Z, [x7, x4, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z15.s\n"
- "ld1w { z11.s }, p2/Z, [x7, x13, LSL #2]\n"
+ "fmla z28.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z22.s }, p2/Z, [x7, x13, LSL #2]\n"
"fmla z30.s, p3/M, z1.s, z16.s\n"
"addvl x7, x7, #1\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
+ "fmla z31.s, p3/M, z2.s, z22.s\n"
"fmla z28.s, p3/M, z7.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x14]\n"
- "ld1w { z16.s }, p2/Z, [x16, x8, LSL #2]\n"
- "fmla z30.s, p3/M, z6.s, z15.s\n"
- "fmla z31.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x14]\n"
+ "ld1w { z17.s }, p2/Z, [x16, x8, LSL #2]\n"
+ "fmla z30.s, p3/M, z6.s, z16.s\n"
+ "fmla z31.s, p3/M, z3.s, z17.s\n"
"addvl x16, x16, #1\n"
- "ld1w { z13.s }, p2/Z, [x14, x4, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
- "fmla z29.s, p3/M, z7.s, z12.s\n"
- "ld1w { z14.s }, p2/Z, [x14, x15, LSL #2]\n"
- "fmla z31.s, p3/M, z7.s, z14.s\n"
- "fmla z30.s, p3/M, z5.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x14, x8, LSL #2]\n"
- "fmla z31.s, p3/M, z6.s, z15.s\n"
- "fmla z29.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z15.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x4, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z16.s\n"
+ "fmla z29.s, p3/M, z7.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z7.s, z16.s\n"
+ "fmla z30.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x14, x8, LSL #2]\n"
+ "fmla z31.s, p3/M, z6.s, z17.s\n"
+ "fmla z29.s, p3/M, z8.s, z22.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z17.s\n"
+ "fmla z31.s, p3/M, z8.s, z16.s\n"
"whilelt p2.s, x21, %x[n_channels]\n"
- "ld1w { z19.s }, p3/Z, [x17]\n"
+ "ld1w { z22.s }, p3/Z, [x17]\n"
"addvl x17, x17, #1\n"
"cmp x23, %x[n_channels]\n"
- ".inst 0xc1b1ca5c // fclamp { z28.s-z31.s }, z18.s, z17.s\n"
+ ".inst 0xc1b8cb5c // fclamp { z28.s-z31.s }, z26.s, z24.s\n"
".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
"addvl x17, x17, #4\n"
"addvl x14, x14, #1\n"
@@ -291,71 +291,71 @@ void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"addvl x17, x17, #1\n"
"blt 3b\n"
"4:" // Tile loop: Channel tail
- "movprfx z28, z19\n fmla z28.s, p3/M, z8.s, z9.s\n"
- "movprfx z29, z19\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "movprfx z28, z22\n fmla z28.s, p3/M, z8.s, z9.s\n"
+ "movprfx z29, z22\n fmla z29.s, p3/M, z6.s, z9.s\n"
"ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"add x3, x3, #0x1\n"
"fmla z28.s, p3/M, z0.s, z10.s\n"
"fmla z29.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x6, x13, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x6, x13, LSL #2]\n"
"ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
"fmla z28.s, p3/M, z1.s, z11.s\n"
"fmla z29.s, p3/M, z2.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x6, x15, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x6, x15, LSL #2]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
"fmla z28.s, p3/M, z3.s, z14.s\n"
"fmla z29.s, p3/M, z0.s, z16.s\n"
- "ld1w { z13.s }, p2/Z, [x6, x8, LSL #2]\n"
+ "ld1w { z20.s }, p2/Z, [x6, x8, LSL #2]\n"
"ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
"fmla z28.s, p3/M, z4.s, z15.s\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "ld1w { z14.s }, p2/Z, [x16]\n"
+ "fmla z29.s, p3/M, z4.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x16]\n"
"cmp x3, x20\n"
"fmla z28.s, p3/M, z2.s, z16.s\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
- "ld1w { z15.s }, p2/Z, [x7]\n"
+ "fmla z29.s, p3/M, z5.s, z18.s\n"
+ "ld1w { z18.s }, p2/Z, [x7]\n"
"add x20, x2, #0x1\n"
- "movprfx z30, z19\n fmla z30.s, p3/M, z2.s, z9.s\n"
- "movprfx z31, z19\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x7, x15, LSL #2]\n"
+ "movprfx z30, z22\n fmla z30.s, p3/M, z2.s, z9.s\n"
+ "movprfx z31, z22\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z19.s }, p2/Z, [x7, x15, LSL #2]\n"
"csel x2, x2, x20, LT\n"
- "fmla z28.s, p3/M, z5.s, z13.s\n"
- "fmla z29.s, p3/M, z3.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x16, x15, LSL #2]\n"
+ "fmla z28.s, p3/M, z5.s, z20.s\n"
+ "fmla z29.s, p3/M, z3.s, z20.s\n"
+ "ld1w { z16.s }, p2/Z, [x16, x15, LSL #2]\n"
"mov p0.b, p2.b\n"
- "fmla z30.s, p3/M, z3.s, z14.s\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x16, x4, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z17.s\n"
+ "fmla z31.s, p3/M, z4.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x16, x4, LSL #2]\n"
"csel x3, x3, XZR, LT\n"
- "fmla z30.s, p3/M, z0.s, z15.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
- "ld1w { z14.s }, p2/Z, [x16, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z18.s\n"
+ "fmla z31.s, p3/M, z1.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x16, x13, LSL #2]\n"
"cmp x2, x21\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z14.s\n"
+ "fmla z30.s, p3/M, z4.s, z17.s\n"
+ "fmla z31.s, p3/M, z5.s, z16.s\n"
"ld1w { z16.s }, p2/Z, [x7, x4, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z15.s\n"
- "ld1w { z11.s }, p2/Z, [x7, x13, LSL #2]\n"
+ "fmla z28.s, p3/M, z6.s, z18.s\n"
+ "ld1w { z18.s }, p2/Z, [x7, x13, LSL #2]\n"
"fmla z30.s, p3/M, z1.s, z16.s\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
+ "fmla z31.s, p3/M, z2.s, z18.s\n"
"fmla z28.s, p3/M, z7.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x14]\n"
- "ld1w { z16.s }, p2/Z, [x16, x8, LSL #2]\n"
- "fmla z30.s, p3/M, z6.s, z15.s\n"
- "fmla z31.s, p3/M, z3.s, z16.s\n"
- "ld1w { z13.s }, p2/Z, [x14, x4, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
- "fmla z29.s, p3/M, z7.s, z12.s\n"
- "ld1w { z14.s }, p2/Z, [x14, x15, LSL #2]\n"
- "fmla z31.s, p3/M, z7.s, z14.s\n"
- "fmla z30.s, p3/M, z5.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x14, x8, LSL #2]\n"
- "fmla z31.s, p3/M, z6.s, z15.s\n"
- "fmla z29.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z15.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
- ".inst 0xc1b1ca5c // fclamp { z28.s-z31.s }, z18.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x14]\n"
+ "ld1w { z17.s }, p2/Z, [x16, x8, LSL #2]\n"
+ "fmla z30.s, p3/M, z6.s, z16.s\n"
+ "fmla z31.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x4, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z16.s\n"
+ "fmla z29.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z7.s, z16.s\n"
+ "fmla z30.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x14, x8, LSL #2]\n"
+ "fmla z31.s, p3/M, z6.s, z17.s\n"
+ "fmla z29.s, p3/M, z8.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z17.s\n"
+ "fmla z31.s, p3/M, z8.s, z16.s\n"
+ ".inst 0xc1b8cb5c // fclamp { z28.s-z31.s }, z26.s, z24.s\n"
"st1w { z28.s }, p0, [x24]\n"
"st1w { z29.s }, p0, [x24, x25, LSL #2]\n"
"st1w { z30.s }, p0, [x22]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
index e2ff9a214e..dc7a40ff54 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -22,11 +22,11 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME2)
-
#include <cstddef>
#include <cstdint>
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
namespace arm_conv {
namespace depthwise {
@@ -93,7 +93,7 @@ void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"ptrue p3.b\n"
"ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
".inst 0x25207810 // ptrue pn8.b\n"
- "ld1w { z19.s }, p3/Z, [x15]\n"
+ "ld1w { z26.s }, p3/Z, [x15]\n"
"addvl x15, x15, #1\n"
"ldp x14, x13, [x20, #0x0]\n"
"cntw x12\n"
@@ -103,119 +103,119 @@ void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"mov x9, #0x0\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
".inst 0xa040c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
- "ldp x28, x27, [x16, #0x0]\n"
+ "ldp x28, x26, [x16, #0x0]\n"
"addvl x15, x15, #4\n"
"cmp x12, %x[n_channels]\n"
- "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ldp x26, x25, [x16, #0x10]\n"
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "sub x24, XZR, x12\n"
+ "ld1rw { z25.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ldp x25, x24, [x16, #0x10]\n"
+ "ld1rw { z24.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "sub x27, XZR, x12\n"
"ldp x23, x22, [x16, #0x20]\n"
"ld1w { z8.s }, p3/Z, [x15]\n"
"addvl x15, x15, #1\n"
"ldp x21, x20, [x16, #0x30]\n"
"ld1w { z9.s }, p2/Z, [x28, x9, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x27, x9, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x26, x9, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x26, x9, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x24, x9, LSL #2]\n"
"ld1w { z13.s }, p2/Z, [x23, x9, LSL #2]\n"
"ld1w { z14.s }, p2/Z, [x22, x9, LSL #2]\n"
"ld1w { z15.s }, p2/Z, [x21, x9, LSL #2]\n"
"ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z28, z19\n fmla z28.s, p3/M, z8.s, z9.s\n"
- "movprfx z29, z19\n fmla z29.s, p3/M, z6.s, z9.s\n"
- "ldr x28, [x16, #0x40]\n"
+ "movprfx z28, z26\n fmla z28.s, p3/M, z8.s, z9.s\n"
+ "movprfx z29, z26\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "ldr x21, [x16, #0x40]\n"
"whilelt p1.s, x12, %x[n_channels]\n"
"fmla z28.s, p3/M, z0.s, z10.s\n"
"fmla z29.s, p3/M, z1.s, z12.s\n"
- "ldr x27, [x16, #0x48]\n"
- "ld1w { z12.s }, p2/Z, [x27, x9, LSL #2]\n"
+ "ldr x20, [x16, #0x48]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
"fmla z28.s, p3/M, z1.s, z11.s\n"
"fmla z29.s, p3/M, z2.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x9, LSL #2]\n"
- "ldr x26, [x16, #0x50]\n"
+ "ld1w { z22.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ldr x20, [x16, #0x50]\n"
"fmla z28.s, p3/M, z3.s, z14.s\n"
"fmla z29.s, p3/M, z0.s, z16.s\n"
- "ld1w { z13.s }, p2/Z, [x26, x9, LSL #2]\n"
- "ldr x25, [x16, #0x58]\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x20, [x16, #0x58]\n"
"fmla z28.s, p3/M, z4.s, z15.s\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "ldr x20, [x16, #0x78]\n"
- "ld1w { z14.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z22.s\n"
+ "ldr x21, [x16, #0x78]\n"
+ "ld1w { z23.s }, p2/Z, [x20, x9, LSL #2]\n"
"fmla z28.s, p3/M, z2.s, z16.s\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
- "ldr x23, [x16, #0x60]\n"
- "ld1w { z15.s }, p2/Z, [x23, x9, LSL #2]\n"
- "movprfx z30, z19\n fmla z30.s, p3/M, z2.s, z9.s\n"
- "movprfx z31, z19\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ldr x28, [x16, #0x80]\n"
- "ld1w { z12.s }, p2/Z, [x28, x9, LSL #2]\n"
- "fmla z28.s, p3/M, z5.s, z13.s\n"
- "fmla z29.s, p3/M, z3.s, z13.s\n"
+ "fmla z29.s, p3/M, z5.s, z18.s\n"
+ "ldr x20, [x16, #0x60]\n"
"ld1w { z13.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x22, [x16, #0x68]\n"
- "fmla z30.s, p3/M, z3.s, z14.s\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "ldr x27, [x16, #0x88]\n"
- "ld1w { z11.s }, p2/Z, [x22, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z15.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
- "ld1w { z14.s }, p2/Z, [x27, x9, LSL #2]\n"
+ "movprfx z30, z26\n fmla z30.s, p3/M, z2.s, z9.s\n"
+ "movprfx z31, z26\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ldr x20, [x16, #0x80]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z28.s, p3/M, z5.s, z17.s\n"
+ "fmla z29.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ldr x21, [x16, #0x68]\n"
+ "fmla z30.s, p3/M, z3.s, z23.s\n"
+ "fmla z31.s, p3/M, z4.s, z16.s\n"
+ "ldr x20, [x16, #0x88]\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z13.s\n"
+ "fmla z31.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
"ldr x21, [x16, #0x70]\n"
- "ldr x25, [x16, #0x98]\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z14.s\n"
+ "ldr x20, [x16, #0x98]\n"
+ "fmla z30.s, p3/M, z4.s, z17.s\n"
+ "fmla z31.s, p3/M, z5.s, z16.s\n"
"ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z15.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
- "ldr x26, [x16, #0x90]\n"
+ "fmla z28.s, p3/M, z6.s, z13.s\n"
+ "ld1w { z4.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x16, #0x90]\n"
"fmla z30.s, p3/M, z1.s, z16.s\n"
- "ldr x22, [x16, #0xa8]\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
+ "ldr x20, [x16, #0xa8]\n"
+ "fmla z31.s, p3/M, z2.s, z4.s\n"
"fmla z28.s, p3/M, z7.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x26, x9, LSL #2]\n"
- "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
- "ldr x23, [x16, #0xa0]\n"
- "fmla z30.s, p3/M, z6.s, z15.s\n"
- "fmla z31.s, p3/M, z3.s, z16.s\n"
- "ldr x21, [x16, #0xb0]\n"
- "ld1w { z13.s }, p2/Z, [x23, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
- "fmla z29.s, p3/M, z7.s, z12.s\n"
- "ld1w { z14.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x16, #0xa0]\n"
+ "fmla z30.s, p3/M, z6.s, z16.s\n"
+ "fmla z31.s, p3/M, z3.s, z17.s\n"
+ "ldr x20, [x16, #0xb0]\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z16.s\n"
+ "fmla z29.s, p3/M, z7.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
"ldr x20, [x16, #0xb8]\n"
- "fmla z31.s, p3/M, z7.s, z14.s\n"
- "fmla z30.s, p3/M, z5.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x28, [x16, #0xc0]\n"
- "fmla z31.s, p3/M, z6.s, z15.s\n"
- "fmla z29.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x9, LSL #2]\n"
- "ldp x28, x27, [x16, #0x0]\n"
- "fmla z30.s, p3/M, z8.s, z15.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
- "ldp x26, x25, [x16, #0x10]\n"
- "ld1w { z19.s }, p3/Z, [x15]\n"
+ "fmla z31.s, p3/M, z7.s, z16.s\n"
+ "fmla z30.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x20, [x16, #0xc0]\n"
+ "fmla z31.s, p3/M, z6.s, z17.s\n"
+ "fmla z29.s, p3/M, z8.s, z4.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldp x20, x26, [x16, #0x0]\n"
+ "fmla z30.s, p3/M, z8.s, z17.s\n"
+ "fmla z31.s, p3/M, z8.s, z16.s\n"
+ "ldp x25, x24, [x16, #0x10]\n"
+ "ld1w { z26.s }, p3/Z, [x15]\n"
"addvl x15, x15, #1\n"
"incw x9\n"
"ldp x23, x22, [x16, #0x20]\n"
- "ld1w { z9.s }, p1/Z, [x28, x12, LSL #2]\n"
- "incw x24\n"
+ "ld1w { z9.s }, p1/Z, [x20, x12, LSL #2]\n"
+ "incw x27\n"
"mov p0.b, p2.b\n"
"ldp x21, x20, [x16, #0x30]\n"
- "ld1w { z10.s }, p1/Z, [x27, x12, LSL #2]\n"
+ "ld1w { z10.s }, p1/Z, [x26, x12, LSL #2]\n"
"whilelt p2.s, x9, %x[n_channels]\n"
- ".inst 0xc1b1ca5c // fclamp { z28.s-z31.s }, z18.s, z17.s\n"
- "ld1w { z11.s }, p1/Z, [x26, x12, LSL #2]\n"
- "st1w { z28.s }, p0, [x14, x24, LSL #2]\n"
- "ld1w { z12.s }, p1/Z, [x25, x12, LSL #2]\n"
- "st1w { z29.s }, p0, [x13, x24, LSL #2]\n"
+ ".inst 0xc1b8cb3c // fclamp { z28.s-z31.s }, z25.s, z24.s\n"
+ "ld1w { z11.s }, p1/Z, [x25, x12, LSL #2]\n"
+ "st1w { z28.s }, p0, [x14, x27, LSL #2]\n"
+ "ld1w { z12.s }, p1/Z, [x24, x12, LSL #2]\n"
+ "st1w { z29.s }, p0, [x13, x27, LSL #2]\n"
"ld1w { z13.s }, p1/Z, [x23, x12, LSL #2]\n"
- "st1w { z30.s }, p0, [x11, x24, LSL #2]\n"
+ "st1w { z30.s }, p0, [x11, x27, LSL #2]\n"
"ld1w { z14.s }, p1/Z, [x22, x12, LSL #2]\n"
- "st1w { z31.s }, p0, [x10, x24, LSL #2]\n"
+ "st1w { z31.s }, p0, [x10, x27, LSL #2]\n"
"ld1w { z15.s }, p1/Z, [x21, x12, LSL #2]\n"
"ld1w { z16.s }, p1/Z, [x20, x12, LSL #2]\n"
"incw x12\n"
@@ -228,83 +228,83 @@ void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"addvl x15, x15, #1\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z28, z19\n fmla z28.s, p3/M, z8.s, z9.s\n"
- "movprfx z29, z19\n fmla z29.s, p3/M, z6.s, z9.s\n"
- "ldr x28, [x16, #0x40]\n"
- "incw x24\n"
+ "movprfx z28, z26\n fmla z28.s, p3/M, z8.s, z9.s\n"
+ "movprfx z29, z26\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "ldr x21, [x16, #0x40]\n"
+ "incw x27\n"
"fmla z28.s, p3/M, z0.s, z10.s\n"
"fmla z29.s, p3/M, z1.s, z12.s\n"
- "ldr x27, [x16, #0x48]\n"
- "ld1w { z12.s }, p2/Z, [x27, x9, LSL #2]\n"
+ "ldr x20, [x16, #0x48]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
"fmla z28.s, p3/M, z1.s, z11.s\n"
"fmla z29.s, p3/M, z2.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x9, LSL #2]\n"
- "ldr x26, [x16, #0x50]\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ldr x20, [x16, #0x50]\n"
"fmla z28.s, p3/M, z3.s, z14.s\n"
"fmla z29.s, p3/M, z0.s, z16.s\n"
- "ld1w { z13.s }, p2/Z, [x26, x9, LSL #2]\n"
- "ldr x25, [x16, #0x58]\n"
+ "ld1w { z20.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x20, [x16, #0x58]\n"
"fmla z28.s, p3/M, z4.s, z15.s\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "ldr x20, [x16, #0x78]\n"
- "ld1w { z14.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z17.s\n"
+ "ldr x21, [x16, #0x78]\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
"fmla z28.s, p3/M, z2.s, z16.s\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
- "ldr x23, [x16, #0x60]\n"
- "ld1w { z15.s }, p2/Z, [x23, x9, LSL #2]\n"
- "movprfx z30, z19\n fmla z30.s, p3/M, z2.s, z9.s\n"
- "movprfx z31, z19\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ldr x28, [x16, #0x80]\n"
- "ld1w { z12.s }, p2/Z, [x28, x9, LSL #2]\n"
- "fmla z28.s, p3/M, z5.s, z13.s\n"
- "fmla z29.s, p3/M, z3.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x22, [x16, #0x68]\n"
- "fmla z30.s, p3/M, z3.s, z14.s\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "ldr x27, [x16, #0x88]\n"
- "ld1w { z11.s }, p2/Z, [x22, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z15.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
- "ld1w { z14.s }, p2/Z, [x27, x9, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z18.s\n"
+ "ldr x20, [x16, #0x60]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "movprfx z30, z26\n fmla z30.s, p3/M, z2.s, z9.s\n"
+ "movprfx z31, z26\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ldr x20, [x16, #0x80]\n"
+ "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z28.s, p3/M, z5.s, z20.s\n"
+ "fmla z29.s, p3/M, z3.s, z20.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ldr x21, [x16, #0x68]\n"
+ "fmla z30.s, p3/M, z3.s, z17.s\n"
+ "fmla z31.s, p3/M, z4.s, z16.s\n"
+ "ldr x20, [x16, #0x88]\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z18.s\n"
+ "fmla z31.s, p3/M, z1.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
"ldr x21, [x16, #0x70]\n"
- "ldr x25, [x16, #0x98]\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z14.s\n"
+ "ldr x20, [x16, #0x98]\n"
+ "fmla z30.s, p3/M, z4.s, z17.s\n"
+ "fmla z31.s, p3/M, z5.s, z16.s\n"
"ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z15.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
- "ldr x26, [x16, #0x90]\n"
+ "fmla z28.s, p3/M, z6.s, z18.s\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x16, #0x90]\n"
"fmla z30.s, p3/M, z1.s, z16.s\n"
- "ldr x22, [x16, #0xa8]\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
+ "ldr x20, [x16, #0xa8]\n"
+ "fmla z31.s, p3/M, z2.s, z18.s\n"
"fmla z28.s, p3/M, z7.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x26, x9, LSL #2]\n"
- "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
- "ldr x23, [x16, #0xa0]\n"
- "fmla z30.s, p3/M, z6.s, z15.s\n"
- "fmla z31.s, p3/M, z3.s, z16.s\n"
- "ldr x21, [x16, #0xb0]\n"
- "ld1w { z13.s }, p2/Z, [x23, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
- "fmla z29.s, p3/M, z7.s, z12.s\n"
- "ld1w { z14.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x16, #0xa0]\n"
+ "fmla z30.s, p3/M, z6.s, z16.s\n"
+ "fmla z31.s, p3/M, z3.s, z17.s\n"
+ "ldr x20, [x16, #0xb0]\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z16.s\n"
+ "fmla z29.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
"ldr x20, [x16, #0xb8]\n"
- "fmla z31.s, p3/M, z7.s, z14.s\n"
- "fmla z30.s, p3/M, z5.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x28, [x16, #0xc0]\n"
- "fmla z31.s, p3/M, z6.s, z15.s\n"
- "fmla z29.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z15.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
+ "fmla z31.s, p3/M, z7.s, z16.s\n"
+ "fmla z30.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x20, [x16, #0xc0]\n"
+ "fmla z31.s, p3/M, z6.s, z17.s\n"
+ "fmla z29.s, p3/M, z8.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z17.s\n"
+ "fmla z31.s, p3/M, z8.s, z16.s\n"
"mov p0.b, p2.b\n"
- ".inst 0xc1b1ca5c // fclamp { z28.s-z31.s }, z18.s, z17.s\n"
- "st1w { z28.s }, p0, [x14, x24, LSL #2]\n"
- "st1w { z29.s }, p0, [x13, x24, LSL #2]\n"
- "st1w { z30.s }, p0, [x11, x24, LSL #2]\n"
- "st1w { z31.s }, p0, [x10, x24, LSL #2]\n"
+ ".inst 0xc1b8cb3c // fclamp { z28.s-z31.s }, z25.s, z24.s\n"
+ "st1w { z28.s }, p0, [x14, x27, LSL #2]\n"
+ "st1w { z29.s }, p0, [x13, x27, LSL #2]\n"
+ "st1w { z30.s }, p0, [x11, x27, LSL #2]\n"
+ "st1w { z31.s }, p0, [x10, x27, LSL #2]\n"
".inst 0xd503467f // SMSTOP\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za.hpp
index d29d0b5496..061b0a1e2e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za/generic.cpp
index 4d02d29e4e..a385893146 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za/generic.cpp
@@ -69,69 +69,69 @@ void sme2_fp32_planar_3x3_s1_4rows_mla_za_impl(
Args args = { inptr, ld_in_vl, pad_top, 6u - std::min(6u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
__asm__ __volatile__(
- "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"mov x20, #0x6\n"
".inst 0xd503477f // SMSTART ZA\n"
- "sub x20, x20, x6\n"
- "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "sub x20, x20, x7\n"
+ "ldr x17, [%x[args], %[offsetof_Args_pad_top]]\n"
"ptrue p2.b\n"
".inst 0x25207812 // ptrue pn10.b\n"
- "ld1rw { z5.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
- "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
- "whilelt p1.s, XZR, x17\n"
+ "ld1rw { z2.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "ldr x16, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x16\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z11.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
- "whilelt p8.s, XZR, x7\n"
+ "ld1rw { z24.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+ "whilelt p8.s, XZR, x17\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
"1:" // Channel loop
"ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
- "fmov z16.s, #0x0\n"
+ "fmov z20.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z16.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z20.s }, p1/Z, [x20, x15, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x20, x15, #0x1\n"
+ "ldr x14, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x14, #0x1\n"
"orr x24, x20, %x[ld_in_col], LSL #18\n"
- "mov z17.d, z16.d\n"
+ "mov z21.d, z20.d\n"
"ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xa1404ae0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x23]\n"
- "orr x24, x17, x24, LSL #20\n"
+ ".inst 0xa0404ae6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x23]\n"
+ "orr x24, x16, x24, LSL #20\n"
"mov x22, #0x6\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "ld1w { z3.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ld1w { z10.s }, p2/Z, [x23, #2, MUL VL]\n"
"addvl x23, x23, #3\n"
- "add x21, x7, x6\n"
- ".inst 0xa0404ae6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x23]\n"
+ "add x21, x17, x7\n"
+ ".inst 0xa1404ae0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x23]\n"
"lsl x20, %x[ld_in_row], #0x2\n"
- "mov z18.d, z16.d\n"
- "mov z19.d, z16.d\n"
+ "mov z22.d, z20.d\n"
+ "mov z23.d, z20.d\n"
"ld1w { z9.s }, p2/Z, [x23, #2, MUL VL]\n"
"addvl x23, x23, #3\n"
"mov x8, #0x0\n"
- "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
- ".inst 0xa1404ae2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x23]\n"
+ "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
+ ".inst 0xa0404ae4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x23]\n"
"lsl x24, x24, #0x2\n"
"sub x22, x22, x21\n"
"ld1w { z1.s }, p2/Z, [x23, #2, MUL VL]\n"
- "madd x20, x20, x7, x14\n"
+ "madd x20, x20, x17, x13\n"
"3:" // Issue prefetches
"subs x22, x22, #0x1\n"
".inst 0xf8b84a9c // rprfm pldstrm, x24, [x20]\n"
"add x20, x20, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
- "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x22, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x2\n"
- "msub x14, x7, x20, x14\n"
- ".inst 0xc0040e00 // mova za.d[x8, #0], { z16.d-z19.d }\n"
+ "msub x13, x17, x20, x13\n"
+ ".inst 0xc0040e80 // mova za.d[x8, #0], { z20.d-z23.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040e01 // mova za.d[x8, #1], { z16.d-z19.d }\n"
+ ".inst 0xc0040e81 // mova za.d[x8, #1], { z20.d-z23.d }\n"
"mov x10, #0x2\n"
- "ldp x9, x28, [x11], #0x10\n"
- ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
+ "ldp x9, x28, [x22], #0x10\n"
+ ".inst 0xc0040e82 // mova za.d[x8, #2], { z20.d-z23.d }\n"
"ldp x27, x26, [x20], #0x10\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- "ldp x25, x24, [x11], #0x10\n"
+ "ldp x25, x24, [x22], #0x10\n"
"ldp x23, x22, [x20], #0x10\n"
"cbz x21, 5f\n"
"cmp x21, x10\n"
@@ -140,8 +140,8 @@ void sme2_fp32_planar_3x3_s1_4rows_mla_za_impl(
"sub x10, x10, x20\n"
"cbz x21, 5f\n"
".inst 0xc0060c0c // mova { z12.d-z15.d }, za.d[x8, #0]\n"
- "sub x13, x13, x21\n"
- ".inst 0xc1abc8ac // fclamp { z12.s-z15.s }, z5.s, z11.s\n"
+ "sub x11, x11, x21\n"
+ ".inst 0xc1b8c84c // fclamp { z12.s-z15.s }, z2.s, z24.s\n"
"4:" // Left padding
"subs x21, x21, #0x1\n"
"st1w { z12.s }, p1, [x9]\n"
@@ -154,298 +154,298 @@ void sme2_fp32_planar_3x3_s1_4rows_mla_za_impl(
"add x24, x24, x22, LSL #2\n"
"bgt 4b\n"
"5:" // Left padding: End
- "adds XZR, x7, x6\n"
+ "adds XZR, x17, x7\n"
"bne 10f\n"
"cbz x10, 8f\n"
"cmp x10, #0x1\n"
- "sub x15, x15, x10\n"
+ "sub x14, x14, x10\n"
"beq 7f\n"
"6:" // Unpadded: 2 priming loads
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x14]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z24.s }, p1/Z, [x20]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z26.s }, p1/Z, [x20]\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1301ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z0.s\n"
- "ld1w { z27.s }, p1/Z, [x20]\n"
+ ".inst 0xc13619c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z6.s\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1361b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z6.s\n"
- "ld1w { z28.s }, p1/Z, [x20]\n"
- ".inst 0xc1321b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z2.s\n"
+ ".inst 0xc13019e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z0.s\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
+ ".inst 0xc1341a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z4.s\n"
"7:" // Unpadded: 1 priming loads
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x14]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z24.s }, p1/Z, [x20]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z13.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z14.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z26.s }, p1/Z, [x20]\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1381ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z8.s\n"
- ".inst 0xc1301ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z0.s\n"
- "ld1w { z27.s }, p1/Z, [x20]\n"
+ ".inst 0xc13719a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z7.s\n"
+ ".inst 0xc13619a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z6.s\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1371b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z7.s\n"
- "ld1w { z28.s }, p1/Z, [x20]\n"
- ".inst 0xc1361b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z6.s\n"
- ".inst 0xc13a1b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z10.s\n"
- ".inst 0xc1321b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z2.s\n"
+ ".inst 0xc13819c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z8.s\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
+ ".inst 0xc13019c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z0.s\n"
+ ".inst 0xc13519e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z5.s\n"
+ ".inst 0xc13419e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z4.s\n"
"8:" // Unpadded: 0 priming loads
- "cbz x15, 16f\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x14]\n"
- "sub x15, x15, #0x1\n"
- "ld1w { z24.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "sub x13, x13, #0x1\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "cmp x15, x13\n"
+ "cbz x14, 16f\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x13]\n"
+ "sub x14, x14, #0x1\n"
"ld1w { z26.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "csel x21, x15, x13, LT\n"
+ "sub x11, x11, #0x1\n"
"ld1w { z27.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "cmp x14, x11\n"
"ld1w { z28.s }, p1/Z, [x20]\n"
- "sub x13, x13, x21\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "csel x21, x14, x11, LT\n"
+ "ld1w { z29.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z30.s }, p1/Z, [x20]\n"
+ "sub x11, x11, x21\n"
"cbz x21, 15f\n"
"9:" // Unpadded: Main loop
- ".inst 0xc1331ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z3.s\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13a1b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z10.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
"subs x21, x21, #0x1\n"
- ".inst 0xc1391b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z9.s\n"
- ".inst 0xc1381ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z8.s\n"
- ".inst 0xc1301ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z0.s\n"
- "ld1w { z23.s }, p1/Z, [x14]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1311b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z1.s\n"
- ".inst 0xc1371b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z7.s\n"
- ".inst 0xc1361b02 // fmla za.s[x8, 2], { z24.s-z27.s }, z6.s\n"
- "ld1w { z24.s }, p1/Z, [x20]\n"
+ ".inst 0xc1391b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z9.s\n"
+ ".inst 0xc1371b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z7.s\n"
+ ".inst 0xc1361b22 // fmla za.s[x8, 2], { z25.s-z28.s }, z6.s\n"
+ "ld1w { z25.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1311b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z1.s\n"
+ ".inst 0xc1381b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z8.s\n"
+ ".inst 0xc1301b42 // fmla za.s[x8, 2], { z26.s-z29.s }, z0.s\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc0060c0c // mova { z12.d-z15.d }, za.d[x8, #0]\n"
- ".inst 0xc1abc8ac // fclamp { z12.s-z15.s }, z5.s, z11.s\n"
+ ".inst 0xc1b8c84c // fclamp { z12.s-z15.s }, z2.s, z24.s\n"
"st1w { z12.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc13a1b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z10.s\n"
+ ".inst 0xc1351b61 // fmla za.s[x8, 1], { z27.s-z30.s }, z5.s\n"
"st1w { z13.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- ".inst 0xc1321b22 // fmla za.s[x8, 2], { z25.s-z28.s }, z2.s\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
+ ".inst 0xc1341b62 // fmla za.s[x8, 2], { z27.s-z30.s }, z4.s\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"add x8, x8, #0x1\n"
- "ld1w { z26.s }, p1/Z, [x20]\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"st1w { z14.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- "ld1w { z27.s }, p1/Z, [x20]\n"
+ "ld1w { z29.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"st1w { z15.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
- ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
- "ld1w { z28.s }, p1/Z, [x20]\n"
+ ".inst 0xc0040e82 // mova za.d[x8, #2], { z20.d-z23.d }\n"
+ "ld1w { z30.s }, p1/Z, [x20]\n"
"bgt 9b\n"
"b 15f\n"
"10:" // Padded
"cbz x10, 13f\n"
"cmp x10, #0x1\n"
- "sub x15, x15, x10\n"
+ "sub x14, x14, x10\n"
"beq 12f\n"
"11:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z23.s }, p0/Z, [x14]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z11.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z26.s }, p0/Z, [x20]\n"
+ "ld1w { z14.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1301ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z0.s\n"
+ ".inst 0xc1361960 // fmla za.s[x8, 0], { z11.s-z14.s }, z6.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z27.s }, p0/Z, [x20]\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1361b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z6.s\n"
+ ".inst 0xc1301980 // fmla za.s[x8, 0], { z12.s-z15.s }, z0.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z28.s }, p0/Z, [x20]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1321b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z2.s\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc13419a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z4.s\n"
"12:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z23.s }, p0/Z, [x14]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z11.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z26.s }, p0/Z, [x20]\n"
+ "ld1w { z14.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1381ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z8.s\n"
+ ".inst 0xc1371960 // fmla za.s[x8, 0], { z11.s-z14.s }, z7.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1301ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z0.s\n"
- "ld1w { z27.s }, p0/Z, [x20]\n"
+ ".inst 0xc1361961 // fmla za.s[x8, 1], { z11.s-z14.s }, z6.s\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1371b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z7.s\n"
- "ld1w { z28.s }, p0/Z, [x20]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1361b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z6.s\n"
- ".inst 0xc13a1b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z10.s\n"
- ".inst 0xc1321b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z2.s\n"
+ ".inst 0xc1381980 // fmla za.s[x8, 0], { z12.s-z15.s }, z8.s\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1301981 // fmla za.s[x8, 1], { z12.s-z15.s }, z0.s\n"
+ ".inst 0xc13519a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z5.s\n"
+ ".inst 0xc13419a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z4.s\n"
"13:" // Padded: 0 priming loads
- "cbz x15, 16f\n"
+ "cbz x14, 16f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z23.s }, p0/Z, [x14]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z26.s }, p0/Z, [x20]\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "sub x15, x15, #0x1\n"
- "sub x13, x13, #0x1\n"
+ "sub x14, x14, #0x1\n"
+ "sub x11, x11, #0x1\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "cmp x15, x13\n"
- "ld1w { z27.s }, p0/Z, [x20]\n"
+ "cmp x14, x11\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z28.s }, p0/Z, [x20]\n"
- "csel x21, x15, x13, LT\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "sub x13, x13, x21\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
+ "csel x21, x14, x11, LT\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "sub x11, x11, x21\n"
"cbz x21, 15f\n"
"14:" // Padded: Main loop
- ".inst 0xc1331ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z3.s\n"
+ ".inst 0xc13a1b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z10.s\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1391b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z9.s\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1391b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z9.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
"subs x21, x21, #0x1\n"
- ".inst 0xc1381ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z8.s\n"
- ".inst 0xc1301ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z0.s\n"
- "ld1w { z23.s }, p0/Z, [x14]\n"
+ ".inst 0xc1371b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z7.s\n"
+ ".inst 0xc1361b22 // fmla za.s[x8, 2], { z25.s-z28.s }, z6.s\n"
+ "ld1w { z25.s }, p0/Z, [x13]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1311b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z1.s\n"
- ".inst 0xc1371b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z7.s\n"
- ".inst 0xc1361b02 // fmla za.s[x8, 2], { z24.s-z27.s }, z6.s\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1311b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z1.s\n"
+ ".inst 0xc1381b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z8.s\n"
+ ".inst 0xc1301b42 // fmla za.s[x8, 2], { z26.s-z29.s }, z0.s\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc0060c0c // mova { z12.d-z15.d }, za.d[x8, #0]\n"
- ".inst 0xc1abc8ac // fclamp { z12.s-z15.s }, z5.s, z11.s\n"
- "st1w { z12.s }, p1, [x9]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ ".inst 0xc1b8c850 // fclamp { z16.s-z19.s }, z2.s, z24.s\n"
+ "st1w { z16.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc13a1b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z10.s\n"
- "st1w { z13.s }, p1, [x28]\n"
+ ".inst 0xc1351b61 // fmla za.s[x8, 1], { z27.s-z30.s }, z5.s\n"
+ "st1w { z17.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- ".inst 0xc1321b22 // fmla za.s[x8, 2], { z25.s-z28.s }, z2.s\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
+ ".inst 0xc1341b62 // fmla za.s[x8, 2], { z27.s-z30.s }, z4.s\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1w { z26.s }, p0/Z, [x20]\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "st1w { z14.s }, p1, [x25]\n"
+ "st1w { z18.s }, p1, [x25]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"add x8, x8, #0x1\n"
- "ld1w { z27.s }, p0/Z, [x20]\n"
- "st1w { z15.s }, p1, [x24]\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
+ "st1w { z19.s }, p1, [x24]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
- "ld1w { z28.s }, p0/Z, [x20]\n"
+ ".inst 0xc0040e82 // mova za.d[x8, #2], { z20.d-z23.d }\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
"add x25, x25, x23, LSL #2\n"
"add x24, x24, x22, LSL #2\n"
"bgt 14b\n"
"15:" // Main loop tail
- ".inst 0xc1331ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z3.s\n"
- ".inst 0xc1391b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z9.s\n"
- ".inst 0xc1381ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z8.s\n"
- ".inst 0xc1301ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z0.s\n"
- ".inst 0xc1311b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z1.s\n"
- ".inst 0xc1371b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z7.s\n"
- ".inst 0xc1361b02 // fmla za.s[x8, 2], { z24.s-z27.s }, z6.s\n"
- ".inst 0xc0060c0c // mova { z12.d-z15.d }, za.d[x8, #0]\n"
- ".inst 0xc1abc8ac // fclamp { z12.s-z15.s }, z5.s, z11.s\n"
- "st1w { z12.s }, p1, [x9]\n"
+ ".inst 0xc13a1b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z10.s\n"
+ ".inst 0xc1391b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z9.s\n"
+ ".inst 0xc1371b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z7.s\n"
+ ".inst 0xc1361b22 // fmla za.s[x8, 2], { z25.s-z28.s }, z6.s\n"
+ ".inst 0xc1311b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z1.s\n"
+ ".inst 0xc1381b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z8.s\n"
+ ".inst 0xc1301b42 // fmla za.s[x8, 2], { z26.s-z29.s }, z0.s\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ ".inst 0xc1b8c850 // fclamp { z16.s-z19.s }, z2.s, z24.s\n"
+ "st1w { z16.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc13a1b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z10.s\n"
- "st1w { z13.s }, p1, [x28]\n"
+ ".inst 0xc1351b61 // fmla za.s[x8, 1], { z27.s-z30.s }, z5.s\n"
+ "st1w { z17.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- ".inst 0xc1321b22 // fmla za.s[x8, 2], { z25.s-z28.s }, z2.s\n"
+ ".inst 0xc1341b62 // fmla za.s[x8, 2], { z27.s-z30.s }, z4.s\n"
"add x8, x8, #0x1\n"
- "st1w { z14.s }, p1, [x25]\n"
+ "st1w { z18.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- "st1w { z15.s }, p1, [x24]\n"
+ "st1w { z19.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
- ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
+ ".inst 0xc0040e82 // mova za.d[x8, #2], { z20.d-z23.d }\n"
"16:" // Main loop skip tail
- "cbz x13, 18f\n"
+ "cbz x11, 18f\n"
"17:" // Right padding loop
- ".inst 0xc0060c0c // mova { z12.d-z15.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "subs x13, x13, #0x1\n"
- ".inst 0xc1abc8ac // fclamp { z12.s-z15.s }, z5.s, z11.s\n"
- "st1w { z12.s }, p1, [x9]\n"
+ "subs x11, x11, #0x1\n"
+ ".inst 0xc1b8c848 // fclamp { z8.s-z11.s }, z2.s, z24.s\n"
+ "st1w { z8.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
- "st1w { z13.s }, p1, [x28]\n"
+ ".inst 0xc0040e82 // mova za.d[x8, #2], { z20.d-z23.d }\n"
+ "st1w { z9.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- "st1w { z14.s }, p1, [x25]\n"
+ "st1w { z10.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- "st1w { z15.s }, p1, [x24]\n"
+ "st1w { z11.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
"bgt 17b\n"
"18:" // End
- "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
- "incb x23, ALL, MUL #9\n"
- "str x23, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x16\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "whilelt p1.s, x16, x17\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x14, x14, x20, LSL #2\n"
- "str x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incb x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x15\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "whilelt p1.s, x15, x16\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21, LSL #2\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x23, x22, [x11, #0x0]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
"ldp x21, x20, [x24, #0x0]\n"
"add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "stp x23, x22, [x11, #0x0]\n"
- "ldp x23, x22, [x11, #0x10]\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "stp x23, x22, [x11, #0x10]\n"
+ "stp x23, x22, [x25, #0x10]\n"
"b.any 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za.hpp
index 18a572954a..711f7f479a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za/generic.cpp
index 9f6b09ef88..26315101b4 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za/generic.cpp
@@ -69,69 +69,69 @@ void sme2_fp32_planar_3x3_s2_4rows_mla_za_impl(
Args args = { inptr, ld_in_vl, pad_top, 9u - std::min(9u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
__asm__ __volatile__(
- "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"mov x20, #0x9\n"
".inst 0xd503477f // SMSTART ZA\n"
- "sub x20, x20, x6\n"
- "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "sub x20, x20, x7\n"
+ "ldr x17, [%x[args], %[offsetof_Args_pad_top]]\n"
"ptrue p2.b\n"
".inst 0x25207812 // ptrue pn10.b\n"
- "ld1rw { z28.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
- "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
- "whilelt p1.s, XZR, x17\n"
+ "ld1rw { z7.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "ldr x16, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x16\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z19.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
- "whilelt p8.s, XZR, x7\n"
+ "ld1rw { z9.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+ "whilelt p8.s, XZR, x17\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
"1:" // Channel loop
"ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
- "fmov z24.s, #0x0\n"
+ "fmov z12.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z24.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z12.s }, p1/Z, [x20, x15, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x20, x15, #0x1\n"
+ "ldr x14, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x14, #0x1\n"
"orr x24, x20, %x[ld_in_col], LSL #18\n"
- "mov z25.d, z24.d\n"
+ "mov z13.d, z12.d\n"
"ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xa0404ae2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x23]\n"
- "orr x24, x17, x24, LSL #20\n"
+ ".inst 0xa1404ae2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x23]\n"
+ "orr x24, x16, x24, LSL #20\n"
"mov x22, #0x9\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "ld1w { z7.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ld1w { z8.s }, p2/Z, [x23, #2, MUL VL]\n"
"addvl x23, x23, #3\n"
- "add x21, x7, x6\n"
- ".inst 0xa0404ae4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x23]\n"
+ "add x21, x17, x7\n"
+ ".inst 0xa0404ae0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x23]\n"
"lsl x20, %x[ld_in_row], #0x2\n"
- "mov z26.d, z24.d\n"
- "mov z27.d, z24.d\n"
- "ld1w { z6.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "mov z14.d, z12.d\n"
+ "mov z15.d, z12.d\n"
+ "ld1w { z5.s }, p2/Z, [x23, #2, MUL VL]\n"
"addvl x23, x23, #3\n"
"mov x8, #0x0\n"
- "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
- ".inst 0xa1404ae1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x23]\n"
+ "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
+ ".inst 0xa1404ae3 // ld1w { z3.s, z11.s }, pn10.b/Z, [x23]\n"
"lsl x24, x24, #0x2\n"
"sub x22, x22, x21\n"
- "ld1w { z8.s }, p2/Z, [x23, #2, MUL VL]\n"
- "madd x20, x20, x7, x14\n"
+ "ld1w { z6.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "madd x20, x20, x17, x13\n"
"3:" // Issue prefetches
"subs x22, x22, #0x1\n"
".inst 0xf8b84a9c // rprfm pldstrm, x24, [x20]\n"
"add x20, x20, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
- "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x2\n"
- "msub x14, x7, x20, x14\n"
- ".inst 0xc0040f00 // mova za.d[x8, #0], { z24.d-z27.d }\n"
+ "msub x13, x17, x20, x13\n"
+ ".inst 0xc0040d80 // mova za.d[x8, #0], { z12.d-z15.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040f01 // mova za.d[x8, #1], { z24.d-z27.d }\n"
+ ".inst 0xc0040d81 // mova za.d[x8, #1], { z12.d-z15.d }\n"
"mov x22, #0x2\n"
- "ldp x10, x9, [x11], #0x10\n"
- ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
+ "ldp x10, x9, [x23], #0x10\n"
+ ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
"ldp x28, x27, [x20], #0x10\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- "ldp x26, x25, [x11], #0x10\n"
+ "ldp x26, x25, [x23], #0x10\n"
"ldp x24, x23, [x20], #0x10\n"
"cbz x21, 5f\n"
"cmp x21, x22\n"
@@ -142,9 +142,9 @@ void sme2_fp32_planar_3x3_s2_4rows_mla_za_impl(
".inst 0xc0060c14 // mova { z20.d-z23.d }, za.d[x8, #0]\n"
"and x22, x21, #0x1\n"
"add x21, x21, #0x1\n"
- ".inst 0xc1b3cb94 // fclamp { z20.s-z23.s }, z28.s, z19.s\n"
+ ".inst 0xc1a9c8f4 // fclamp { z20.s-z23.s }, z7.s, z9.s\n"
"lsr x21, x21, #0x1\n"
- "sub x13, x13, x21\n"
+ "sub x11, x11, x21\n"
"4:" // Left padding
"subs x21, x21, #0x1\n"
"st1w { z20.s }, p1, [x10]\n"
@@ -157,490 +157,490 @@ void sme2_fp32_planar_3x3_s2_4rows_mla_za_impl(
"add x25, x25, x23, LSL #2\n"
"bgt 4b\n"
"5:" // Left padding: End
- "adds XZR, x7, x6\n"
+ "adds XZR, x17, x7\n"
"bne 10f\n"
"cbz x22, 8f\n"
"cmp x22, #0x1\n"
- "sub x15, x15, x22\n"
+ "sub x14, x14, x22\n"
"beq 7f\n"
"6:" // Unpadded: 2 priming loads
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z12.s }, p1/Z, [x14]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z29.s }, p1/Z, [x20]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z19.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z13.s }, p1/Z, [x20]\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z30.s }, p1/Z, [x20]\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x20]\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z31.s }, p1/Z, [x20]\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1321980 // fmla za.s[x8, 0], { z12.s-z15.s }, z2.s\n"
- "ld1w { z0.s }, p1/Z, [x20]\n"
+ ".inst 0xc1321a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z2.s\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1341ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z4.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0xc13119a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z1.s\n"
+ ".inst 0xc1301b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z0.s\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
+ ".inst 0xc1331a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z3.s\n"
"7:" // Unpadded: 1 priming loads
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z12.s }, p1/Z, [x14]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z29.s }, p1/Z, [x20]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z26.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z13.s }, p1/Z, [x20]\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z30.s }, p1/Z, [x20]\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x20]\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z31.s }, p1/Z, [x20]\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
+ "ld1w { z29.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1331980 // fmla za.s[x8, 0], { z12.s-z15.s }, z3.s\n"
- "ld1w { z0.s }, p1/Z, [x20]\n"
+ ".inst 0xc13a1b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z10.s\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1351ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z5.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0xc13919a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z9.s\n"
+ ".inst 0xc1311a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z1.s\n"
+ "ld1w { z30.s }, p1/Z, [x20]\n"
+ ".inst 0xc13b1b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z11.s\n"
"8:" // Unpadded: 0 priming loads
- "cmp x15, #0x2\n"
+ "cmp x14, #0x2\n"
"blt 16f\n"
- "add x21, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z12.s }, p1/Z, [x14]\n"
- "sub x15, x15, #0x2\n"
- "ld1w { z29.s }, p1/Z, [x21]\n"
+ "add x21, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x13]\n"
+ "sub x14, x14, #0x2\n"
+ "ld1w { z19.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "sub x13, x13, #0x1\n"
- "ld1w { z13.s }, p1/Z, [x21]\n"
+ "sub x11, x11, #0x1\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "lsr x20, x15, #0x1\n"
- "ld1w { z30.s }, p1/Z, [x21]\n"
+ "lsr x20, x14, #0x1\n"
+ "ld1w { z20.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "cmp x20, x13\n"
- "ld1w { z14.s }, p1/Z, [x21]\n"
+ "cmp x20, x11\n"
+ "ld1w { z27.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "csel x22, x20, x13, LT\n"
- "ld1w { z31.s }, p1/Z, [x21]\n"
+ "csel x22, x20, x11, LT\n"
+ "ld1w { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z15.s }, p1/Z, [x21]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z28.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "and x15, x15, #0x1\n"
- "ld1w { z0.s }, p1/Z, [x21]\n"
+ "and x14, x14, #0x1\n"
+ "ld1w { z22.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "sub x13, x13, x22\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "sub x11, x11, x22\n"
+ "ld1w { z29.s }, p1/Z, [x21]\n"
"cbz x22, 15f\n"
"9:" // Unpadded: Main loop
- ".inst 0xc1371980 // fmla za.s[x8, 0], { z12.s-z15.s }, z7.s\n"
- "add x21, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1381b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z8.s\n"
+ "add x21, x13, %x[ld_in_row], LSL #2\n"
"subs x22, x22, #0x1\n"
- ".inst 0xc1321981 // fmla za.s[x8, 1], { z12.s-z15.s }, z2.s\n"
- "ld1w { z12.s }, p1/Z, [x14]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1361ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z6.s\n"
- ".inst 0xc1341ba1 // fmla za.s[x8, 1], { z29.s-z0.s }, z4.s\n"
- "ld1w { z29.s }, p1/Z, [x21]\n"
+ ".inst 0xc1321b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z2.s\n"
+ "ld1w { z25.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1351a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z5.s\n"
+ ".inst 0xc1301a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z0.s\n"
+ "ld1w { z18.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13819a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z8.s\n"
- ".inst 0xc13119a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z1.s\n"
- "ld1w { z13.s }, p1/Z, [x21]\n"
+ ".inst 0xc1361b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z6.s\n"
+ ".inst 0xc1331b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z3.s\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "ld1w { z30.s }, p1/Z, [x21]\n"
+ "ld1w { z19.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0xc0060c14 // mova { z20.d-z23.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "ld1w { z14.s }, p1/Z, [x21]\n"
+ "ld1w { z27.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1b3cb94 // fclamp { z20.s-z23.s }, z28.s, z19.s\n"
+ ".inst 0xc1a9c8f4 // fclamp { z20.s-z23.s }, z7.s, z9.s\n"
"st1w { z20.s }, p1, [x10]\n"
- "ld1w { z31.s }, p1/Z, [x21]\n"
+ "ld1w { z20.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"add x10, x10, x28, LSL #2\n"
"st1w { z21.s }, p1, [x9]\n"
- "ld1w { z15.s }, p1/Z, [x21]\n"
+ "ld1w { z28.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1331980 // fmla za.s[x8, 0], { z12.s-z15.s }, z3.s\n"
+ ".inst 0xc13a1b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z10.s\n"
"add x9, x9, x27, LSL #2\n"
- "ld1w { z0.s }, p1/Z, [x21]\n"
+ "ld1w { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1351ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z5.s\n"
+ ".inst 0xc1311a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z1.s\n"
"st1w { z22.s }, p1, [x26]\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
- ".inst 0xc13919a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z9.s\n"
+ "ld1w { z29.s }, p1/Z, [x21]\n"
+ ".inst 0xc13b1b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z11.s\n"
"add x26, x26, x24, LSL #2\n"
"st1w { z23.s }, p1, [x25]\n"
- "ld1w { z12.s }, p1/Z, [x14]\n"
+ "ld1w { z25.s }, p1/Z, [x13]\n"
"add x25, x25, x23, LSL #2\n"
- ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z29.s }, p1/Z, [x20]\n"
+ ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z13.s }, p1/Z, [x20]\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z30.s }, p1/Z, [x20]\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x20]\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z31.s }, p1/Z, [x20]\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z0.s }, p1/Z, [x20]\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "ld1w { z29.s }, p1/Z, [x20]\n"
"bgt 9b\n"
"b 15f\n"
"10:" // Padded
"cbz x22, 13f\n"
"cmp x22, #0x1\n"
- "sub x15, x15, x22\n"
+ "sub x14, x14, x22\n"
"beq 12f\n"
"11:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z12.s }, p0/Z, [x14]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z27.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z29.s }, p0/Z, [x20]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z30.s }, p0/Z, [x20]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z31.s }, p0/Z, [x20]\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- ".inst 0xc1321980 // fmla za.s[x8, 0], { z12.s-z15.s }, z2.s\n"
- "ld1w { z0.s }, p0/Z, [x20]\n"
+ ".inst 0xc1321b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z2.s\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1341ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z4.s\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc13119a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z1.s\n"
+ ".inst 0xc1301ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z0.s\n"
+ "ld1w { z31.s }, p0/Z, [x20]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1331b80 // fmla za.s[x8, 0], { z28.s-z31.s }, z3.s\n"
"12:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z12.s }, p0/Z, [x14]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z22.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z29.s }, p0/Z, [x20]\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z30.s }, p0/Z, [x20]\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z31.s }, p0/Z, [x20]\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- ".inst 0xc1331980 // fmla za.s[x8, 0], { z12.s-z15.s }, z3.s\n"
- "ld1w { z0.s }, p0/Z, [x20]\n"
+ ".inst 0xc13a1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z10.s\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1351ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z5.s\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc13919a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z9.s\n"
+ ".inst 0xc1311b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z1.s\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc13b1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z11.s\n"
"13:" // Padded: 0 priming loads
- "cmp x15, #0x2\n"
+ "cmp x14, #0x2\n"
"blt 16f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z12.s }, p0/Z, [x14]\n"
- "add x21, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p0/Z, [x13]\n"
+ "add x21, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z29.s }, p0/Z, [x21]\n"
+ "ld1w { z19.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z13.s }, p0/Z, [x21]\n"
+ "ld1w { z26.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z30.s }, p0/Z, [x21]\n"
+ "ld1w { z20.s }, p0/Z, [x21]\n"
"mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x21]\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "sub x15, x15, #0x2\n"
- "ld1w { z31.s }, p0/Z, [x21]\n"
+ "sub x14, x14, #0x2\n"
+ "ld1w { z21.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z15.s }, p0/Z, [x21]\n"
- "sub x13, x13, #0x1\n"
- "lsr x20, x15, #0x1\n"
+ "ld1w { z28.s }, p0/Z, [x21]\n"
+ "sub x11, x11, #0x1\n"
+ "lsr x20, x14, #0x1\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z0.s }, p0/Z, [x21]\n"
+ "ld1w { z22.s }, p0/Z, [x21]\n"
"mov x12, #0x8\n"
- "cmp x20, x13\n"
+ "cmp x20, x11\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- "csel x22, x20, x13, LT\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "and x15, x15, #0x1\n"
- "sub x13, x13, x22\n"
+ "ld1w { z29.s }, p0/Z, [x21]\n"
+ "csel x22, x20, x11, LT\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "and x14, x14, #0x1\n"
+ "sub x11, x11, x22\n"
"cbz x22, 15f\n"
"14:" // Padded: Main loop
- ".inst 0xc1371980 // fmla za.s[x8, 0], { z12.s-z15.s }, z7.s\n"
+ ".inst 0xc1381b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z8.s\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1321981 // fmla za.s[x8, 1], { z12.s-z15.s }, z2.s\n"
- "ld1w { z12.s }, p0/Z, [x14]\n"
- "add x21, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1321b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z2.s\n"
+ "ld1w { z18.s }, p0/Z, [x13]\n"
+ "add x21, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1361ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z6.s\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1341ba1 // fmla za.s[x8, 1], { z29.s-z0.s }, z4.s\n"
- "ld1w { z29.s }, p0/Z, [x21]\n"
+ ".inst 0xc1351a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z5.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1301a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z0.s\n"
+ "ld1w { z25.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc13819a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z8.s\n"
+ ".inst 0xc1361b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z6.s\n"
"subs x22, x22, #0x1\n"
- ".inst 0xc13119a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z1.s\n"
- "ld1w { z13.s }, p0/Z, [x21]\n"
+ ".inst 0xc1331b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z3.s\n"
+ "ld1w { z19.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1w { z30.s }, p0/Z, [x21]\n"
+ "ld1w { z26.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc0060c14 // mova { z20.d-z23.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x21]\n"
+ "ld1w { z20.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1b3cb94 // fclamp { z20.s-z23.s }, z28.s, z19.s\n"
+ ".inst 0xc1a9c8fc // fclamp { z28.s-z31.s }, z7.s, z9.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z31.s }, p0/Z, [x21]\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "st1w { z20.s }, p1, [x10]\n"
+ "st1w { z28.s }, p1, [x10]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z15.s }, p0/Z, [x21]\n"
+ "ld1w { z21.s }, p0/Z, [x21]\n"
"add x8, x8, #0x1\n"
- "st1w { z21.s }, p1, [x9]\n"
+ "st1w { z29.s }, p1, [x9]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z0.s }, p0/Z, [x21]\n"
- "st1w { z22.s }, p1, [x26]\n"
+ "ld1w { z28.s }, p0/Z, [x21]\n"
+ "st1w { z30.s }, p1, [x26]\n"
"mov x12, #0x8\n"
- ".inst 0xc1331980 // fmla za.s[x8, 0], { z12.s-z15.s }, z3.s\n"
+ ".inst 0xc13a1a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z10.s\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "st1w { z23.s }, p1, [x25]\n"
+ "st1w { z31.s }, p1, [x25]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1351ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z5.s\n"
+ ".inst 0xc1311b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z1.s\n"
"mov x12, #0x0\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z22.s }, p0/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z12.s }, p0/Z, [x14]\n"
+ "ld1w { z25.s }, p0/Z, [x13]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc13919a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z9.s\n"
- "ld1w { z29.s }, p0/Z, [x20]\n"
+ ".inst 0xc13b1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z11.s\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
+ ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1w { z30.s }, p0/Z, [x20]\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z31.s }, p0/Z, [x20]\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z0.s }, p0/Z, [x20]\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
"add x10, x10, x28, LSL #2\n"
"add x9, x9, x27, LSL #2\n"
"add x26, x26, x24, LSL #2\n"
"add x25, x25, x23, LSL #2\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
"bgt 14b\n"
"15:" // Main loop tail
- ".inst 0xc1371980 // fmla za.s[x8, 0], { z12.s-z15.s }, z7.s\n"
+ ".inst 0xc1381b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z8.s\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1321981 // fmla za.s[x8, 1], { z12.s-z15.s }, z2.s\n"
- "ld1w { z12.s }, p0/Z, [x14]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1321b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z2.s\n"
+ "ld1w { z18.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1361ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z6.s\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1341ba1 // fmla za.s[x8, 1], { z29.s-z0.s }, z4.s\n"
- "ld1w { z29.s }, p0/Z, [x20]\n"
+ ".inst 0xc1351a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z5.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1301a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z0.s\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc13819a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z8.s\n"
- ".inst 0xc13119a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z1.s\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
+ ".inst 0xc1361b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z6.s\n"
+ ".inst 0xc1331b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z3.s\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1w { z30.s }, p0/Z, [x20]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc0060c14 // mova { z20.d-z23.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1b3cb94 // fclamp { z20.s-z23.s }, z28.s, z19.s\n"
+ ".inst 0xc1a9c8fc // fclamp { z28.s-z31.s }, z7.s, z9.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z31.s }, p0/Z, [x20]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "st1w { z20.s }, p1, [x10]\n"
+ "st1w { z28.s }, p1, [x10]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
"add x8, x8, #0x1\n"
- "st1w { z21.s }, p1, [x9]\n"
+ "st1w { z29.s }, p1, [x9]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z0.s }, p0/Z, [x20]\n"
- "st1w { z22.s }, p1, [x26]\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ "st1w { z30.s }, p1, [x26]\n"
"mov x12, #0x8\n"
- ".inst 0xc1331980 // fmla za.s[x8, 0], { z12.s-z15.s }, z3.s\n"
+ ".inst 0xc13a1a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z10.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "st1w { z23.s }, p1, [x25]\n"
+ "st1w { z31.s }, p1, [x25]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1351ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z5.s\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xc1311ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z1.s\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
"add x10, x10, x28, LSL #2\n"
"add x9, x9, x27, LSL #2\n"
"add x26, x26, x24, LSL #2\n"
- ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
+ ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
"add x25, x25, x23, LSL #2\n"
- ".inst 0xc13919a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z9.s\n"
+ ".inst 0xc13b1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z11.s\n"
"16:" // Main loop skip tail
- "cbz x15, 17f\n" // Skip remainder inputs
+ "cbz x14, 17f\n" // Skip remainder inputs
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z12.s }, p0/Z, [x14]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z21.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z29.s }, p0/Z, [x20]\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z30.s }, p0/Z, [x20]\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z31.s }, p0/Z, [x20]\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- ".inst 0xc1371980 // fmla za.s[x8, 0], { z12.s-z15.s }, z7.s\n"
- "ld1w { z0.s }, p0/Z, [x20]\n"
+ ".inst 0xc1381aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z8.s\n"
+ "ld1w { z31.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1361ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z6.s\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0xc1321981 // fmla za.s[x8, 1], { z12.s-z15.s }, z2.s\n"
- "sub x13, x13, #0x1\n"
- ".inst 0xc13819a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z8.s\n"
- ".inst 0xc1341ba1 // fmla za.s[x8, 1], { z29.s-z0.s }, z4.s\n"
- ".inst 0xc0060c14 // mova { z20.d-z23.d }, za.d[x8, #0]\n"
- ".inst 0xc1b3cb94 // fclamp { z20.s-z23.s }, z28.s, z19.s\n"
- "st1w { z20.s }, p1, [x10]\n"
+ ".inst 0xc1351b80 // fmla za.s[x8, 0], { z28.s-z31.s }, z5.s\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ ".inst 0xc1321aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z2.s\n"
+ "sub x11, x11, #0x1\n"
+ ".inst 0xc1361ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z6.s\n"
+ ".inst 0xc1301b81 // fmla za.s[x8, 1], { z28.s-z31.s }, z0.s\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a9c8f0 // fclamp { z16.s-z19.s }, z7.s, z9.s\n"
+ "st1w { z16.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- ".inst 0xc13119a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z1.s\n"
+ ".inst 0xc1331ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z3.s\n"
"add x8, x8, #0x1\n"
- "st1w { z21.s }, p1, [x9]\n"
+ "st1w { z17.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z22.s }, p1, [x26]\n"
+ "st1w { z18.s }, p1, [x26]\n"
"add x26, x26, x24, LSL #2\n"
- ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
- "st1w { z23.s }, p1, [x25]\n"
+ ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
+ "st1w { z19.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
"17:" // Tail input: End
- "cbz x13, 19f\n"
+ "cbz x11, 19f\n"
"18:" // Right padding loop
- ".inst 0xc0060c14 // mova { z20.d-z23.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "subs x13, x13, #0x1\n"
- ".inst 0xc1b3cb94 // fclamp { z20.s-z23.s }, z28.s, z19.s\n"
- "st1w { z20.s }, p1, [x10]\n"
+ "subs x11, x11, #0x1\n"
+ ".inst 0xc1a9c8e0 // fclamp { z0.s-z3.s }, z7.s, z9.s\n"
+ "st1w { z0.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
- "st1w { z21.s }, p1, [x9]\n"
+ ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
+ "st1w { z1.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z22.s }, p1, [x26]\n"
+ "st1w { z2.s }, p1, [x26]\n"
"add x26, x26, x24, LSL #2\n"
- "st1w { z23.s }, p1, [x25]\n"
+ "st1w { z3.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
"bgt 18b\n"
"19:" // End
- "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
- "incb x23, ALL, MUL #9\n"
- "str x23, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x16\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "whilelt p1.s, x16, x17\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x14, x14, x20, LSL #2\n"
- "str x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incb x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x15\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "whilelt p1.s, x15, x16\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21, LSL #2\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x23, x22, [x11, #0x0]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
"ldp x21, x20, [x24, #0x0]\n"
"add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "stp x23, x22, [x11, #0x0]\n"
- "ldp x23, x22, [x11, #0x10]\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "stp x23, x22, [x11, #0x10]\n"
+ "stp x23, x22, [x25, #0x10]\n"
"b.any 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za.hpp
index 0fa0300f9f..71487e08b6 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za/generic.cpp
index bf12b42ddc..3741b973b4 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za/generic.cpp
@@ -69,71 +69,71 @@ void sme2_fp32_planar_5x5_s1_4rows_mla_za_impl(
Args args = { inptr, ld_in_vl, pad_top, 8u - std::min(8u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
__asm__ __volatile__(
- "ldr x5, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"mov x20, #0x8\n"
".inst 0xd503477f // SMSTART ZA\n"
- "sub x20, x20, x5\n"
- "ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "sub x20, x20, x6\n"
+ "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
"ptrue p2.b\n"
".inst 0x25207812 // ptrue pn10.b\n"
- "ld1rw { z22.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
- "ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
- "whilelt p1.s, XZR, x7\n"
+ "ld1rw { z16.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x17\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z11.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
- "whilelt p8.s, XZR, x6\n"
+ "ld1rw { z17.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+ "whilelt p8.s, XZR, x7\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
"1:" // Channel loop
"ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
"fmov z28.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z28.s }, p1/Z, [x20, x17, LSL #2]\n"
+ "ld1w { z28.s }, p1/Z, [x20, x16, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x16, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x20, x16, #0x1\n"
+ "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x15, #0x1\n"
"orr x23, x20, %x[ld_in_col], LSL #18\n"
"mov z29.d, z28.d\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "orr x23, x7, x23, LSL #20\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "orr x23, x17, x23, LSL #20\n"
"mov x22, #0x8\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "add x21, x6, x5\n"
+ "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "add x21, x7, x6\n"
"lsl x20, %x[ld_in_row], #0x2\n"
- "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
- "addvl x15, x15, #5\n"
+ "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "addvl x14, x14, #5\n"
"mov z30.d, z28.d\n"
"mov z31.d, z28.d\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
"mov x8, #0x0\n"
- "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
"lsl x23, x23, #0x2\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
"sub x22, x22, x21\n"
- "madd x20, x20, x6, x14\n"
- "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
- "addvl x15, x15, #5\n"
+ "madd x20, x20, x7, x13\n"
+ "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "addvl x14, x14, #5\n"
"3:" // Issue prefetches
"subs x22, x22, #0x1\n"
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
- "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x22, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x2\n"
- "msub x14, x6, x20, x14\n"
+ "msub x13, x7, x20, x13\n"
".inst 0xc0040f80 // mova za.d[x8, #0], { z28.d-z31.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
".inst 0xc0040f81 // mova za.d[x8, #1], { z28.d-z31.d }\n"
"mov x10, #0x4\n"
- "ldp x9, x28, [x11], #0x10\n"
+ "ldp x9, x28, [x22], #0x10\n"
".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
"ldp x27, x26, [x20], #0x10\n"
".inst 0xc0040f83 // mova za.d[x8, #3], { z28.d-z31.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "ldp x25, x24, [x11], #0x10\n"
+ "ldp x25, x24, [x22], #0x10\n"
"ldp x23, x22, [x20], #0x10\n"
"cbz x21, 5f\n"
"cmp x21, x10\n"
@@ -141,308 +141,308 @@ void sme2_fp32_planar_5x5_s1_4rows_mla_za_impl(
"sub x21, x21, x20\n"
"sub x10, x10, x20\n"
"cbz x21, 5f\n"
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- "sub x13, x13, x21\n"
- ".inst 0xc1abcad8 // fclamp { z24.s-z27.s }, z22.s, z11.s\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ "sub x11, x11, x21\n"
+ ".inst 0xc1b1ca04 // fclamp { z4.s-z7.s }, z16.s, z17.s\n"
"4:" // Left padding
"subs x21, x21, #0x1\n"
- "st1w { z24.s }, p1, [x9]\n"
+ "st1w { z4.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z25.s }, p1, [x28]\n"
+ "st1w { z5.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- "st1w { z26.s }, p1, [x25]\n"
+ "st1w { z6.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- "st1w { z27.s }, p1, [x24]\n"
+ "st1w { z7.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
"bgt 4b\n"
"5:" // Left padding: End
- "adds XZR, x6, x5\n"
+ "adds XZR, x7, x6\n"
"bne 12f\n"
"cbz x10, 10f\n"
"cmp x10, #0x1\n"
- "sub x16, x16, x10\n"
+ "sub x15, x15, x10\n"
"beq 9f\n"
"cmp x10, #0x2\n"
"beq 8f\n"
"cmp x10, #0x3\n"
"beq 7f\n"
"6:" // Unpadded: 4 priming loads
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x14]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13419c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z4.s\n"
- "ld1w { z18.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13019e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z0.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
"ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1341a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z4.s\n"
"ld1w { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1301a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z0.s\n"
"ld1w { z21.s }, p1/Z, [x20]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1341a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- "7:" // Unpadded: 3 priming loads
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x14]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13519c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z5.s\n"
- ".inst 0xc13419c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z4.s\n"
- "ld1w { z18.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13819e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z8.s\n"
- "ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13019e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z0.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1351a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z5.s\n"
- "ld1w { z20.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1341a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z4.s\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1381a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z8.s\n"
+ ".inst 0xc13e1a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z14.s\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13a1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z10.s\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa14049c5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1351a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z5.s\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa04049cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc13c1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z12.s\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
+ ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc1311ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z1.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ "7:" // Unpadded: 3 priming loads
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
"ld1w { z21.s }, p1/Z, [x20]\n"
- ".inst 0xc1301a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z0.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1351a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z5.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1341a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- "8:" // Unpadded: 2 priming loads
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x14]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13219c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z2.s\n"
- ".inst 0xc13519c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z5.s\n"
- "ld1w { z18.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13419c2 // fmla za.s[x8, 2], { z14.s-z17.s }, z4.s\n"
- "ld1w { z19.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13619e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z6.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc13819e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z8.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc13019e2 // fmla za.s[x8, 2], { z15.s-z18.s }, z0.s\n"
- "ld1w { z20.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1321a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z2.s\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1351a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z5.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1341a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z4.s\n"
- "ld1w { z21.s }, p1/Z, [x20]\n"
- ".inst 0xc1361a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z6.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1381a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z8.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1301a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z0.s\n"
- ".inst 0xc1321a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z2.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1351a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z5.s\n"
- ".inst 0xc1341a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13f1a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z15.s\n"
+ ".inst 0xc13e1a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z14.s\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13b1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z11.s\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13a1aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z10.s\n"
+ ".inst 0xa04049c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1371ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z7.s\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1361ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z6.s\n"
+ ".inst 0xa14049c5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc13d1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z13.s\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
+ ".inst 0xc1351ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z5.s\n"
+ ".inst 0xa04049c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc1371b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z7.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1361b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z6.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ "8:" // Unpadded: 2 priming loads
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z1.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z2.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z3.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z4.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13c1820 // fmla za.s[x8, 0], { z1.s-z4.s }, z12.s\n"
+ ".inst 0xc13f1821 // fmla za.s[x8, 1], { z1.s-z4.s }, z15.s\n"
+ "ld1w { z5.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13e1822 // fmla za.s[x8, 2], { z1.s-z4.s }, z14.s\n"
+ "ld1w { z6.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1381840 // fmla za.s[x8, 0], { z2.s-z5.s }, z8.s\n"
+ ".inst 0xa04049cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc13b1841 // fmla za.s[x8, 1], { z2.s-z5.s }, z11.s\n"
+ ".inst 0xa04149ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc13a1842 // fmla za.s[x8, 2], { z2.s-z5.s }, z10.s\n"
+ "ld1w { z7.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13e1860 // fmla za.s[x8, 0], { z3.s-z6.s }, z14.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc13d1861 // fmla za.s[x8, 1], { z3.s-z6.s }, z13.s\n"
+ ".inst 0xa14149c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc13c1862 // fmla za.s[x8, 2], { z3.s-z6.s }, z12.s\n"
+ "ld1w { z8.s }, p1/Z, [x20]\n"
+ ".inst 0xc1301880 // fmla za.s[x8, 0], { z4.s-z7.s }, z0.s\n"
+ ".inst 0xa04049c0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc13f1881 // fmla za.s[x8, 1], { z4.s-z7.s }, z15.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc13e1882 // fmla za.s[x8, 2], { z4.s-z7.s }, z14.s\n"
+ ".inst 0xc13c18a0 // fmla za.s[x8, 0], { z5.s-z8.s }, z12.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13118a1 // fmla za.s[x8, 1], { z5.s-z8.s }, z1.s\n"
+ ".inst 0xc13018a2 // fmla za.s[x8, 2], { z5.s-z8.s }, z0.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "addvl x14, x14, #5\n"
"9:" // Unpadded: 1 priming loads
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x14]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13319c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z3.s\n"
- ".inst 0xc13219c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z2.s\n"
- "ld1w { z18.s }, p1/Z, [x20]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13519c2 // fmla za.s[x8, 2], { z14.s-z17.s }, z5.s\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13d1a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z13.s\n"
+ ".inst 0xc13c1a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z12.s\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13f1a82 // fmla za.s[x8, 2], { z20.s-z23.s }, z15.s\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13e1a83 // fmla za.s[x8, 3], { z20.s-z23.s }, z14.s\n"
+ ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc1391aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z9.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1381aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z8.s\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13b1aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z11.s\n"
+ ".inst 0xa14149c6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13a1aa3 // fmla za.s[x8, 3], { z21.s-z24.s }, z10.s\n"
+ ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc13d1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z13.s\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
+ ".inst 0xc13c1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z12.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1351ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z5.s\n"
+ ".inst 0xc1341ac3 // fmla za.s[x8, 3], { z22.s-z25.s }, z4.s\n"
+ ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc13e1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z14.s\n"
+ "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc1361ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z6.s\n"
+ ".inst 0xc1391ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z9.s\n"
+ ".inst 0xc1311ae3 // fmla za.s[x8, 3], { z23.s-z26.s }, z1.s\n"
+ ".inst 0xc13d1b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z13.s\n"
+ ".inst 0xc13c1b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z12.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1351b02 // fmla za.s[x8, 2], { z24.s-z27.s }, z5.s\n"
+ ".inst 0xc1341b03 // fmla za.s[x8, 3], { z24.s-z27.s }, z4.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ "10:" // Unpadded: 0 priming loads
+ "cbz x15, 20f\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p1/Z, [x13]\n"
+ "sub x15, x15, #0x1\n"
"ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13419c3 // fmla za.s[x8, 3], { z14.s-z17.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc13719e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z7.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc13619e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z6.s\n"
+ "sub x11, x11, #0x1\n"
"ld1w { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13819e2 // fmla za.s[x8, 2], { z15.s-z18.s }, z8.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc13019e3 // fmla za.s[x8, 3], { z15.s-z18.s }, z0.s\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1331a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z3.s\n"
+ "cmp x15, x11\n"
"ld1w { z21.s }, p1/Z, [x20]\n"
- ".inst 0xc1321a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z2.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1351a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z5.s\n"
- ".inst 0xc1341a03 // fmla za.s[x8, 3], { z16.s-z19.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1371a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z7.s\n"
- "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1361a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z6.s\n"
- ".inst 0xc1381a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z8.s\n"
- ".inst 0xc1301a23 // fmla za.s[x8, 3], { z17.s-z20.s }, z0.s\n"
- ".inst 0xc1331a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z3.s\n"
- ".inst 0xc1321a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z2.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1351a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z5.s\n"
- ".inst 0xc1341a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
- "addvl x15, x15, #5\n"
- "10:" // Unpadded: 0 priming loads
- "cbz x16, 20f\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x14]\n"
- "sub x16, x16, #0x1\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "sub x13, x13, #0x1\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "csel x21, x15, x11, LT\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "cmp x16, x13\n"
- "ld1w { z17.s }, p1/Z, [x20]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "csel x21, x16, x13, LT\n"
- "ld1w { z18.s }, p1/Z, [x20]\n"
+ "sub x11, x11, x21\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
+ "cbz x21, 19f\n"
+ "11:" // Unpadded: Main loop
+ ".inst 0xc1321a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z2.s\n"
+ "ld1w { z6.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0xc1331a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z3.s\n"
+ ".inst 0xc13d1a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z13.s\n"
+ ".inst 0xc13c1a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z12.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13f1a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z15.s\n"
+ ".inst 0xc13e1a44 // fmla za.s[x8, 4], { z18.s-z21.s }, z14.s\n"
+ ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1361a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z6.s\n"
+ "ld1w { z6.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc1391a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z9.s\n"
+ "ld1w { z18.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1381a62 // fmla za.s[x8, 2], { z19.s-z22.s }, z8.s\n"
+ ".inst 0xa04149ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13b1a63 // fmla za.s[x8, 3], { z19.s-z22.s }, z11.s\n"
+ ".inst 0xc13a1a64 // fmla za.s[x8, 4], { z19.s-z22.s }, z10.s\n"
+ ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1361aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z6.s\n"
+ "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc13d1a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z13.s\n"
"ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "sub x13, x13, x21\n"
+ ".inst 0xc13c1a82 // fmla za.s[x8, 2], { z20.s-z23.s }, z12.s\n"
+ ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1351a83 // fmla za.s[x8, 3], { z20.s-z23.s }, z5.s\n"
+ ".inst 0xc1341a84 // fmla za.s[x8, 4], { z20.s-z23.s }, z4.s\n"
+ ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc1321ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z2.s\n"
"ld1w { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13f1aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z15.s\n"
+ "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc13e1aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z14.s\n"
+ ".inst 0xc1381aa3 // fmla za.s[x8, 3], { z21.s-z24.s }, z8.s\n"
+ ".inst 0xc1301aa4 // fmla za.s[x8, 4], { z21.s-z24.s }, z0.s\n"
"ld1w { z21.s }, p1/Z, [x20]\n"
- "cbz x21, 19f\n"
- "11:" // Unpadded: Main loop
- ".inst 0xc13a19c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xc13119e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z1.s\n"
- ".inst 0xc13319c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z3.s\n"
- ".inst 0xc13219c2 // fmla za.s[x8, 2], { z14.s-z17.s }, z2.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc13519c3 // fmla za.s[x8, 3], { z14.s-z17.s }, z5.s\n"
- ".inst 0xc13419c4 // fmla za.s[x8, 4], { z14.s-z17.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc13a1a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z10.s\n"
- "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc13719e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z7.s\n"
- "ld1w { z14.s }, p1/Z, [x14]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc13619e2 // fmla za.s[x8, 2], { z15.s-z18.s }, z6.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc13819e3 // fmla za.s[x8, 3], { z15.s-z18.s }, z8.s\n"
- ".inst 0xc13019e4 // fmla za.s[x8, 4], { z15.s-z18.s }, z0.s\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1311a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z1.s\n"
- "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1331a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z3.s\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1321a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z2.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1351a03 // fmla za.s[x8, 3], { z16.s-z19.s }, z5.s\n"
- ".inst 0xc1341a04 // fmla za.s[x8, 4], { z16.s-z19.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13a1a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z10.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1371a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z7.s\n"
- "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1361a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z6.s\n"
- ".inst 0xc1381a23 // fmla za.s[x8, 3], { z17.s-z20.s }, z8.s\n"
- ".inst 0xc1301a24 // fmla za.s[x8, 4], { z17.s-z20.s }, z0.s\n"
- "ld1w { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- ".inst 0xc1abcad8 // fclamp { z24.s-z27.s }, z22.s, z11.s\n"
- "st1w { z24.s }, p1, [x9]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc0060c0c // mova { z12.d-z15.d }, za.d[x8, #0]\n"
+ ".inst 0xc1b1ca0c // fclamp { z12.s-z15.s }, z16.s, z17.s\n"
+ "st1w { z12.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc1331a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z3.s\n"
- "st1w { z25.s }, p1, [x28]\n"
+ ".inst 0xc1371ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z7.s\n"
+ "st1w { z13.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- ".inst 0xc1321a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z2.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "st1w { z26.s }, p1, [x25]\n"
+ ".inst 0xc1361ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z6.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "st1w { z14.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- ".inst 0xc1351a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z5.s\n"
- "st1w { z27.s }, p1, [x24]\n"
+ ".inst 0xc1351ac3 // fmla za.s[x8, 3], { z22.s-z25.s }, z5.s\n"
+ "st1w { z15.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
- ".inst 0xc1341a44 // fmla za.s[x8, 4], { z18.s-z21.s }, z4.s\n"
- "ld1w { z18.s }, p1/Z, [x20]\n"
+ ".inst 0xc1341ac4 // fmla za.s[x8, 4], { z22.s-z25.s }, z4.s\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"add x8, x8, #0x1\n"
- "ld1w { z19.s }, p1/Z, [x20]\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- "ld1w { z20.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
- "addvl x15, x15, #5\n"
- "ld1w { z21.s }, p1/Z, [x20]\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
"bgt 11b\n"
"b 19f\n"
"12:" // Padded
"cbz x10, 17f\n"
"cmp x10, #0x1\n"
- "sub x16, x16, x10\n"
+ "sub x15, x15, x10\n"
"beq 16f\n"
"cmp x10, #0x2\n"
"beq 15f\n"
@@ -451,429 +451,429 @@ void sme2_fp32_planar_5x5_s1_4rows_mla_za_impl(
"13:" // Padded: 4 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x14]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z19.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13419c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc13e1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z14.s\n"
+ ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z18.s }, p0/Z, [x20]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13019e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z0.s\n"
+ ".inst 0xc13a1a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z10.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1341a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z4.s\n"
+ ".inst 0xc1311aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z1.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "addvl x15, x15, #5\n"
- "ld1w { z20.s }, p0/Z, [x20]\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+ "addvl x14, x14, #5\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ ".inst 0xa04049c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc1301a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z0.s\n"
- "addvl x15, x15, #5\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1341a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
+ ".inst 0xc1361ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z6.s\n"
+ "addvl x14, x14, #5\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc1301ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z0.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
"14:" // Padded: 3 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x14]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z0.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
+ "ld1w { z1.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z2.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
+ "ld1w { z3.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13519c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z5.s\n"
+ ".inst 0xc13f1800 // fmla za.s[x8, 0], { z0.s-z3.s }, z15.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc13419c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z4.s\n"
- "ld1w { z18.s }, p0/Z, [x20]\n"
+ ".inst 0xc13e1801 // fmla za.s[x8, 1], { z0.s-z3.s }, z14.s\n"
+ "ld1w { z4.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc13819e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z8.s\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
+ ".inst 0xc13b1820 // fmla za.s[x8, 0], { z1.s-z4.s }, z11.s\n"
+ "ld1w { z5.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13019e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z0.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc13a1821 // fmla za.s[x8, 1], { z1.s-z4.s }, z10.s\n"
+ ".inst 0xa04049c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1351a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z5.s\n"
- "ld1w { z20.s }, p0/Z, [x20]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1391840 // fmla za.s[x8, 0], { z2.s-z5.s }, z9.s\n"
+ "ld1w { z6.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc1341a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z4.s\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1381a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z8.s\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
- ".inst 0xc1301a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z0.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1351a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z5.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1341a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
+ ".inst 0xc1381841 // fmla za.s[x8, 1], { z2.s-z5.s }, z8.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc13f1860 // fmla za.s[x8, 0], { z3.s-z6.s }, z15.s\n"
+ "ld1w { z7.s }, p0/Z, [x20]\n"
+ ".inst 0xc13e1861 // fmla za.s[x8, 1], { z3.s-z6.s }, z14.s\n"
+ ".inst 0xa14049c3 // ld1w { z3.s, z11.s }, pn10.b/Z, [x14]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc13b1880 // fmla za.s[x8, 0], { z4.s-z7.s }, z11.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1331881 // fmla za.s[x8, 1], { z4.s-z7.s }, z3.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "addvl x14, x14, #5\n"
"15:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x14]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z19.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13219c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z2.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc13c1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z12.s\n"
+ ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc13519c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z5.s\n"
- "ld1w { z18.s }, p0/Z, [x20]\n"
+ ".inst 0xc13f1a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z15.s\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13419c2 // fmla za.s[x8, 2], { z14.s-z17.s }, z4.s\n"
+ ".inst 0xc13e1a62 // fmla za.s[x8, 2], { z19.s-z22.s }, z14.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13619e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z6.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1381a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z8.s\n"
+ ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc13819e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z8.s\n"
- "ld1w { z20.s }, p0/Z, [x20]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc13b1a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z11.s\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc13019e2 // fmla za.s[x8, 2], { z15.s-z18.s }, z0.s\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1321a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z2.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1351a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z5.s\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
- ".inst 0xc1341a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1361a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z6.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1381a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z8.s\n"
- ".inst 0xc1301a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z0.s\n"
- ".inst 0xc1321a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z2.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1351a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z5.s\n"
- ".inst 0xc1341a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
+ ".inst 0xc13a1a82 // fmla za.s[x8, 2], { z20.s-z23.s }, z10.s\n"
+ ".inst 0xa14049c2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x14]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1361aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z6.s\n"
+ ".inst 0xa14149c4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1381aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z8.s\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
+ ".inst 0xc1301aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z0.s\n"
+ ".inst 0xa04049c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc1341ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z4.s\n"
+ ".inst 0xa14149c3 // ld1w { z3.s, z11.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc13a1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z10.s\n"
+ ".inst 0xc1321ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z2.s\n"
+ ".inst 0xc1331ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z3.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1371ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z7.s\n"
+ ".inst 0xc1361ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z6.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "addvl x14, x14, #5\n"
"16:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x14]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13319c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z3.s\n"
+ ".inst 0xc13d1a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z13.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc13219c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z2.s\n"
- "ld1w { z18.s }, p0/Z, [x20]\n"
+ ".inst 0xc13c1a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z12.s\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13519c2 // fmla za.s[x8, 2], { z14.s-z17.s }, z5.s\n"
+ ".inst 0xc13f1a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z15.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13419c3 // fmla za.s[x8, 3], { z14.s-z17.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc13e1a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z14.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc13719e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z7.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc13619e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z6.s\n"
- "ld1w { z20.s }, p0/Z, [x20]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1391a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z9.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1381a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z8.s\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc13819e2 // fmla za.s[x8, 2], { z15.s-z18.s }, z8.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc13019e3 // fmla za.s[x8, 3], { z15.s-z18.s }, z0.s\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1331a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z3.s\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
- ".inst 0xc1321a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z2.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1351a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z5.s\n"
- ".inst 0xc1341a03 // fmla za.s[x8, 3], { z16.s-z19.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1371a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z7.s\n"
- "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1361a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z6.s\n"
- ".inst 0xc1381a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z8.s\n"
- ".inst 0xc1301a23 // fmla za.s[x8, 3], { z17.s-z20.s }, z0.s\n"
- ".inst 0xc1331a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z3.s\n"
- ".inst 0xc1321a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z2.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1351a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z5.s\n"
- ".inst 0xc1341a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
- "addvl x15, x15, #5\n"
+ ".inst 0xc13b1a62 // fmla za.s[x8, 2], { z19.s-z22.s }, z11.s\n"
+ ".inst 0xa14149c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13a1a63 // fmla za.s[x8, 3], { z19.s-z22.s }, z10.s\n"
+ ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc13d1a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z13.s\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ ".inst 0xc13c1a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z12.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13f1a82 // fmla za.s[x8, 2], { z20.s-z23.s }, z15.s\n"
+ ".inst 0xc13e1a83 // fmla za.s[x8, 3], { z20.s-z23.s }, z14.s\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc1381aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z8.s\n"
+ "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc1301aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z0.s\n"
+ ".inst 0xc1391aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z9.s\n"
+ ".inst 0xc1311aa3 // fmla za.s[x8, 3], { z21.s-z24.s }, z1.s\n"
+ ".inst 0xc13d1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z13.s\n"
+ ".inst 0xc13c1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z12.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13b1ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z11.s\n"
+ ".inst 0xc13a1ac3 // fmla za.s[x8, 3], { z22.s-z25.s }, z10.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "addvl x14, x14, #5\n"
"17:" // Padded: 0 priming loads
- "cbz x16, 20f\n"
+ "cbz x15, 20f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x14]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z18.s }, p0/Z, [x20]\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "sub x16, x16, #0x1\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
- "sub x13, x13, #0x1\n"
+ "sub x15, x15, #0x1\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
+ "sub x11, x11, #0x1\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "cmp x16, x13\n"
- "ld1w { z20.s }, p0/Z, [x20]\n"
+ "cmp x15, x11\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
- "csel x21, x16, x13, LT\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "sub x13, x13, x21\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ "csel x21, x15, x11, LT\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "sub x11, x11, x21\n"
"cbz x21, 19f\n"
"18:" // Padded: Main loop
- ".inst 0xc13a19c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc1321a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z2.s\n"
+ "ld1w { z0.s }, p2/Z, [x14, #4, MUL VL]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc13119e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z1.s\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1331a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z3.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
"subs x21, x21, #0x1\n"
- ".inst 0xc13319c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z3.s\n"
- ".inst 0xc13219c2 // fmla za.s[x8, 2], { z14.s-z17.s }, z2.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc13519c3 // fmla za.s[x8, 3], { z14.s-z17.s }, z5.s\n"
- ".inst 0xc13419c4 // fmla za.s[x8, 4], { z14.s-z17.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc13a1a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z10.s\n"
- "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc13719e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z7.s\n"
- "ld1w { z14.s }, p0/Z, [x14]\n"
+ ".inst 0xc13d1a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z13.s\n"
+ ".inst 0xc13c1a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z12.s\n"
+ ".inst 0xa04149c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13f1a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z15.s\n"
+ ".inst 0xc13e1a44 // fmla za.s[x8, 4], { z18.s-z21.s }, z14.s\n"
+ ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1301a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z0.s\n"
+ "ld1w { z12.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc1391a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z9.s\n"
+ "ld1w { z18.s }, p0/Z, [x13]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc13619e2 // fmla za.s[x8, 2], { z15.s-z18.s }, z6.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc13819e3 // fmla za.s[x8, 3], { z15.s-z18.s }, z8.s\n"
- ".inst 0xc13019e4 // fmla za.s[x8, 4], { z15.s-z18.s }, z0.s\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1311a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z1.s\n"
- "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1331a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z3.s\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1381a62 // fmla za.s[x8, 2], { z19.s-z22.s }, z8.s\n"
+ ".inst 0xa14149c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13b1a63 // fmla za.s[x8, 3], { z19.s-z22.s }, z11.s\n"
+ ".inst 0xc13a1a64 // fmla za.s[x8, 4], { z19.s-z22.s }, z10.s\n"
+ ".inst 0xa04049c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc13c1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc1331a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z3.s\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1321a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z2.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1351a03 // fmla za.s[x8, 3], { z16.s-z19.s }, z5.s\n"
- ".inst 0xc1341a04 // fmla za.s[x8, 4], { z16.s-z19.s }, z4.s\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xc1321a82 // fmla za.s[x8, 2], { z20.s-z23.s }, z2.s\n"
+ ".inst 0xa14149c3 // ld1w { z3.s, z11.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1351a83 // fmla za.s[x8, 3], { z20.s-z23.s }, z5.s\n"
+ ".inst 0xc1341a84 // fmla za.s[x8, 4], { z20.s-z23.s }, z4.s\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- ".inst 0xc13a1a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z10.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1371a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z7.s\n"
- "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1361a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z6.s\n"
- ".inst 0xc1381a23 // fmla za.s[x8, 3], { z17.s-z20.s }, z8.s\n"
- ".inst 0xc1301a24 // fmla za.s[x8, 4], { z17.s-z20.s }, z0.s\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
+ ".inst 0xc13c1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z12.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc1381aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z8.s\n"
+ "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc1301aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z0.s\n"
+ ".inst 0xc1371aa3 // fmla za.s[x8, 3], { z21.s-z24.s }, z7.s\n"
+ ".inst 0xc1361aa4 // fmla za.s[x8, 4], { z21.s-z24.s }, z6.s\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- ".inst 0xc1abcad8 // fclamp { z24.s-z27.s }, z22.s, z11.s\n"
- "st1w { z24.s }, p1, [x9]\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc1b1ca04 // fclamp { z4.s-z7.s }, z16.s, z17.s\n"
+ "st1w { z4.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc1331a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z3.s\n"
- "st1w { z25.s }, p1, [x28]\n"
+ ".inst 0xc13b1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z11.s\n"
+ "st1w { z5.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- ".inst 0xc1321a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z2.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "st1w { z26.s }, p1, [x25]\n"
+ ".inst 0xc1331ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z3.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "st1w { z6.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- ".inst 0xc1351a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z5.s\n"
- "st1w { z27.s }, p1, [x24]\n"
+ ".inst 0xc13f1ac3 // fmla za.s[x8, 3], { z22.s-z25.s }, z15.s\n"
+ "st1w { z7.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
- ".inst 0xc1341a44 // fmla za.s[x8, 4], { z18.s-z21.s }, z4.s\n"
- "ld1w { z18.s }, p0/Z, [x20]\n"
+ ".inst 0xc13e1ac4 // fmla za.s[x8, 4], { z22.s-z25.s }, z14.s\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
"add x8, x8, #0x1\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "ld1w { z20.s }, p0/Z, [x20]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
- "addvl x15, x15, #5\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
"bgt 18b\n"
"19:" // Main loop tail
- ".inst 0xc13a19c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc13119e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z1.s\n"
- ".inst 0xc13319c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z3.s\n"
- ".inst 0xc13219c2 // fmla za.s[x8, 2], { z14.s-z17.s }, z2.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc13519c3 // fmla za.s[x8, 3], { z14.s-z17.s }, z5.s\n"
- ".inst 0xc13419c4 // fmla za.s[x8, 4], { z14.s-z17.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc13a1a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z10.s\n"
- "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc13719e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z7.s\n"
- ".inst 0xc13619e2 // fmla za.s[x8, 2], { z15.s-z18.s }, z6.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc13819e3 // fmla za.s[x8, 3], { z15.s-z18.s }, z8.s\n"
- ".inst 0xc13019e4 // fmla za.s[x8, 4], { z15.s-z18.s }, z0.s\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1311a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z1.s\n"
- "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1331a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z3.s\n"
- ".inst 0xc1321a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z2.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1351a03 // fmla za.s[x8, 3], { z16.s-z19.s }, z5.s\n"
- ".inst 0xc1341a04 // fmla za.s[x8, 4], { z16.s-z19.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc13a1a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z10.s\n"
- ".inst 0xc1371a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z7.s\n"
- ".inst 0xc1361a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z6.s\n"
- ".inst 0xc1381a23 // fmla za.s[x8, 3], { z17.s-z20.s }, z8.s\n"
- ".inst 0xc1301a24 // fmla za.s[x8, 4], { z17.s-z20.s }, z0.s\n"
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- ".inst 0xc1abcad8 // fclamp { z24.s-z27.s }, z22.s, z11.s\n"
- "st1w { z24.s }, p1, [x9]\n"
+ ".inst 0xc1321a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z2.s\n"
+ "ld1w { z6.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc1331a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z3.s\n"
+ ".inst 0xc13d1a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z13.s\n"
+ ".inst 0xc13c1a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z12.s\n"
+ ".inst 0xa04149c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13f1a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z15.s\n"
+ ".inst 0xc13e1a44 // fmla za.s[x8, 4], { z18.s-z21.s }, z14.s\n"
+ ".inst 0xa04049c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1361a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z6.s\n"
+ "ld1w { z7.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc1391a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z9.s\n"
+ ".inst 0xc1381a62 // fmla za.s[x8, 2], { z19.s-z22.s }, z8.s\n"
+ ".inst 0xa14149c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13b1a63 // fmla za.s[x8, 3], { z19.s-z22.s }, z11.s\n"
+ ".inst 0xc13a1a64 // fmla za.s[x8, 4], { z19.s-z22.s }, z10.s\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1371aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z7.s\n"
+ "ld1w { z1.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc1351a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z5.s\n"
+ ".inst 0xc1341a82 // fmla za.s[x8, 2], { z20.s-z23.s }, z4.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1331a83 // fmla za.s[x8, 3], { z20.s-z23.s }, z3.s\n"
+ ".inst 0xc1321a84 // fmla za.s[x8, 4], { z20.s-z23.s }, z2.s\n"
+ ".inst 0xa04049c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc1311ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z1.s\n"
+ ".inst 0xc1381aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z8.s\n"
+ ".inst 0xc1301aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z0.s\n"
+ ".inst 0xc13b1aa3 // fmla za.s[x8, 3], { z21.s-z24.s }, z11.s\n"
+ ".inst 0xc13a1aa4 // fmla za.s[x8, 4], { z21.s-z24.s }, z10.s\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc1b1ca04 // fclamp { z4.s-z7.s }, z16.s, z17.s\n"
+ "st1w { z4.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc1331a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z3.s\n"
- "st1w { z25.s }, p1, [x28]\n"
+ ".inst 0xc13d1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z13.s\n"
+ "st1w { z5.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- ".inst 0xc1321a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z2.s\n"
- "st1w { z26.s }, p1, [x25]\n"
+ ".inst 0xc13c1ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z12.s\n"
+ "st1w { z6.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- ".inst 0xc1351a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z5.s\n"
- "st1w { z27.s }, p1, [x24]\n"
+ ".inst 0xc1331ac3 // fmla za.s[x8, 3], { z22.s-z25.s }, z3.s\n"
+ "st1w { z7.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
- ".inst 0xc1341a44 // fmla za.s[x8, 4], { z18.s-z21.s }, z4.s\n"
+ ".inst 0xc1321ac4 // fmla za.s[x8, 4], { z22.s-z25.s }, z2.s\n"
"add x8, x8, #0x1\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
"20:" // Main loop skip tail
- "cbz x13, 22f\n"
+ "cbz x11, 22f\n"
"21:" // Right padding loop
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "subs x13, x13, #0x1\n"
- ".inst 0xc1abcad8 // fclamp { z24.s-z27.s }, z22.s, z11.s\n"
- "st1w { z24.s }, p1, [x9]\n"
+ "subs x11, x11, #0x1\n"
+ ".inst 0xc1b1ca00 // fclamp { z0.s-z3.s }, z16.s, z17.s\n"
+ "st1w { z0.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "st1w { z25.s }, p1, [x28]\n"
+ "st1w { z1.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- "st1w { z26.s }, p1, [x25]\n"
+ "st1w { z2.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- "st1w { z27.s }, p1, [x24]\n"
+ "st1w { z3.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
"bgt 21b\n"
"22:" // End
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- "incb x15, ALL, MUL #16\n"
- "incb x15, ALL, MUL #9\n"
- "str x15, [%x[args], %[offsetof_Args_weights]]\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "incw x17\n"
- "whilelt p1.s, x17, x7\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x14, x14, x20, LSL #2\n"
- "str x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incb x20, ALL, MUL #16\n"
+ "incb x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "incw x16\n"
+ "whilelt p1.s, x16, x17\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21, LSL #2\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x23, x22, [x11, #0x0]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
"ldp x21, x20, [x24, #0x0]\n"
"add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "stp x23, x22, [x11, #0x0]\n"
- "ldp x23, x22, [x11, #0x10]\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "stp x23, x22, [x11, #0x10]\n"
+ "stp x23, x22, [x25, #0x10]\n"
"b.any 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za.hpp
index cae4b24e66..7412c7b57c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za/generic.cpp
index 755265835d..81ad8e5833 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za/generic.cpp
@@ -76,11 +76,11 @@ void sme2_fp32_planar_5x5_s2_4rows_mla_za_impl(
"ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
"ptrue p2.b\n"
".inst 0x25207812 // ptrue pn10.b\n"
- "ld1rw { z0.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "ld1rw { z2.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
"ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x7\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z17.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+ "ld1rw { z3.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
"whilelt p8.s, XZR, x6\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
"ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
@@ -99,64 +99,64 @@ void sme2_fp32_planar_5x5_s2_4rows_mla_za_impl(
"orr x23, x7, x23, LSL #20\n"
"mov x22, #0xb\n"
"ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"add x21, x6, x5\n"
"lsl x20, %x[ld_in_row], #0x2\n"
- "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
"addvl x15, x15, #5\n"
"mov z30.d, z28.d\n"
"mov z31.d, z28.d\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
"mov x8, #0x0\n"
"ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
"lsl x23, x23, #0x2\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"sub x22, x22, x21\n"
"madd x20, x20, x6, x14\n"
- "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
"addvl x15, x15, #5\n"
"3:" // Issue prefetches
"subs x22, x22, #0x1\n"
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
- "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x2\n"
"msub x14, x6, x20, x14\n"
".inst 0xc0040f80 // mova za.d[x8, #0], { z28.d-z31.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
".inst 0xc0040f81 // mova za.d[x8, #1], { z28.d-z31.d }\n"
"mov x22, #0x4\n"
- "ldp x10, x9, [x11], #0x10\n"
+ "ldp x11, x10, [x23], #0x10\n"
".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
- "ldp x28, x27, [x20], #0x10\n"
+ "ldp x9, x28, [x20], #0x10\n"
".inst 0xc0040f83 // mova za.d[x8, #3], { z28.d-z31.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "ldp x26, x25, [x11], #0x10\n"
- "ldp x24, x23, [x20], #0x10\n"
+ "ldp x27, x26, [x23], #0x10\n"
+ "ldp x25, x24, [x20], #0x10\n"
"cbz x21, 5f\n"
"cmp x21, x22\n"
"csel x20, x21, x22, LT\n"
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 5f\n"
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
"and x22, x21, #0x1\n"
"add x21, x21, #0x1\n"
- ".inst 0xc1b1c818 // fclamp { z24.s-z27.s }, z0.s, z17.s\n"
+ ".inst 0xc1a3c850 // fclamp { z16.s-z19.s }, z2.s, z3.s\n"
"lsr x21, x21, #0x1\n"
"sub x13, x13, x21\n"
"4:" // Left padding
"subs x21, x21, #0x1\n"
- "st1w { z24.s }, p1, [x10]\n"
+ "st1w { z16.s }, p1, [x11]\n"
+ "add x11, x11, x9, LSL #2\n"
+ "st1w { z17.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- "st1w { z25.s }, p1, [x9]\n"
- "add x9, x9, x27, LSL #2\n"
- "st1w { z26.s }, p1, [x26]\n"
+ "st1w { z18.s }, p1, [x27]\n"
+ "add x27, x27, x25, LSL #2\n"
+ "st1w { z19.s }, p1, [x26]\n"
"add x26, x26, x24, LSL #2\n"
- "st1w { z27.s }, p1, [x25]\n"
- "add x25, x25, x23, LSL #2\n"
"bgt 4b\n"
"5:" // Left padding: End
"adds XZR, x6, x5\n"
@@ -171,331 +171,331 @@ void sme2_fp32_planar_5x5_s2_4rows_mla_za_impl(
"beq 7f\n"
"6:" // Unpadded: 4 priming loads
"add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z11.s }, p1/Z, [x14]\n"
+ "ld1w { z9.s }, p1/Z, [x14]\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z10.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
"ld1w { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z12.s }, p1/Z, [x20]\n"
+ "ld1w { z11.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"ld1w { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z13.s }, p1/Z, [x20]\n"
+ "ld1w { z12.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1341920 // fmla za.s[x8, 0], { z9.s-z12.s }, z4.s\n"
"ld1w { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1341960 // fmla za.s[x8, 0], { z11.s-z14.s }, z4.s\n"
- "ld1w { z24.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1311aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z1.s\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
+ ".inst 0xc1371a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z7.s\n"
+ "ld1w { z13.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1341980 // fmla za.s[x8, 0], { z12.s-z15.s }, z4.s\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
+ ".inst 0xc1341940 // fmla za.s[x8, 0], { z10.s-z13.s }, z4.s\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04049e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1311ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z1.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0xc1301aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z0.s\n"
+ "ld1w { z14.s }, p1/Z, [x20]\n"
".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
"ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13419a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z4.s\n"
+ ".inst 0xc1341960 // fmla za.s[x8, 0], { z11.s-z14.s }, z4.s\n"
".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
"7:" // Unpadded: 3 priming loads
"add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z11.s }, p1/Z, [x14]\n"
+ "ld1w { z22.s }, p1/Z, [x14]\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z21.s }, p1/Z, [x20]\n"
+ "ld1w { z7.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z12.s }, p1/Z, [x20]\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z22.s }, p1/Z, [x20]\n"
+ "ld1w { z8.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z13.s }, p1/Z, [x20]\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x20]\n"
+ "ld1w { z9.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x20]\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1351960 // fmla za.s[x8, 0], { z11.s-z14.s }, z5.s\n"
- "ld1w { z24.s }, p1/Z, [x20]\n"
+ ".inst 0xc1351ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z5.s\n"
+ "ld1w { z10.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1391aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z9.s\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
+ ".inst 0xc13f18e0 // fmla za.s[x8, 0], { z7.s-z10.s }, z15.s\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04049ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1351980 // fmla za.s[x8, 0], { z12.s-z15.s }, z5.s\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
+ ".inst 0xc13f1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z15.s\n"
+ "ld1w { z11.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04049e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1391ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z9.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1371900 // fmla za.s[x8, 0], { z8.s-z11.s }, z7.s\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
+ ".inst 0xa04049ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15]\n"
"ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13519a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z5.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc13b1b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z11.s\n"
+ ".inst 0xa14049e4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
"8:" // Unpadded: 2 priming loads
"add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z11.s }, p1/Z, [x14]\n"
+ "ld1w { z19.s }, p1/Z, [x14]\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z21.s }, p1/Z, [x20]\n"
+ "ld1w { z14.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z12.s }, p1/Z, [x20]\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z22.s }, p1/Z, [x20]\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z13.s }, p1/Z, [x20]\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x20]\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x20]\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1361960 // fmla za.s[x8, 0], { z11.s-z14.s }, z6.s\n"
- ".inst 0xc1341961 // fmla za.s[x8, 1], { z11.s-z14.s }, z4.s\n"
- "ld1w { z24.s }, p1/Z, [x20]\n"
+ ".inst 0xc13a1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z10.s\n"
+ ".inst 0xc1341a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z4.s\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1321aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z2.s\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
+ ".inst 0xc13019c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z0.s\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1311aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z1.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc13719c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z7.s\n"
+ ".inst 0xa04049e8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1361980 // fmla za.s[x8, 0], { z12.s-z15.s }, z6.s\n"
- ".inst 0xc1341981 // fmla za.s[x8, 1], { z12.s-z15.s }, z4.s\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
+ ".inst 0xc13a1a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z10.s\n"
+ ".inst 0xc1381a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z8.s\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1311ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z1.s\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xa04049e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc13619e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z6.s\n"
+ ".inst 0xa04149e8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1321ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z2.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc13419a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z4.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc13819e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z8.s\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
+ ".inst 0xa04049ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc13e1aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z14.s\n"
+ ".inst 0xa14149e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13619a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z6.s\n"
+ ".inst 0xc1371aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z7.s\n"
".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xa04049ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
"9:" // Unpadded: 1 priming loads
"add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z11.s }, p1/Z, [x14]\n"
+ "ld1w { z7.s }, p1/Z, [x14]\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z21.s }, p1/Z, [x20]\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z12.s }, p1/Z, [x20]\n"
+ "ld1w { z8.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z22.s }, p1/Z, [x20]\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z13.s }, p1/Z, [x20]\n"
+ "ld1w { z9.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x20]\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x20]\n"
+ "ld1w { z10.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1371960 // fmla za.s[x8, 0], { z11.s-z14.s }, z7.s\n"
- ".inst 0xc1351961 // fmla za.s[x8, 1], { z11.s-z14.s }, z5.s\n"
- "ld1w { z24.s }, p1/Z, [x20]\n"
+ ".inst 0xc13b18e0 // fmla za.s[x8, 0], { z7.s-z10.s }, z11.s\n"
+ ".inst 0xc13518e1 // fmla za.s[x8, 1], { z7.s-z10.s }, z5.s\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13a1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z10.s\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
+ ".inst 0xc1311a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z1.s\n"
+ "ld1w { z11.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1391aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z9.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc13f1a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z15.s\n"
+ ".inst 0xa04049e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149ec // ld1w { z12.s-z13.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1371980 // fmla za.s[x8, 0], { z12.s-z15.s }, z7.s\n"
- ".inst 0xc1351981 // fmla za.s[x8, 1], { z12.s-z15.s }, z5.s\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
+ ".inst 0xc13d1900 // fmla za.s[x8, 0], { z8.s-z11.s }, z13.s\n"
+ ".inst 0xc1311901 // fmla za.s[x8, 1], { z8.s-z11.s }, z1.s\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1391ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z9.s\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xa14049e6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc13e1a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z14.s\n"
+ ".inst 0xa14149e6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc13a1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z10.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc13519a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z5.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc13e1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z14.s\n"
+ "ld1w { z12.s }, p1/Z, [x20]\n"
+ ".inst 0xa04049ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc13f1921 // fmla za.s[x8, 1], { z9.s-z12.s }, z15.s\n"
+ ".inst 0xa04149ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13719a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z7.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc13f1920 // fmla za.s[x8, 0], { z9.s-z12.s }, z15.s\n"
+ ".inst 0xa14049e4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
"addvl x15, x15, #5\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
"addvl x15, x15, #5\n"
"10:" // Unpadded: 0 priming loads
"cmp x16, #0x2\n"
"blt 20f\n"
"add x21, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z11.s }, p1/Z, [x14]\n"
+ "ld1w { z22.s }, p1/Z, [x14]\n"
"sub x16, x16, #0x2\n"
- "ld1w { z21.s }, p1/Z, [x21]\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"sub x13, x13, #0x1\n"
- "ld1w { z12.s }, p1/Z, [x21]\n"
+ "ld1w { z23.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"lsr x20, x16, #0x1\n"
- "ld1w { z22.s }, p1/Z, [x21]\n"
+ "ld1w { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"cmp x20, x13\n"
- "ld1w { z13.s }, p1/Z, [x21]\n"
+ "ld1w { z24.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "csel x22, x20, x13, LT\n"
- "ld1w { z23.s }, p1/Z, [x21]\n"
+ "csel x23, x20, x13, LT\n"
+ "ld1w { z18.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x21]\n"
+ "ld1w { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"and x16, x16, #0x1\n"
- "ld1w { z24.s }, p1/Z, [x21]\n"
+ "ld1w { z19.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "sub x13, x13, x22\n"
- "ld1w { z15.s }, p1/Z, [x21]\n"
+ "sub x13, x13, x23\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "ld1w { z25.s }, p1/Z, [x21]\n"
+ "ld1w { z20.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
- "cbz x22, 19f\n"
+ "ld1w { z27.s }, p1/Z, [x21]\n"
+ "cbz x23, 19f\n"
"11:" // Unpadded: Main loop
- ".inst 0xc1381960 // fmla za.s[x8, 0], { z11.s-z14.s }, z8.s\n"
- "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
- "add x21, x14, %x[ld_in_row], LSL #2\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xc1361961 // fmla za.s[x8, 1], { z11.s-z14.s }, z6.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1341962 // fmla za.s[x8, 2], { z11.s-z14.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1391ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z9.s\n"
+ "ld1w { z13.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "add x22, x14, %x[ld_in_row], LSL #2\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0xc13a1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z10.s\n"
+ ".inst 0xa14149e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1341ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z4.s\n"
+ ".inst 0xa04049e8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1331aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z3.s\n"
- "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1321aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z2.s\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1311aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z1.s\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1361a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z6.s\n"
+ "ld1w { z11.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc1301a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z0.s\n"
+ ".inst 0xa04149ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1371a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z7.s\n"
+ ".inst 0xa04049e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1381980 // fmla za.s[x8, 0], { z12.s-z15.s }, z8.s\n"
- "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1361981 // fmla za.s[x8, 1], { z12.s-z15.s }, z6.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1341982 // fmla za.s[x8, 2], { z12.s-z15.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1331ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z3.s\n"
- "ld1w { z11.s }, p1/Z, [x14]\n"
+ ".inst 0xc13d1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z13.s\n"
+ "ld1w { z4.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc1311ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z1.s\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1381ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z8.s\n"
+ ".inst 0xa04049ec // ld1w { z12.s-z13.s }, pn10.b/Z, [x15]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc13b1a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z11.s\n"
+ "ld1w { z15.s }, p1/Z, [x14]\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
"add x20, x14, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1321ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z2.s\n"
- "ld1w { z21.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1311ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z1.s\n"
- "ld1w { z12.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
- "ld1w { z22.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13819a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z8.s\n"
- ".inst 0xc13619a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z6.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc13419a2 // fmla za.s[x8, 2], { z13.s-z16.s }, z4.s\n"
- "ld1w { z13.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ ".inst 0xc13e1a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z14.s\n"
+ "ld1w { z22.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1361a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z6.s\n"
+ "ld1w { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1341b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z4.s\n"
+ ".inst 0xc1301b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z0.s\n"
+ ".inst 0xa0414aa6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc13c1b02 // fmla za.s[x8, 2], { z24.s-z27.s }, z12.s\n"
+ "ld1w { z17.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ "ld1w { z24.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1b1c818 // fclamp { z24.s-z27.s }, z0.s, z17.s\n"
- "st1w { z24.s }, p1, [x10]\n"
- "ld1w { z14.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1371960 // fmla za.s[x8, 0], { z11.s-z14.s }, z7.s\n"
+ ".inst 0xa1404aa4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x21]\n"
+ "addvl x21, x21, #5\n"
+ ".inst 0xc1a3c848 // fclamp { z8.s-z11.s }, z2.s, z3.s\n"
+ "st1w { z8.s }, p1, [x11]\n"
+ "ld1w { z18.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13719e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z7.s\n"
+ "add x11, x11, x9, LSL #2\n"
+ ".inst 0xc13c19e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z12.s\n"
+ ".inst 0xa1404aa7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x21]\n"
+ "st1w { z9.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- ".inst 0xc1351961 // fmla za.s[x8, 1], { z11.s-z14.s }, z5.s\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- "st1w { z25.s }, p1, [x9]\n"
- "add x9, x9, x27, LSL #2\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- "st1w { z26.s }, p1, [x26]\n"
+ ".inst 0xa1414aa6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ "addvl x21, x21, #5\n"
+ "st1w { z10.s }, p1, [x27]\n"
+ "add x27, x27, x25, LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13e1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z14.s\n"
+ "st1w { z11.s }, p1, [x26]\n"
+ ".inst 0xc13f1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z15.s\n"
+ "ld1w { z19.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
"add x26, x26, x24, LSL #2\n"
- "ld1w { z24.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13a1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z10.s\n"
- "st1w { z27.s }, p1, [x25]\n"
- ".inst 0xc1391aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z9.s\n"
- "ld1w { z15.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
- "add x25, x25, x23, LSL #2\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1351981 // fmla za.s[x8, 1], { z12.s-z15.s }, z5.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1371980 // fmla za.s[x8, 0], { z12.s-z15.s }, z7.s\n"
- "ld1w { z25.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa0404aae // ld1w { z14.s-z15.s }, pn10.b/Z, [x21]\n"
+ ".inst 0xc13f1a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z15.s\n"
+ ".inst 0xa1414aa4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ "addvl x21, x21, #5\n"
+ ".inst 0xc13c1a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z12.s\n"
+ "ld1w { z26.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1391ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z9.s\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc13a1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z10.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc13519a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z5.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc13719a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z7.s\n"
+ ".inst 0xa0404aac // ld1w { z12.s-z13.s }, pn10.b/Z, [x21]\n"
+ ".inst 0xc13d1ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z13.s\n"
+ ".inst 0xa1414aa4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ "addvl x21, x21, #5\n"
+ ".inst 0xc13c1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z12.s\n"
+ "ld1w { z20.s }, p1/Z, [x22]\n"
+ ".inst 0xa1404aa7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x21]\n"
+ ".inst 0xc13f1a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z15.s\n"
+ ".inst 0xa0414aaa // ld1w { z10.s-z11.s }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc13b1a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z11.s\n"
"ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- "ld1w { z11.s }, p1/Z, [x14]\n"
+ "ld1w { z22.s }, p1/Z, [x14]\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z21.s }, p1/Z, [x20]\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z12.s }, p1/Z, [x20]\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z22.s }, p1/Z, [x20]\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z13.s }, p1/Z, [x20]\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x20]\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x20]\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z24.s }, p1/Z, [x20]\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
"addvl x15, x15, #5\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
"addvl x15, x15, #5\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
"bgt 11b\n"
"b 19f\n"
"12:" // Padded
@@ -510,654 +510,654 @@ void sme2_fp32_planar_5x5_s2_4rows_mla_za_impl(
"13:" // Padded: 4 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z11.s }, p0/Z, [x14]\n"
+ "ld1w { z9.s }, p0/Z, [x14]\n"
"add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z12.s }, p0/Z, [x20]\n"
+ "ld1w { z10.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z22.s }, p0/Z, [x20]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
+ "ld1w { z11.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x20]\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- ".inst 0xc1341960 // fmla za.s[x8, 0], { z11.s-z14.s }, z4.s\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
+ ".inst 0xc1341920 // fmla za.s[x8, 0], { z9.s-z12.s }, z4.s\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1311aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z1.s\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1371ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z7.s\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
+ ".inst 0xa04049e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1341980 // fmla za.s[x8, 0], { z12.s-z15.s }, z4.s\n"
+ ".inst 0xc1361940 // fmla za.s[x8, 0], { z10.s-z13.s }, z6.s\n"
"addvl x15, x15, #5\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa14049e6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x15]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1311ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z1.s\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xc1361b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z6.s\n"
+ "ld1w { z14.s }, p0/Z, [x20]\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04049e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
"ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13419a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z4.s\n"
+ ".inst 0xc1361960 // fmla za.s[x8, 0], { z11.s-z14.s }, z6.s\n"
".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
"14:" // Padded: 3 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z11.s }, p0/Z, [x14]\n"
+ "ld1w { z22.s }, p0/Z, [x14]\n"
"add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
+ "ld1w { z9.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z12.s }, p0/Z, [x20]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z22.s }, p0/Z, [x20]\n"
+ "ld1w { z10.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x20]\n"
+ "ld1w { z11.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- ".inst 0xc1351960 // fmla za.s[x8, 0], { z11.s-z14.s }, z5.s\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
+ ".inst 0xc1351ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z5.s\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1391aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z9.s\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc13f1920 // fmla za.s[x8, 0], { z9.s-z12.s }, z15.s\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
+ ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1351980 // fmla za.s[x8, 0], { z12.s-z15.s }, z5.s\n"
+ ".inst 0xc1381ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z8.s\n"
"addvl x15, x15, #5\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1391ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z9.s\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xc13f1940 // fmla za.s[x8, 0], { z10.s-z13.s }, z15.s\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04049ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
"ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13519a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z5.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc13f1b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z15.s\n"
+ ".inst 0xa14049e4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
"15:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z11.s }, p0/Z, [x14]\n"
+ "ld1w { z16.s }, p0/Z, [x14]\n"
"add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z12.s }, p0/Z, [x20]\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z22.s }, p0/Z, [x20]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x20]\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- ".inst 0xc1361960 // fmla za.s[x8, 0], { z11.s-z14.s }, z6.s\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
- ".inst 0xc1341961 // fmla za.s[x8, 1], { z11.s-z14.s }, z4.s\n"
+ ".inst 0xc13a1a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z10.s\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
+ ".inst 0xc1341a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z4.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
- ".inst 0xc1321aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z2.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
+ ".inst 0xc1301ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z0.s\n"
+ ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1311aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z1.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1371ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z7.s\n"
+ ".inst 0xa14149e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1361980 // fmla za.s[x8, 0], { z12.s-z15.s }, z6.s\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
+ ".inst 0xc1371a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z7.s\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1341981 // fmla za.s[x8, 1], { z12.s-z15.s }, z4.s\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1301a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z0.s\n"
+ ".inst 0xa14049e5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1321ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z2.s\n"
- ".inst 0xc1311ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z1.s\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc13419a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z4.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc13a1b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z10.s\n"
+ ".inst 0xc1351b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z5.s\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ ".inst 0xa14049e5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1351a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z5.s\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13619a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z6.s\n"
+ ".inst 0xc1301a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z0.s\n"
".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
"16:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z11.s }, p0/Z, [x14]\n"
+ "ld1w { z19.s }, p0/Z, [x14]\n"
"add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
+ "ld1w { z8.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z12.s }, p0/Z, [x20]\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z22.s }, p0/Z, [x20]\n"
+ "ld1w { z9.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x20]\n"
+ "ld1w { z10.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- ".inst 0xc1371960 // fmla za.s[x8, 0], { z11.s-z14.s }, z7.s\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
- ".inst 0xc1351961 // fmla za.s[x8, 1], { z11.s-z14.s }, z5.s\n"
+ ".inst 0xc13b1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z11.s\n"
+ "ld1w { z11.s }, p0/Z, [x20]\n"
+ ".inst 0xc1351a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z5.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
- ".inst 0xc13a1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z10.s\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
+ ".inst 0xc1311900 // fmla za.s[x8, 0], { z8.s-z11.s }, z1.s\n"
".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1391aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z9.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc13f1901 // fmla za.s[x8, 1], { z8.s-z11.s }, z15.s\n"
+ ".inst 0xa14149e6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1371980 // fmla za.s[x8, 0], { z12.s-z15.s }, z7.s\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
+ ".inst 0xc13e1a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z14.s\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1351981 // fmla za.s[x8, 1], { z12.s-z15.s }, z5.s\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1351a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z5.s\n"
+ ".inst 0xa04049e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa14149e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc13a1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z10.s\n"
- ".inst 0xc1391ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z9.s\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc13519a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z5.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1381920 // fmla za.s[x8, 0], { z9.s-z12.s }, z8.s\n"
+ ".inst 0xc1371921 // fmla za.s[x8, 1], { z9.s-z12.s }, z7.s\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
+ ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1381aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z8.s\n"
+ ".inst 0xa04149ec // ld1w { z12.s-z13.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13719a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z7.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc13d1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z13.s\n"
+ ".inst 0xa14049e4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
"addvl x15, x15, #5\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
"addvl x15, x15, #5\n"
"17:" // Padded: 0 priming loads
"cmp x16, #0x2\n"
"blt 20f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z11.s }, p0/Z, [x14]\n"
+ "ld1w { z22.s }, p0/Z, [x14]\n"
"add x21, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z21.s }, p0/Z, [x21]\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z12.s }, p0/Z, [x21]\n"
+ "ld1w { z23.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z22.s }, p0/Z, [x21]\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
"mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z13.s }, p0/Z, [x21]\n"
+ "ld1w { z24.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x21]\n"
+ "ld1w { z18.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z14.s }, p0/Z, [x21]\n"
+ "ld1w { z25.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z24.s }, p0/Z, [x21]\n"
+ "ld1w { z19.s }, p0/Z, [x21]\n"
"sub x16, x16, #0x2\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"sub x13, x13, #0x1\n"
- "ld1w { z15.s }, p0/Z, [x21]\n"
+ "ld1w { z26.s }, p0/Z, [x21]\n"
"lsr x20, x16, #0x1\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"cmp x20, x13\n"
- "ld1w { z25.s }, p0/Z, [x21]\n"
+ "ld1w { z20.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- "csel x22, x20, x13, LT\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
+ "csel x23, x20, x13, LT\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
"and x16, x16, #0x1\n"
- "sub x13, x13, x22\n"
- "cbz x22, 19f\n"
+ "sub x13, x13, x23\n"
+ "cbz x23, 19f\n"
"18:" // Padded: Main loop
- ".inst 0xc1381960 // fmla za.s[x8, 0], { z11.s-z14.s }, z8.s\n"
- "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc1391ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z9.s\n"
+ "ld1w { z15.s }, p2/Z, [x15, #4, MUL VL]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1361961 // fmla za.s[x8, 1], { z11.s-z14.s }, z6.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "add x21, x14, %x[ld_in_row], LSL #2\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xc1341962 // fmla za.s[x8, 2], { z11.s-z14.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc13a1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z10.s\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "add x22, x14, %x[ld_in_row], LSL #2\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0xc1341ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z4.s\n"
+ ".inst 0xa14049e5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1331aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z3.s\n"
- "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1321aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z2.s\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1311aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z1.s\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1361a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z6.s\n"
+ "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc1301a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z0.s\n"
+ ".inst 0xa04149ec // ld1w { z12.s-z13.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1371a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z7.s\n"
+ ".inst 0xa14049e6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1381980 // fmla za.s[x8, 0], { z12.s-z15.s }, z8.s\n"
- "ld1w { z11.s }, p0/Z, [x14]\n"
+ ".inst 0xc13f1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z15.s\n"
+ "ld1w { z16.s }, p0/Z, [x14]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1361981 // fmla za.s[x8, 1], { z12.s-z15.s }, z6.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1341982 // fmla za.s[x8, 2], { z12.s-z15.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1331ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z3.s\n"
- "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1321ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z2.s\n"
- "ld1w { z21.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13a1ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z10.s\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "add x21, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1351ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z5.s\n"
+ ".inst 0xa04049ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1311a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z1.s\n"
+ "ld1w { z0.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc13c1a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z12.s\n"
+ "ld1w { z12.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1311ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z1.s\n"
- "ld1w { z12.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1361a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z6.s\n"
+ "ld1w { z17.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1w { z22.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13819a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z8.s\n"
+ "ld1w { z13.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1301b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z0.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc13619a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z6.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc13419a2 // fmla za.s[x8, 2], { z13.s-z16.s }, z4.s\n"
- "ld1w { z13.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13a1b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z10.s\n"
+ ".inst 0xa1414a81 // ld1w { z1.s, z9.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc13e1b02 // fmla za.s[x8, 2], { z24.s-z27.s }, z14.s\n"
+ "ld1w { z18.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa0404a80 // ld1w { z0.s-z1.s }, pn10.b/Z, [x20]\n"
"add x8, x8, #0x1\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1b1c818 // fclamp { z24.s-z27.s }, z0.s, z17.s\n"
- "ld1w { z14.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "addvl x20, x20, #5\n"
+ ".inst 0xc1a3c858 // fclamp { z24.s-z27.s }, z2.s, z3.s\n"
+ "ld1w { z19.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "st1w { z24.s }, p1, [x10]\n"
+ "st1w { z24.s }, p1, [x11]\n"
"mov x12, #0x8\n"
- ".inst 0xc1371960 // fmla za.s[x8, 0], { z11.s-z14.s }, z7.s\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- "add x10, x10, x28, LSL #2\n"
- ".inst 0xc1351961 // fmla za.s[x8, 1], { z11.s-z14.s }, z5.s\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- "st1w { z25.s }, p1, [x9]\n"
- "ld1w { z24.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1391a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z9.s\n"
+ ".inst 0xa0404a88 // ld1w { z8.s-z9.s }, pn10.b/Z, [x20]\n"
+ "add x11, x11, x9, LSL #2\n"
+ ".inst 0xc1311a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z1.s\n"
+ ".inst 0xa0414a80 // ld1w { z0.s-z1.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "addvl x20, x20, #5\n"
+ "st1w { z25.s }, p1, [x10]\n"
+ "ld1w { z15.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc13a1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z10.s\n"
- ".inst 0xc1391aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z9.s\n"
- "ld1w { z15.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1311980 // fmla za.s[x8, 0], { z12.s-z15.s }, z1.s\n"
+ ".inst 0xc1391981 // fmla za.s[x8, 1], { z12.s-z15.s }, z9.s\n"
+ "ld1w { z20.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1351981 // fmla za.s[x8, 1], { z12.s-z15.s }, z5.s\n"
- "add x9, x9, x27, LSL #2\n"
- "st1w { z26.s }, p1, [x26]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1371980 // fmla za.s[x8, 0], { z12.s-z15.s }, z7.s\n"
- "add x26, x26, x24, LSL #2\n"
- "ld1w { z25.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa0404a8a // ld1w { z10.s-z11.s }, pn10.b/Z, [x20]\n"
+ ".inst 0xc13b1a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z11.s\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z26.s }, p1, [x27]\n"
+ ".inst 0xa1414a80 // ld1w { z0.s, z8.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "addvl x20, x20, #5\n"
+ ".inst 0xc1381a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z8.s\n"
+ "add x27, x27, x25, LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "st1w { z27.s }, p1, [x25]\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ "st1w { z27.s }, p1, [x26]\n"
+ ".inst 0xa0404a88 // ld1w { z8.s-z9.s }, pn10.b/Z, [x20]\n"
"mov x12, #0x0\n"
- ".inst 0xc1391ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z9.s\n"
- "add x25, x25, x23, LSL #2\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc13a1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z10.s\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ ".inst 0xc13919a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z9.s\n"
+ "add x26, x26, x24, LSL #2\n"
+ ".inst 0xa1414a81 // ld1w { z1.s, z9.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "addvl x20, x20, #5\n"
+ ".inst 0xc13919a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z9.s\n"
+ "ld1w { z21.s }, p0/Z, [x22]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "ld1w { z11.s }, p0/Z, [x14]\n"
+ "ld1w { z22.s }, p0/Z, [x14]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc13519a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z5.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc13719a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z7.s\n"
+ ".inst 0xa0404a8e // ld1w { z14.s-z15.s }, pn10.b/Z, [x20]\n"
+ ".inst 0xc13f1a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z15.s\n"
+ ".inst 0xa0414a80 // ld1w { z0.s-z1.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1311a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z1.s\n"
"ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z12.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1w { z22.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z24.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z19.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z26.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
"addvl x15, x15, #5\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
"addvl x15, x15, #5\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
"bgt 18b\n"
"19:" // Main loop tail
- ".inst 0xc1381960 // fmla za.s[x8, 0], { z11.s-z14.s }, z8.s\n"
+ ".inst 0xc1391ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z9.s\n"
"ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1361961 // fmla za.s[x8, 1], { z11.s-z14.s }, z6.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1341962 // fmla za.s[x8, 2], { z11.s-z14.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1331aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z3.s\n"
- "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1321aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z2.s\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1311aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z1.s\n"
+ ".inst 0xc13a1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z10.s\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "add x21, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1341ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z4.s\n"
".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1381980 // fmla za.s[x8, 0], { z12.s-z15.s }, z8.s\n"
- "ld1w { z11.s }, p0/Z, [x14]\n"
+ ".inst 0xc1361a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z6.s\n"
+ "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc1301a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z0.s\n"
+ ".inst 0xa04149ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1371a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z7.s\n"
+ ".inst 0xa04049ec // ld1w { z12.s-z13.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xc1381ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z8.s\n"
+ "ld1w { z16.s }, p0/Z, [x14]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1361981 // fmla za.s[x8, 1], { z12.s-z15.s }, z6.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1341982 // fmla za.s[x8, 2], { z12.s-z15.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1331ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z3.s\n"
- "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1321ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z2.s\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13a1ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z10.s\n"
+ ".inst 0xa14149e5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1311ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z1.s\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1391a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z9.s\n"
+ "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc13e1a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z14.s\n"
+ "ld1w { z22.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1311ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z1.s\n"
- "ld1w { z12.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13c1a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z12.s\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1w { z22.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13819a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z8.s\n"
+ "ld1w { z23.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1311b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z1.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc13619a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z6.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc13419a2 // fmla za.s[x8, 2], { z13.s-z16.s }, z4.s\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1351b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z5.s\n"
+ ".inst 0xa0414a8e // ld1w { z14.s-z15.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1371b02 // fmla za.s[x8, 2], { z24.s-z27.s }, z7.s\n"
+ "ld1w { z18.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z24.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+ ".inst 0xa0404a84 // ld1w { z4.s-z5.s }, pn10.b/Z, [x20]\n"
"add x8, x8, #0x1\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1b1c818 // fclamp { z24.s-z27.s }, z0.s, z17.s\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "addvl x20, x20, #5\n"
+ ".inst 0xc1a3c848 // fclamp { z8.s-z11.s }, z2.s, z3.s\n"
+ "ld1w { z19.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "st1w { z24.s }, p1, [x10]\n"
+ "st1w { z8.s }, p1, [x11]\n"
"mov x12, #0x8\n"
- ".inst 0xc1371960 // fmla za.s[x8, 0], { z11.s-z14.s }, z7.s\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- "add x10, x10, x28, LSL #2\n"
- ".inst 0xc1351961 // fmla za.s[x8, 1], { z11.s-z14.s }, z5.s\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- "st1w { z25.s }, p1, [x9]\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13f1a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z15.s\n"
+ ".inst 0xa0404a80 // ld1w { z0.s-z1.s }, pn10.b/Z, [x20]\n"
+ "add x11, x11, x9, LSL #2\n"
+ ".inst 0xc1351a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z5.s\n"
+ ".inst 0xa1414a80 // ld1w { z0.s, z8.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "addvl x20, x20, #5\n"
+ "st1w { z9.s }, p1, [x10]\n"
+ "ld1w { z25.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc13a1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z10.s\n"
- ".inst 0xc1391aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z9.s\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1381ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z8.s\n"
+ ".inst 0xc1311ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z1.s\n"
+ "ld1w { z20.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1351981 // fmla za.s[x8, 1], { z12.s-z15.s }, z5.s\n"
- "add x9, x9, x27, LSL #2\n"
- "st1w { z26.s }, p1, [x26]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1371980 // fmla za.s[x8, 0], { z12.s-z15.s }, z7.s\n"
- "add x26, x26, x24, LSL #2\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa0404a86 // ld1w { z6.s-z7.s }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1371a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z7.s\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z10.s }, p1, [x27]\n"
+ ".inst 0xa1414a81 // ld1w { z1.s, z9.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "addvl x20, x20, #5\n"
+ ".inst 0xc1391a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z9.s\n"
+ "add x27, x27, x25, LSL #2\n"
+ "ld1w { z26.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "st1w { z27.s }, p1, [x25]\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1391ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z9.s\n"
- "add x25, x25, x23, LSL #2\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc13a1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z10.s\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "st1w { z11.s }, p1, [x26]\n"
+ ".inst 0xa1404a84 // ld1w { z4.s, z12.s }, pn10.b/Z, [x20]\n"
+ ".inst 0xc13c1ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z12.s\n"
+ "add x26, x26, x24, LSL #2\n"
+ ".inst 0xa1414a84 // ld1w { z4.s, z12.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "addvl x20, x20, #5\n"
+ ".inst 0xc13c1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z12.s\n"
+ "ld1w { z21.s }, p0/Z, [x21]\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc13519a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z5.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xa0404a80 // ld1w { z0.s-z1.s }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1311a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z1.s\n"
+ ".inst 0xa0414a80 // ld1w { z0.s-z1.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
"ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13719a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z7.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc1311a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z1.s\n"
+ ".inst 0xa14049e4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
"addvl x15, x15, #5\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
"addvl x15, x15, #5\n"
"20:" // Main loop skip tail
"cbz x16, 21f\n" // Skip remainder inputs
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z11.s }, p0/Z, [x14]\n"
+ "ld1w { z16.s }, p0/Z, [x14]\n"
"add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z12.s }, p0/Z, [x20]\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z22.s }, p0/Z, [x20]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x20]\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- ".inst 0xc1381960 // fmla za.s[x8, 0], { z11.s-z14.s }, z8.s\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
+ ".inst 0xc1391a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z9.s\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1331aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z3.s\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
+ ".inst 0xc1361ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z6.s\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
"ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1361961 // fmla za.s[x8, 1], { z11.s-z14.s }, z6.s\n"
+ ".inst 0xc13a1a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z10.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1341962 // fmla za.s[x8, 2], { z11.s-z14.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1341a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z4.s\n"
+ ".inst 0xa04049ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15]\n"
"sub x13, x13, #0x1\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xa04149ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1381980 // fmla za.s[x8, 0], { z12.s-z15.s }, z8.s\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
- ".inst 0xc1321aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z2.s\n"
+ ".inst 0xc1381a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z8.s\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ ".inst 0xc1301ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z0.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1311aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z1.s\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1331ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z3.s\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc1371ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z7.s\n"
+ ".inst 0xa04049e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1391b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z9.s\n"
+ ".inst 0xa14149e5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1361981 // fmla za.s[x8, 1], { z12.s-z15.s }, z6.s\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0xc1341982 // fmla za.s[x8, 2], { z12.s-z15.s }, z4.s\n"
+ ".inst 0xc13e1a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z14.s\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ ".inst 0xc13a1a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z10.s\n"
"ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc13819a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z8.s\n"
- ".inst 0xc1321ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z2.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1311ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z1.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1381a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z8.s\n"
+ ".inst 0xc1351b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z5.s\n"
+ ".inst 0xa04049e8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1301b02 // fmla za.s[x8, 2], { z24.s-z27.s }, z0.s\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- ".inst 0xc1b1c818 // fclamp { z24.s-z27.s }, z0.s, z17.s\n"
- "st1w { z24.s }, p1, [x10]\n"
+ ".inst 0xc1a3c858 // fclamp { z24.s-z27.s }, z2.s, z3.s\n"
+ "st1w { z24.s }, p1, [x11]\n"
+ "add x11, x11, x9, LSL #2\n"
+ ".inst 0xc1301a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z0.s\n"
+ "st1w { z25.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- ".inst 0xc13619a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z6.s\n"
- "st1w { z25.s }, p1, [x9]\n"
- "add x9, x9, x27, LSL #2\n"
- ".inst 0xc13419a2 // fmla za.s[x8, 2], { z13.s-z16.s }, z4.s\n"
+ ".inst 0xc1381a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z8.s\n"
"add x8, x8, #0x1\n"
- "st1w { z26.s }, p1, [x26]\n"
+ "st1w { z26.s }, p1, [x27]\n"
+ "add x27, x27, x25, LSL #2\n"
+ "st1w { z27.s }, p1, [x26]\n"
"add x26, x26, x24, LSL #2\n"
- "st1w { z27.s }, p1, [x25]\n"
- "add x25, x25, x23, LSL #2\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
"21:" // Tail input: End
"cbz x13, 23f\n"
"22:" // Right padding loop
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
"subs x13, x13, #0x1\n"
- ".inst 0xc1b1c818 // fclamp { z24.s-z27.s }, z0.s, z17.s\n"
- "st1w { z24.s }, p1, [x10]\n"
- "add x10, x10, x28, LSL #2\n"
+ ".inst 0xc1a3c848 // fclamp { z8.s-z11.s }, z2.s, z3.s\n"
+ "st1w { z8.s }, p1, [x11]\n"
+ "add x11, x11, x9, LSL #2\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "st1w { z25.s }, p1, [x9]\n"
- "add x9, x9, x27, LSL #2\n"
- "st1w { z26.s }, p1, [x26]\n"
+ "st1w { z9.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z10.s }, p1, [x27]\n"
+ "add x27, x27, x25, LSL #2\n"
+ "st1w { z11.s }, p1, [x26]\n"
"add x26, x26, x24, LSL #2\n"
- "st1w { z27.s }, p1, [x25]\n"
- "add x25, x25, x23, LSL #2\n"
"bgt 22b\n"
"23:" // End
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- "incb x15, ALL, MUL #16\n"
- "incb x15, ALL, MUL #9\n"
- "str x15, [%x[args], %[offsetof_Args_weights]]\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incb x20, ALL, MUL #16\n"
+ "incb x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
"incw x17\n"
"whilelt p1.s, x17, x7\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x14, x14, x20, LSL #2\n"
- "str x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21, LSL #2\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x23, x22, [x11, #0x0]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
"ldp x21, x20, [x24, #0x0]\n"
"add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "stp x23, x22, [x11, #0x0]\n"
- "ldp x23, x22, [x11, #0x10]\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "stp x23, x22, [x11, #0x10]\n"
+ "stp x23, x22, [x25, #0x10]\n"
"b.any 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za.hpp
index f09c61667f..50ef6c3815 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za/generic.cpp
index 5570b27644..be82e04613 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za/generic.cpp
@@ -69,102 +69,102 @@ void sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za_impl(
Args args = { inptr, ld_in_vl, pad_top, 6u - std::min(6u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
__asm__ __volatile__(
- "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"mov x20, #0x6\n"
".inst 0xd503477f // SMSTART ZA\n"
- "sub x20, x20, x6\n"
- "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "sub x20, x20, x7\n"
+ "ldr x17, [%x[args], %[offsetof_Args_pad_top]]\n"
"ptrue p2.b\n"
- "ld1rw { z28.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
- "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
- "whilelt p1.s, XZR, x17\n"
+ "ld1rw { z25.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "ldr x16, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x16\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z29.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
- "whilelt p8.s, XZR, x7\n"
+ "ld1rw { z13.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+ "whilelt p8.s, XZR, x17\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
"1:" // Channel loop
"ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
- "fmov z22.s, #0x0\n"
+ "fmov z26.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z22.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z26.s }, p1/Z, [x20, x15, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x21, x20\n"
- "fmov z9.s, #0x0\n"
- "ld1w { z25.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #3\n"
- "incb x20\n"
- "ld1w { z27.s }, p2/Z, [x21]\n"
- ".inst 0x648aab29 // bfcvtnt z9.h, p2/M, z25.s\n"
- "incb x21, ALL, MUL #3\n"
- "ld1w { z21.s }, p2/Z, [x21]\n"
- "mov x21, x20\n"
- ".inst 0x658aab28 // bfcvt z8.h, p2/M, z25.s\n"
- "ld1w { z25.s }, p2/Z, [x21]\n"
- ".inst 0x658aab66 // bfcvt z6.h, p2/M, z27.s\n"
- "fmov z2.s, #0x0\n"
- "incb x21, ALL, MUL #3\n"
- ".inst 0x658aab21 // bfcvt z1.h, p2/M, z25.s\n"
- ".inst 0x648aab68 // bfcvtnt z8.h, p2/M, z27.s\n"
- "incb x20\n"
- "ld1w { z27.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #3\n"
- ".inst 0x648aaaa6 // bfcvtnt z6.h, p2/M, z21.s\n"
- ".inst 0x658aaaa5 // bfcvt z5.h, p2/M, z21.s\n"
- "ld1w { z21.s }, p2/Z, [x21]\n"
- "mov x21, x20\n"
- ".inst 0x648aab22 // bfcvtnt z2.h, p2/M, z25.s\n"
- "ld1w { z25.s }, p2/Z, [x21]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x21\n"
+ "fmov z6.s, #0x0\n"
+ "ld1w { z15.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #3\n"
+ "incb x21\n"
+ "ld1w { z29.s }, p2/Z, [x20]\n"
+ ".inst 0x648aa9e6 // bfcvtnt z6.h, p2/M, z15.s\n"
+ "incb x20, ALL, MUL #3\n"
+ "ld1w { z30.s }, p2/Z, [x20]\n"
+ "mov x20, x21\n"
+ ".inst 0x658aa9e5 // bfcvt z5.h, p2/M, z15.s\n"
+ "ld1w { z14.s }, p2/Z, [x20]\n"
+ ".inst 0x658aaba8 // bfcvt z8.h, p2/M, z29.s\n"
+ "fmov z11.s, #0x0\n"
+ "incb x20, ALL, MUL #3\n"
+ ".inst 0x658aa9ca // bfcvt z10.h, p2/M, z14.s\n"
+ ".inst 0x648aaba5 // bfcvtnt z5.h, p2/M, z29.s\n"
+ "incb x21\n"
+ "ld1w { z24.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #3\n"
+ ".inst 0x648aabc8 // bfcvtnt z8.h, p2/M, z30.s\n"
+ ".inst 0x658aabcc // bfcvt z12.h, p2/M, z30.s\n"
+ "ld1w { z28.s }, p2/Z, [x20]\n"
+ "mov x21, x21\n"
+ ".inst 0x648aa9cb // bfcvtnt z11.h, p2/M, z14.s\n"
+ "ld1w { z20.s }, p2/Z, [x21]\n"
"incb x21, ALL, MUL #3\n"
- ".inst 0x648aab61 // bfcvtnt z1.h, p2/M, z27.s\n"
- ".inst 0x658aab6c // bfcvt z12.h, p2/M, z27.s\n"
- "ld1w { z27.s }, p2/Z, [x21]\n"
- "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
+ ".inst 0x648aab0a // bfcvtnt z10.h, p2/M, z24.s\n"
+ ".inst 0x658aab09 // bfcvt z9.h, p2/M, z24.s\n"
+ "ld1w { z15.s }, p2/Z, [x21]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_input_cols]]\n"
"incb x21, ALL, MUL #3\n"
- "fmov z7.s, #0x0\n"
- ".inst 0x658aab24 // bfcvt z4.h, p2/M, z25.s\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- ".inst 0x658aab60 // bfcvt z0.h, p2/M, z27.s\n"
- ".inst 0x648aaaac // bfcvtnt z12.h, p2/M, z21.s\n"
- "sub x20, x15, #0x1\n"
+ "fmov z14.s, #0x0\n"
+ ".inst 0x658aaa81 // bfcvt z1.h, p2/M, z20.s\n"
+ "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
+ ".inst 0x658aa9e7 // bfcvt z7.h, p2/M, z15.s\n"
+ ".inst 0x648aab89 // bfcvtnt z9.h, p2/M, z28.s\n"
+ "sub x20, x14, #0x1\n"
"orr x23, x20, %x[ld_in_col], LSL #18\n"
- ".inst 0x658aaaaa // bfcvt z10.h, p2/M, z21.s\n"
- "ld1w { z21.s }, p2/Z, [x21]\n"
- "orr x23, x17, x23, LSL #20\n"
+ ".inst 0x658aab84 // bfcvt z4.h, p2/M, z28.s\n"
+ "ld1w { z29.s }, p2/Z, [x21]\n"
+ "orr x23, x16, x23, LSL #20\n"
"mov x22, #0x6\n"
- "add x21, x7, x6\n"
+ "add x21, x17, x7\n"
"lsl x20, %x[ld_in_row], #0x2\n"
- "mov z23.d, z22.d\n"
- ".inst 0x648aab27 // bfcvtnt z7.h, p2/M, z25.s\n"
- ".inst 0x648aab64 // bfcvtnt z4.h, p2/M, z27.s\n"
- ".inst 0x648aaaa0 // bfcvtnt z0.h, p2/M, z21.s\n"
+ "mov z27.d, z26.d\n"
+ ".inst 0x648aaa8e // bfcvtnt z14.h, p2/M, z20.s\n"
+ ".inst 0x648aa9e1 // bfcvtnt z1.h, p2/M, z15.s\n"
+ ".inst 0x648aaba7 // bfcvtnt z7.h, p2/M, z29.s\n"
"mov x8, #0x0\n"
- "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
- ".inst 0x658aaaa3 // bfcvt z3.h, p2/M, z21.s\n"
+ "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
+ ".inst 0x658aaba2 // bfcvt z2.h, p2/M, z29.s\n"
"lsl x23, x23, #0x2\n"
"sub x22, x22, x21\n"
- "madd x20, x20, x7, x14\n"
+ "madd x20, x20, x17, x13\n"
"3:" // Issue prefetches
"subs x22, x22, #0x1\n"
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
- "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x22, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x2\n"
- "msub x14, x7, x20, x14\n"
- ".inst 0xc0040ac0 // mova za.d[x8, #0], { z22.d-z23.d }\n"
+ "msub x13, x17, x20, x13\n"
+ ".inst 0xc0040b40 // mova za.d[x8, #0], { z26.d-z27.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040ac1 // mova za.d[x8, #1], { z22.d-z23.d }\n"
+ ".inst 0xc0040b41 // mova za.d[x8, #1], { z26.d-z27.d }\n"
"mov x10, #0x2\n"
- "ldp x9, x28, [x11], #0x10\n"
- ".inst 0xc0040ac2 // mova za.d[x8, #2], { z22.d-z23.d }\n"
+ "ldp x9, x28, [x22], #0x10\n"
+ ".inst 0xc0040b42 // mova za.d[x8, #2], { z26.d-z27.d }\n"
"ldp x27, x26, [x20], #0x10\n"
- ".inst 0xc0040ac3 // mova za.d[x8, #3], { z22.d-z23.d }\n"
+ ".inst 0xc0040b43 // mova za.d[x8, #3], { z26.d-z27.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0040ac4 // mova za.d[x8, #4], { z22.d-z23.d }\n"
- "ldp x25, x24, [x11], #0x10\n"
- ".inst 0xc0040ac5 // mova za.d[x8, #5], { z22.d-z23.d }\n"
+ ".inst 0xc0040b44 // mova za.d[x8, #4], { z26.d-z27.d }\n"
+ "ldp x25, x24, [x22], #0x10\n"
+ ".inst 0xc0040b45 // mova za.d[x8, #5], { z26.d-z27.d }\n"
"ldp x23, x22, [x20], #0x10\n"
"cbz x21, 5f\n"
"cmp x21, x10\n"
@@ -172,389 +172,389 @@ void sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x10, x10, x20\n"
"cbz x21, 5f\n"
- ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
- "sub x13, x13, x21\n"
- ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
- ".inst 0xc1bdcb98 // fclamp { z24.s-z27.s }, z28.s, z29.s\n"
+ ".inst 0xc0060814 // mova { z20.d-z21.d }, za.d[x8, #0]\n"
+ "sub x11, x11, x21\n"
+ ".inst 0xc0060836 // mova { z22.d-z23.d }, za.d[x8, #1]\n"
+ ".inst 0xc1adcb34 // fclamp { z20.s-z23.s }, z25.s, z13.s\n"
"4:" // Left padding
"subs x21, x21, #0x1\n"
- "st1w { z24.s }, p1, [x9]\n"
+ "st1w { z20.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z26.s }, p1, [x28]\n"
+ "st1w { z22.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- "st1w { z25.s }, p1, [x25]\n"
+ "st1w { z21.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- "st1w { z27.s }, p1, [x24]\n"
+ "st1w { z23.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
"bgt 4b\n"
"5:" // Left padding: End
- "adds XZR, x7, x6\n"
+ "adds XZR, x17, x7\n"
"bne 10f\n"
"cbz x10, 8f\n"
"cmp x10, #0x1\n"
- "sub x15, x15, x10\n"
+ "sub x14, x14, x10\n"
"beq 7f\n"
"6:" // Unpadded: 2 priming loads
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x14]\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p1/Z, [x13]\n"
+ ".inst 0x658aaa3e // bfcvt z30.h, p2/M, z17.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aab9e // bfcvtnt z30.h, p2/M, z28.s\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x658aaa1f // bfcvt z31.h, p2/M, z16.s\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- ".inst 0xc12811b0 // bfdot za.s[x8, 0], { z13.h-z14.h }, z8.h\n"
+ ".inst 0x648aa9ff // bfcvtnt z31.h, p2/M, z15.s\n"
+ ".inst 0xc12513d0 // bfdot za.s[x8, 0], { z30.h-z31.h }, z5.h\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc12911b1 // bfdot za.s[x8, 1], { z13.h-z14.h }, z9.h\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xc12511d0 // bfdot za.s[x8, 0], { z14.h-z15.h }, z5.h\n"
- ".inst 0xc12611d1 // bfdot za.s[x8, 1], { z14.h-z15.h }, z6.h\n"
+ ".inst 0x658aaa00 // bfcvt z0.h, p2/M, z16.s\n"
+ ".inst 0xc12613d1 // bfdot za.s[x8, 1], { z30.h-z31.h }, z6.h\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
+ ".inst 0x648aa9e0 // bfcvtnt z0.h, p2/M, z15.s\n"
+ ".inst 0xc12c13f0 // bfdot za.s[x8, 0], { z31.h-z0.h }, z12.h\n"
+ ".inst 0xc12813f1 // bfdot za.s[x8, 1], { z31.h-z0.h }, z8.h\n"
"7:" // Unpadded: 1 priming loads
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x14]\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z31.s }, p1/Z, [x13]\n"
+ ".inst 0x658aabef // bfcvt z15.h, p2/M, z31.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- ".inst 0xc12111b0 // bfdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x648aaa30 // bfcvtnt z16.h, p2/M, z17.s\n"
+ ".inst 0xc12a11f0 // bfdot za.s[x8, 0], { z15.h-z16.h }, z10.h\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc12211b1 // bfdot za.s[x8, 1], { z13.h-z14.h }, z2.h\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xc12811b2 // bfdot za.s[x8, 2], { z13.h-z14.h }, z8.h\n"
- ".inst 0xc12911b3 // bfdot za.s[x8, 3], { z13.h-z14.h }, z9.h\n"
- ".inst 0xc12a11d0 // bfdot za.s[x8, 0], { z14.h-z15.h }, z10.h\n"
- ".inst 0xc12c11d1 // bfdot za.s[x8, 1], { z14.h-z15.h }, z12.h\n"
- ".inst 0xc12511d2 // bfdot za.s[x8, 2], { z14.h-z15.h }, z5.h\n"
- ".inst 0xc12611d3 // bfdot za.s[x8, 3], { z14.h-z15.h }, z6.h\n"
+ ".inst 0x658aaad1 // bfcvt z17.h, p2/M, z22.s\n"
+ ".inst 0xc12b11f1 // bfdot za.s[x8, 1], { z15.h-z16.h }, z11.h\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
+ ".inst 0x648aaa51 // bfcvtnt z17.h, p2/M, z18.s\n"
+ ".inst 0xc12511f2 // bfdot za.s[x8, 2], { z15.h-z16.h }, z5.h\n"
+ ".inst 0xc12611f3 // bfdot za.s[x8, 3], { z15.h-z16.h }, z6.h\n"
+ ".inst 0xc1241210 // bfdot za.s[x8, 0], { z16.h-z17.h }, z4.h\n"
+ ".inst 0xc1291211 // bfdot za.s[x8, 1], { z16.h-z17.h }, z9.h\n"
+ ".inst 0xc12c1212 // bfdot za.s[x8, 2], { z16.h-z17.h }, z12.h\n"
+ ".inst 0xc1281213 // bfdot za.s[x8, 3], { z16.h-z17.h }, z8.h\n"
"8:" // Unpadded: 0 priming loads
- "cbz x15, 16f\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x14]\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "sub x15, x15, #0x1\n"
+ "cbz x14, 16f\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x13]\n"
+ ".inst 0x658aaa16 // bfcvt z22.h, p2/M, z16.s\n"
+ "sub x14, x14, #0x1\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "sub x13, x13, #0x1\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "sub x11, x11, #0x1\n"
+ ".inst 0x648aaa16 // bfcvtnt z22.h, p2/M, z16.s\n"
+ "ld1w { z0.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "cmp x15, x13\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x658aa817 // bfcvt z23.h, p2/M, z0.s\n"
+ "cmp x14, x11\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "csel x21, x15, x13, LT\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "csel x21, x14, x11, LT\n"
+ ".inst 0x648aab17 // bfcvtnt z23.h, p2/M, z24.s\n"
+ "ld1w { z0.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
+ ".inst 0x658aa818 // bfcvt z24.h, p2/M, z0.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- "sub x13, x13, x21\n"
+ ".inst 0x648aaa18 // bfcvtnt z24.h, p2/M, z16.s\n"
+ "sub x11, x11, x21\n"
"cbz x21, 15f\n"
"9:" // Unpadded: Main loop
- ".inst 0xc12411b0 // bfdot za.s[x8, 0], { z13.h-z14.h }, z4.h\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z21.s }, p1/Z, [x14]\n"
+ ".inst 0xc12112d0 // bfdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z0.s }, p1/Z, [x13]\n"
"subs x21, x21, #0x1\n"
- ".inst 0xc12711b1 // bfdot za.s[x8, 1], { z13.h-z14.h }, z7.h\n"
+ ".inst 0xc12e12d1 // bfdot za.s[x8, 1], { z22.h-z23.h }, z14.h\n"
"ld1w { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
"ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12311d0 // bfdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc12011d1 // bfdot za.s[x8, 1], { z14.h-z15.h }, z0.h\n"
+ ".inst 0xc12212f0 // bfdot za.s[x8, 0], { z23.h-z24.h }, z2.h\n"
+ ".inst 0xc12712f1 // bfdot za.s[x8, 1], { z23.h-z24.h }, z7.h\n"
"ld1w { z18.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12111b2 // bfdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc12a12d2 // bfdot za.s[x8, 2], { z22.h-z23.h }, z10.h\n"
"ld1w { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12211b3 // bfdot za.s[x8, 3], { z13.h-z14.h }, z2.h\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0xc12811b4 // bfdot za.s[x8, 4], { z13.h-z14.h }, z8.h\n"
- ".inst 0xc12911b5 // bfdot za.s[x8, 5], { z13.h-z14.h }, z9.h\n"
- ".inst 0x658aaaad // bfcvt z13.h, p2/M, z21.s\n"
- ".inst 0x648aaa8d // bfcvtnt z13.h, p2/M, z20.s\n"
- ".inst 0xc12a11d2 // bfdot za.s[x8, 2], { z14.h-z15.h }, z10.h\n"
- ".inst 0xc12c11d3 // bfdot za.s[x8, 3], { z14.h-z15.h }, z12.h\n"
- ".inst 0xc12511d4 // bfdot za.s[x8, 4], { z14.h-z15.h }, z5.h\n"
- ".inst 0xc12611d5 // bfdot za.s[x8, 5], { z14.h-z15.h }, z6.h\n"
- ".inst 0x658aaa6e // bfcvt z14.h, p2/M, z19.s\n"
- ".inst 0x658aaa2f // bfcvt z15.h, p2/M, z17.s\n"
- ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
- ".inst 0x648aaa4e // bfcvtnt z14.h, p2/M, z18.s\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc12b12d3 // bfdot za.s[x8, 3], { z22.h-z23.h }, z11.h\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
+ ".inst 0xc12512d4 // bfdot za.s[x8, 4], { z22.h-z23.h }, z5.h\n"
+ ".inst 0xc12612d5 // bfdot za.s[x8, 5], { z22.h-z23.h }, z6.h\n"
+ ".inst 0x658aa816 // bfcvt z22.h, p2/M, z0.s\n"
+ ".inst 0x648aaa96 // bfcvtnt z22.h, p2/M, z20.s\n"
+ ".inst 0xc12412f2 // bfdot za.s[x8, 2], { z23.h-z24.h }, z4.h\n"
+ ".inst 0xc12912f3 // bfdot za.s[x8, 3], { z23.h-z24.h }, z9.h\n"
+ ".inst 0xc12c12f4 // bfdot za.s[x8, 4], { z23.h-z24.h }, z12.h\n"
+ ".inst 0xc12812f5 // bfdot za.s[x8, 5], { z23.h-z24.h }, z8.h\n"
+ ".inst 0x658aaa77 // bfcvt z23.h, p2/M, z19.s\n"
+ ".inst 0x658aaa38 // bfcvt z24.h, p2/M, z17.s\n"
+ ".inst 0xc0060810 // mova { z16.d-z17.d }, za.d[x8, #0]\n"
+ ".inst 0x648aaa57 // bfcvtnt z23.h, p2/M, z18.s\n"
+ ".inst 0x648aab98 // bfcvtnt z24.h, p2/M, z28.s\n"
+ ".inst 0xc0060832 // mova { z18.d-z19.d }, za.d[x8, #1]\n"
"add x8, x8, #0x2\n"
- ".inst 0xc1bdcb98 // fclamp { z24.s-z27.s }, z28.s, z29.s\n"
- "st1w { z24.s }, p1, [x9]\n"
+ ".inst 0xc1adcb30 // fclamp { z16.s-z19.s }, z25.s, z13.s\n"
+ "st1w { z16.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z26.s }, p1, [x28]\n"
+ "st1w { z18.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- ".inst 0xc0040ac4 // mova za.d[x8, #4], { z22.d-z23.d }\n"
- "st1w { z25.s }, p1, [x25]\n"
+ ".inst 0xc0040b44 // mova za.d[x8, #4], { z26.d-z27.d }\n"
+ "st1w { z17.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- ".inst 0xc0040ac5 // mova za.d[x8, #5], { z22.d-z23.d }\n"
- "st1w { z27.s }, p1, [x24]\n"
+ ".inst 0xc0040b45 // mova za.d[x8, #5], { z26.d-z27.d }\n"
+ "st1w { z19.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
"bgt 9b\n"
"b 15f\n"
"10:" // Padded
"cbz x10, 13f\n"
"cmp x10, #0x1\n"
- "sub x15, x15, x10\n"
+ "sub x14, x14, x10\n"
"beq 12f\n"
"11:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x14]\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x13]\n"
+ ".inst 0x658aaa14 // bfcvt z20.h, p2/M, z16.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aaa14 // bfcvtnt z20.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aaa15 // bfcvt z21.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aaa15 // bfcvtnt z21.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12811b0 // bfdot za.s[x8, 0], { z13.h-z14.h }, z8.h\n"
+ ".inst 0xc1251290 // bfdot za.s[x8, 0], { z20.h-z21.h }, z5.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ ".inst 0x658aaaf6 // bfcvt z22.h, p2/M, z23.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xc12911b1 // bfdot za.s[x8, 1], { z13.h-z14.h }, z9.h\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc12511d0 // bfdot za.s[x8, 0], { z14.h-z15.h }, z5.h\n"
- ".inst 0xc12611d1 // bfdot za.s[x8, 1], { z14.h-z15.h }, z6.h\n"
+ ".inst 0x648aaa16 // bfcvtnt z22.h, p2/M, z16.s\n"
+ ".inst 0xc1261291 // bfdot za.s[x8, 1], { z20.h-z21.h }, z6.h\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc12c12b0 // bfdot za.s[x8, 0], { z21.h-z22.h }, z12.h\n"
+ ".inst 0xc12812b1 // bfdot za.s[x8, 1], { z21.h-z22.h }, z8.h\n"
"12:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x14]\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x13]\n"
+ ".inst 0x658aaa13 // bfcvt z19.h, p2/M, z16.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aaa13 // bfcvtnt z19.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aaa14 // bfcvt z20.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aaa14 // bfcvtnt z20.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12111b0 // bfdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc12a1270 // bfdot za.s[x8, 0], { z19.h-z20.h }, z10.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ ".inst 0x658aa9f5 // bfcvt z21.h, p2/M, z15.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xc12211b1 // bfdot za.s[x8, 1], { z13.h-z14.h }, z2.h\n"
- ".inst 0xc12811b2 // bfdot za.s[x8, 2], { z13.h-z14.h }, z8.h\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc12911b3 // bfdot za.s[x8, 3], { z13.h-z14.h }, z9.h\n"
- ".inst 0xc12a11d0 // bfdot za.s[x8, 0], { z14.h-z15.h }, z10.h\n"
- ".inst 0xc12c11d1 // bfdot za.s[x8, 1], { z14.h-z15.h }, z12.h\n"
- ".inst 0xc12511d2 // bfdot za.s[x8, 2], { z14.h-z15.h }, z5.h\n"
- ".inst 0xc12611d3 // bfdot za.s[x8, 3], { z14.h-z15.h }, z6.h\n"
+ ".inst 0x648aaa15 // bfcvtnt z21.h, p2/M, z16.s\n"
+ ".inst 0xc12b1271 // bfdot za.s[x8, 1], { z19.h-z20.h }, z11.h\n"
+ ".inst 0xc1251272 // bfdot za.s[x8, 2], { z19.h-z20.h }, z5.h\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1261273 // bfdot za.s[x8, 3], { z19.h-z20.h }, z6.h\n"
+ ".inst 0xc1241290 // bfdot za.s[x8, 0], { z20.h-z21.h }, z4.h\n"
+ ".inst 0xc1291291 // bfdot za.s[x8, 1], { z20.h-z21.h }, z9.h\n"
+ ".inst 0xc12c1292 // bfdot za.s[x8, 2], { z20.h-z21.h }, z12.h\n"
+ ".inst 0xc1281293 // bfdot za.s[x8, 3], { z20.h-z21.h }, z8.h\n"
"13:" // Padded: 0 priming loads
- "cbz x15, 16f\n"
+ "cbz x14, 16f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x14]\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x13]\n"
+ ".inst 0x658aaa16 // bfcvt z22.h, p2/M, z16.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aaa16 // bfcvtnt z22.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aaa17 // bfcvt z23.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aaa17 // bfcvtnt z23.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ ".inst 0x658aaa18 // bfcvt z24.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- "sub x15, x15, #0x1\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- "sub x13, x13, #0x1\n"
- "cmp x15, x13\n"
- "csel x21, x15, x13, LT\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "sub x13, x13, x21\n"
+ "sub x14, x14, #0x1\n"
+ ".inst 0x648aaa18 // bfcvtnt z24.h, p2/M, z16.s\n"
+ "sub x11, x11, #0x1\n"
+ "cmp x14, x11\n"
+ "csel x21, x14, x11, LT\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "sub x11, x11, x21\n"
"cbz x21, 15f\n"
"14:" // Padded: Main loop
"mov x12, #0x0\n"
- ".inst 0xc12411b0 // bfdot za.s[x8, 0], { z13.h-z14.h }, z4.h\n"
+ ".inst 0xc12112d0 // bfdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z21.s }, p0/Z, [x14]\n"
- ".inst 0xc12711b1 // bfdot za.s[x8, 1], { z13.h-z14.h }, z7.h\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p0/Z, [x13]\n"
+ ".inst 0xc12e12d1 // bfdot za.s[x8, 1], { z22.h-z23.h }, z14.h\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z20.s }, p0/Z, [x20]\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
- ".inst 0xc12311d0 // bfdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
+ ".inst 0xc12212f0 // bfdot za.s[x8, 0], { z23.h-z24.h }, z2.h\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc12011d1 // bfdot za.s[x8, 1], { z14.h-z15.h }, z0.h\n"
+ ".inst 0xc12712f1 // bfdot za.s[x8, 1], { z23.h-z24.h }, z7.h\n"
"ld1w { z18.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- ".inst 0xc12111b2 // bfdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc12a12d2 // bfdot za.s[x8, 2], { z22.h-z23.h }, z10.h\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12211b3 // bfdot za.s[x8, 3], { z13.h-z14.h }, z2.h\n"
+ ".inst 0xc12b12d3 // bfdot za.s[x8, 3], { z22.h-z23.h }, z11.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12811b4 // bfdot za.s[x8, 4], { z13.h-z14.h }, z8.h\n"
+ ".inst 0xc12512d4 // bfdot za.s[x8, 4], { z22.h-z23.h }, z5.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
"subs x21, x21, #0x1\n"
- ".inst 0xc12911b5 // bfdot za.s[x8, 5], { z13.h-z14.h }, z9.h\n"
- ".inst 0x658aaaad // bfcvt z13.h, p2/M, z21.s\n"
- ".inst 0x648aaa8d // bfcvtnt z13.h, p2/M, z20.s\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc12a11d2 // bfdot za.s[x8, 2], { z14.h-z15.h }, z10.h\n"
- ".inst 0xc12c11d3 // bfdot za.s[x8, 3], { z14.h-z15.h }, z12.h\n"
- ".inst 0xc12511d4 // bfdot za.s[x8, 4], { z14.h-z15.h }, z5.h\n"
- ".inst 0xc12611d5 // bfdot za.s[x8, 5], { z14.h-z15.h }, z6.h\n"
- ".inst 0x658aaa6e // bfcvt z14.h, p2/M, z19.s\n"
- ".inst 0x658aaa2f // bfcvt z15.h, p2/M, z17.s\n"
- ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
- ".inst 0x648aaa4e // bfcvtnt z14.h, p2/M, z18.s\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc12612d5 // bfdot za.s[x8, 5], { z22.h-z23.h }, z6.h\n"
+ ".inst 0x658aaa96 // bfcvt z22.h, p2/M, z20.s\n"
+ ".inst 0x648aaa76 // bfcvtnt z22.h, p2/M, z19.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc12412f2 // bfdot za.s[x8, 2], { z23.h-z24.h }, z4.h\n"
+ ".inst 0xc12912f3 // bfdot za.s[x8, 3], { z23.h-z24.h }, z9.h\n"
+ ".inst 0xc12c12f4 // bfdot za.s[x8, 4], { z23.h-z24.h }, z12.h\n"
+ ".inst 0xc12812f5 // bfdot za.s[x8, 5], { z23.h-z24.h }, z8.h\n"
+ ".inst 0x658aaa37 // bfcvt z23.h, p2/M, z17.s\n"
+ ".inst 0x658aaa18 // bfcvt z24.h, p2/M, z16.s\n"
+ ".inst 0xc0060810 // mova { z16.d-z17.d }, za.d[x8, #0]\n"
+ ".inst 0x648aaa57 // bfcvtnt z23.h, p2/M, z18.s\n"
+ ".inst 0x648aa9f8 // bfcvtnt z24.h, p2/M, z15.s\n"
+ ".inst 0xc0060832 // mova { z18.d-z19.d }, za.d[x8, #1]\n"
"add x8, x8, #0x2\n"
- ".inst 0xc1bdcb98 // fclamp { z24.s-z27.s }, z28.s, z29.s\n"
- "st1w { z24.s }, p1, [x9]\n"
+ ".inst 0xc1adcb30 // fclamp { z16.s-z19.s }, z25.s, z13.s\n"
+ "st1w { z16.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z26.s }, p1, [x28]\n"
+ "st1w { z18.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- ".inst 0xc0040ac4 // mova za.d[x8, #4], { z22.d-z23.d }\n"
- "st1w { z25.s }, p1, [x25]\n"
+ ".inst 0xc0040b44 // mova za.d[x8, #4], { z26.d-z27.d }\n"
+ "st1w { z17.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- ".inst 0xc0040ac5 // mova za.d[x8, #5], { z22.d-z23.d }\n"
- "st1w { z27.s }, p1, [x24]\n"
+ ".inst 0xc0040b45 // mova za.d[x8, #5], { z26.d-z27.d }\n"
+ "st1w { z19.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
"bgt 14b\n"
"15:" // Main loop tail
- ".inst 0xc12411b0 // bfdot za.s[x8, 0], { z13.h-z14.h }, z4.h\n"
- ".inst 0xc12711b1 // bfdot za.s[x8, 1], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12311d0 // bfdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc12011d1 // bfdot za.s[x8, 1], { z14.h-z15.h }, z0.h\n"
- ".inst 0xc12111b2 // bfdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc12211b3 // bfdot za.s[x8, 3], { z13.h-z14.h }, z2.h\n"
- ".inst 0xc12811b4 // bfdot za.s[x8, 4], { z13.h-z14.h }, z8.h\n"
- ".inst 0xc12911b5 // bfdot za.s[x8, 5], { z13.h-z14.h }, z9.h\n"
- ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
- ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
- ".inst 0xc1bdcb98 // fclamp { z24.s-z27.s }, z28.s, z29.s\n"
- "st1w { z24.s }, p1, [x9]\n"
+ ".inst 0xc12112d0 // bfdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+ ".inst 0xc12e12d1 // bfdot za.s[x8, 1], { z22.h-z23.h }, z14.h\n"
+ ".inst 0xc12212f0 // bfdot za.s[x8, 0], { z23.h-z24.h }, z2.h\n"
+ ".inst 0xc12712f1 // bfdot za.s[x8, 1], { z23.h-z24.h }, z7.h\n"
+ ".inst 0xc12a12d2 // bfdot za.s[x8, 2], { z22.h-z23.h }, z10.h\n"
+ ".inst 0xc12b12d3 // bfdot za.s[x8, 3], { z22.h-z23.h }, z11.h\n"
+ ".inst 0xc12512d4 // bfdot za.s[x8, 4], { z22.h-z23.h }, z5.h\n"
+ ".inst 0xc12612d5 // bfdot za.s[x8, 5], { z22.h-z23.h }, z6.h\n"
+ ".inst 0xc0060810 // mova { z16.d-z17.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060832 // mova { z18.d-z19.d }, za.d[x8, #1]\n"
+ ".inst 0xc1adcb30 // fclamp { z16.s-z19.s }, z25.s, z13.s\n"
+ "st1w { z16.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc12a11d2 // bfdot za.s[x8, 2], { z14.h-z15.h }, z10.h\n"
- "st1w { z26.s }, p1, [x28]\n"
+ ".inst 0xc12412f2 // bfdot za.s[x8, 2], { z23.h-z24.h }, z4.h\n"
+ "st1w { z18.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- ".inst 0xc12c11d3 // bfdot za.s[x8, 3], { z14.h-z15.h }, z12.h\n"
- "st1w { z25.s }, p1, [x25]\n"
+ ".inst 0xc12912f3 // bfdot za.s[x8, 3], { z23.h-z24.h }, z9.h\n"
+ "st1w { z17.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- ".inst 0xc12511d4 // bfdot za.s[x8, 4], { z14.h-z15.h }, z5.h\n"
- "st1w { z27.s }, p1, [x24]\n"
+ ".inst 0xc12c12f4 // bfdot za.s[x8, 4], { z23.h-z24.h }, z12.h\n"
+ "st1w { z19.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
- ".inst 0xc12611d5 // bfdot za.s[x8, 5], { z14.h-z15.h }, z6.h\n"
+ ".inst 0xc12812f5 // bfdot za.s[x8, 5], { z23.h-z24.h }, z8.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xc0040ac4 // mova za.d[x8, #4], { z22.d-z23.d }\n"
- ".inst 0xc0040ac5 // mova za.d[x8, #5], { z22.d-z23.d }\n"
+ ".inst 0xc0040b44 // mova za.d[x8, #4], { z26.d-z27.d }\n"
+ ".inst 0xc0040b45 // mova za.d[x8, #5], { z26.d-z27.d }\n"
"16:" // Main loop skip tail
- "cbz x13, 18f\n"
+ "cbz x11, 18f\n"
"17:" // Right padding loop
- ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
- "subs x13, x13, #0x1\n"
- ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc006081c // mova { z28.d-z29.d }, za.d[x8, #0]\n"
+ "subs x11, x11, #0x1\n"
+ ".inst 0xc006083e // mova { z30.d-z31.d }, za.d[x8, #1]\n"
"add x8, x8, #0x2\n"
- ".inst 0xc1bdcb98 // fclamp { z24.s-z27.s }, z28.s, z29.s\n"
- "st1w { z24.s }, p1, [x9]\n"
+ ".inst 0xc1adcb3c // fclamp { z28.s-z31.s }, z25.s, z13.s\n"
+ "st1w { z28.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z26.s }, p1, [x28]\n"
+ "st1w { z30.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- ".inst 0xc0040ac4 // mova za.d[x8, #4], { z22.d-z23.d }\n"
- "st1w { z25.s }, p1, [x25]\n"
+ ".inst 0xc0040b44 // mova za.d[x8, #4], { z26.d-z27.d }\n"
+ "st1w { z29.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- ".inst 0xc0040ac5 // mova za.d[x8, #5], { z22.d-z23.d }\n"
- "st1w { z27.s }, p1, [x24]\n"
+ ".inst 0xc0040b45 // mova za.d[x8, #5], { z26.d-z27.d }\n"
+ "st1w { z31.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
"bgt 17b\n"
"18:" // End
"ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
"incb x20, ALL, MUL #9\n"
"str x20, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x16\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "whilelt p1.s, x16, x17\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x14, x14, x20, LSL #2\n"
- "str x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "incw x15\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "whilelt p1.s, x15, x16\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21, LSL #2\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x23, x22, [x11, #0x0]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
"ldp x21, x20, [x24, #0x0]\n"
"add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "stp x23, x22, [x11, #0x0]\n"
- "ldp x23, x22, [x11, #0x10]\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "stp x23, x22, [x11, #0x10]\n"
+ "stp x23, x22, [x25, #0x10]\n"
"b.any 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(ARM_COMPUTE_ENABLE_SME2)
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za.hpp
index 89b9199084..e685884762 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za/generic.cpp
index e8c9bfeb29..a3b9ca402a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za/generic.cpp
@@ -69,89 +69,89 @@ void sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za_impl(
Args args = { inptr, ld_in_vl, pad_top, 9u - std::min(9u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
__asm__ __volatile__(
- "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"mov x20, #0x9\n"
".inst 0xd503477f // SMSTART ZA\n"
- "sub x20, x20, x6\n"
- "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "sub x20, x20, x7\n"
+ "ldr x17, [%x[args], %[offsetof_Args_pad_top]]\n"
"ptrue p2.b\n"
- "ld1rw { z27.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
- "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
- "whilelt p1.s, XZR, x17\n"
+ "ld1rw { z4.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "ldr x16, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x16\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z23.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
- "whilelt p8.s, XZR, x7\n"
+ "ld1rw { z1.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+ "whilelt p8.s, XZR, x17\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
"1:" // Channel loop
"ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
- "fmov z4.s, #0x0\n"
+ "fmov z24.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z4.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z24.s }, p1/Z, [x20, x15, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x21, x20\n"
- "ld1w { z19.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #3\n"
- "incb x20\n"
- "ld1w { z24.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #3\n"
- ".inst 0x658aaa69 // bfcvt z9.h, p2/M, z19.s\n"
- "ld1w { z12.s }, p2/Z, [x21]\n"
- "mov x21, x20\n"
- ".inst 0x648aab09 // bfcvtnt z9.h, p2/M, z24.s\n"
- "incb x20\n"
- "ld1w { z19.s }, p2/Z, [x21]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x21\n"
+ "ld1w { z18.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #3\n"
+ "incb x21\n"
+ "ld1w { z23.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #3\n"
+ ".inst 0x658aaa4e // bfcvt z14.h, p2/M, z18.s\n"
+ "ld1w { z6.s }, p2/Z, [x20]\n"
+ "mov x20, x21\n"
+ ".inst 0x648aaaee // bfcvtnt z14.h, p2/M, z23.s\n"
+ "incb x21\n"
+ "ld1w { z28.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #3\n"
+ ".inst 0x658aa8c3 // bfcvt z3.h, p2/M, z6.s\n"
+ ".inst 0x658aab88 // bfcvt z8.h, p2/M, z28.s\n"
+ "ld1w { z10.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #3\n"
+ "ldr x14, [%x[args], %[offsetof_Args_input_cols]]\n"
+ ".inst 0x648aa948 // bfcvtnt z8.h, p2/M, z10.s\n"
+ "ld1w { z2.s }, p2/Z, [x20]\n"
+ "mov x21, x21\n"
+ ".inst 0x658aa847 // bfcvt z7.h, p2/M, z2.s\n"
+ "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ld1w { z9.s }, p2/Z, [x21]\n"
"incb x21, ALL, MUL #3\n"
- ".inst 0x658aa983 // bfcvt z3.h, p2/M, z12.s\n"
- ".inst 0x658aaa62 // bfcvt z2.h, p2/M, z19.s\n"
- "ld1w { z24.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #3\n"
- "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
- ".inst 0x648aab02 // bfcvtnt z2.h, p2/M, z24.s\n"
- "ld1w { z12.s }, p2/Z, [x21]\n"
- "mov x21, x20\n"
- ".inst 0x658aa980 // bfcvt z0.h, p2/M, z12.s\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "ld1w { z19.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #3\n"
- ".inst 0x658aaa6a // bfcvt z10.h, p2/M, z19.s\n"
- "sub x20, x15, #0x1\n"
- "ld1w { z24.s }, p2/Z, [x21]\n"
+ ".inst 0x658aa920 // bfcvt z0.h, p2/M, z9.s\n"
+ "sub x20, x14, #0x1\n"
+ "ld1w { z6.s }, p2/Z, [x21]\n"
"incb x21, ALL, MUL #3\n"
"orr x23, x20, %x[ld_in_col], LSL #18\n"
- "mov z5.d, z4.d\n"
- "ld1w { z12.s }, p2/Z, [x21]\n"
- "orr x23, x17, x23, LSL #20\n"
+ "mov z25.d, z24.d\n"
+ "ld1w { z17.s }, p2/Z, [x21]\n"
+ "orr x23, x16, x23, LSL #20\n"
"mov x22, #0x9\n"
- "mov z6.d, z4.d\n"
- "add x21, x7, x6\n"
+ "mov z26.d, z24.d\n"
+ "add x21, x17, x7\n"
"lsl x20, %x[ld_in_row], #0x2\n"
- "mov z7.d, z4.d\n"
- ".inst 0x648aab0a // bfcvtnt z10.h, p2/M, z24.s\n"
- ".inst 0x658aa981 // bfcvt z1.h, p2/M, z12.s\n"
+ "mov z27.d, z24.d\n"
+ ".inst 0x648aa8c0 // bfcvtnt z0.h, p2/M, z6.s\n"
+ ".inst 0x658aaa26 // bfcvt z6.h, p2/M, z17.s\n"
"mov x8, #0x0\n"
- "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
"lsl x23, x23, #0x2\n"
"sub x22, x22, x21\n"
- "madd x20, x20, x7, x14\n"
+ "madd x20, x20, x17, x13\n"
"3:" // Issue prefetches
"subs x22, x22, #0x1\n"
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
- "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x2\n"
- "msub x14, x7, x20, x14\n"
- ".inst 0xc0040c80 // mova za.d[x8, #0], { z4.d-z7.d }\n"
+ "msub x13, x17, x20, x13\n"
+ ".inst 0xc0040f00 // mova za.d[x8, #0], { z24.d-z27.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040c81 // mova za.d[x8, #1], { z4.d-z7.d }\n"
+ ".inst 0xc0040f01 // mova za.d[x8, #1], { z24.d-z27.d }\n"
"mov x22, #0x2\n"
- "ldp x10, x9, [x11], #0x10\n"
- ".inst 0xc0040c82 // mova za.d[x8, #2], { z4.d-z7.d }\n"
+ "ldp x10, x9, [x23], #0x10\n"
+ ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
"ldp x28, x27, [x20], #0x10\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- "ldp x26, x25, [x11], #0x10\n"
+ "ldp x26, x25, [x23], #0x10\n"
"ldp x24, x23, [x20], #0x10\n"
"cbz x21, 5f\n"
"cmp x21, x22\n"
@@ -159,396 +159,396 @@ void sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 5f\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
"and x22, x21, #0x1\n"
"add x21, x21, #0x1\n"
- ".inst 0xc1b7cb7c // fclamp { z28.s-z31.s }, z27.s, z23.s\n"
+ ".inst 0xc1a1c890 // fclamp { z16.s-z19.s }, z4.s, z1.s\n"
"lsr x21, x21, #0x1\n"
- "sub x13, x13, x21\n"
+ "sub x11, x11, x21\n"
"4:" // Left padding
"subs x21, x21, #0x1\n"
- "st1w { z28.s }, p1, [x10]\n"
+ "st1w { z16.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- "st1w { z29.s }, p1, [x9]\n"
+ "st1w { z17.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z30.s }, p1, [x26]\n"
+ "st1w { z18.s }, p1, [x26]\n"
"add x26, x26, x24, LSL #2\n"
- "st1w { z31.s }, p1, [x25]\n"
+ "st1w { z19.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
"bgt 4b\n"
"5:" // Left padding: End
- "adds XZR, x7, x6\n"
+ "adds XZR, x17, x7\n"
"bne 10f\n"
"cbz x22, 8f\n"
"cmp x22, #0x1\n"
- "sub x15, x15, x22\n"
+ "sub x14, x14, x22\n"
"beq 7f\n"
"6:" // Unpadded: 2 priming loads
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x14]\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p1/Z, [x13]\n"
+ ".inst 0x658aaa53 // bfcvt z19.h, p2/M, z18.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z12.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x648aa993 // bfcvtnt z19.h, p2/M, z12.s\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x658aaaf4 // bfcvt z20.h, p2/M, z23.s\n"
+ "ld1w { z2.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x648aa854 // bfcvtnt z20.h, p2/M, z2.s\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aa9f5 // bfcvt z21.h, p2/M, z15.s\n"
"ld1w { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaace // bfcvt z14.h, p2/M, z22.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x648aaad5 // bfcvtnt z21.h, p2/M, z22.s\n"
+ "ld1w { z30.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1331190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z3.h\n"
+ ".inst 0x658aabd6 // bfcvt z22.h, p2/M, z30.s\n"
+ "ld1w { z12.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa996 // bfcvtnt z22.h, p2/M, z12.s\n"
+ ".inst 0xc13e1270 // bfdot za.s[x8, 0], { z19.h-z22.h }, z14.h\n"
+ "ld1w { z31.s }, p1/Z, [x20]\n"
+ ".inst 0x658aabf7 // bfcvt z23.h, p2/M, z31.s\n"
+ ".inst 0xc1331290 // bfdot za.s[x8, 0], { z20.h-z23.h }, z3.h\n"
"7:" // Unpadded: 1 priming loads
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x14]\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p1/Z, [x13]\n"
+ ".inst 0x658aaa30 // bfcvt z16.h, p2/M, z17.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x648aaad0 // bfcvtnt z16.h, p2/M, z22.s\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x658aab91 // bfcvt z17.h, p2/M, z28.s\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x648aaa51 // bfcvtnt z17.h, p2/M, z18.s\n"
+ "ld1w { z2.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x658aa852 // bfcvt z18.h, p2/M, z2.s\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x648aaa72 // bfcvtnt z18.h, p2/M, z19.s\n"
+ "ld1w { z2.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x658aa853 // bfcvt z19.h, p2/M, z2.s\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- ".inst 0xc1321170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1301190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z0.h\n"
+ ".inst 0x648aaaf3 // bfcvtnt z19.h, p2/M, z23.s\n"
+ ".inst 0xc1381210 // bfdot za.s[x8, 0], { z16.h-z19.h }, z8.h\n"
+ "ld1w { z10.s }, p1/Z, [x20]\n"
+ ".inst 0x658aa954 // bfcvt z20.h, p2/M, z10.s\n"
+ ".inst 0xc1371230 // bfdot za.s[x8, 0], { z17.h-z20.h }, z7.h\n"
"8:" // Unpadded: 0 priming loads
- "cmp x15, #0x2\n"
+ "cmp x14, #0x2\n"
"blt 16f\n"
- "add x21, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x14]\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "sub x15, x15, #0x2\n"
+ "add x21, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x13]\n"
+ ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
+ "sub x14, x14, #0x2\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "sub x13, x13, #0x1\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ "sub x11, x11, #0x1\n"
+ ".inst 0x648aaa09 // bfcvtnt z9.h, p2/M, z16.s\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "lsr x20, x15, #0x1\n"
+ ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ "lsr x20, x14, #0x1\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "cmp x20, x13\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ "cmp x20, x11\n"
+ ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "csel x22, x20, x13, LT\n"
+ ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ "csel x22, x20, x11, LT\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
+ ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "and x15, x15, #0x1\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ "and x14, x14, #0x1\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- "sub x13, x13, x22\n"
+ ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ "sub x11, x11, x22\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
"cbz x22, 15f\n"
"9:" // Unpadded: Main loop
- "add x21, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x14]\n"
- ".inst 0xc13a1170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z10.h\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z18.s }, p1/Z, [x21]\n"
+ "add x21, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x13]\n"
+ ".inst 0xc1301130 // bfdot za.s[x8, 0], { z9.h-z12.h }, z0.h\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z15.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0xc13e1131 // bfdot za.s[x8, 1], { z9.h-z12.h }, z14.h\n"
+ ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
+ "ld1w { z18.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1311190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z1.h\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1361150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z6.h\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
"ld1w { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1331191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z3.h\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0xc1331151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z3.h\n"
+ ".inst 0x658aaa4a // bfcvt z10.h, p2/M, z18.s\n"
+ "ld1w { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- ".inst 0x648aaa4b // bfcvtnt z11.h, p2/M, z18.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x658aabcb // bfcvt z11.h, p2/M, z30.s\n"
+ ".inst 0x648aa9e9 // bfcvtnt z9.h, p2/M, z15.s\n"
+ "ld1w { z19.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa2c // bfcvtnt z12.h, p2/M, z17.s\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aaa2a // bfcvtnt z10.h, p2/M, z17.s\n"
+ ".inst 0x648aaa6b // bfcvtnt z11.h, p2/M, z19.s\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ "ld1w { z2.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aa84c // bfcvtnt z12.h, p2/M, z2.s\n"
"add x8, x8, #0x1\n"
- "ld1w { z16.s }, p1/Z, [x14]\n"
- ".inst 0xc1321170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ "ld1w { z29.s }, p1/Z, [x13]\n"
+ ".inst 0xc1381130 // bfdot za.s[x8, 0], { z9.h-z12.h }, z8.h\n"
+ ".inst 0x658aaba9 // bfcvt z9.h, p2/M, z29.s\n"
"subs x22, x22, #0x1\n"
- "ld1w { z20.s }, p1/Z, [x20]\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1b7cb7c // fclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1w { z28.s }, p1, [x10]\n"
+ ".inst 0xc1a1c890 // fclamp { z16.s-z19.s }, z4.s, z1.s\n"
+ "st1w { z16.s }, p1, [x10]\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1301190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z0.h\n"
+ ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0xc1371150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z7.h\n"
"add x10, x10, x28, LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "st1w { z29.s }, p1, [x9]\n"
- "ld1w { z19.s }, p1/Z, [x20]\n"
+ ".inst 0x658aab8a // bfcvt z10.h, p2/M, z28.s\n"
+ "st1w { z17.s }, p1, [x9]\n"
+ "ld1w { z31.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z30.s }, p1, [x26]\n"
+ "st1w { z18.s }, p1, [x26]\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
"add x26, x26, x24, LSL #2\n"
- "ld1w { z18.s }, p1/Z, [x20]\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "st1w { z31.s }, p1, [x25]\n"
+ "st1w { z19.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- ".inst 0xc0040c82 // mova za.d[x8, #2], { z4.d-z7.d }\n"
- "ld1w { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa8b // bfcvtnt z11.h, p2/M, z20.s\n"
- ".inst 0x648aaa6c // bfcvtnt z12.h, p2/M, z19.s\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0x648aaa4d // bfcvtnt z13.h, p2/M, z18.s\n"
- ".inst 0x648aaa2e // bfcvtnt z14.h, p2/M, z17.s\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaac9 // bfcvtnt z9.h, p2/M, z22.s\n"
+ ".inst 0x648aabea // bfcvtnt z10.h, p2/M, z31.s\n"
+ "ld1w { z31.s }, p1/Z, [x20]\n"
+ ".inst 0x648aaa2b // bfcvtnt z11.h, p2/M, z17.s\n"
+ ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0x658aabed // bfcvt z13.h, p2/M, z31.s\n"
"bgt 9b\n"
"b 15f\n"
"10:" // Padded
"cbz x22, 13f\n"
"cmp x22, #0x1\n"
- "sub x15, x15, x22\n"
+ "sub x14, x14, x22\n"
"beq 12f\n"
"11:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x14]\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x13]\n"
+ ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
+ ".inst 0x648aaa49 // bfcvtnt z9.h, p2/M, z18.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
+ ".inst 0x658aa98a // bfcvt z10.h, p2/M, z12.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
+ ".inst 0x648aa98a // bfcvtnt z10.h, p2/M, z12.s\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aaa4b // bfcvt z11.h, p2/M, z18.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1331190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z3.h\n"
+ ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0xc13e1130 // bfdot za.s[x8, 0], { z9.h-z12.h }, z14.h\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1331150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z3.h\n"
"12:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x14]\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x13]\n"
+ ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ ".inst 0x648aaa70 // bfcvtnt z16.h, p2/M, z19.s\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aa9b1 // bfcvt z17.h, p2/M, z13.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aa991 // bfcvtnt z17.h, p2/M, z12.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z9.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aa932 // bfcvt z18.h, p2/M, z9.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z11.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aa972 // bfcvtnt z18.h, p2/M, z11.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1321170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1301190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z0.h\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ ".inst 0x658aaab3 // bfcvt z19.h, p2/M, z21.s\n"
+ ".inst 0xc13811f0 // bfdot za.s[x8, 0], { z15.h-z18.h }, z8.h\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1371210 // bfdot za.s[x8, 0], { z16.h-z19.h }, z7.h\n"
"13:" // Padded: 0 priming loads
- "cmp x15, #0x2\n"
+ "cmp x14, #0x2\n"
"blt 16f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x14]\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x13]\n"
+ ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ ".inst 0x648aaa09 // bfcvtnt z9.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- "sub x15, x15, #0x2\n"
- "sub x13, x13, #0x1\n"
- "lsr x20, x15, #0x1\n"
- "cmp x20, x13\n"
- "csel x21, x20, x13, LT\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "and x15, x15, #0x1\n"
- "sub x13, x13, x21\n"
+ ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ "sub x14, x14, #0x2\n"
+ "sub x11, x11, #0x1\n"
+ "lsr x20, x14, #0x1\n"
+ "cmp x20, x11\n"
+ "csel x21, x20, x11, LT\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "and x14, x14, #0x1\n"
+ "sub x11, x11, x21\n"
"cbz x21, 15f\n"
"14:" // Padded: Main loop
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z18.s }, p0/Z, [x14]\n"
- ".inst 0xc13a1170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z10.h\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p0/Z, [x13]\n"
+ ".inst 0xc1301130 // bfdot za.s[x8, 0], { z9.h-z12.h }, z0.h\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
- ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
+ ".inst 0xc13e1131 // bfdot za.s[x8, 1], { z9.h-z12.h }, z14.h\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0xc1311190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z1.h\n"
+ ".inst 0xc1361150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z6.h\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
- ".inst 0xc1331191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z3.h\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ ".inst 0xc1331151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z3.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa4b // bfcvt z11.h, p2/M, z18.s\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aaa49 // bfcvt z9.h, p2/M, z18.s\n"
+ ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z2.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aa84b // bfcvt z11.h, p2/M, z2.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z18.s }, p0/Z, [x20]\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa6b // bfcvtnt z11.h, p2/M, z19.s\n"
+ ".inst 0x648aaa29 // bfcvtnt z9.h, p2/M, z17.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aab8c // bfcvt z12.h, p2/M, z28.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa2c // bfcvtnt z12.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
+ ".inst 0x648aaa6a // bfcvtnt z10.h, p2/M, z19.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa4d // bfcvtnt z13.h, p2/M, z18.s\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
+ ".inst 0x648aa9eb // bfcvtnt z11.h, p2/M, z15.s\n"
"mov x12, #0x0\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0x648aaa2c // bfcvtnt z12.h, p2/M, z17.s\n"
".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x14]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa2f // bfcvt z15.h, p2/M, z17.s\n"
+ "ld1w { z16.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa9ad // bfcvt z13.h, p2/M, z13.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z21.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1b7cb7c // fclamp { z28.s-z31.s }, z27.s, z23.s\n"
+ ".inst 0xc1a1c89c // fclamp { z28.s-z31.s }, z4.s, z1.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z17.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
@@ -563,197 +563,197 @@ void sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za_impl(
"st1w { z30.s }, p1, [x26]\n"
"add x8, x8, #0x1\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1321170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ ".inst 0xc1381130 // bfdot za.s[x8, 0], { z9.h-z12.h }, z8.h\n"
+ ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z18.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1301190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z0.h\n"
+ ".inst 0xc1371150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z7.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa2c // bfcvt z12.h, p2/M, z17.s\n"
+ ".inst 0x658aaa2a // bfcvt z10.h, p2/M, z17.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
"ld1w { z17.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa6d // bfcvt z13.h, p2/M, z19.s\n"
+ ".inst 0x658aaa6b // bfcvt z11.h, p2/M, z19.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
"subs x21, x21, #0x1\n"
"add x10, x10, x28, LSL #2\n"
"st1w { z31.s }, p1, [x25]\n"
- ".inst 0xc0040c82 // mova za.d[x8, #2], { z4.d-z7.d }\n"
+ ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
"add x9, x9, x27, LSL #2\n"
"add x26, x26, x24, LSL #2\n"
- ".inst 0x648aaaab // bfcvtnt z11.h, p2/M, z21.s\n"
- ".inst 0x648aaa8c // bfcvtnt z12.h, p2/M, z20.s\n"
+ ".inst 0x648aaaa9 // bfcvtnt z9.h, p2/M, z21.s\n"
+ ".inst 0x648aaa8a // bfcvtnt z10.h, p2/M, z20.s\n"
"add x25, x25, x23, LSL #2\n"
- ".inst 0x648aaa4d // bfcvtnt z13.h, p2/M, z18.s\n"
- ".inst 0x648aaa2e // bfcvtnt z14.h, p2/M, z17.s\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ ".inst 0x648aaa4b // bfcvtnt z11.h, p2/M, z18.s\n"
+ ".inst 0x648aaa2c // bfcvtnt z12.h, p2/M, z17.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
"bgt 14b\n"
"15:" // Main loop tail
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z17.s }, p0/Z, [x14]\n"
- ".inst 0xc13a1170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z10.h\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p0/Z, [x13]\n"
+ ".inst 0xc1301130 // bfdot za.s[x8, 0], { z9.h-z12.h }, z0.h\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
- ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
+ "ld1w { z2.s }, p0/Z, [x20]\n"
+ ".inst 0xc13e1131 // bfdot za.s[x8, 1], { z9.h-z12.h }, z14.h\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0xc1311190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z1.h\n"
+ ".inst 0xc1361150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z6.h\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z18.s }, p0/Z, [x20]\n"
- ".inst 0xc1331191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z3.h\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
+ ".inst 0xc1331151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z3.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa2b // bfcvt z11.h, p2/M, z17.s\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aaa32 // bfcvt z18.h, p2/M, z17.s\n"
+ ".inst 0x658aaa13 // bfcvt z19.h, p2/M, z16.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aaa14 // bfcvt z20.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa6b // bfcvtnt z11.h, p2/M, z19.s\n"
+ ".inst 0x648aa852 // bfcvtnt z18.h, p2/M, z2.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aaa15 // bfcvt z21.h, p2/M, z16.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- ".inst 0x648aaa4c // bfcvtnt z12.h, p2/M, z18.s\n"
+ ".inst 0x648aaaf3 // bfcvtnt z19.h, p2/M, z23.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0x648aaa2d // bfcvtnt z13.h, p2/M, z17.s\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aa9f4 // bfcvtnt z20.h, p2/M, z15.s\n"
+ ".inst 0x648aaa15 // bfcvtnt z21.h, p2/M, z16.s\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1321170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- ".inst 0xc1b7cb7c // fclamp { z28.s-z31.s }, z27.s, z23.s\n"
+ ".inst 0x658aaa16 // bfcvt z22.h, p2/M, z16.s\n"
+ ".inst 0xc1381250 // bfdot za.s[x8, 0], { z18.h-z21.h }, z8.h\n"
+ ".inst 0xc1a1c89c // fclamp { z28.s-z31.s }, z4.s, z1.s\n"
"st1w { z28.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
"st1w { z29.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc0040c82 // mova za.d[x8, #2], { z4.d-z7.d }\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
"st1w { z30.s }, p1, [x26]\n"
"add x26, x26, x24, LSL #2\n"
- ".inst 0xc1301190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z0.h\n"
+ ".inst 0xc1371270 // bfdot za.s[x8, 0], { z19.h-z22.h }, z7.h\n"
"st1w { z31.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
"16:" // Main loop skip tail
- "cbz x15, 17f\n" // Skip remainder inputs
+ "cbz x14, 17f\n" // Skip remainder inputs
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x14]\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x13]\n"
+ ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ "ld1w { z2.s }, p0/Z, [x20]\n"
+ ".inst 0x648aa850 // bfcvtnt z16.h, p2/M, z2.s\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z10.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aa951 // bfcvt z17.h, p2/M, z10.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aabd1 // bfcvtnt z17.h, p2/M, z30.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aaa72 // bfcvt z18.h, p2/M, z19.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aaa72 // bfcvtnt z18.h, p2/M, z19.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc13a1170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z10.h\n"
- "sub x13, x13, #0x1\n"
- ".inst 0xc1311190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z1.h\n"
- ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- ".inst 0xc1b7cb7c // fclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1w { z28.s }, p1, [x10]\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ ".inst 0x658aaa73 // bfcvt z19.h, p2/M, z19.s\n"
+ ".inst 0xc13011f0 // bfdot za.s[x8, 0], { z15.h-z18.h }, z0.h\n"
+ "sub x11, x11, #0x1\n"
+ ".inst 0xc1361210 // bfdot za.s[x8, 0], { z16.h-z19.h }, z6.h\n"
+ ".inst 0xc13e11f1 // bfdot za.s[x8, 1], { z15.h-z18.h }, z14.h\n"
+ ".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a1c888 // fclamp { z8.s-z11.s }, z4.s, z1.s\n"
+ "st1w { z8.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- ".inst 0xc1331191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z3.h\n"
+ ".inst 0xc1331211 // bfdot za.s[x8, 1], { z16.h-z19.h }, z3.h\n"
"add x8, x8, #0x1\n"
- "st1w { z29.s }, p1, [x9]\n"
+ "st1w { z9.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z30.s }, p1, [x26]\n"
+ "st1w { z10.s }, p1, [x26]\n"
"add x26, x26, x24, LSL #2\n"
- ".inst 0xc0040c82 // mova za.d[x8, #2], { z4.d-z7.d }\n"
- "st1w { z31.s }, p1, [x25]\n"
+ ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
+ "st1w { z11.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
"17:" // Tail input: End
- "cbz x13, 19f\n"
+ "cbz x11, 19f\n"
"18:" // Right padding loop
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "subs x13, x13, #0x1\n"
- ".inst 0xc1b7cb7c // fclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1w { z28.s }, p1, [x10]\n"
+ "subs x11, x11, #0x1\n"
+ ".inst 0xc1a1c888 // fclamp { z8.s-z11.s }, z4.s, z1.s\n"
+ "st1w { z8.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- ".inst 0xc0040c82 // mova za.d[x8, #2], { z4.d-z7.d }\n"
- "st1w { z29.s }, p1, [x9]\n"
+ ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
+ "st1w { z9.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z30.s }, p1, [x26]\n"
+ "st1w { z10.s }, p1, [x26]\n"
"add x26, x26, x24, LSL #2\n"
- "st1w { z31.s }, p1, [x25]\n"
+ "st1w { z11.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
"bgt 18b\n"
"19:" // End
"ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
"incb x20, ALL, MUL #9\n"
"str x20, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x16\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "whilelt p1.s, x16, x17\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x14, x14, x20, LSL #2\n"
- "str x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "incw x15\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "whilelt p1.s, x15, x16\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21, LSL #2\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x23, x22, [x11, #0x0]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
"ldp x21, x20, [x24, #0x0]\n"
"add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "stp x23, x22, [x11, #0x0]\n"
- "ldp x23, x22, [x11, #0x10]\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "stp x23, x22, [x11, #0x10]\n"
+ "stp x23, x22, [x25, #0x10]\n"
"b.any 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za.hpp
index c2d439fe78..5215ccaf39 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za/generic.cpp
index 2b3a247686..b72042558d 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za/generic.cpp
@@ -73,237 +73,237 @@ void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
"mov x20, #0x8\n"
".inst 0xd503477f // SMSTART ZA\n"
"sub x20, x20, x4\n"
- "ldr x5, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
"ptrue p2.b\n"
".inst 0x25207812 // ptrue pn10.b\n"
- "ld1rw { z26.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
- "ldr x6, [%x[args], %[offsetof_Args_n_channels]]\n"
- "whilelt p1.s, XZR, x6\n"
+ "ld1rw { z29.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x7\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z31.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
- "whilelt p8.s, XZR, x5\n"
+ "ld1rw { z28.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+ "whilelt p8.s, XZR, x6\n"
"addvl SP, SP, #-30\n"
- "ldr x7, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
"1:" // Channel loop
- "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
- "fmov z24.s, #0x0\n"
- "cbz x20, 2f\n"
- "ld1w { z24.s }, p1/Z, [x20, x7, LSL #2]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_bias]]\n"
+ "fmov z30.s, #0x0\n"
+ "cbz x21, 2f\n"
+ "ld1w { z30.s }, p1/Z, [x21, x17, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x21, x20\n"
- "ld1w { z18.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- "ld1w { z11.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- "fmov z4.s, #0x0\n"
- "incb x20\n"
- "ld1w { z3.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x658aaa45 // bfcvt z5.h, p2/M, z18.s\n"
- ".inst 0x658aa966 // bfcvt z6.h, p2/M, z11.s\n"
- "ld1w { z17.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
+ "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x21\n"
+ "ld1w { z12.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "ld1w { z24.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "fmov z11.s, #0x0\n"
+ "incb x21\n"
+ "ld1w { z3.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aa99a // bfcvt z26.h, p2/M, z12.s\n"
+ ".inst 0x658aab10 // bfcvt z16.h, p2/M, z24.s\n"
+ "ld1w { z20.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
"addvl x24, SP, #30\n"
- ".inst 0x648aaa44 // bfcvtnt z4.h, p2/M, z18.s\n"
- "ld1w { z16.s }, p2/Z, [x21]\n"
- "mov x21, x20\n"
- ".inst 0x658aa867 // bfcvt z7.h, p2/M, z3.s\n"
+ ".inst 0x648aa98b // bfcvtnt z11.h, p2/M, z12.s\n"
+ "ld1w { z25.s }, p2/Z, [x20]\n"
+ "mov x20, x21\n"
+ ".inst 0x658aa875 // bfcvt z21.h, p2/M, z3.s\n"
"addvl x24, x24, #-6\n"
- "ld1w { z18.s }, p2/Z, [x21]\n"
- ".inst 0x658aaa28 // bfcvt z8.h, p2/M, z17.s\n"
- "incb x21, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x24]\n"
- ".inst 0x648aa965 // bfcvtnt z5.h, p2/M, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- "fmov z4.s, #0x0\n"
- "st1h { z5.h }, p2, [x24, #1, MUL VL]\n"
- ".inst 0x648aa866 // bfcvtnt z6.h, p2/M, z3.s\n"
- "ld1w { z3.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x658aaa45 // bfcvt z5.h, p2/M, z18.s\n"
- ".inst 0x648aaa27 // bfcvtnt z7.h, p2/M, z17.s\n"
- "incb x20\n"
- "ld1w { z17.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- "st1h { z6.h }, p2, [x24, #2, MUL VL]\n"
- ".inst 0x648aaa08 // bfcvtnt z8.h, p2/M, z16.s\n"
- ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x21]\n"
- ".inst 0x658aa966 // bfcvt z6.h, p2/M, z11.s\n"
- "mov x21, x20\n"
- "st1h { z7.h }, p2, [x24, #3, MUL VL]\n"
- ".inst 0x648aaa44 // bfcvtnt z4.h, p2/M, z18.s\n"
- ".inst 0x658aa867 // bfcvt z7.h, p2/M, z3.s\n"
- "ld1w { z18.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- "st1h { z8.h }, p2, [x24, #4, MUL VL]\n"
- ".inst 0x648aa965 // bfcvtnt z5.h, p2/M, z11.s\n"
- ".inst 0x658aaa28 // bfcvt z8.h, p2/M, z17.s\n"
- "incb x20\n"
- "st1h { z9.h }, p2, [x24, #5, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x20]\n"
+ ".inst 0x658aaa9b // bfcvt z27.h, p2/M, z20.s\n"
+ "incb x20, ALL, MUL #5\n"
+ "st1h { z11.h }, p2, [x24]\n"
+ ".inst 0x648aab1a // bfcvtnt z26.h, p2/M, z24.s\n"
+ "ld1w { z14.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "fmov z11.s, #0x0\n"
+ "st1h { z26.h }, p2, [x24, #1, MUL VL]\n"
+ ".inst 0x648aa870 // bfcvtnt z16.h, p2/M, z3.s\n"
+ "ld1w { z19.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aa8c9 // bfcvt z9.h, p2/M, z6.s\n"
+ ".inst 0x648aaa95 // bfcvtnt z21.h, p2/M, z20.s\n"
+ "incb x21\n"
+ "ld1w { z12.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "st1h { z16.h }, p2, [x24, #2, MUL VL]\n"
+ ".inst 0x648aab3b // bfcvtnt z27.h, p2/M, z25.s\n"
+ ".inst 0x658aab37 // bfcvt z23.h, p2/M, z25.s\n"
+ "ld1w { z5.s }, p2/Z, [x20]\n"
+ ".inst 0x658aa9c8 // bfcvt z8.h, p2/M, z14.s\n"
+ "mov x23, x21\n"
+ "st1h { z21.h }, p2, [x24, #3, MUL VL]\n"
+ ".inst 0x648aa8cb // bfcvtnt z11.h, p2/M, z6.s\n"
+ ".inst 0x658aaa79 // bfcvt z25.h, p2/M, z19.s\n"
+ "ld1w { z4.s }, p2/Z, [x23]\n"
+ "incb x23, ALL, MUL #5\n"
+ "st1h { z27.h }, p2, [x24, #4, MUL VL]\n"
+ ".inst 0x648aa9c9 // bfcvtnt z9.h, p2/M, z14.s\n"
+ ".inst 0x658aa991 // bfcvt z17.h, p2/M, z12.s\n"
+ "incb x21\n"
+ "st1h { z23.h }, p2, [x24, #5, MUL VL]\n"
"addvl x24, x24, #-6\n"
- "ld1w { z11.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x24]\n"
- "fmov z4.s, #0x0\n"
- ".inst 0x648aa866 // bfcvtnt z6.h, p2/M, z3.s\n"
- "ldr x17, [%x[args], %[offsetof_Args_input_cols]]\n"
- "st1h { z5.h }, p2, [x24, #1, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x658aaa45 // bfcvt z5.h, p2/M, z18.s\n"
- "st1h { z6.h }, p2, [x24, #2, MUL VL]\n"
- ".inst 0x648aaa27 // bfcvtnt z7.h, p2/M, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x658aa966 // bfcvt z6.h, p2/M, z11.s\n"
- ".inst 0x648aaa08 // bfcvtnt z8.h, p2/M, z16.s\n"
- "st1h { z7.h }, p2, [x24, #3, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x23]\n"
+ "incb x23, ALL, MUL #5\n"
+ "st1h { z11.h }, p2, [x24]\n"
+ "fmov z2.s, #0x0\n"
+ ".inst 0x648aaa68 // bfcvtnt z8.h, p2/M, z19.s\n"
+ "ldr x25, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "st1h { z9.h }, p2, [x24, #1, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x23]\n"
+ "incb x23, ALL, MUL #5\n"
+ ".inst 0x658aa893 // bfcvt z19.h, p2/M, z4.s\n"
+ "st1h { z8.h }, p2, [x24, #2, MUL VL]\n"
+ ".inst 0x648aa999 // bfcvtnt z25.h, p2/M, z12.s\n"
+ "ld1w { z7.s }, p2/Z, [x23]\n"
+ "incb x23, ALL, MUL #5\n"
+ ".inst 0x658aab4e // bfcvt z14.h, p2/M, z26.s\n"
+ ".inst 0x648aa8b1 // bfcvtnt z17.h, p2/M, z5.s\n"
+ "st1h { z25.h }, p2, [x24, #3, MUL VL]\n"
"ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
- ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x21]\n"
- "mov x21, x20\n"
- ".inst 0x648aaa44 // bfcvtnt z4.h, p2/M, z18.s\n"
- ".inst 0x658aa867 // bfcvt z7.h, p2/M, z3.s\n"
- "ld1w { z18.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- "st1h { z8.h }, p2, [x24, #4, MUL VL]\n"
- "st1h { z9.h }, p2, [x24, #5, MUL VL]\n"
+ ".inst 0x658aa8ab // bfcvt z11.h, p2/M, z5.s\n"
+ "ld1w { z18.s }, p2/Z, [x23]\n"
+ "mov x20, x21\n"
+ ".inst 0x648aa882 // bfcvtnt z2.h, p2/M, z4.s\n"
+ ".inst 0x658aab66 // bfcvt z6.h, p2/M, z27.s\n"
+ "ld1w { z15.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "st1h { z17.h }, p2, [x24, #4, MUL VL]\n"
+ "st1h { z11.h }, p2, [x24, #5, MUL VL]\n"
"addvl x24, x24, #-6\n"
- ".inst 0x648aa965 // bfcvtnt z5.h, p2/M, z11.s\n"
- ".inst 0x658aaa28 // bfcvt z8.h, p2/M, z17.s\n"
- "ld1w { z11.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x24]\n"
- ".inst 0x648aa866 // bfcvtnt z6.h, p2/M, z3.s\n"
- "ld1w { z3.s }, p2/Z, [x21]\n"
- "fmov z4.s, #0x0\n"
- "st1h { z5.h }, p2, [x24, #1, MUL VL]\n"
- "incb x21, ALL, MUL #5\n"
+ ".inst 0x648aab53 // bfcvtnt z19.h, p2/M, z26.s\n"
+ ".inst 0x658aa8fa // bfcvt z26.h, p2/M, z7.s\n"
+ "ld1w { z11.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "st1h { z2.h }, p2, [x24]\n"
+ ".inst 0x648aab6e // bfcvtnt z14.h, p2/M, z27.s\n"
+ "ld1w { z4.s }, p2/Z, [x20]\n"
+ "fmov z21.s, #0x0\n"
+ "st1h { z19.h }, p2, [x24, #1, MUL VL]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aa9ea // bfcvt z10.h, p2/M, z15.s\n"
+ "st1h { z14.h }, p2, [x24, #2, MUL VL]\n"
+ ".inst 0x648aa8e6 // bfcvtnt z6.h, p2/M, z7.s\n"
+ "incb x21\n"
+ "ld1w { z17.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aa973 // bfcvt z19.h, p2/M, z11.s\n"
+ "st1h { z6.h }, p2, [x24, #3, MUL VL]\n"
+ ".inst 0x648aaa5a // bfcvtnt z26.h, p2/M, z18.s\n"
".inst 0x658aaa45 // bfcvt z5.h, p2/M, z18.s\n"
- "st1h { z6.h }, p2, [x24, #2, MUL VL]\n"
- ".inst 0x648aaa27 // bfcvtnt z7.h, p2/M, z17.s\n"
- "incb x20\n"
- "ld1w { z17.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x658aa966 // bfcvt z6.h, p2/M, z11.s\n"
- "st1h { z7.h }, p2, [x24, #3, MUL VL]\n"
- ".inst 0x648aaa08 // bfcvtnt z8.h, p2/M, z16.s\n"
- ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x21]\n"
- "mov x21, x20\n"
- ".inst 0x658aa867 // bfcvt z7.h, p2/M, z3.s\n"
- ".inst 0x648aaa44 // bfcvtnt z4.h, p2/M, z18.s\n"
- "ld1w { z18.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x648aa965 // bfcvtnt z5.h, p2/M, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x21]\n"
+ "ld1w { z12.s }, p2/Z, [x20]\n"
+ "mov x21, x21\n"
+ ".inst 0x658aa897 // bfcvt z23.h, p2/M, z4.s\n"
+ ".inst 0x648aa9f5 // bfcvtnt z21.h, p2/M, z15.s\n"
+ "ld1w { z24.s }, p2/Z, [x21]\n"
"incb x21, ALL, MUL #5\n"
- "st1h { z8.h }, p2, [x24, #4, MUL VL]\n"
- ".inst 0x648aa866 // bfcvtnt z6.h, p2/M, z3.s\n"
- ".inst 0x658aaa28 // bfcvt z8.h, p2/M, z17.s\n"
+ ".inst 0x648aa96a // bfcvtnt z10.h, p2/M, z11.s\n"
"ld1w { z3.s }, p2/Z, [x21]\n"
"incb x21, ALL, MUL #5\n"
- ".inst 0x648aaa27 // bfcvtnt z7.h, p2/M, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x21]\n"
- "st1h { z9.h }, p2, [x24, #5, MUL VL]\n"
+ "st1h { z26.h }, p2, [x24, #4, MUL VL]\n"
+ ".inst 0x648aa893 // bfcvtnt z19.h, p2/M, z4.s\n"
+ ".inst 0x658aaa30 // bfcvt z16.h, p2/M, z17.s\n"
+ "ld1w { z2.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
+ ".inst 0x648aaa37 // bfcvtnt z23.h, p2/M, z17.s\n"
+ "ld1w { z26.s }, p2/Z, [x21]\n"
+ "st1h { z5.h }, p2, [x24, #5, MUL VL]\n"
"addvl x24, x24, #-6\n"
- "st1h { z4.h }, p2, [x24]\n"
- ".inst 0x648aaa08 // bfcvtnt z8.h, p2/M, z16.s\n"
+ "st1h { z21.h }, p2, [x24]\n"
+ ".inst 0x648aa990 // bfcvtnt z16.h, p2/M, z12.s\n"
"incb x21, ALL, MUL #5\n"
- "fmov z4.s, #0x0\n"
- "st1h { z5.h }, p2, [x24, #1, MUL VL]\n"
- ".inst 0x658aaa45 // bfcvt z5.h, p2/M, z18.s\n"
- ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
- "sub x20, x17, #0x1\n"
- "st1h { z6.h }, p2, [x24, #2, MUL VL]\n"
- ".inst 0x658aa966 // bfcvt z6.h, p2/M, z11.s\n"
- "ld1w { z16.s }, p2/Z, [x21]\n"
+ "fmov z8.s, #0x0\n"
+ "st1h { z10.h }, p2, [x24, #1, MUL VL]\n"
+ ".inst 0x658aab04 // bfcvt z4.h, p2/M, z24.s\n"
+ ".inst 0x658aa985 // bfcvt z5.h, p2/M, z12.s\n"
+ "sub x20, x25, #0x1\n"
+ "st1h { z19.h }, p2, [x24, #2, MUL VL]\n"
+ ".inst 0x658aa871 // bfcvt z17.h, p2/M, z3.s\n"
+ "ld1w { z25.s }, p2/Z, [x21]\n"
"orr x23, x20, %x[ld_in_col], LSL #18\n"
- "st1h { z7.h }, p2, [x24, #3, MUL VL]\n"
- ".inst 0x658aa867 // bfcvt z7.h, p2/M, z3.s\n"
- "orr x23, x6, x23, LSL #20\n"
+ "st1h { z23.h }, p2, [x24, #3, MUL VL]\n"
+ ".inst 0x658aa857 // bfcvt z23.h, p2/M, z2.s\n"
+ "orr x23, x7, x23, LSL #20\n"
"mov x22, #0x8\n"
- "st1h { z8.h }, p2, [x24, #4, MUL VL]\n"
- ".inst 0x658aaa28 // bfcvt z8.h, p2/M, z17.s\n"
- "add x21, x5, x4\n"
+ "st1h { z16.h }, p2, [x24, #4, MUL VL]\n"
+ ".inst 0x658aab4e // bfcvt z14.h, p2/M, z26.s\n"
+ "add x21, x6, x4\n"
"lsl x20, %x[ld_in_row], #0x2\n"
- "st1h { z9.h }, p2, [x24, #5, MUL VL]\n"
+ "st1h { z5.h }, p2, [x24, #5, MUL VL]\n"
"addvl x24, x24, #-6\n"
- "mov z25.d, z24.d\n"
- ".inst 0x648aaa44 // bfcvtnt z4.h, p2/M, z18.s\n"
- "st1h { z4.h }, p2, [x24]\n"
- ".inst 0x648aa965 // bfcvtnt z5.h, p2/M, z11.s\n"
- ".inst 0x648aa866 // bfcvtnt z6.h, p2/M, z3.s\n"
+ "mov z31.d, z30.d\n"
+ ".inst 0x648aab08 // bfcvtnt z8.h, p2/M, z24.s\n"
+ "st1h { z8.h }, p2, [x24]\n"
+ ".inst 0x648aa864 // bfcvtnt z4.h, p2/M, z3.s\n"
+ ".inst 0x648aa851 // bfcvtnt z17.h, p2/M, z2.s\n"
"mov x11, #0x0\n"
- "st1h { z5.h }, p2, [x24, #1, MUL VL]\n"
- ".inst 0x648aaa27 // bfcvtnt z7.h, p2/M, z17.s\n"
- ".inst 0x648aaa08 // bfcvtnt z8.h, p2/M, z16.s\n"
+ "st1h { z4.h }, p2, [x24, #1, MUL VL]\n"
+ ".inst 0x648aab57 // bfcvtnt z23.h, p2/M, z26.s\n"
+ ".inst 0x648aab2e // bfcvtnt z14.h, p2/M, z25.s\n"
"mov x8, #0x8\n"
- "st1h { z6.h }, p2, [x24, #2, MUL VL]\n"
- ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
+ "st1h { z17.h }, p2, [x24, #2, MUL VL]\n"
+ ".inst 0x658aab26 // bfcvt z6.h, p2/M, z25.s\n"
"ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
"lsl x23, x23, #0x2\n"
- "st1h { z7.h }, p2, [x24, #3, MUL VL]\n"
+ "st1h { z23.h }, p2, [x24, #3, MUL VL]\n"
"sub x22, x22, x21\n"
- "madd x20, x20, x5, x16\n"
- "st1h { z8.h }, p2, [x24, #4, MUL VL]\n"
- "st1h { z9.h }, p2, [x24, #5, MUL VL]\n"
+ "madd x20, x20, x6, x16\n"
+ "st1h { z14.h }, p2, [x24, #4, MUL VL]\n"
+ "st1h { z6.h }, p2, [x24, #5, MUL VL]\n"
"3:" // Issue prefetches
"subs x22, x22, #0x1\n"
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
- "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x2\n"
- "msub x16, x5, x20, x16\n"
- ".inst 0xc0046b00 // mova za.d[x11, #0], { z24.d-z25.d }\n"
+ "msub x16, x6, x20, x16\n"
+ ".inst 0xc0046bc0 // mova za.d[x11, #0], { z30.d-z31.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0046b01 // mova za.d[x11, #1], { z24.d-z25.d }\n"
+ ".inst 0xc0046bc1 // mova za.d[x11, #1], { z30.d-z31.d }\n"
"mov x22, #0x4\n"
- "ldp x14, x13, [x25], #0x10\n"
- ".inst 0xc0046b02 // mova za.d[x11, #2], { z24.d-z25.d }\n"
- "ldp x0, x10, [x20], #0x10\n"
- ".inst 0xc0046b03 // mova za.d[x11, #3], { z24.d-z25.d }\n"
+ "ldp x14, x13, [x23], #0x10\n"
+ ".inst 0xc0046bc2 // mova za.d[x11, #2], { z30.d-z31.d }\n"
+ "ldp x5, x10, [x20], #0x10\n"
+ ".inst 0xc0046bc3 // mova za.d[x11, #3], { z30.d-z31.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0046b04 // mova za.d[x11, #4], { z24.d-z25.d }\n"
- "ldp x9, x28, [x25], #0x10\n"
- ".inst 0xc0046b05 // mova za.d[x11, #5], { z24.d-z25.d }\n"
+ ".inst 0xc0046bc4 // mova za.d[x11, #4], { z30.d-z31.d }\n"
+ "ldp x9, x28, [x23], #0x10\n"
+ ".inst 0xc0046bc5 // mova za.d[x11, #5], { z30.d-z31.d }\n"
"ldp x27, x26, [x20], #0x10\n"
- ".inst 0xc0046b06 // mova za.d[x11, #6], { z24.d-z25.d }\n"
- ".inst 0xc0046b07 // mova za.d[x11, #7], { z24.d-z25.d }\n"
- ".inst 0xc0040b00 // mova za.d[x8, #0], { z24.d-z25.d }\n"
- ".inst 0xc0040b01 // mova za.d[x8, #1], { z24.d-z25.d }\n"
+ ".inst 0xc0046bc6 // mova za.d[x11, #6], { z30.d-z31.d }\n"
+ ".inst 0xc0046bc7 // mova za.d[x11, #7], { z30.d-z31.d }\n"
+ ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
+ ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
"cbz x21, 5f\n"
"cmp x21, x22\n"
"csel x20, x21, x22, LT\n"
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 5f\n"
- ".inst 0xc0066800 // mova { z0.d-z1.d }, za.d[x11, #0]\n"
+ ".inst 0xc0066804 // mova { z4.d-z5.d }, za.d[x11, #0]\n"
"sub x15, x15, x21\n"
- ".inst 0xc0066822 // mova { z2.d-z3.d }, za.d[x11, #1]\n"
- ".inst 0xc1bfcb40 // fclamp { z0.s-z3.s }, z26.s, z31.s\n"
+ ".inst 0xc0066826 // mova { z6.d-z7.d }, za.d[x11, #1]\n"
+ ".inst 0xc1bccba4 // fclamp { z4.s-z7.s }, z29.s, z28.s\n"
"4:" // Left padding
"subs x21, x21, #0x1\n"
- "st1w { z0.s }, p1, [x14]\n"
- "add x14, x14, x0, LSL #2\n"
- "st1w { z2.s }, p1, [x13]\n"
+ "st1w { z4.s }, p1, [x14]\n"
+ "add x14, x14, x5, LSL #2\n"
+ "st1w { z6.s }, p1, [x13]\n"
"add x13, x13, x10, LSL #2\n"
- "st1w { z1.s }, p1, [x9]\n"
+ "st1w { z5.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z3.s }, p1, [x28]\n"
+ "st1w { z7.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
"bgt 4b\n"
"5:" // Left padding: End
- "adds XZR, x5, x4\n"
+ "adds XZR, x6, x4\n"
"bne 12f\n"
"cbz x22, 10f\n"
"cmp x22, #0x1\n"
- "sub x17, x17, x22\n"
+ "sub x25, x25, x22\n"
"beq 9f\n"
"cmp x22, #0x2\n"
"beq 8f\n"
@@ -311,335 +311,335 @@ void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
"beq 7f\n"
"6:" // Unpadded: 4 priming loads
"add x21, x16, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x16]\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ "ld1w { z21.s }, p1/Z, [x16]\n"
+ ".inst 0x658aaab2 // bfcvt z18.h, p2/M, z21.s\n"
"addvl x20, SP, #24\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "ld1w { z11.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aa972 // bfcvtnt z18.h, p2/M, z11.s\n"
"add x16, x16, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "ld1w { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x658aaa33 // bfcvt z19.h, p2/M, z17.s\n"
+ "ld1w { z12.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x648aa993 // bfcvtnt z19.h, p2/M, z12.s\n"
+ "ld1w { z7.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x658aa8f4 // bfcvt z20.h, p2/M, z7.s\n"
+ "ld1w { z12.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x648aa994 // bfcvtnt z20.h, p2/M, z12.s\n"
+ ".inst 0xa0402a8c // ld1h { z12.h-z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc12d7250 // bfdot za.s[x11, 0], { z18.h-z19.h }, z13.h\n"
+ "ld1w { z6.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
+ ".inst 0x658aa8d5 // bfcvt z21.h, p2/M, z6.s\n"
+ ".inst 0xc12c7251 // bfdot za.s[x11, 1], { z18.h-z19.h }, z12.h\n"
+ ".inst 0xa0412a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc12b7270 // bfdot za.s[x11, 0], { z19.h-z20.h }, z11.h\n"
+ "ld1w { z27.s }, p1/Z, [x21]\n"
+ ".inst 0x648aab75 // bfcvtnt z21.h, p2/M, z27.s\n"
+ ".inst 0xc12a7271 // bfdot za.s[x11, 1], { z19.h-z20.h }, z10.h\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc12b7290 // bfdot za.s[x11, 0], { z20.h-z21.h }, z11.h\n"
+ ".inst 0xc12a7291 // bfdot za.s[x11, 1], { z20.h-z21.h }, z10.h\n"
"7:" // Unpadded: 3 priming loads
"add x22, x16, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x16]\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ "ld1w { z6.s }, p1/Z, [x16]\n"
+ ".inst 0x658aa8d7 // bfcvt z23.h, p2/M, z6.s\n"
"addvl x21, SP, #18\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ "ld1w { z1.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aa837 // bfcvtnt z23.h, p2/M, z1.s\n"
"addvl x20, SP, #24\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ "ld1w { z15.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aa9f8 // bfcvt z24.h, p2/M, z15.s\n"
"add x16, x16, %x[ld_in_col], LSL #2\n"
"ld1w { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ ".inst 0x648aaa18 // bfcvtnt z24.h, p2/M, z16.s\n"
+ "ld1w { z1.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ ".inst 0x658aa839 // bfcvt z25.h, p2/M, z1.s\n"
+ "ld1w { z9.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- ".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
+ ".inst 0x648aa939 // bfcvtnt z25.h, p2/M, z9.s\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc12972f0 // bfdot za.s[x11, 0], { z23.h-z24.h }, z9.h\n"
"ld1w { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1257192 // bfdot za.s[x11, 2], { z12.h-z13.h }, z5.h\n"
- ".inst 0xa0412aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc1247193 // bfdot za.s[x11, 3], { z12.h-z13.h }, z4.h\n"
+ ".inst 0x658aaa1a // bfcvt z26.h, p2/M, z16.s\n"
+ ".inst 0xc12172f1 // bfdot za.s[x11, 1], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xa1402a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc12f72f2 // bfdot za.s[x11, 2], { z23.h-z24.h }, z15.h\n"
+ ".inst 0xa1412aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc12772f3 // bfdot za.s[x11, 3], { z23.h-z24.h }, z7.h\n"
"ld1w { z16.s }, p1/Z, [x22]\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc12771b2 // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b3 // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
- ".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc12971d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z8.h\n"
+ ".inst 0x648aaa1a // bfcvtnt z26.h, p2/M, z16.s\n"
+ ".inst 0xc1297310 // bfdot za.s[x11, 0], { z24.h-z25.h }, z9.h\n"
+ ".inst 0xc1217311 // bfdot za.s[x11, 1], { z24.h-z25.h }, z1.h\n"
+ ".inst 0xa1412a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xa1422aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc12f7312 // bfdot za.s[x11, 2], { z24.h-z25.h }, z15.h\n"
+ ".inst 0xc1277313 // bfdot za.s[x11, 3], { z24.h-z25.h }, z7.h\n"
+ ".inst 0xc12b7330 // bfdot za.s[x11, 0], { z25.h-z26.h }, z11.h\n"
+ ".inst 0xc1237331 // bfdot za.s[x11, 1], { z25.h-z26.h }, z3.h\n"
+ ".inst 0xa0422a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc1237332 // bfdot za.s[x11, 2], { z25.h-z26.h }, z3.h\n"
+ ".inst 0xc1227333 // bfdot za.s[x11, 3], { z25.h-z26.h }, z2.h\n"
"8:" // Unpadded: 2 priming loads
"add x23, x16, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x16]\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ "ld1w { z24.s }, p1/Z, [x16]\n"
+ ".inst 0x658aab02 // bfcvt z2.h, p2/M, z24.s\n"
"addvl x22, SP, #12\n"
"ld1w { z16.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aaa02 // bfcvtnt z2.h, p2/M, z16.s\n"
"addvl x21, SP, #18\n"
"ld1w { z16.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aaa03 // bfcvt z3.h, p2/M, z16.s\n"
"addvl x20, SP, #24\n"
"ld1w { z16.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aaa03 // bfcvtnt z3.h, p2/M, z16.s\n"
"add x16, x16, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x23]\n"
+ "ld1w { z1.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x23]\n"
+ ".inst 0x658aa824 // bfcvt z4.h, p2/M, z1.s\n"
+ "ld1w { z19.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- ".inst 0xa0402ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
- "ld1w { z16.s }, p1/Z, [x23]\n"
+ ".inst 0x648aaa64 // bfcvtnt z4.h, p2/M, z19.s\n"
+ ".inst 0xa1402ac7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc12f7050 // bfdot za.s[x11, 0], { z2.h-z3.h }, z15.h\n"
+ "ld1w { z0.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1257192 // bfdot za.s[x11, 2], { z12.h-z13.h }, z5.h\n"
- ".inst 0xa0412ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc1247193 // bfdot za.s[x11, 3], { z12.h-z13.h }, z4.h\n"
- "ld1w { z16.s }, p1/Z, [x23]\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
- ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc1257194 // bfdot za.s[x11, 4], { z12.h-z13.h }, z5.h\n"
- ".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc1247195 // bfdot za.s[x11, 5], { z12.h-z13.h }, z4.h\n"
- ".inst 0xc12771b2 // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b3 // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc12771b4 // bfdot za.s[x11, 4], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b5 // bfdot za.s[x11, 5], { z13.h-z14.h }, z6.h\n"
- ".inst 0xc12971d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc12971d4 // bfdot za.s[x11, 4], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d5 // bfdot za.s[x11, 5], { z14.h-z15.h }, z8.h\n"
+ ".inst 0x658aa805 // bfcvt z5.h, p2/M, z0.s\n"
+ ".inst 0xc1277051 // bfdot za.s[x11, 1], { z2.h-z3.h }, z7.h\n"
+ ".inst 0xa1402aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc12f7052 // bfdot za.s[x11, 2], { z2.h-z3.h }, z15.h\n"
+ ".inst 0xa1412ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc1277053 // bfdot za.s[x11, 3], { z2.h-z3.h }, z7.h\n"
+ "ld1w { z10.s }, p1/Z, [x23]\n"
+ ".inst 0x648aa945 // bfcvtnt z5.h, p2/M, z10.s\n"
+ ".inst 0xc12e7070 // bfdot za.s[x11, 0], { z3.h-z4.h }, z14.h\n"
+ ".inst 0xa1402a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1267071 // bfdot za.s[x11, 1], { z3.h-z4.h }, z6.h\n"
+ ".inst 0xa0412aac // ld1h { z12.h-z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc12f7054 // bfdot za.s[x11, 4], { z2.h-z3.h }, z15.h\n"
+ ".inst 0xa1422ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc1277055 // bfdot za.s[x11, 5], { z2.h-z3.h }, z7.h\n"
+ ".inst 0xc12d7072 // bfdot za.s[x11, 2], { z3.h-z4.h }, z13.h\n"
+ ".inst 0xc12c7073 // bfdot za.s[x11, 3], { z3.h-z4.h }, z12.h\n"
+ ".inst 0xa0412a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1287090 // bfdot za.s[x11, 0], { z4.h-z5.h }, z8.h\n"
+ ".inst 0xc1207091 // bfdot za.s[x11, 1], { z4.h-z5.h }, z0.h\n"
+ ".inst 0xa0422aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc12f7074 // bfdot za.s[x11, 4], { z3.h-z4.h }, z15.h\n"
+ ".inst 0xc12e7075 // bfdot za.s[x11, 5], { z3.h-z4.h }, z14.h\n"
+ ".inst 0xc1277092 // bfdot za.s[x11, 2], { z4.h-z5.h }, z7.h\n"
+ ".inst 0xc1267093 // bfdot za.s[x11, 3], { z4.h-z5.h }, z6.h\n"
+ ".inst 0xa1422a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc1287094 // bfdot za.s[x11, 4], { z4.h-z5.h }, z8.h\n"
+ ".inst 0xc1207095 // bfdot za.s[x11, 5], { z4.h-z5.h }, z0.h\n"
"9:" // Unpadded: 1 priming loads
"add x24, x16, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x16]\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ "ld1w { z18.s }, p1/Z, [x16]\n"
+ ".inst 0x658aaa4c // bfcvt z12.h, p2/M, z18.s\n"
"addvl x23, SP, #6\n"
- "ld1w { z16.s }, p1/Z, [x24]\n"
+ "ld1w { z7.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aa8ec // bfcvtnt z12.h, p2/M, z7.s\n"
"addvl x22, SP, #12\n"
- "ld1w { z16.s }, p1/Z, [x24]\n"
+ "ld1w { z20.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aaa8d // bfcvt z13.h, p2/M, z20.s\n"
"addvl x21, SP, #18\n"
- "ld1w { z16.s }, p1/Z, [x24]\n"
+ "ld1w { z0.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aa80d // bfcvtnt z13.h, p2/M, z0.s\n"
"addvl x20, SP, #24\n"
- "ld1w { z16.s }, p1/Z, [x24]\n"
+ "ld1w { z10.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aa94e // bfcvt z14.h, p2/M, z10.s\n"
"add x16, x16, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x24]\n"
+ "ld1w { z0.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- ".inst 0xa0402ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23]\n"
- ".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
- "ld1w { z16.s }, p1/Z, [x24]\n"
+ ".inst 0x648aa80e // bfcvtnt z14.h, p2/M, z0.s\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1217190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z1.h\n"
+ "ld1w { z17.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc1257192 // bfdot za.s[x11, 2], { z12.h-z13.h }, z5.h\n"
- ".inst 0xa0412ae6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc1247193 // bfdot za.s[x11, 3], { z12.h-z13.h }, z4.h\n"
- "ld1w { z16.s }, p1/Z, [x24]\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
- ".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
+ ".inst 0x658aaa2f // bfcvt z15.h, p2/M, z17.s\n"
+ ".inst 0xc1207191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z0.h\n"
+ ".inst 0xa0402aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc12b7192 // bfdot za.s[x11, 2], { z12.h-z13.h }, z11.h\n"
+ ".inst 0xa0412ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc12a7193 // bfdot za.s[x11, 3], { z12.h-z13.h }, z10.h\n"
+ "ld1w { z18.s }, p1/Z, [x24]\n"
+ ".inst 0x648aaa4f // bfcvtnt z15.h, p2/M, z18.s\n"
+ ".inst 0xc12171b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc12071b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z0.h\n"
".inst 0xa0412ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc1257194 // bfdot za.s[x11, 4], { z12.h-z13.h }, z5.h\n"
- ".inst 0xa0422ae8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc1247195 // bfdot za.s[x11, 5], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc12a7194 // bfdot za.s[x11, 4], { z12.h-z13.h }, z10.h\n"
+ ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc1227195 // bfdot za.s[x11, 5], { z12.h-z13.h }, z2.h\n"
+ ".inst 0xa0402a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
".inst 0xc12771b2 // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
".inst 0xc12671b3 // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc1257196 // bfdot za.s[x11, 6], { z12.h-z13.h }, z5.h\n"
- ".inst 0xc1247197 // bfdot za.s[x11, 7], { z12.h-z13.h }, z4.h\n"
- ".inst 0xc12771b4 // bfdot za.s[x11, 4], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b5 // bfdot za.s[x11, 5], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc12971d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc12771b6 // bfdot za.s[x11, 6], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b7 // bfdot za.s[x11, 7], { z13.h-z14.h }, z6.h\n"
- ".inst 0xc12971d4 // bfdot za.s[x11, 4], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d5 // bfdot za.s[x11, 5], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc12971d6 // bfdot za.s[x11, 6], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d7 // bfdot za.s[x11, 7], { z14.h-z15.h }, z8.h\n"
+ ".inst 0xa0412aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc12b71d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z11.h\n"
+ ".inst 0xc12a71d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z10.h\n"
+ ".inst 0xa1422ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc1297196 // bfdot za.s[x11, 6], { z12.h-z13.h }, z9.h\n"
+ ".inst 0xc1287197 // bfdot za.s[x11, 7], { z12.h-z13.h }, z8.h\n"
+ ".inst 0xc12171b4 // bfdot za.s[x11, 4], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc12071b5 // bfdot za.s[x11, 5], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xa1412a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc12a71d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z10.h\n"
+ ".inst 0xc12271d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xa0422aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc12b71b6 // bfdot za.s[x11, 6], { z13.h-z14.h }, z11.h\n"
+ ".inst 0xc12371b7 // bfdot za.s[x11, 7], { z13.h-z14.h }, z3.h\n"
+ ".inst 0xc12771d4 // bfdot za.s[x11, 4], { z14.h-z15.h }, z7.h\n"
+ ".inst 0xc12671d5 // bfdot za.s[x11, 5], { z14.h-z15.h }, z6.h\n"
+ ".inst 0xa0422a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc12771d6 // bfdot za.s[x11, 6], { z14.h-z15.h }, z7.h\n"
+ ".inst 0xc12671d7 // bfdot za.s[x11, 7], { z14.h-z15.h }, z6.h\n"
"10:" // Unpadded: 0 priming loads
- ".inst 0xa0402be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa0412be6 // ld1h { z6.h-z7.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xa0422be8 // ld1h { z8.h-z9.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "cbz x17, 20f\n"
+ ".inst 0xa1402be6 // ld1h { z6.h, z14.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa1422be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "cbz x25, 20f\n"
"add x20, x16, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x16]\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "sub x17, x17, #0x1\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "ld1w { z1.s }, p1/Z, [x16]\n"
+ ".inst 0x658aa834 // bfcvt z20.h, p2/M, z1.s\n"
+ "sub x25, x25, #0x1\n"
+ "ld1w { z10.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"sub x15, x15, #0x1\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aa954 // bfcvtnt z20.h, p2/M, z10.s\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "cmp x17, x15\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x658aaa15 // bfcvt z21.h, p2/M, z16.s\n"
+ "cmp x25, x15\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "csel x25, x17, x15, LT\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "csel x25, x25, x15, LT\n"
+ ".inst 0x648aaa75 // bfcvtnt z21.h, p2/M, z19.s\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aaaf6 // bfcvt z22.h, p2/M, z23.s\n"
"add x16, x16, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aab76 // bfcvtnt z22.h, p2/M, z27.s\n"
"sub x15, x15, x25\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ ".inst 0x658aa9f7 // bfcvt z23.h, p2/M, z15.s\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
+ ".inst 0x648aaa17 // bfcvtnt z23.h, p2/M, z16.s\n"
"cbz x25, 19f\n"
"11:" // Unpadded: Main loop
"addvl x24, SP, #6\n"
- ".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
+ ".inst 0xc12e7290 // bfdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
"addvl x23, SP, #12\n"
- "ld1w { z23.s }, p1/Z, [x16]\n"
- ".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402b04 // ld1h { z4.h-z5.h }, pn10.b/Z, [x24]\n"
+ "ld1w { z27.s }, p1/Z, [x16]\n"
+ ".inst 0xc1267291 // bfdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa1402b01 // ld1h { z1.h, z9.h }, pn10.b/Z, [x24]\n"
"addvl x22, SP, #18\n"
"addvl x21, SP, #24\n"
- ".inst 0xc1257192 // bfdot za.s[x11, 2], { z12.h-z13.h }, z5.h\n"
+ ".inst 0xc1297292 // bfdot za.s[x11, 2], { z20.h-z21.h }, z9.h\n"
"add x20, x16, %x[ld_in_row], LSL #2\n"
- "ld1w { z22.s }, p1/Z, [x20]\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1247193 // bfdot za.s[x11, 3], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1217293 // bfdot za.s[x11, 3], { z20.h-z21.h }, z1.h\n"
+ ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
"subs x25, x25, #0x1\n"
"add x16, x16, %x[ld_in_col], LSL #2\n"
- ".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
- "ld1w { z21.s }, p1/Z, [x20]\n"
+ ".inst 0xc12d72b0 // bfdot za.s[x11, 0], { z21.h-z22.h }, z13.h\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412b06 // ld1h { z6.h-z7.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
- ".inst 0xc1257194 // bfdot za.s[x11, 4], { z12.h-z13.h }, z5.h\n"
- "ld1w { z20.s }, p1/Z, [x20]\n"
+ ".inst 0xc12572b1 // bfdot za.s[x11, 1], { z21.h-z22.h }, z5.h\n"
+ ".inst 0xa1412b07 // ld1h { z7.h, z15.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+ ".inst 0xc12e7294 // bfdot za.s[x11, 4], { z20.h-z21.h }, z14.h\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1247195 // bfdot za.s[x11, 5], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc12771b2 // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
+ ".inst 0xc1267295 // bfdot za.s[x11, 5], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa1402ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc12f72b2 // bfdot za.s[x11, 2], { z21.h-z22.h }, z15.h\n"
"ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12671b3 // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412ae6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
+ ".inst 0xc12772b3 // bfdot za.s[x11, 3], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xa1412ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc12c72d0 // bfdot za.s[x11, 0], { z22.h-z23.h }, z12.h\n"
"ld1w { z18.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422b08 // ld1h { z8.h-z9.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc1257196 // bfdot za.s[x11, 6], { z12.h-z13.h }, z5.h\n"
+ ".inst 0xc12472d1 // bfdot za.s[x11, 1], { z22.h-z23.h }, z4.h\n"
+ ".inst 0xa1422b07 // ld1h { z7.h, z15.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc12d7296 // bfdot za.s[x11, 6], { z20.h-z21.h }, z13.h\n"
"ld1w { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1247197 // bfdot za.s[x11, 7], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc12771b4 // bfdot za.s[x11, 4], { z13.h-z14.h }, z7.h\n"
+ ".inst 0xc1257297 // bfdot za.s[x11, 7], { z20.h-z21.h }, z5.h\n"
+ ".inst 0xa1402aa4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc12e72b4 // bfdot za.s[x11, 4], { z21.h-z22.h }, z14.h\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0xc12671b5 // bfdot za.s[x11, 5], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc12971d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422ae8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc12771b6 // bfdot za.s[x11, 6], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b7 // bfdot za.s[x11, 7], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc12971d4 // bfdot za.s[x11, 4], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d5 // bfdot za.s[x11, 5], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc12971d6 // bfdot za.s[x11, 6], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d7 // bfdot za.s[x11, 7], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc1251190 // bfdot za.s[x8, 0], { z12.h-z13.h }, z5.h\n"
- ".inst 0xc1241191 // bfdot za.s[x8, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0x658aaaec // bfcvt z12.h, p2/M, z23.s\n"
- ".inst 0xa0402be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc12711b0 // bfdot za.s[x8, 0], { z13.h-z14.h }, z7.h\n"
- ".inst 0x648aaacc // bfcvtnt z12.h, p2/M, z22.s\n"
- ".inst 0xc12611b1 // bfdot za.s[x8, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0x658aaaad // bfcvt z13.h, p2/M, z21.s\n"
- ".inst 0xa0412be6 // ld1h { z6.h-z7.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xc12911d0 // bfdot za.s[x8, 0], { z14.h-z15.h }, z9.h\n"
- ".inst 0x648aaa8d // bfcvtnt z13.h, p2/M, z20.s\n"
- ".inst 0xc12811d1 // bfdot za.s[x8, 1], { z14.h-z15.h }, z8.h\n"
- ".inst 0x658aaa6e // bfcvt z14.h, p2/M, z19.s\n"
- ".inst 0x658aaa2f // bfcvt z15.h, p2/M, z17.s\n"
+ ".inst 0xc12672b5 // bfdot za.s[x11, 5], { z21.h-z22.h }, z6.h\n"
+ ".inst 0xa1412ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc12f72d2 // bfdot za.s[x11, 2], { z22.h-z23.h }, z15.h\n"
+ ".inst 0xc12772d3 // bfdot za.s[x11, 3], { z22.h-z23.h }, z7.h\n"
+ ".inst 0xa1422ae7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc12e72b6 // bfdot za.s[x11, 6], { z21.h-z22.h }, z14.h\n"
+ ".inst 0xc12672b7 // bfdot za.s[x11, 7], { z21.h-z22.h }, z6.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc12f72d4 // bfdot za.s[x11, 4], { z22.h-z23.h }, z15.h\n"
+ ".inst 0xc12772d5 // bfdot za.s[x11, 5], { z22.h-z23.h }, z7.h\n"
+ ".inst 0xa0422ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc12f72d6 // bfdot za.s[x11, 6], { z22.h-z23.h }, z15.h\n"
+ ".inst 0xc12e72d7 // bfdot za.s[x11, 7], { z22.h-z23.h }, z14.h\n"
+ ".inst 0xa1422aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc12c1290 // bfdot za.s[x8, 0], { z20.h-z21.h }, z12.h\n"
+ ".inst 0xc1241291 // bfdot za.s[x8, 1], { z20.h-z21.h }, z4.h\n"
+ ".inst 0x658aab74 // bfcvt z20.h, p2/M, z27.s\n"
+ ".inst 0xa1402be6 // ld1h { z6.h, z14.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc12d12b0 // bfdot za.s[x8, 0], { z21.h-z22.h }, z13.h\n"
+ ".inst 0x648aab54 // bfcvtnt z20.h, p2/M, z26.s\n"
+ ".inst 0xc12512b1 // bfdot za.s[x8, 1], { z21.h-z22.h }, z5.h\n"
+ ".inst 0x658aab35 // bfcvt z21.h, p2/M, z25.s\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xc12912d0 // bfdot za.s[x8, 0], { z22.h-z23.h }, z9.h\n"
+ ".inst 0x648aab15 // bfcvtnt z21.h, p2/M, z24.s\n"
+ ".inst 0xc12112d1 // bfdot za.s[x8, 1], { z22.h-z23.h }, z1.h\n"
+ ".inst 0x658aaa76 // bfcvt z22.h, p2/M, z19.s\n"
+ ".inst 0x658aaa37 // bfcvt z23.h, p2/M, z17.s\n"
"add x8, x8, #0x2\n"
- ".inst 0xc0066800 // mova { z0.d-z1.d }, za.d[x11, #0]\n"
- ".inst 0xa0422be8 // ld1h { z8.h-z9.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- ".inst 0x648aaa4e // bfcvtnt z14.h, p2/M, z18.s\n"
- ".inst 0xc0066822 // mova { z2.d-z3.d }, za.d[x11, #1]\n"
- ".inst 0xc1bfcb40 // fclamp { z0.s-z3.s }, z26.s, z31.s\n"
- "st1w { z0.s }, p1, [x14]\n"
- "add x14, x14, x0, LSL #2\n"
- "st1w { z2.s }, p1, [x13]\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ ".inst 0xa1422be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ ".inst 0x648aaa56 // bfcvtnt z22.h, p2/M, z18.s\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1bccba8 // fclamp { z8.s-z11.s }, z29.s, z28.s\n"
+ "st1w { z8.s }, p1, [x14]\n"
+ "add x14, x14, x5, LSL #2\n"
+ "st1w { z10.s }, p1, [x13]\n"
"add x13, x13, x10, LSL #2\n"
"add x11, x11, #0x2\n"
- ".inst 0xc0040b00 // mova za.d[x8, #0], { z24.d-z25.d }\n"
- "st1w { z1.s }, p1, [x9]\n"
+ ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
+ "st1w { z9.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc0040b01 // mova za.d[x8, #1], { z24.d-z25.d }\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- "st1w { z3.s }, p1, [x28]\n"
+ ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
+ ".inst 0x648aaa17 // bfcvtnt z23.h, p2/M, z16.s\n"
+ "st1w { z11.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
"bgt 11b\n"
"b 19f\n"
"12:" // Padded
"cbz x22, 17f\n"
"cmp x22, #0x1\n"
- "sub x17, x17, x22\n"
+ "sub x25, x25, x22\n"
"beq 16f\n"
"cmp x22, #0x2\n"
"beq 15f\n"
@@ -649,449 +649,449 @@ void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1w { z16.s }, p0/Z, [x16]\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aaa06 // bfcvt z6.h, p2/M, z16.s\n"
"add x21, x16, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aaa06 // bfcvtnt z6.h, p2/M, z16.s\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aaa07 // bfcvt z7.h, p2/M, z16.s\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aaa07 // bfcvtnt z7.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1w { z16.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aaa08 // bfcvt z8.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"addvl x20, SP, #24\n"
"ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aaa08 // bfcvtnt z8.h, p2/M, z16.s\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc12f70d0 // bfdot za.s[x11, 0], { z6.h-z7.h }, z15.h\n"
+ "ld1w { z9.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0x658aa929 // bfcvt z9.h, p2/M, z9.s\n"
+ ".inst 0xc12e70d1 // bfdot za.s[x11, 1], { z6.h-z7.h }, z14.h\n"
+ ".inst 0xa0412a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
"add x16, x16, %x[ld_in_col], LSL #2\n"
"ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
+ ".inst 0x648aaa09 // bfcvtnt z9.h, p2/M, z16.s\n"
+ ".inst 0xc12f70f0 // bfdot za.s[x11, 0], { z7.h-z8.h }, z15.h\n"
+ ".inst 0xc12e70f1 // bfdot za.s[x11, 1], { z7.h-z8.h }, z14.h\n"
+ ".inst 0xa0422a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc1237110 // bfdot za.s[x11, 0], { z8.h-z9.h }, z3.h\n"
+ ".inst 0xc1227111 // bfdot za.s[x11, 1], { z8.h-z9.h }, z2.h\n"
"14:" // Padded: 3 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1w { z16.s }, p0/Z, [x16]\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
"add x22, x16, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aaa09 // bfcvtnt z9.h, p2/M, z16.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1w { z16.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"addvl x21, SP, #18\n"
"ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
+ ".inst 0xa1402aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc12f7130 // bfdot za.s[x11, 0], { z9.h-z10.h }, z15.h\n"
"ld1w { z16.s }, p0/Z, [x22]\n"
"addvl x20, SP, #24\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1277131 // bfdot za.s[x11, 1], { z9.h-z10.h }, z7.h\n"
+ ".inst 0xa1402a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
"add x16, x16, %x[ld_in_col], LSL #2\n"
- ".inst 0xa0412aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc1257192 // bfdot za.s[x11, 2], { z12.h-z13.h }, z5.h\n"
+ ".inst 0xa1412aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc12e7132 // bfdot za.s[x11, 2], { z9.h-z10.h }, z14.h\n"
"ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0xc1247193 // bfdot za.s[x11, 3], { z12.h-z13.h }, z4.h\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
- ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc12771b2 // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b3 // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
- ".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc12971d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z8.h\n"
+ ".inst 0xc1267133 // bfdot za.s[x11, 3], { z9.h-z10.h }, z6.h\n"
+ ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0xc12f7150 // bfdot za.s[x11, 0], { z10.h-z11.h }, z15.h\n"
+ ".inst 0xa1422aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc1277151 // bfdot za.s[x11, 1], { z10.h-z11.h }, z7.h\n"
+ ".inst 0xa0412a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc12f7152 // bfdot za.s[x11, 2], { z10.h-z11.h }, z15.h\n"
+ ".inst 0xc12e7153 // bfdot za.s[x11, 3], { z10.h-z11.h }, z14.h\n"
+ ".inst 0xc12d7170 // bfdot za.s[x11, 0], { z11.h-z12.h }, z13.h\n"
+ ".inst 0xc1257171 // bfdot za.s[x11, 1], { z11.h-z12.h }, z5.h\n"
+ ".inst 0xa0422a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc12f7172 // bfdot za.s[x11, 2], { z11.h-z12.h }, z15.h\n"
+ ".inst 0xc12e7173 // bfdot za.s[x11, 3], { z11.h-z12.h }, z14.h\n"
"15:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1w { z16.s }, p0/Z, [x16]\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aaa12 // bfcvt z18.h, p2/M, z16.s\n"
"add x23, x16, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x23]\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aaa12 // bfcvtnt z18.h, p2/M, z16.s\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x23]\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aaa13 // bfcvt z19.h, p2/M, z16.s\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1w { z16.s }, p0/Z, [x23]\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aaa13 // bfcvtnt z19.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1w { z16.s }, p0/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aaa14 // bfcvt z20.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"addvl x22, SP, #12\n"
"ld1w { z16.s }, p0/Z, [x23]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aaa14 // bfcvtnt z20.h, p2/M, z16.s\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa0402ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
- "ld1w { z16.s }, p0/Z, [x23]\n"
+ ".inst 0xa1402ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc1297250 // bfdot za.s[x11, 0], { z18.h-z19.h }, z9.h\n"
+ "ld1w { z26.s }, p0/Z, [x23]\n"
"addvl x21, SP, #18\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ ".inst 0x658aab55 // bfcvt z21.h, p2/M, z26.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1217251 // bfdot za.s[x11, 1], { z18.h-z19.h }, z1.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
"addvl x20, SP, #24\n"
- ".inst 0xa0412ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc1257192 // bfdot za.s[x11, 2], { z12.h-z13.h }, z5.h\n"
+ ".inst 0xa1412ac7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc12e7252 // bfdot za.s[x11, 2], { z18.h-z19.h }, z14.h\n"
"add x16, x16, %x[ld_in_col], LSL #2\n"
"ld1w { z16.s }, p0/Z, [x23]\n"
- ".inst 0xc1247193 // bfdot za.s[x11, 3], { z12.h-z13.h }, z4.h\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc1257194 // bfdot za.s[x11, 4], { z12.h-z13.h }, z5.h\n"
- ".inst 0xc1247195 // bfdot za.s[x11, 5], { z12.h-z13.h }, z4.h\n"
- ".inst 0xc12771b2 // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b3 // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc12771b4 // bfdot za.s[x11, 4], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b5 // bfdot za.s[x11, 5], { z13.h-z14.h }, z6.h\n"
- ".inst 0xc12971d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc12971d4 // bfdot za.s[x11, 4], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d5 // bfdot za.s[x11, 5], { z14.h-z15.h }, z8.h\n"
+ ".inst 0xc1267253 // bfdot za.s[x11, 3], { z18.h-z19.h }, z6.h\n"
+ ".inst 0x648aaa15 // bfcvtnt z21.h, p2/M, z16.s\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc12f7270 // bfdot za.s[x11, 0], { z19.h-z20.h }, z15.h\n"
+ ".inst 0xc1277271 // bfdot za.s[x11, 1], { z19.h-z20.h }, z7.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xa1422ac7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc12d7254 // bfdot za.s[x11, 4], { z18.h-z19.h }, z13.h\n"
+ ".inst 0xc1257255 // bfdot za.s[x11, 5], { z18.h-z19.h }, z5.h\n"
+ ".inst 0xc12e7272 // bfdot za.s[x11, 2], { z19.h-z20.h }, z14.h\n"
+ ".inst 0xc1267273 // bfdot za.s[x11, 3], { z19.h-z20.h }, z6.h\n"
+ ".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc12f7290 // bfdot za.s[x11, 0], { z20.h-z21.h }, z15.h\n"
+ ".inst 0xc1277291 // bfdot za.s[x11, 1], { z20.h-z21.h }, z7.h\n"
+ ".inst 0xa0422aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc12d7274 // bfdot za.s[x11, 4], { z19.h-z20.h }, z13.h\n"
+ ".inst 0xc1257275 // bfdot za.s[x11, 5], { z19.h-z20.h }, z5.h\n"
+ ".inst 0xc12f7292 // bfdot za.s[x11, 2], { z20.h-z21.h }, z15.h\n"
+ ".inst 0xc12e7293 // bfdot za.s[x11, 3], { z20.h-z21.h }, z14.h\n"
+ ".inst 0xa0422a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc1237294 // bfdot za.s[x11, 4], { z20.h-z21.h }, z3.h\n"
+ ".inst 0xc1227295 // bfdot za.s[x11, 5], { z20.h-z21.h }, z2.h\n"
"16:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1w { z16.s }, p0/Z, [x16]\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
"add x24, x16, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x24]\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aaa09 // bfcvtnt z9.h, p2/M, z16.s\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x24]\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1w { z16.s }, p0/Z, [x24]\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1w { z16.s }, p0/Z, [x24]\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"addvl x23, SP, #6\n"
"ld1w { z16.s }, p0/Z, [x24]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa0402ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23]\n"
- ".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
+ ".inst 0xa1402ae7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc12f7130 // bfdot za.s[x11, 0], { z9.h-z10.h }, z15.h\n"
"ld1w { z16.s }, p0/Z, [x24]\n"
"addvl x22, SP, #12\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc1277131 // bfdot za.s[x11, 1], { z9.h-z10.h }, z7.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
"addvl x21, SP, #18\n"
- ".inst 0xa0412ae6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc1257192 // bfdot za.s[x11, 2], { z12.h-z13.h }, z5.h\n"
+ ".inst 0xa1412ae7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc12e7132 // bfdot za.s[x11, 2], { z9.h-z10.h }, z14.h\n"
"addvl x20, SP, #24\n"
"add x16, x16, %x[ld_in_col], LSL #2\n"
"ld1w { z16.s }, p0/Z, [x24]\n"
- ".inst 0xc1247193 // bfdot za.s[x11, 3], { z12.h-z13.h }, z4.h\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xa0422ae8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc1257194 // bfdot za.s[x11, 4], { z12.h-z13.h }, z5.h\n"
- ".inst 0xc1247195 // bfdot za.s[x11, 5], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc12771b2 // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b3 // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc1257196 // bfdot za.s[x11, 6], { z12.h-z13.h }, z5.h\n"
- ".inst 0xc1247197 // bfdot za.s[x11, 7], { z12.h-z13.h }, z4.h\n"
- ".inst 0xc12771b4 // bfdot za.s[x11, 4], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b5 // bfdot za.s[x11, 5], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc12971d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc12771b6 // bfdot za.s[x11, 6], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b7 // bfdot za.s[x11, 7], { z13.h-z14.h }, z6.h\n"
- ".inst 0xc12971d4 // bfdot za.s[x11, 4], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d5 // bfdot za.s[x11, 5], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc12971d6 // bfdot za.s[x11, 6], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d7 // bfdot za.s[x11, 7], { z14.h-z15.h }, z8.h\n"
+ ".inst 0xc1267133 // bfdot za.s[x11, 3], { z9.h-z10.h }, z6.h\n"
+ ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0xa1402aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc12f7150 // bfdot za.s[x11, 0], { z10.h-z11.h }, z15.h\n"
+ ".inst 0xc1277151 // bfdot za.s[x11, 1], { z10.h-z11.h }, z7.h\n"
+ ".inst 0xa1412ac7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa1422ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc12d7134 // bfdot za.s[x11, 4], { z9.h-z10.h }, z13.h\n"
+ ".inst 0xc1257135 // bfdot za.s[x11, 5], { z9.h-z10.h }, z5.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc12f7152 // bfdot za.s[x11, 2], { z10.h-z11.h }, z15.h\n"
+ ".inst 0xc1277153 // bfdot za.s[x11, 3], { z10.h-z11.h }, z7.h\n"
+ ".inst 0xa1412aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc12e7170 // bfdot za.s[x11, 0], { z11.h-z12.h }, z14.h\n"
+ ".inst 0xc1267171 // bfdot za.s[x11, 1], { z11.h-z12.h }, z6.h\n"
+ ".inst 0xa1422ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc12d7136 // bfdot za.s[x11, 6], { z9.h-z10.h }, z13.h\n"
+ ".inst 0xc1257137 // bfdot za.s[x11, 7], { z9.h-z10.h }, z5.h\n"
+ ".inst 0xc12f7154 // bfdot za.s[x11, 4], { z10.h-z11.h }, z15.h\n"
+ ".inst 0xc1277155 // bfdot za.s[x11, 5], { z10.h-z11.h }, z7.h\n"
+ ".inst 0xa1412a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc12e7172 // bfdot za.s[x11, 2], { z11.h-z12.h }, z14.h\n"
+ ".inst 0xc1267173 // bfdot za.s[x11, 3], { z11.h-z12.h }, z6.h\n"
+ ".inst 0xa1422aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc12f7156 // bfdot za.s[x11, 6], { z10.h-z11.h }, z15.h\n"
+ ".inst 0xc1277157 // bfdot za.s[x11, 7], { z10.h-z11.h }, z7.h\n"
+ ".inst 0xc1297174 // bfdot za.s[x11, 4], { z11.h-z12.h }, z9.h\n"
+ ".inst 0xc1217175 // bfdot za.s[x11, 5], { z11.h-z12.h }, z1.h\n"
+ ".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc1217176 // bfdot za.s[x11, 6], { z11.h-z12.h }, z1.h\n"
+ ".inst 0xc1207177 // bfdot za.s[x11, 7], { z11.h-z12.h }, z0.h\n"
"17:" // Padded: 0 priming loads
- ".inst 0xa0402be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa0412be6 // ld1h { z6.h-z7.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xa0422be8 // ld1h { z8.h-z9.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "cbz x17, 20f\n"
+ ".inst 0xa1402be6 // ld1h { z6.h, z14.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa1422be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "cbz x25, 20f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1w { z16.s }, p0/Z, [x16]\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aaa14 // bfcvt z20.h, p2/M, z16.s\n"
"add x20, x16, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aaa14 // bfcvtnt z20.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aaa15 // bfcvt z21.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aaa15 // bfcvtnt z21.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aaa16 // bfcvt z22.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aaa16 // bfcvtnt z22.h, p2/M, z16.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ ".inst 0x658aaa17 // bfcvt z23.h, p2/M, z16.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- "sub x17, x17, #0x1\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
+ "sub x25, x25, #0x1\n"
+ ".inst 0x648aaa17 // bfcvtnt z23.h, p2/M, z16.s\n"
"sub x15, x15, #0x1\n"
- "cmp x17, x15\n"
- "csel x25, x17, x15, LT\n"
+ "cmp x25, x15\n"
+ "csel x25, x25, x15, LT\n"
"add x16, x16, %x[ld_in_col], LSL #2\n"
"sub x15, x15, x25\n"
"cbz x25, 19f\n"
"18:" // Padded: Main loop
"addvl x24, SP, #6\n"
- ".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
+ ".inst 0xc12e7290 // bfdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
"addvl x23, SP, #12\n"
- ".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402b04 // ld1h { z4.h-z5.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc1267291 // bfdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa0402b02 // ld1h { z2.h-z3.h }, pn10.b/Z, [x24]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1257192 // bfdot za.s[x11, 2], { z12.h-z13.h }, z5.h\n"
- "ld1w { z23.s }, p0/Z, [x16]\n"
+ ".inst 0xc1237292 // bfdot za.s[x11, 2], { z20.h-z21.h }, z3.h\n"
+ "ld1w { z16.s }, p0/Z, [x16]\n"
"add x22, x16, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1247193 // bfdot za.s[x11, 3], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1227293 // bfdot za.s[x11, 3], { z20.h-z21.h }, z2.h\n"
+ ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
"addvl x21, SP, #18\n"
"addvl x20, SP, #24\n"
- ".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
- "ld1w { z22.s }, p0/Z, [x22]\n"
+ ".inst 0xc12d72b0 // bfdot za.s[x11, 0], { z21.h-z22.h }, z13.h\n"
+ "ld1w { z19.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412b06 // ld1h { z6.h-z7.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+ ".inst 0xc12572b1 // bfdot za.s[x11, 1], { z21.h-z22.h }, z5.h\n"
+ ".inst 0xa1412b07 // ld1h { z7.h, z15.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
"subs x25, x25, #0x1\n"
"add x16, x16, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1257194 // bfdot za.s[x11, 4], { z12.h-z13.h }, z5.h\n"
- "ld1w { z21.s }, p0/Z, [x22]\n"
+ ".inst 0xc12e7294 // bfdot za.s[x11, 4], { z20.h-z21.h }, z14.h\n"
+ "ld1w { z17.s }, p0/Z, [x22]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- ".inst 0xc1247195 // bfdot za.s[x11, 5], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1267295 // bfdot za.s[x11, 5], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12771b2 // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
- "ld1w { z20.s }, p0/Z, [x22]\n"
+ ".inst 0xc12f72b2 // bfdot za.s[x11, 2], { z21.h-z22.h }, z15.h\n"
+ "ld1w { z27.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc12671b3 // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412ae6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
- "ld1w { z19.s }, p0/Z, [x22]\n"
+ ".inst 0xc12772b3 // bfdot za.s[x11, 3], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xa1412ae7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc12c72d0 // bfdot za.s[x11, 0], { z22.h-z23.h }, z12.h\n"
+ "ld1w { z10.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422b08 // ld1h { z8.h-z9.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc1257196 // bfdot za.s[x11, 6], { z12.h-z13.h }, z5.h\n"
- "ld1w { z18.s }, p0/Z, [x22]\n"
+ ".inst 0xc12472d1 // bfdot za.s[x11, 1], { z22.h-z23.h }, z4.h\n"
+ ".inst 0xa1422b04 // ld1h { z4.h, z12.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc12e7296 // bfdot za.s[x11, 6], { z20.h-z21.h }, z14.h\n"
+ "ld1w { z8.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1247197 // bfdot za.s[x11, 7], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc12771b4 // bfdot za.s[x11, 4], { z13.h-z14.h }, z7.h\n"
- "ld1w { z17.s }, p0/Z, [x22]\n"
+ ".inst 0xc1267297 // bfdot za.s[x11, 7], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc12f72b4 // bfdot za.s[x11, 4], { z21.h-z22.h }, z15.h\n"
+ "ld1w { z11.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc12671b5 // bfdot za.s[x11, 5], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc12971d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z9.h\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0xc12871d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422ae8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc12771b6 // bfdot za.s[x11, 6], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b7 // bfdot za.s[x11, 7], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc12971d4 // bfdot za.s[x11, 4], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d5 // bfdot za.s[x11, 5], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc12971d6 // bfdot za.s[x11, 6], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d7 // bfdot za.s[x11, 7], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1251190 // bfdot za.s[x8, 0], { z12.h-z13.h }, z5.h\n"
- ".inst 0xc1241191 // bfdot za.s[x8, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0x658aaaec // bfcvt z12.h, p2/M, z23.s\n"
- ".inst 0xa0402be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc12711b0 // bfdot za.s[x8, 0], { z13.h-z14.h }, z7.h\n"
- ".inst 0x648aaacc // bfcvtnt z12.h, p2/M, z22.s\n"
- ".inst 0xc12611b1 // bfdot za.s[x8, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0x658aaaad // bfcvt z13.h, p2/M, z21.s\n"
- ".inst 0xa0412be6 // ld1h { z6.h-z7.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xc12911d0 // bfdot za.s[x8, 0], { z14.h-z15.h }, z9.h\n"
- ".inst 0x648aaa8d // bfcvtnt z13.h, p2/M, z20.s\n"
- ".inst 0xc12811d1 // bfdot za.s[x8, 1], { z14.h-z15.h }, z8.h\n"
- ".inst 0x658aaa6e // bfcvt z14.h, p2/M, z19.s\n"
- ".inst 0x658aaa2f // bfcvt z15.h, p2/M, z17.s\n"
+ ".inst 0xc12772b5 // bfdot za.s[x11, 5], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xa0412aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc12c72d2 // bfdot za.s[x11, 2], { z22.h-z23.h }, z12.h\n"
+ "ld1w { z18.s }, p0/Z, [x22]\n"
+ ".inst 0xc12472d3 // bfdot za.s[x11, 3], { z22.h-z23.h }, z4.h\n"
+ ".inst 0xa1422ae4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc12f72b6 // bfdot za.s[x11, 6], { z21.h-z22.h }, z15.h\n"
+ ".inst 0xc12e72b7 // bfdot za.s[x11, 7], { z21.h-z22.h }, z14.h\n"
+ ".inst 0xa1412a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc12c72d4 // bfdot za.s[x11, 4], { z22.h-z23.h }, z12.h\n"
+ ".inst 0xc12472d5 // bfdot za.s[x11, 5], { z22.h-z23.h }, z4.h\n"
+ ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc12172d6 // bfdot za.s[x11, 6], { z22.h-z23.h }, z1.h\n"
+ ".inst 0xc12072d7 // bfdot za.s[x11, 7], { z22.h-z23.h }, z0.h\n"
+ ".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc12d1290 // bfdot za.s[x8, 0], { z20.h-z21.h }, z13.h\n"
+ ".inst 0xc1251291 // bfdot za.s[x8, 1], { z20.h-z21.h }, z5.h\n"
+ ".inst 0x658aaa14 // bfcvt z20.h, p2/M, z16.s\n"
+ ".inst 0xa1402be6 // ld1h { z6.h, z14.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc12f12b0 // bfdot za.s[x8, 0], { z21.h-z22.h }, z15.h\n"
+ ".inst 0x648aaa74 // bfcvtnt z20.h, p2/M, z19.s\n"
+ ".inst 0xc12712b1 // bfdot za.s[x8, 1], { z21.h-z22.h }, z7.h\n"
+ ".inst 0x658aaa35 // bfcvt z21.h, p2/M, z17.s\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xc12112d0 // bfdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+ ".inst 0x648aab75 // bfcvtnt z21.h, p2/M, z27.s\n"
+ ".inst 0xc12012d1 // bfdot za.s[x8, 1], { z22.h-z23.h }, z0.h\n"
+ ".inst 0x658aa956 // bfcvt z22.h, p2/M, z10.s\n"
+ ".inst 0x658aa977 // bfcvt z23.h, p2/M, z11.s\n"
"add x8, x8, #0x2\n"
".inst 0xc0066800 // mova { z0.d-z1.d }, za.d[x11, #0]\n"
- ".inst 0xa0422be8 // ld1h { z8.h-z9.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- ".inst 0x648aaa4e // bfcvtnt z14.h, p2/M, z18.s\n"
+ ".inst 0xa1422be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ ".inst 0x648aa916 // bfcvtnt z22.h, p2/M, z8.s\n"
".inst 0xc0066822 // mova { z2.d-z3.d }, za.d[x11, #1]\n"
- ".inst 0xc1bfcb40 // fclamp { z0.s-z3.s }, z26.s, z31.s\n"
+ ".inst 0xc1bccba0 // fclamp { z0.s-z3.s }, z29.s, z28.s\n"
"st1w { z0.s }, p1, [x14]\n"
- "add x14, x14, x0, LSL #2\n"
+ "add x14, x14, x5, LSL #2\n"
"st1w { z2.s }, p1, [x13]\n"
"add x13, x13, x10, LSL #2\n"
"add x11, x11, #0x2\n"
- ".inst 0xc0040b00 // mova za.d[x8, #0], { z24.d-z25.d }\n"
+ ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
"st1w { z1.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc0040b01 // mova za.d[x8, #1], { z24.d-z25.d }\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
+ ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
+ ".inst 0x648aaa57 // bfcvtnt z23.h, p2/M, z18.s\n"
"st1w { z3.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
"bgt 18b\n"
"19:" // Main loop tail
"addvl x23, SP, #6\n"
- ".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
+ ".inst 0xc12e7290 // bfdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
"addvl x22, SP, #12\n"
- ".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1267291 // bfdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
"addvl x21, SP, #18\n"
"addvl x20, SP, #24\n"
- ".inst 0xc1257192 // bfdot za.s[x11, 2], { z12.h-z13.h }, z5.h\n"
- ".inst 0xc1247193 // bfdot za.s[x11, 3], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412ae6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc1257194 // bfdot za.s[x11, 4], { z12.h-z13.h }, z5.h\n"
- ".inst 0xc1247195 // bfdot za.s[x11, 5], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc12771b2 // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b3 // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422ae8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc1257196 // bfdot za.s[x11, 6], { z12.h-z13.h }, z5.h\n"
- ".inst 0xc1247197 // bfdot za.s[x11, 7], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc12771b4 // bfdot za.s[x11, 4], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b5 // bfdot za.s[x11, 5], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc12971d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc12771b6 // bfdot za.s[x11, 6], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b7 // bfdot za.s[x11, 7], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc12971d4 // bfdot za.s[x11, 4], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d5 // bfdot za.s[x11, 5], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc12971d6 // bfdot za.s[x11, 6], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d7 // bfdot za.s[x11, 7], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1251190 // bfdot za.s[x8, 0], { z12.h-z13.h }, z5.h\n"
- ".inst 0xc1241191 // bfdot za.s[x8, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0xc12711b0 // bfdot za.s[x8, 0], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12611b1 // bfdot za.s[x8, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0xc12911d0 // bfdot za.s[x8, 0], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12811d1 // bfdot za.s[x8, 1], { z14.h-z15.h }, z8.h\n"
+ ".inst 0xc1217292 // bfdot za.s[x11, 2], { z20.h-z21.h }, z1.h\n"
+ ".inst 0xc1207293 // bfdot za.s[x11, 3], { z20.h-z21.h }, z0.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc12d72b0 // bfdot za.s[x11, 0], { z21.h-z22.h }, z13.h\n"
+ ".inst 0xc12572b1 // bfdot za.s[x11, 1], { z21.h-z22.h }, z5.h\n"
+ ".inst 0xa1412ae7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc12e7294 // bfdot za.s[x11, 4], { z20.h-z21.h }, z14.h\n"
+ ".inst 0xc1267295 // bfdot za.s[x11, 5], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa1402aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc12f72b2 // bfdot za.s[x11, 2], { z21.h-z22.h }, z15.h\n"
+ ".inst 0xc12772b3 // bfdot za.s[x11, 3], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xa1412ac7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc12c72d0 // bfdot za.s[x11, 0], { z22.h-z23.h }, z12.h\n"
+ ".inst 0xc12472d1 // bfdot za.s[x11, 1], { z22.h-z23.h }, z4.h\n"
+ ".inst 0xa1422ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc12d7296 // bfdot za.s[x11, 6], { z20.h-z21.h }, z13.h\n"
+ ".inst 0xc1257297 // bfdot za.s[x11, 7], { z20.h-z21.h }, z5.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc12f72b4 // bfdot za.s[x11, 4], { z21.h-z22.h }, z15.h\n"
+ ".inst 0xc12772b5 // bfdot za.s[x11, 5], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xa1412aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc12e72d2 // bfdot za.s[x11, 2], { z22.h-z23.h }, z14.h\n"
+ ".inst 0xc12672d3 // bfdot za.s[x11, 3], { z22.h-z23.h }, z6.h\n"
+ ".inst 0xa1422ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc12f72b6 // bfdot za.s[x11, 6], { z21.h-z22.h }, z15.h\n"
+ ".inst 0xc12772b7 // bfdot za.s[x11, 7], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xa1412a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc12e72d4 // bfdot za.s[x11, 4], { z22.h-z23.h }, z14.h\n"
+ ".inst 0xc12672d5 // bfdot za.s[x11, 5], { z22.h-z23.h }, z6.h\n"
+ ".inst 0xa1422aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc12b72d6 // bfdot za.s[x11, 6], { z22.h-z23.h }, z11.h\n"
+ ".inst 0xc12372d7 // bfdot za.s[x11, 7], { z22.h-z23.h }, z3.h\n"
+ ".inst 0xa0422a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc12d1290 // bfdot za.s[x8, 0], { z20.h-z21.h }, z13.h\n"
+ ".inst 0xc1251291 // bfdot za.s[x8, 1], { z20.h-z21.h }, z5.h\n"
+ ".inst 0xc12f12b0 // bfdot za.s[x8, 0], { z21.h-z22.h }, z15.h\n"
+ ".inst 0xc12712b1 // bfdot za.s[x8, 1], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xc12312d0 // bfdot za.s[x8, 0], { z22.h-z23.h }, z3.h\n"
+ ".inst 0xc12212d1 // bfdot za.s[x8, 1], { z22.h-z23.h }, z2.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xc0066800 // mova { z0.d-z1.d }, za.d[x11, #0]\n"
- ".inst 0xc0066822 // mova { z2.d-z3.d }, za.d[x11, #1]\n"
- ".inst 0xc1bfcb40 // fclamp { z0.s-z3.s }, z26.s, z31.s\n"
- "st1w { z0.s }, p1, [x14]\n"
- "add x14, x14, x0, LSL #2\n"
- "st1w { z2.s }, p1, [x13]\n"
+ ".inst 0xc0066814 // mova { z20.d-z21.d }, za.d[x11, #0]\n"
+ ".inst 0xc0066836 // mova { z22.d-z23.d }, za.d[x11, #1]\n"
+ ".inst 0xc1bccbb4 // fclamp { z20.s-z23.s }, z29.s, z28.s\n"
+ "st1w { z20.s }, p1, [x14]\n"
+ "add x14, x14, x5, LSL #2\n"
+ "st1w { z22.s }, p1, [x13]\n"
"add x13, x13, x10, LSL #2\n"
"add x11, x11, #0x2\n"
- ".inst 0xc0040b00 // mova za.d[x8, #0], { z24.d-z25.d }\n"
- "st1w { z1.s }, p1, [x9]\n"
+ ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
+ "st1w { z21.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc0040b01 // mova za.d[x8, #1], { z24.d-z25.d }\n"
- "st1w { z3.s }, p1, [x28]\n"
+ ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
+ "st1w { z23.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
"20:" // Main loop skip tail
"cbz x15, 22f\n"
@@ -1100,16 +1100,16 @@ void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
"add x8, x8, #0x2\n"
"subs x15, x15, #0x1\n"
".inst 0xc0066822 // mova { z2.d-z3.d }, za.d[x11, #1]\n"
- ".inst 0xc1bfcb40 // fclamp { z0.s-z3.s }, z26.s, z31.s\n"
+ ".inst 0xc1bccba0 // fclamp { z0.s-z3.s }, z29.s, z28.s\n"
"st1w { z0.s }, p1, [x14]\n"
- "add x14, x14, x0, LSL #2\n"
+ "add x14, x14, x5, LSL #2\n"
"st1w { z2.s }, p1, [x13]\n"
"add x13, x13, x10, LSL #2\n"
"add x11, x11, #0x2\n"
- ".inst 0xc0040b00 // mova za.d[x8, #0], { z24.d-z25.d }\n"
+ ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
"st1w { z1.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc0040b01 // mova za.d[x8, #1], { z24.d-z25.d }\n"
+ ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
"st1w { z3.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
"bgt 21b\n"
@@ -1118,12 +1118,12 @@ void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
"incb x20, ALL, MUL #16\n"
"incb x20, ALL, MUL #9\n"
"str x20, [%x[args], %[offsetof_Args_weights]]\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "incw x7\n"
- "whilelt p1.s, x7, x6\n"
- "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x16, x16, x20, LSL #2\n"
- "str x16, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "incw x17\n"
+ "whilelt p1.s, x17, x7\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21, LSL #2\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
"ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
"ldp x23, x22, [x25, #0x0]\n"
@@ -1141,7 +1141,7 @@ void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x0", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za.hpp
index c99cf51da4..53e596418b 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za/generic.cpp
index 01f689a0b4..3a56e69d26 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za/generic.cpp
@@ -76,134 +76,134 @@ void sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl(
"ldr x4, [%x[args], %[offsetof_Args_pad_top]]\n"
"ptrue p2.b\n"
".inst 0x25207812 // ptrue pn10.b\n"
- "ld1rw { z30.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "ld1rw { z13.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
"ldr x5, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x5\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z22.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+ "ld1rw { z12.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
"whilelt p8.s, XZR, x4\n"
"addvl SP, SP, #-15\n"
"ldr x6, [%x[args], %[offsetof_Args_current_channel]]\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
"1:" // Channel loop
"ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
- "fmov z4.s, #0x0\n"
+ "fmov z16.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z4.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "ld1w { z16.s }, p1/Z, [x20, x6, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x21, x20\n"
- "ld1w { z31.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- "ld1w { z16.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x658aabe1 // bfcvt z1.h, p2/M, z31.s\n"
- "incb x20\n"
- "ld1w { z13.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x658aa9a9 // bfcvt z9.h, p2/M, z13.s\n"
+ "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x21\n"
+ "ld1w { z31.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "ld1w { z8.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aabef // bfcvt z15.h, p2/M, z31.s\n"
+ "incb x21\n"
+ "ld1w { z18.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aaa4e // bfcvt z14.h, p2/M, z18.s\n"
"addvl x24, SP, #15\n"
- "ld1w { z18.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x648aaa01 // bfcvtnt z1.h, p2/M, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x648aa90f // bfcvtnt z15.h, p2/M, z8.s\n"
"addvl x24, x24, #-3\n"
- "ld1w { z15.s }, p2/Z, [x21]\n"
- "mov x21, x20\n"
- "st1h { z1.h }, p2, [x24]\n"
- ".inst 0x648aaa49 // bfcvtnt z9.h, p2/M, z18.s\n"
- "ld1w { z31.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x658aabe1 // bfcvt z1.h, p2/M, z31.s\n"
- "incb x20\n"
- "ld1w { z16.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- "st1h { z9.h }, p2, [x24, #1, MUL VL]\n"
- ".inst 0x658aa9e2 // bfcvt z2.h, p2/M, z15.s\n"
- "ld1w { z13.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x658aa9a9 // bfcvt z9.h, p2/M, z13.s\n"
- ".inst 0x648aaa01 // bfcvtnt z1.h, p2/M, z16.s\n"
- "ld1w { z18.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- "st1h { z2.h }, p2, [x24, #2, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x20]\n"
+ "mov x20, x21\n"
+ "st1h { z15.h }, p2, [x24]\n"
+ ".inst 0x648aaa2e // bfcvtnt z14.h, p2/M, z17.s\n"
+ "ld1w { z29.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aabb5 // bfcvt z21.h, p2/M, z29.s\n"
+ "incb x21\n"
+ "ld1w { z17.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "st1h { z14.h }, p2, [x24, #1, MUL VL]\n"
+ ".inst 0x658aaa58 // bfcvt z24.h, p2/M, z18.s\n"
+ "ld1w { z26.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aab41 // bfcvt z1.h, p2/M, z26.s\n"
+ ".inst 0x648aaa35 // bfcvtnt z21.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "st1h { z24.h }, p2, [x24, #2, MUL VL]\n"
"addvl x24, x24, #-3\n"
- "ld1w { z15.s }, p2/Z, [x21]\n"
- "mov x21, x20\n"
- "st1h { z1.h }, p2, [x24]\n"
- ".inst 0x648aaa49 // bfcvtnt z9.h, p2/M, z18.s\n"
- "ld1w { z31.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- "incb x20\n"
- ".inst 0x658aabe1 // bfcvt z1.h, p2/M, z31.s\n"
- "ld1w { z16.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x658aa9e2 // bfcvt z2.h, p2/M, z15.s\n"
- "st1h { z9.h }, p2, [x24, #1, MUL VL]\n"
- "ld1w { z13.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x658aa9a9 // bfcvt z9.h, p2/M, z13.s\n"
- "st1h { z2.h }, p2, [x24, #2, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
+ "ld1w { z9.s }, p2/Z, [x20]\n"
+ "mov x20, x21\n"
+ "st1h { z21.h }, p2, [x24]\n"
+ ".inst 0x648aaa21 // bfcvtnt z1.h, p2/M, z17.s\n"
+ "ld1w { z3.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "incb x21\n"
+ ".inst 0x658aa864 // bfcvt z4.h, p2/M, z3.s\n"
+ "ld1w { z31.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aa92b // bfcvt z11.h, p2/M, z9.s\n"
+ "st1h { z1.h }, p2, [x24, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aaa46 // bfcvt z6.h, p2/M, z18.s\n"
+ "st1h { z11.h }, p2, [x24, #2, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
"addvl x24, x24, #-3\n"
- ".inst 0x648aaa01 // bfcvtnt z1.h, p2/M, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x21]\n"
- "mov x21, x20\n"
- "st1h { z1.h }, p2, [x24]\n"
- ".inst 0x648aaa49 // bfcvtnt z9.h, p2/M, z18.s\n"
- "ld1w { z31.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x658aabe1 // bfcvt z1.h, p2/M, z31.s\n"
- "incb x20\n"
- "ld1w { z16.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x658aa9e2 // bfcvt z2.h, p2/M, z15.s\n"
- "st1h { z9.h }, p2, [x24, #1, MUL VL]\n"
- "ld1w { z13.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x648aaa01 // bfcvtnt z1.h, p2/M, z16.s\n"
- ".inst 0x658aa9a9 // bfcvt z9.h, p2/M, z13.s\n"
- "ld1w { z18.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
+ ".inst 0x648aabe4 // bfcvtnt z4.h, p2/M, z31.s\n"
+ "ld1w { z27.s }, p2/Z, [x20]\n"
+ "mov x20, x21\n"
+ "st1h { z4.h }, p2, [x24]\n"
+ ".inst 0x648aa8a6 // bfcvtnt z6.h, p2/M, z5.s\n"
+ "ld1w { z9.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aa938 // bfcvt z24.h, p2/M, z9.s\n"
+ "incb x21\n"
+ "ld1w { z17.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aab75 // bfcvt z21.h, p2/M, z27.s\n"
+ "st1h { z6.h }, p2, [x24, #1, MUL VL]\n"
+ "ld1w { z31.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x648aaa38 // bfcvtnt z24.h, p2/M, z17.s\n"
+ ".inst 0x658aabf9 // bfcvt z25.h, p2/M, z31.s\n"
+ "ld1w { z18.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
"ldr x7, [%x[args], %[offsetof_Args_input_cols]]\n"
- "st1h { z2.h }, p2, [x24, #2, MUL VL]\n"
- "ld1w { z15.s }, p2/Z, [x21]\n"
- "mov x21, x20\n"
+ "st1h { z21.h }, p2, [x24, #2, MUL VL]\n"
+ "ld1w { z11.s }, p2/Z, [x20]\n"
+ "mov x21, x21\n"
"addvl x24, x24, #-3\n"
- "st1h { z1.h }, p2, [x24]\n"
- "ld1w { z31.s }, p2/Z, [x21]\n"
+ "st1h { z24.h }, p2, [x24]\n"
+ "ld1w { z17.s }, p2/Z, [x21]\n"
"incb x21, ALL, MUL #5\n"
- ".inst 0x648aaa49 // bfcvtnt z9.h, p2/M, z18.s\n"
- "st1h { z9.h }, p2, [x24, #1, MUL VL]\n"
- "ld1w { z16.s }, p2/Z, [x21]\n"
+ ".inst 0x648aaa59 // bfcvtnt z25.h, p2/M, z18.s\n"
+ "st1h { z25.h }, p2, [x24, #1, MUL VL]\n"
+ "ld1w { z8.s }, p2/Z, [x21]\n"
"incb x21, ALL, MUL #5\n"
- ".inst 0x658aabe1 // bfcvt z1.h, p2/M, z31.s\n"
- ".inst 0x658aa9e2 // bfcvt z2.h, p2/M, z15.s\n"
- "ld1w { z13.s }, p2/Z, [x21]\n"
+ ".inst 0x658aaa29 // bfcvt z9.h, p2/M, z17.s\n"
+ ".inst 0x658aa976 // bfcvt z22.h, p2/M, z11.s\n"
+ "ld1w { z28.s }, p2/Z, [x21]\n"
"incb x21, ALL, MUL #5\n"
- ".inst 0x658aa9a9 // bfcvt z9.h, p2/M, z13.s\n"
+ ".inst 0x658aab85 // bfcvt z5.h, p2/M, z28.s\n"
"ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
- "ld1w { z18.s }, p2/Z, [x21]\n"
+ "ld1w { z25.s }, p2/Z, [x21]\n"
"incb x21, ALL, MUL #5\n"
"sub x20, x7, #0x1\n"
- "st1h { z2.h }, p2, [x24, #2, MUL VL]\n"
- "ld1w { z15.s }, p2/Z, [x21]\n"
+ "st1h { z22.h }, p2, [x24, #2, MUL VL]\n"
+ "ld1w { z11.s }, p2/Z, [x21]\n"
"orr x23, x20, %x[ld_in_col], LSL #18\n"
"addvl x24, x24, #-3\n"
- "mov z5.d, z4.d\n"
+ "mov z17.d, z16.d\n"
"orr x23, x5, x23, LSL #20\n"
"mov x22, #0xb\n"
- "mov z6.d, z4.d\n"
- "mov z7.d, z4.d\n"
+ "mov z18.d, z16.d\n"
+ "mov z19.d, z16.d\n"
"add x21, x4, x3\n"
"lsl x20, %x[ld_in_row], #0x2\n"
- ".inst 0x648aaa01 // bfcvtnt z1.h, p2/M, z16.s\n"
- "st1h { z1.h }, p2, [x24]\n"
- ".inst 0x648aaa49 // bfcvtnt z9.h, p2/M, z18.s\n"
- "st1h { z9.h }, p2, [x24, #1, MUL VL]\n"
- ".inst 0x658aa9e2 // bfcvt z2.h, p2/M, z15.s\n"
+ ".inst 0x648aa909 // bfcvtnt z9.h, p2/M, z8.s\n"
+ "st1h { z9.h }, p2, [x24]\n"
+ ".inst 0x648aab25 // bfcvtnt z5.h, p2/M, z25.s\n"
+ "st1h { z5.h }, p2, [x24, #1, MUL VL]\n"
+ ".inst 0x658aa97b // bfcvt z27.h, p2/M, z11.s\n"
"mov x8, #0x0\n"
- "st1h { z2.h }, p2, [x24, #2, MUL VL]\n"
+ "st1h { z27.h }, p2, [x24, #2, MUL VL]\n"
"ldr x16, [%x[args], %[offsetof_Args_output_cols]]\n"
"lsl x23, x23, #0x2\n"
"sub x22, x22, x21\n"
@@ -213,20 +213,20 @@ void sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl(
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
- "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x2\n"
"msub x17, x4, x20, x17\n"
- ".inst 0xc0040c80 // mova za.d[x8, #0], { z4.d-z7.d }\n"
+ ".inst 0xc0040e00 // mova za.d[x8, #0], { z16.d-z19.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040c81 // mova za.d[x8, #1], { z4.d-z7.d }\n"
+ ".inst 0xc0040e01 // mova za.d[x8, #1], { z16.d-z19.d }\n"
"mov x22, #0x4\n"
- "ldp x15, x14, [x25], #0x10\n"
- ".inst 0xc0040c82 // mova za.d[x8, #2], { z4.d-z7.d }\n"
+ "ldp x15, x14, [x23], #0x10\n"
+ ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
"ldp x13, x11, [x20], #0x10\n"
- ".inst 0xc0040c83 // mova za.d[x8, #3], { z4.d-z7.d }\n"
+ ".inst 0xc0040e03 // mova za.d[x8, #3], { z16.d-z19.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0040c84 // mova za.d[x8, #4], { z4.d-z7.d }\n"
- "ldp x10, x9, [x25], #0x10\n"
+ ".inst 0xc0040e04 // mova za.d[x8, #4], { z16.d-z19.d }\n"
+ "ldp x10, x9, [x23], #0x10\n"
"ldp x28, x27, [x20], #0x10\n"
"cbz x21, 5f\n"
"cmp x21, x22\n"
@@ -234,21 +234,21 @@ void sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 5f\n"
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
"and x22, x21, #0x1\n"
"add x21, x21, #0x1\n"
- ".inst 0xc1b6cbd8 // fclamp { z24.s-z27.s }, z30.s, z22.s\n"
+ ".inst 0xc1acc9a4 // fclamp { z4.s-z7.s }, z13.s, z12.s\n"
"lsr x21, x21, #0x1\n"
"sub x16, x16, x21\n"
"4:" // Left padding
"subs x21, x21, #0x1\n"
- "st1w { z24.s }, p1, [x15]\n"
+ "st1w { z4.s }, p1, [x15]\n"
"add x15, x15, x13, LSL #2\n"
- "st1w { z25.s }, p1, [x14]\n"
+ "st1w { z5.s }, p1, [x14]\n"
"add x14, x14, x11, LSL #2\n"
- "st1w { z26.s }, p1, [x10]\n"
+ "st1w { z6.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- "st1w { z27.s }, p1, [x9]\n"
+ "st1w { z7.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
"bgt 4b\n"
"5:" // Left padding: End
@@ -264,331 +264,331 @@ void sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl(
"beq 7f\n"
"6:" // Unpadded: 4 priming loads
"add x21, x17, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x17]\n"
- ".inst 0x658aaaea // bfcvt z10.h, p2/M, z23.s\n"
+ "ld1w { z0.s }, p1/Z, [x17]\n"
+ ".inst 0x658aa816 // bfcvt z22.h, p2/M, z0.s\n"
"addvl x20, SP, #12\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "ld1w { z9.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
+ ".inst 0x648aa936 // bfcvtnt z22.h, p2/M, z9.s\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "ld1w { z28.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x658aab97 // bfcvt z23.h, p2/M, z28.s\n"
+ "ld1w { z20.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x648aaa97 // bfcvtnt z23.h, p2/M, z20.s\n"
+ "ld1w { z20.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x658aaa98 // bfcvt z24.h, p2/M, z20.s\n"
+ "ld1w { z29.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x648aabb8 // bfcvtnt z24.h, p2/M, z29.s\n"
+ "ld1w { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x658aabd9 // bfcvt z25.h, p2/M, z30.s\n"
+ "ld1w { z9.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x648aa939 // bfcvtnt z25.h, p2/M, z9.s\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x658aab5a // bfcvt z26.h, p2/M, z26.s\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc13312d0 // bfdot za.s[x8, 0], { z22.h-z25.h }, z3.h\n"
+ "ld1w { z9.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
+ ".inst 0x648aa93a // bfcvtnt z26.h, p2/M, z9.s\n"
+ ".inst 0xc13b12f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z11.h\n"
+ "ld1w { z9.s }, p1/Z, [x21]\n"
+ ".inst 0x658aa93b // bfcvt z27.h, p2/M, z9.s\n"
+ "ld1h { z9.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1391310 // bfdot za.s[x8, 0], { z24.h-z27.h }, z9.h\n"
"7:" // Unpadded: 3 priming loads
"add x21, x17, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x17]\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ "ld1w { z27.s }, p1/Z, [x17]\n"
+ ".inst 0x658aab7d // bfcvt z29.h, p2/M, z27.s\n"
"addvl x20, SP, #9\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
+ ".inst 0x648aab5d // bfcvtnt z29.h, p2/M, z26.s\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "ld1w { z9.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x658aa93e // bfcvt z30.h, p2/M, z9.s\n"
+ "ld1w { z20.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x648aaa9e // bfcvtnt z30.h, p2/M, z20.s\n"
+ "ld1w { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x658aab3f // bfcvt z31.h, p2/M, z25.s\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x648aab5f // bfcvtnt z31.h, p2/M, z26.s\n"
+ "ld1w { z27.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x658aab60 // bfcvt z0.h, p2/M, z27.s\n"
+ "ld1w { z9.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x648aa920 // bfcvtnt z0.h, p2/M, z9.s\n"
+ "ld1w { z23.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x658aaae1 // bfcvt z1.h, p2/M, z23.s\n"
+ ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc13413b0 // bfdot za.s[x8, 0], { z29.h-z0.h }, z4.h\n"
+ "ld1w { z9.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
+ ".inst 0x648aa921 // bfcvtnt z1.h, p2/M, z9.s\n"
+ ".inst 0xc13513d0 // bfdot za.s[x8, 0], { z30.h-z1.h }, z5.h\n"
+ "ld1w { z29.s }, p1/Z, [x21]\n"
+ ".inst 0x658aaba2 // bfcvt z2.h, p2/M, z29.s\n"
+ "ld1h { z9.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc13913f0 // bfdot za.s[x8, 0], { z31.h-z2.h }, z9.h\n"
"8:" // Unpadded: 2 priming loads
"add x22, x17, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x17]\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ "ld1w { z27.s }, p1/Z, [x17]\n"
+ ".inst 0x658aab7a // bfcvt z26.h, p2/M, z27.s\n"
"addvl x21, SP, #6\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ "ld1w { z21.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
+ ".inst 0x648aaaba // bfcvtnt z26.h, p2/M, z21.s\n"
"addvl x20, SP, #12\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ "ld1w { z25.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ ".inst 0x658aab3b // bfcvt z27.h, p2/M, z25.s\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ "ld1w { z4.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ ".inst 0x648aa89b // bfcvtnt z27.h, p2/M, z4.s\n"
+ "ld1w { z10.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ ".inst 0x658aa95c // bfcvt z28.h, p2/M, z10.s\n"
+ "ld1w { z4.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ ".inst 0x648aa89c // bfcvtnt z28.h, p2/M, z4.s\n"
+ "ld1w { z5.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ ".inst 0x658aa8bd // bfcvt z29.h, p2/M, z5.s\n"
+ "ld1w { z5.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ ".inst 0x648aa8bd // bfcvtnt z29.h, p2/M, z5.s\n"
+ "ld1w { z5.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aa8be // bfcvt z30.h, p2/M, z5.s\n"
+ ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc13e1350 // bfdot za.s[x8, 0], { z26.h-z29.h }, z14.h\n"
+ "ld1w { z5.s }, p1/Z, [x22]\n"
+ ".inst 0x648aa8be // bfcvtnt z30.h, p2/M, z5.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
- "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
+ ".inst 0xc13f1370 // bfdot za.s[x8, 0], { z27.h-z30.h }, z15.h\n"
+ ".inst 0xa0402a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1381351 // bfdot za.s[x8, 1], { z26.h-z29.h }, z8.h\n"
+ "ld1w { z23.s }, p1/Z, [x22]\n"
+ ".inst 0x658aaaff // bfcvt z31.h, p2/M, z23.s\n"
+ ".inst 0xc1391371 // bfdot za.s[x8, 1], { z27.h-z30.h }, z9.h\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1301390 // bfdot za.s[x8, 0], { z28.h-z31.h }, z0.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1301391 // bfdot za.s[x8, 1], { z28.h-z31.h }, z0.h\n"
"9:" // Unpadded: 1 priming loads
"add x22, x17, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x17]\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ "ld1w { z27.s }, p1/Z, [x17]\n"
+ ".inst 0x658aab77 // bfcvt z23.h, p2/M, z27.s\n"
"addvl x21, SP, #3\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ "ld1w { z24.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
+ ".inst 0x648aab17 // bfcvtnt z23.h, p2/M, z24.s\n"
"addvl x20, SP, #9\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ "ld1w { z31.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ ".inst 0x658aabf8 // bfcvt z24.h, p2/M, z31.s\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ "ld1w { z6.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ ".inst 0x648aa8d8 // bfcvtnt z24.h, p2/M, z6.s\n"
+ "ld1w { z28.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ ".inst 0x658aab99 // bfcvt z25.h, p2/M, z28.s\n"
+ "ld1w { z26.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ ".inst 0x648aab59 // bfcvtnt z25.h, p2/M, z26.s\n"
+ "ld1w { z28.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ ".inst 0x658aab9a // bfcvt z26.h, p2/M, z28.s\n"
+ "ld1w { z4.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ ".inst 0x648aa89a // bfcvtnt z26.h, p2/M, z4.s\n"
+ "ld1w { z20.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aaa9b // bfcvt z27.h, p2/M, z20.s\n"
+ ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc13012f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z0.h\n"
+ "ld1w { z20.s }, p1/Z, [x22]\n"
+ ".inst 0x648aaa9b // bfcvtnt z27.h, p2/M, z20.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
- "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
+ ".inst 0xc1381310 // bfdot za.s[x8, 0], { z24.h-z27.h }, z8.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc13212f1 // bfdot za.s[x8, 1], { z23.h-z26.h }, z2.h\n"
+ "ld1w { z11.s }, p1/Z, [x22]\n"
+ ".inst 0x658aa97c // bfcvt z28.h, p2/M, z11.s\n"
+ ".inst 0xc1331311 // bfdot za.s[x8, 1], { z24.h-z27.h }, z3.h\n"
+ "ld1h { z4.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1341330 // bfdot za.s[x8, 0], { z25.h-z28.h }, z4.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1301331 // bfdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
"10:" // Unpadded: 0 priming loads
"cmp x7, #0x2\n"
- ".inst 0xa1402be1 // ld1h { z1.h, z9.h }, pn10.b/Z, [SP]\n"
- "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z7.h }, p2/Z, [SP, #2, MUL VL]\n"
"blt 20f\n"
"add x21, x17, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x17]\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ "ld1w { z27.s }, p1/Z, [x17]\n"
+ ".inst 0x658aab75 // bfcvt z21.h, p2/M, z27.s\n"
"sub x7, x7, #0x2\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"sub x16, x16, #0x1\n"
- ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x648aab55 // bfcvtnt z21.h, p2/M, z26.s\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ ".inst 0x658aab56 // bfcvt z22.h, p2/M, z26.s\n"
"lsr x20, x7, #0x1\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"cmp x20, x16\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x648aab56 // bfcvtnt z22.h, p2/M, z26.s\n"
+ "ld1w { z8.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aa917 // bfcvt z23.h, p2/M, z8.s\n"
"csel x26, x20, x16, LT\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "ld1w { z2.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aa857 // bfcvtnt z23.h, p2/M, z2.s\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "ld1w { z6.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aa8d8 // bfcvt z24.h, p2/M, z6.s\n"
"and x7, x7, #0x1\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "ld1w { z15.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aa9f8 // bfcvtnt z24.h, p2/M, z15.s\n"
"sub x16, x16, x26\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "ld1w { z27.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x658aab79 // bfcvt z25.h, p2/M, z27.s\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ ".inst 0x648aab59 // bfcvtnt z25.h, p2/M, z26.s\n"
+ "ld1w { z27.s }, p1/Z, [x21]\n"
+ ".inst 0x658aab7a // bfcvt z26.h, p2/M, z27.s\n"
"cbz x26, 19f\n"
"11:" // Unpadded: Main loop
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
+ ".inst 0xc13312b0 // bfdot za.s[x8, 0], { z21.h-z24.h }, z3.h\n"
"addvl x25, SP, #6\n"
"addvl x24, SP, #12\n"
- "ld1w { z18.s }, p1/Z, [x17]\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402b21 // ld1h { z1.h, z9.h }, pn10.b/Z, [x25]\n"
+ "ld1w { z14.s }, p1/Z, [x17]\n"
+ ".inst 0xc13b12d0 // bfdot za.s[x8, 0], { z22.h-z25.h }, z11.h\n"
+ ".inst 0xa1402b20 // ld1h { z0.h, z8.h }, pn10.b/Z, [x25]\n"
"add x23, x17, %x[ld_in_row], LSL #2\n"
"addvl x22, SP, #3\n"
- ".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
- "ld1w { z17.s }, p1/Z, [x23]\n"
+ ".inst 0xc13012b1 // bfdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ "ld1w { z27.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402b01 // ld1h { z1.h, z9.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc13812d1 // bfdot za.s[x8, 1], { z22.h-z25.h }, z8.h\n"
+ ".inst 0xa1402b00 // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
"addvl x21, SP, #9\n"
"add x20, x17, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1311152 // bfdot za.s[x8, 2], { z10.h-z13.h }, z1.h\n"
- "ld1w { z16.s }, p1/Z, [x23]\n"
+ ".inst 0xc13012b2 // bfdot za.s[x8, 2], { z21.h-z24.h }, z0.h\n"
+ "ld1w { z2.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa4a // bfcvt z10.h, p2/M, z18.s\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
- "ld1h { z2.h }, p2/Z, [x25, #2, MUL VL]\n"
- ".inst 0x648aaa2a // bfcvtnt z10.h, p2/M, z17.s\n"
+ ".inst 0x658aa9d5 // bfcvt z21.h, p2/M, z14.s\n"
+ ".inst 0xc13712f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z7.h\n"
+ "ld1h { z11.h }, p2/Z, [x25, #2, MUL VL]\n"
+ ".inst 0x648aab75 // bfcvtnt z21.h, p2/M, z27.s\n"
"subs x26, x26, #0x1\n"
- "ld1w { z17.s }, p1/Z, [x23]\n"
+ "ld1w { z14.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1391172 // bfdot za.s[x8, 2], { z11.h-z14.h }, z9.h\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x23]\n"
+ ".inst 0xc13812d2 // bfdot za.s[x8, 2], { z22.h-z25.h }, z8.h\n"
+ ".inst 0x658aa856 // bfcvt z22.h, p2/M, z2.s\n"
+ "ld1w { z7.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
- ".inst 0x648aaa2b // bfcvtnt z11.h, p2/M, z17.s\n"
- "ld1w { z17.s }, p1/Z, [x23]\n"
+ ".inst 0xc13b12f1 // bfdot za.s[x8, 1], { z23.h-z26.h }, z11.h\n"
+ ".inst 0x648aa9d6 // bfcvtnt z22.h, p2/M, z14.s\n"
+ "ld1w { z31.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- ".inst 0xc1b6cbd8 // fclamp { z24.s-z27.s }, z30.s, z22.s\n"
- "ld1h { z2.h }, p2/Z, [x24, #2, MUL VL]\n"
- ".inst 0xc1321192 // bfdot za.s[x8, 2], { z12.h-z15.h }, z2.h\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+ ".inst 0xc1acc9a8 // fclamp { z8.s-z11.s }, z13.s, z12.s\n"
+ "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+ ".inst 0xc13012f2 // bfdot za.s[x8, 2], { z23.h-z26.h }, z0.h\n"
+ ".inst 0x658aa8f7 // bfcvt z23.h, p2/M, z7.s\n"
"add x8, x8, #0x1\n"
- "ld1w { z16.s }, p1/Z, [x23]\n"
+ "ld1w { z26.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- ".inst 0x648aaa2c // bfcvtnt z12.h, p2/M, z17.s\n"
- "ld1w { z16.s }, p1/Z, [x23]\n"
+ ".inst 0x658aab58 // bfcvt z24.h, p2/M, z26.s\n"
+ ".inst 0x648aabf7 // bfcvtnt z23.h, p2/M, z31.s\n"
+ "ld1w { z2.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "st1w { z24.s }, p1, [x15]\n"
- "ld1w { z16.s }, p1/Z, [x23]\n"
+ ".inst 0x648aa858 // bfcvtnt z24.h, p2/M, z2.s\n"
+ "st1w { z8.s }, p1, [x15]\n"
+ "ld1w { z0.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aa819 // bfcvt z25.h, p2/M, z0.s\n"
"add x15, x15, x13, LSL #2\n"
- ".inst 0xa1402ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
- "st1w { z25.s }, p1, [x14]\n"
+ ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc13212b0 // bfdot za.s[x8, 0], { z21.h-z24.h }, z2.h\n"
+ "st1w { z9.s }, p1, [x14]\n"
"add x14, x14, x11, LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x23]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ "ld1w { z26.s }, p1/Z, [x23]\n"
+ ".inst 0x648aab59 // bfcvtnt z25.h, p2/M, z26.s\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
+ ".inst 0xc13312d0 // bfdot za.s[x8, 0], { z22.h-z25.h }, z3.h\n"
".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
- "st1w { z26.s }, p1, [x10]\n"
+ ".inst 0xc13112b1 // bfdot za.s[x8, 1], { z21.h-z24.h }, z1.h\n"
+ "st1w { z10.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x23]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- "ld1w { z16.s }, p1/Z, [x17]\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
- "st1w { z27.s }, p1, [x9]\n"
+ "ld1w { z26.s }, p1/Z, [x23]\n"
+ ".inst 0x658aab5a // bfcvt z26.h, p2/M, z26.s\n"
+ ".inst 0xc13912d1 // bfdot za.s[x8, 1], { z22.h-z25.h }, z9.h\n"
+ "ld1w { z31.s }, p1/Z, [x17]\n"
+ ".inst 0x658aabf5 // bfcvt z21.h, p2/M, z31.s\n"
+ "st1w { z11.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "ld1w { z30.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc0040c84 // mova za.d[x8, #4], { z4.d-z7.d }\n"
- ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0xc0040e04 // mova za.d[x8, #4], { z16.d-z19.d }\n"
+ ".inst 0x648aabd5 // bfcvtnt z21.h, p2/M, z30.s\n"
+ "ld1w { z0.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ ".inst 0x658aa816 // bfcvt z22.h, p2/M, z0.s\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "ld1w { z1.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x648aa836 // bfcvtnt z22.h, p2/M, z1.s\n"
+ "ld1w { z11.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"ld1h { z2.h }, p2/Z, [x22, #2, MUL VL]\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
- "ld1w { z19.s }, p1/Z, [x20]\n"
+ ".inst 0xc13212f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z17.s }, p1/Z, [x20]\n"
+ "ld1w { z14.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z18.s }, p1/Z, [x20]\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "ld1h { z4.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc13412f1 // bfdot za.s[x8, 1], { z23.h-z26.h }, z4.h\n"
+ ".inst 0x658aa977 // bfcvt z23.h, p2/M, z11.s\n"
+ "ld1w { z29.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa2d // bfcvt z13.h, p2/M, z17.s\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "ld1w { z17.s }, p1/Z, [x20]\n"
+ ".inst 0x658aa9d8 // bfcvt z24.h, p2/M, z14.s\n"
+ ".inst 0x658aabb9 // bfcvt z25.h, p2/M, z29.s\n"
+ "ld1w { z5.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa6c // bfcvtnt z12.h, p2/M, z19.s\n"
- ".inst 0x648aaa4d // bfcvtnt z13.h, p2/M, z18.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0x648aaa2e // bfcvtnt z14.h, p2/M, z17.s\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xa1402be1 // ld1h { z1.h, z9.h }, pn10.b/Z, [SP]\n"
- "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0x648aab97 // bfcvtnt z23.h, p2/M, z28.s\n"
+ ".inst 0x648aab78 // bfcvtnt z24.h, p2/M, z27.s\n"
+ "ld1w { z11.s }, p1/Z, [x20]\n"
+ ".inst 0x648aa8b9 // bfcvtnt z25.h, p2/M, z5.s\n"
+ ".inst 0x658aa97a // bfcvt z26.h, p2/M, z11.s\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z7.h }, p2/Z, [SP, #2, MUL VL]\n"
"bgt 11b\n"
"b 19f\n"
"12:" // Padded
@@ -603,282 +603,282 @@ void sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl(
"13:" // Padded: 4 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x17]\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ "ld1w { z1.s }, p0/Z, [x17]\n"
+ ".inst 0x658aa837 // bfcvt z23.h, p2/M, z1.s\n"
"add x21, x17, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
+ "ld1w { z29.s }, p0/Z, [x21]\n"
+ ".inst 0x648aabb7 // bfcvtnt z23.h, p2/M, z29.s\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ "ld1w { z30.s }, p0/Z, [x21]\n"
+ ".inst 0x658aabd8 // bfcvt z24.h, p2/M, z30.s\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ "ld1w { z15.s }, p0/Z, [x21]\n"
+ ".inst 0x648aa9f8 // bfcvtnt z24.h, p2/M, z15.s\n"
"mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aab79 // bfcvt z25.h, p2/M, z27.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z20.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aaa99 // bfcvtnt z25.h, p2/M, z20.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z10.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aa95a // bfcvt z26.h, p2/M, z10.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ "ld1w { z8.s }, p0/Z, [x21]\n"
+ ".inst 0x648aa91a // bfcvtnt z26.h, p2/M, z8.s\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ "ld1w { z28.s }, p0/Z, [x21]\n"
+ ".inst 0x658aab9b // bfcvt z27.h, p2/M, z28.s\n"
"addvl x20, SP, #12\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
+ ".inst 0xc13112f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z1.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z28.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aab9b // bfcvtnt z27.h, p2/M, z28.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z0.s }, p0/Z, [x21]\n"
+ ".inst 0x658aa81c // bfcvt z28.h, p2/M, z0.s\n"
+ ".inst 0xc1391310 // bfdot za.s[x8, 0], { z24.h-z27.h }, z9.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
+ ".inst 0xc1301330 // bfdot za.s[x8, 0], { z25.h-z28.h }, z0.h\n"
"14:" // Padded: 3 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x17]\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ "ld1w { z21.s }, p0/Z, [x17]\n"
+ ".inst 0x658aaab4 // bfcvt z20.h, p2/M, z21.s\n"
"add x21, x17, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
+ ".inst 0x648aab74 // bfcvtnt z20.h, p2/M, z27.s\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
+ ".inst 0x658aab75 // bfcvt z21.h, p2/M, z27.s\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
+ ".inst 0x648aab75 // bfcvtnt z21.h, p2/M, z27.s\n"
"mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z29.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aabb6 // bfcvt z22.h, p2/M, z29.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aab76 // bfcvtnt z22.h, p2/M, z27.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aab77 // bfcvt z23.h, p2/M, z27.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ "ld1w { z8.s }, p0/Z, [x21]\n"
+ ".inst 0x648aa917 // bfcvtnt z23.h, p2/M, z8.s\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ "ld1w { z28.s }, p0/Z, [x21]\n"
+ ".inst 0x658aab98 // bfcvt z24.h, p2/M, z28.s\n"
"addvl x20, SP, #9\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
+ ".inst 0xc1311290 // bfdot za.s[x8, 0], { z20.h-z23.h }, z1.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z0.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aa818 // bfcvtnt z24.h, p2/M, z0.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z1.s }, p0/Z, [x21]\n"
+ ".inst 0x658aa839 // bfcvt z25.h, p2/M, z1.s\n"
+ ".inst 0xc13912b0 // bfdot za.s[x8, 0], { z21.h-z24.h }, z9.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
+ ".inst 0xc13012d0 // bfdot za.s[x8, 0], { z22.h-z25.h }, z0.h\n"
"15:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x17]\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ "ld1w { z6.s }, p0/Z, [x17]\n"
+ ".inst 0x658aa8da // bfcvt z26.h, p2/M, z6.s\n"
"add x22, x17, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
+ "ld1w { z29.s }, p0/Z, [x22]\n"
+ ".inst 0x648aabba // bfcvtnt z26.h, p2/M, z29.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ "ld1w { z28.s }, p0/Z, [x22]\n"
+ ".inst 0x658aab9b // bfcvt z27.h, p2/M, z28.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ "ld1w { z14.s }, p0/Z, [x22]\n"
+ ".inst 0x648aa9db // bfcvtnt z27.h, p2/M, z14.s\n"
"mov x12, #0x4\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
+ "ld1w { z24.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aab1c // bfcvt z28.h, p2/M, z24.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
+ "ld1w { z1.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aa83c // bfcvtnt z28.h, p2/M, z1.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
+ "ld1w { z3.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aa87d // bfcvt z29.h, p2/M, z3.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ "ld1w { z0.s }, p0/Z, [x22]\n"
+ ".inst 0x648aa81d // bfcvtnt z29.h, p2/M, z0.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ "ld1w { z24.s }, p0/Z, [x22]\n"
+ ".inst 0x658aab1e // bfcvt z30.h, p2/M, z24.s\n"
"addvl x21, SP, #6\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
+ ".inst 0xc1311350 // bfdot za.s[x8, 0], { z26.h-z29.h }, z1.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ "ld1w { z23.s }, p0/Z, [x22]\n"
+ ".inst 0x648aaafe // bfcvtnt z30.h, p2/M, z23.s\n"
"addvl x20, SP, #12\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ ".inst 0xc1391370 // bfdot za.s[x8, 0], { z27.h-z30.h }, z9.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "ld1w { z31.s }, p0/Z, [x22]\n"
+ ".inst 0xc1301351 // bfdot za.s[x8, 1], { z26.h-z29.h }, z0.h\n"
+ ".inst 0x658aabff // bfcvt z31.h, p2/M, z31.s\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
- "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1311371 // bfdot za.s[x8, 1], { z27.h-z30.h }, z1.h\n"
+ ".inst 0xc1301390 // bfdot za.s[x8, 0], { z28.h-z31.h }, z0.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1301391 // bfdot za.s[x8, 1], { z28.h-z31.h }, z0.h\n"
"16:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x17]\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ "ld1w { z22.s }, p0/Z, [x17]\n"
+ ".inst 0x658aaad5 // bfcvt z21.h, p2/M, z22.s\n"
"add x22, x17, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
+ "ld1w { z3.s }, p0/Z, [x22]\n"
+ ".inst 0x648aa875 // bfcvtnt z21.h, p2/M, z3.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ "ld1w { z20.s }, p0/Z, [x22]\n"
+ ".inst 0x658aaa96 // bfcvt z22.h, p2/M, z20.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ "ld1w { z25.s }, p0/Z, [x22]\n"
+ ".inst 0x648aab36 // bfcvtnt z22.h, p2/M, z25.s\n"
"mov x12, #0x4\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
+ "ld1w { z24.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aab17 // bfcvt z23.h, p2/M, z24.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
+ "ld1w { z0.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aa817 // bfcvtnt z23.h, p2/M, z0.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
+ "ld1w { z7.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aa8f8 // bfcvt z24.h, p2/M, z7.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ "ld1w { z28.s }, p0/Z, [x22]\n"
+ ".inst 0x648aab98 // bfcvtnt z24.h, p2/M, z28.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ "ld1w { z6.s }, p0/Z, [x22]\n"
+ ".inst 0x658aa8d9 // bfcvt z25.h, p2/M, z6.s\n"
"addvl x21, SP, #3\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
+ ".inst 0xc13112b0 // bfdot za.s[x8, 0], { z21.h-z24.h }, z1.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ "ld1w { z6.s }, p0/Z, [x22]\n"
+ ".inst 0x648aa8d9 // bfcvtnt z25.h, p2/M, z6.s\n"
"addvl x20, SP, #9\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ ".inst 0xc13912d0 // bfdot za.s[x8, 0], { z22.h-z25.h }, z9.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "ld1w { z3.s }, p0/Z, [x22]\n"
+ ".inst 0xc13012b1 // bfdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ ".inst 0x658aa87a // bfcvt z26.h, p2/M, z3.s\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
- "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc13112d1 // bfdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ ".inst 0xc13012f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z0.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc13012f1 // bfdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
"17:" // Padded: 0 priming loads
"cmp x7, #0x2\n"
- ".inst 0xa1402be1 // ld1h { z1.h, z9.h }, pn10.b/Z, [SP]\n"
- "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z7.h }, p2/Z, [SP, #2, MUL VL]\n"
"blt 20f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x17]\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ "ld1w { z25.s }, p0/Z, [x17]\n"
+ ".inst 0x658aab35 // bfcvt z21.h, p2/M, z25.s\n"
"add x20, x17, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ ".inst 0x648aab75 // bfcvtnt z21.h, p2/M, z27.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ ".inst 0x658aab76 // bfcvt z22.h, p2/M, z27.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ ".inst 0x648aab76 // bfcvtnt z22.h, p2/M, z27.s\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aab77 // bfcvt z23.h, p2/M, z27.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aab37 // bfcvtnt z23.h, p2/M, z25.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aab58 // bfcvt z24.h, p2/M, z26.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ ".inst 0x648aab78 // bfcvtnt z24.h, p2/M, z27.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ ".inst 0x658aab79 // bfcvt z25.h, p2/M, z27.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
+ ".inst 0x648aab59 // bfcvtnt z25.h, p2/M, z26.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ ".inst 0x658aab7a // bfcvt z26.h, p2/M, z27.s\n"
"sub x7, x7, #0x2\n"
"sub x16, x16, #0x1\n"
"lsr x20, x7, #0x1\n"
@@ -889,323 +889,323 @@ void sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl(
"sub x16, x16, x24\n"
"cbz x24, 19f\n"
"18:" // Padded: Main loop
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
+ ".inst 0xc13312b0 // bfdot za.s[x8, 0], { z21.h-z24.h }, z3.h\n"
"addvl x23, SP, #6\n"
"addvl x21, SP, #12\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc13b12d0 // bfdot za.s[x8, 0], { z22.h-z25.h }, z11.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
- "ld1w { z16.s }, p0/Z, [x17]\n"
+ ".inst 0xc13012b1 // bfdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ "ld1w { z9.s }, p0/Z, [x17]\n"
"add x20, x17, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc13112d1 // bfdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
"addvl x22, SP, #3\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1311152 // bfdot za.s[x8, 2], { z10.h-z13.h }, z1.h\n"
- "ld1w { z20.s }, p0/Z, [x20]\n"
+ ".inst 0xc13012b2 // bfdot za.s[x8, 2], { z21.h-z24.h }, z0.h\n"
+ "ld1w { z14.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
+ ".inst 0xc13712f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z7.h\n"
"mov x12, #0x4\n"
- "ld1h { z2.h }, p2/Z, [x23, #2, MUL VL]\n"
- ".inst 0xc1391172 // bfdot za.s[x8, 2], { z11.h-z14.h }, z9.h\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
+ "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
+ ".inst 0xc13112d2 // bfdot za.s[x8, 2], { z22.h-z25.h }, z1.h\n"
+ ".inst 0x658aa921 // bfcvt z1.h, p2/M, z9.s\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xc13012f1 // bfdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+ "ld1w { z9.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0x658aaa2b // bfcvt z11.h, p2/M, z17.s\n"
- "ld1w { z18.s }, p0/Z, [x20]\n"
+ ".inst 0x658aab62 // bfcvt z2.h, p2/M, z27.s\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0x648aaa8a // bfcvtnt z10.h, p2/M, z20.s\n"
- "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1321192 // bfdot za.s[x8, 2], { z12.h-z15.h }, z2.h\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aa9c1 // bfcvtnt z1.h, p2/M, z14.s\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc13012f2 // bfdot za.s[x8, 2], { z23.h-z26.h }, z0.h\n"
+ ".inst 0x658aa923 // bfcvt z3.h, p2/M, z9.s\n"
"addvl x21, SP, #9\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z9.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aa924 // bfcvt z4.h, p2/M, z9.s\n"
"mov x12, #0x8\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa6b // bfcvtnt z11.h, p2/M, z19.s\n"
+ ".inst 0x648aa9e2 // bfcvtnt z2.h, p2/M, z15.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z9.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa4c // bfcvtnt z12.h, p2/M, z18.s\n"
+ ".inst 0x648aab63 // bfcvtnt z3.h, p2/M, z27.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0x648aaa2d // bfcvtnt z13.h, p2/M, z17.s\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- ".inst 0xa1402ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x648aab04 // bfcvtnt z4.h, p2/M, z24.s\n"
+ ".inst 0x658aa925 // bfcvt z5.h, p2/M, z9.s\n"
+ ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
+ ".inst 0x648aabc5 // bfcvtnt z5.h, p2/M, z30.s\n"
+ ".inst 0xc1301030 // bfdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
- "ld1w { z16.s }, p0/Z, [x17]\n"
+ ".inst 0xc1381050 // bfdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ "ld1w { z0.s }, p0/Z, [x17]\n"
"add x20, x17, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
+ ".inst 0xc1361031 // bfdot za.s[x8, 1], { z1.h-z4.h }, z6.h\n"
+ "ld1w { z10.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0x658aaa2f // bfcvt z15.h, p2/M, z17.s\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
+ ".inst 0x658aaba6 // bfcvt z6.h, p2/M, z29.s\n"
+ "ld1w { z9.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
+ ".inst 0xc13e1051 // bfdot za.s[x8, 1], { z2.h-z5.h }, z14.h\n"
"mov x12, #0x4\n"
- "ld1w { z20.s }, p0/Z, [x20]\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ ".inst 0x658aa815 // bfcvt z21.h, p2/M, z0.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z31.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa2b // bfcvt z11.h, p2/M, z17.s\n"
+ ".inst 0x658aa936 // bfcvt z22.h, p2/M, z9.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1h { z2.h }, p2/Z, [x22, #2, MUL VL]\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
+ "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
+ ".inst 0xc1301070 // bfdot za.s[x8, 0], { z3.h-z6.h }, z0.h\n"
"subs x24, x24, #0x1\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1b6cbd8 // fclamp { z24.s-z27.s }, z30.s, z22.s\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
+ ".inst 0xc1acc9b8 // fclamp { z24.s-z27.s }, z13.s, z12.s\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"st1w { z24.s }, p1, [x15]\n"
"mov x12, #0x8\n"
- "ld1w { z18.s }, p0/Z, [x20]\n"
+ "ld1w { z14.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"st1w { z25.s }, p1, [x14]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1301071 // bfdot za.s[x8, 1], { z3.h-z6.h }, z0.h\n"
+ ".inst 0x658aabf7 // bfcvt z23.h, p2/M, z31.s\n"
+ "ld1w { z8.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0x658aaa2d // bfcvt z13.h, p2/M, z17.s\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
+ ".inst 0x658aabd8 // bfcvt z24.h, p2/M, z30.s\n"
+ "ld1w { z4.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x658aa919 // bfcvt z25.h, p2/M, z8.s\n"
+ "ld1w { z5.s }, p0/Z, [x20]\n"
"add x15, x15, x13, LSL #2\n"
"add x14, x14, x11, LSL #2\n"
"st1w { z26.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
"st1w { z27.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc0040c84 // mova za.d[x8, #4], { z4.d-z7.d }\n"
- ".inst 0xa1402be1 // ld1h { z1.h, z9.h }, pn10.b/Z, [SP]\n"
- ".inst 0x648aaaaa // bfcvtnt z10.h, p2/M, z21.s\n"
- ".inst 0x648aaa8b // bfcvtnt z11.h, p2/M, z20.s\n"
+ ".inst 0xc0040e04 // mova za.d[x8, #4], { z16.d-z19.d }\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ ".inst 0x648aa955 // bfcvtnt z21.h, p2/M, z10.s\n"
+ ".inst 0x648aabb6 // bfcvtnt z22.h, p2/M, z29.s\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
- ".inst 0x648aaa6c // bfcvtnt z12.h, p2/M, z19.s\n"
- ".inst 0x648aaa4d // bfcvtnt z13.h, p2/M, z18.s\n"
- ".inst 0x648aaa2e // bfcvtnt z14.h, p2/M, z17.s\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ "ld1h { z7.h }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0x648aa9f7 // bfcvtnt z23.h, p2/M, z15.s\n"
+ ".inst 0x648aa9d8 // bfcvtnt z24.h, p2/M, z14.s\n"
+ ".inst 0x648aa899 // bfcvtnt z25.h, p2/M, z4.s\n"
+ ".inst 0x658aa8ba // bfcvt z26.h, p2/M, z5.s\n"
"bgt 18b\n"
"19:" // Main loop tail
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
+ ".inst 0xc13312b0 // bfdot za.s[x8, 0], { z21.h-z24.h }, z3.h\n"
"addvl x24, SP, #6\n"
"addvl x23, SP, #12\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402b01 // ld1h { z1.h, z9.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc13b12d0 // bfdot za.s[x8, 0], { z22.h-z25.h }, z11.h\n"
+ ".inst 0xa0402b00 // ld1h { z0.h-z1.h }, pn10.b/Z, [x24]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
- "ld1w { z16.s }, p0/Z, [x17]\n"
+ ".inst 0xc13012b1 // bfdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ "ld1w { z5.s }, p0/Z, [x17]\n"
"add x22, x17, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc13112d1 // bfdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
"addvl x21, SP, #3\n"
"addvl x20, SP, #9\n"
- ".inst 0xc1311152 // bfdot za.s[x8, 2], { z10.h-z13.h }, z1.h\n"
- "ld1w { z20.s }, p0/Z, [x22]\n"
+ ".inst 0xc13012b2 // bfdot za.s[x8, 2], { z21.h-z24.h }, z0.h\n"
+ "ld1w { z29.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z17.s }, p0/Z, [x22]\n"
+ "ld1w { z2.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
+ ".inst 0xc13712f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z7.h\n"
"mov x12, #0x4\n"
- "ld1h { z2.h }, p2/Z, [x24, #2, MUL VL]\n"
- ".inst 0xc1391172 // bfdot za.s[x8, 2], { z11.h-z14.h }, z9.h\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
- "ld1w { z19.s }, p0/Z, [x22]\n"
+ "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+ ".inst 0xc13112d2 // bfdot za.s[x8, 2], { z22.h-z25.h }, z1.h\n"
+ ".inst 0x658aa8bb // bfcvt z27.h, p2/M, z5.s\n"
+ "ld1w { z20.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
+ ".inst 0xc13012f1 // bfdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+ "ld1w { z1.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0x658aaa2b // bfcvt z11.h, p2/M, z17.s\n"
- "ld1w { z18.s }, p0/Z, [x22]\n"
+ ".inst 0x658aa85c // bfcvt z28.h, p2/M, z2.s\n"
+ "ld1w { z14.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0x648aaa8a // bfcvtnt z10.h, p2/M, z20.s\n"
- "ld1h { z2.h }, p2/Z, [x23, #2, MUL VL]\n"
- ".inst 0xc1321192 // bfdot za.s[x8, 2], { z12.h-z15.h }, z2.h\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aabbb // bfcvtnt z27.h, p2/M, z29.s\n"
+ "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
+ ".inst 0xc13012f2 // bfdot za.s[x8, 2], { z23.h-z26.h }, z0.h\n"
+ ".inst 0x658aa83d // bfcvt z29.h, p2/M, z1.s\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
+ "ld1w { z1.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aa83e // bfcvt z30.h, p2/M, z1.s\n"
"mov x12, #0x8\n"
- "ld1w { z17.s }, p0/Z, [x22]\n"
+ "ld1w { z31.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa6b // bfcvtnt z11.h, p2/M, z19.s\n"
+ ".inst 0x648aaa9c // bfcvtnt z28.h, p2/M, z20.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
+ "ld1w { z26.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa4c // bfcvtnt z12.h, p2/M, z18.s\n"
+ ".inst 0x648aa9dd // bfcvtnt z29.h, p2/M, z14.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0x648aaa2d // bfcvtnt z13.h, p2/M, z17.s\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ ".inst 0x648aabfe // bfcvtnt z30.h, p2/M, z31.s\n"
+ ".inst 0x658aab5f // bfcvt z31.h, p2/M, z26.s\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "ld1w { z9.s }, p0/Z, [x22]\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
+ ".inst 0x648aa93f // bfcvtnt z31.h, p2/M, z9.s\n"
+ ".inst 0xc1321370 // bfdot za.s[x8, 0], { z27.h-z30.h }, z2.h\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
- "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1b6cbd8 // fclamp { z24.s-z27.s }, z30.s, z22.s\n"
- ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- "st1w { z24.s }, p1, [x15]\n"
+ "ld1w { z26.s }, p0/Z, [x22]\n"
+ ".inst 0xc13a1390 // bfdot za.s[x8, 0], { z28.h-z31.h }, z10.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ ".inst 0x658aab40 // bfcvt z0.h, p2/M, z26.s\n"
+ ".inst 0xc1321371 // bfdot za.s[x8, 1], { z27.h-z30.h }, z2.h\n"
+ "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1acc9a4 // fclamp { z4.s-z7.s }, z13.s, z12.s\n"
+ ".inst 0xc13a1391 // bfdot za.s[x8, 1], { z28.h-z31.h }, z10.h\n"
+ "st1w { z4.s }, p1, [x15]\n"
"add x15, x15, x13, LSL #2\n"
- ".inst 0xa1402be1 // ld1h { z1.h, z9.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
- "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
- "st1w { z25.s }, p1, [x14]\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc13913b0 // bfdot za.s[x8, 0], { z29.h-z0.h }, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "st1w { z5.s }, p1, [x14]\n"
"add x14, x14, x11, LSL #2\n"
- "st1w { z26.s }, p1, [x10]\n"
+ "st1w { z6.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- ".inst 0xc0040c84 // mova za.d[x8, #4], { z4.d-z7.d }\n"
- "st1w { z27.s }, p1, [x9]\n"
+ ".inst 0xc0040e04 // mova za.d[x8, #4], { z16.d-z19.d }\n"
+ "st1w { z7.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
- "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0xc13913b1 // bfdot za.s[x8, 1], { z29.h-z0.h }, z9.h\n"
+ "ld1h { z7.h }, p2/Z, [SP, #2, MUL VL]\n"
"20:" // Main loop skip tail
"cbz x7, 21f\n" // Skip remainder inputs
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x17]\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ "ld1w { z25.s }, p0/Z, [x17]\n"
+ ".inst 0x658aab3d // bfcvt z29.h, p2/M, z25.s\n"
"add x22, x17, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
+ "ld1w { z26.s }, p0/Z, [x22]\n"
+ ".inst 0x648aab5d // bfcvtnt z29.h, p2/M, z26.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ "ld1w { z25.s }, p0/Z, [x22]\n"
+ ".inst 0x658aab3e // bfcvt z30.h, p2/M, z25.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ "ld1w { z24.s }, p0/Z, [x22]\n"
+ ".inst 0x648aab1e // bfcvtnt z30.h, p2/M, z24.s\n"
"mov x12, #0x4\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
+ "ld1w { z26.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aab5f // bfcvt z31.h, p2/M, z26.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
+ "ld1w { z9.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aa93f // bfcvtnt z31.h, p2/M, z9.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
+ "ld1w { z9.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aa920 // bfcvt z0.h, p2/M, z9.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ "ld1w { z24.s }, p0/Z, [x22]\n"
+ ".inst 0x648aab00 // bfcvtnt z0.h, p2/M, z24.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ "ld1w { z9.s }, p0/Z, [x22]\n"
+ ".inst 0x658aa921 // bfcvt z1.h, p2/M, z9.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
+ "ld1w { z25.s }, p0/Z, [x22]\n"
+ ".inst 0x648aab21 // bfcvtnt z1.h, p2/M, z25.s\n"
+ ".inst 0xc13313b0 // bfdot za.s[x8, 0], { z29.h-z0.h }, z3.h\n"
"addvl x21, SP, #6\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc13b13d0 // bfdot za.s[x8, 0], { z30.h-z1.h }, z11.h\n"
+ ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"addvl x20, SP, #12\n"
- ".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ ".inst 0xc13e13b1 // bfdot za.s[x8, 1], { z29.h-z0.h }, z14.h\n"
+ "ld1w { z25.s }, p0/Z, [x22]\n"
+ ".inst 0x658aab22 // bfcvt z2.h, p2/M, z25.s\n"
"sub x16, x16, #0x1\n"
- ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1311152 // bfdot za.s[x8, 2], { z10.h-z13.h }, z1.h\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
- "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1391172 // bfdot za.s[x8, 2], { z11.h-z14.h }, z9.h\n"
- ".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
- "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- ".inst 0xc1b6cbd8 // fclamp { z24.s-z27.s }, z30.s, z22.s\n"
- "st1w { z24.s }, p1, [x15]\n"
+ ".inst 0xc13f13d1 // bfdot za.s[x8, 1], { z30.h-z1.h }, z15.h\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc13e13b2 // bfdot za.s[x8, 2], { z29.h-z0.h }, z14.h\n"
+ ".inst 0xc13713f0 // bfdot za.s[x8, 0], { z31.h-z2.h }, z7.h\n"
+ "ld1h { z4.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc13f13d2 // bfdot za.s[x8, 2], { z30.h-z1.h }, z15.h\n"
+ ".inst 0xc13413f1 // bfdot za.s[x8, 1], { z31.h-z2.h }, z4.h\n"
+ "ld1h { z9.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc1acc9a4 // fclamp { z4.s-z7.s }, z13.s, z12.s\n"
+ "st1w { z4.s }, p1, [x15]\n"
"add x15, x15, x13, LSL #2\n"
- ".inst 0xc1321192 // bfdot za.s[x8, 2], { z12.h-z15.h }, z2.h\n"
+ ".inst 0xc13913f2 // bfdot za.s[x8, 2], { z31.h-z2.h }, z9.h\n"
"add x8, x8, #0x1\n"
- "st1w { z25.s }, p1, [x14]\n"
+ "st1w { z5.s }, p1, [x14]\n"
"add x14, x14, x11, LSL #2\n"
- "st1w { z26.s }, p1, [x10]\n"
+ "st1w { z6.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- ".inst 0xc0040c84 // mova za.d[x8, #4], { z4.d-z7.d }\n"
- "st1w { z27.s }, p1, [x9]\n"
+ ".inst 0xc0040e04 // mova za.d[x8, #4], { z16.d-z19.d }\n"
+ "st1w { z7.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
"21:" // Tail input: End
"cbz x16, 23f\n"
"22:" // Right padding loop
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
"subs x16, x16, #0x1\n"
- ".inst 0xc1b6cbd8 // fclamp { z24.s-z27.s }, z30.s, z22.s\n"
- "st1w { z24.s }, p1, [x15]\n"
+ ".inst 0xc1acc9a4 // fclamp { z4.s-z7.s }, z13.s, z12.s\n"
+ "st1w { z4.s }, p1, [x15]\n"
"add x15, x15, x13, LSL #2\n"
- ".inst 0xc0040c84 // mova za.d[x8, #4], { z4.d-z7.d }\n"
- "st1w { z25.s }, p1, [x14]\n"
+ ".inst 0xc0040e04 // mova za.d[x8, #4], { z16.d-z19.d }\n"
+ "st1w { z5.s }, p1, [x14]\n"
"add x14, x14, x11, LSL #2\n"
- "st1w { z26.s }, p1, [x10]\n"
+ "st1w { z6.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- "st1w { z27.s }, p1, [x9]\n"
+ "st1w { z7.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
"bgt 22b\n"
"23:" // End
@@ -1213,12 +1213,12 @@ void sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl(
"incb x20, ALL, MUL #16\n"
"incb x20, ALL, MUL #9\n"
"str x20, [%x[args], %[offsetof_Args_weights]]\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
"incw x6\n"
"whilelt p1.s, x6, x5\n"
- "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x17, x17, x20, LSL #2\n"
- "str x17, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21, LSL #2\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
"ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
"ldp x23, x22, [x25, #0x0]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za.hpp
index be4f02fc30..de3eadac8a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za/generic.cpp
index 6c42c76683..845f376926 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za/generic.cpp
@@ -73,96 +73,96 @@ void sme2_s8q_planar_3x3_s1_4rows_dot_za_impl(
"ptrue p2.b\n"
"mov x20, #0x6\n"
"ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z24.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "ld1rh { z21.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
"sub x20, x20, x6\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x17\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
"whilelt p8.s, XZR, x7\n"
"addvl SP, SP, #-12\n"
"ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z24.h, p2/M, z24.h\n"
+ "neg z21.h, p2/M, z21.h\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z22.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z29.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z28.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z8.s, #0x0\n"
+ "mov z30.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z8.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z30.s }, p1/Z, [x20, x16, LSL #2]\n"
"2:" // Load bias: Done
"ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
"mov x20, x22\n"
- "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "ld1sb { z10.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "ld1rh { z21.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "mov z20.h, #0x0\n"
- "sub z27.h, z27.h, z21.h\n"
+ "ld1rh { z31.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z7.h, #0x0\n"
+ "sub z10.h, z10.h, z31.h\n"
"incw x22\n"
- "ld1sb { z23.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #3\n"
- "sub z23.h, z23.h, z21.h\n"
- "trn1 z0.h, z20.h, z27.h\n"
"ld1sb { z16.s }, p2/Z, [x20]\n"
- "sub z16.h, z16.h, z21.h\n"
+ "incw x20, ALL, MUL #3\n"
+ "sub z16.h, z16.h, z31.h\n"
+ "trn1 z20.h, z7.h, z10.h\n"
+ "ld1sb { z11.s }, p2/Z, [x20]\n"
+ "sub z11.h, z11.h, z31.h\n"
"mov x20, x22\n"
- "trn1 z1.h, z27.h, z23.h\n"
- "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "trn1 z19.h, z10.h, z16.h\n"
+ "ld1sb { z24.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "trn1 z2.h, z23.h, z16.h\n"
- "trn1 z3.h, z16.h, z20.h\n"
- "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "trn1 z26.h, z16.h, z11.h\n"
+ "trn1 z13.h, z11.h, z7.h\n"
+ "ld1sb { z11.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z27.h, z27.h, z21.h\n"
- "sub z23.h, z23.h, z21.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
- "sub z16.h, z16.h, z21.h\n"
+ "sub z24.h, z24.h, z31.h\n"
+ "sub z11.h, z11.h, z31.h\n"
+ "ld1sb { z2.s }, p2/Z, [x20]\n"
+ "sub z2.h, z2.h, z31.h\n"
"addvl x21, SP, #12\n"
"incw x22\n"
"addvl x21, x21, #-4\n"
"mov x20, x22\n"
- "st1h { z0.h }, p2, [x21]\n"
- "trn1 z0.h, z20.h, z27.h\n"
- "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z1.h, z27.h, z23.h\n"
- "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "st1h { z20.h }, p2, [x21]\n"
+ "trn1 z22.h, z7.h, z24.h\n"
+ "st1h { z19.h }, p2, [x21, #1, MUL VL]\n"
+ "trn1 z1.h, z24.h, z11.h\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "st1h { z2.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z2.h, z23.h, z16.h\n"
- "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "st1h { z26.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z3.h, z11.h, z2.h\n"
+ "ld1sb { z0.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "st1h { z3.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z3.h, z16.h, z20.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "st1h { z13.h }, p2, [x21, #3, MUL VL]\n"
+ "trn1 z25.h, z2.h, z7.h\n"
+ "ld1sb { z4.s }, p2/Z, [x20]\n"
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "sub z27.h, z27.h, z21.h\n"
- "sub z23.h, z23.h, z21.h\n"
+ "sub z16.h, z16.h, z31.h\n"
+ "sub z0.h, z0.h, z31.h\n"
"addvl x21, x21, #-4\n"
- "st1h { z0.h }, p2, [x21]\n"
- "sub z16.h, z16.h, z21.h\n"
+ "st1h { z22.h }, p2, [x21]\n"
+ "sub z4.h, z4.h, z31.h\n"
"st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
- "mov z9.d, z8.d\n"
- "st1h { z2.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z0.h, z20.h, z27.h\n"
- "trn1 z1.h, z27.h, z23.h\n"
- "st1h { z3.h }, p2, [x21, #3, MUL VL]\n"
+ "mov z31.d, z30.d\n"
+ "st1h { z3.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z24.h, z7.h, z16.h\n"
+ "trn1 z18.h, z16.h, z0.h\n"
+ "st1h { z25.h }, p2, [x21, #3, MUL VL]\n"
"addvl x21, x21, #-4\n"
- "trn1 z2.h, z23.h, z16.h\n"
- "trn1 z3.h, z16.h, z20.h\n"
- "st1h { z0.h }, p2, [x21]\n"
- "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
- "st1h { z2.h }, p2, [x21, #2, MUL VL]\n"
- "st1h { z3.h }, p2, [x21, #3, MUL VL]\n"
+ "trn1 z0.h, z0.h, z4.h\n"
+ "trn1 z1.h, z4.h, z7.h\n"
+ "st1h { z24.h }, p2, [x21]\n"
+ "st1h { z18.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "st1h { z1.h }, p2, [x21, #3, MUL VL]\n"
"cbz x20, 3f\n"
- "ld1w { z10.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z14.s }, p1/Z, [x20, x16, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z11.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z12.s }, p1/Z, [x20, x16, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
"sub x20, x15, #0x1\n"
@@ -182,21 +182,21 @@ void sme2_s8q_planar_3x3_s1_4rows_dot_za_impl(
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x0\n"
"msub x14, x7, x20, x14\n"
- ".inst 0xc0040900 // mova za.d[x8, #0], { z8.d-z9.d }\n"
+ ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040901 // mova za.d[x8, #1], { z8.d-z9.d }\n"
+ ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
"mov x22, #0x2\n"
- "ldp x11, x10, [x25], #0x10\n"
- ".inst 0xc0040902 // mova za.d[x8, #2], { z8.d-z9.d }\n"
+ "ldp x11, x10, [x23], #0x10\n"
+ ".inst 0xc0040bc2 // mova za.d[x8, #2], { z30.d-z31.d }\n"
"ldp x9, x28, [x20], #0x10\n"
- ".inst 0xc0040903 // mova za.d[x8, #3], { z8.d-z9.d }\n"
+ ".inst 0xc0040bc3 // mova za.d[x8, #3], { z30.d-z31.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
- "ldp x27, x26, [x25], #0x10\n"
- ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ "ldp x27, x26, [x23], #0x10\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
"ldp x25, x24, [x20], #0x10\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
@@ -204,22 +204,22 @@ void sme2_s8q_planar_3x3_s1_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
"sub x13, x13, x21\n"
- ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
- ".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- ".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z4.s }, p1, [x11]\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z5.s }, p1, [x27]\n"
+ "st1b { z25.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z7.s }, p1, [x26]\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"bgt 6b\n"
"7:" // Left padding: End
@@ -231,148 +231,148 @@ void sme2_s8q_planar_3x3_s1_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 2 priming loads
"add x21, x14, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x14]\n"
+ "ld1sb { z20.s }, p1/Z, [x14]\n"
"addvl x20, SP, #8\n"
"ld1sb { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z17.h, z16.h\n"
- "add z13.h, z13.h, z24.h\n"
- "ld1sb { z17.s }, p1/Z, [x21]\n"
+ "trn1 z4.h, z20.h, z16.h\n"
+ "add z4.h, z4.h, z21.h\n"
+ "ld1sb { z23.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1sb { z16.s }, p1/Z, [x21]\n"
+ "ld1sb { z22.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z17.h, z16.h\n"
- "add z14.h, z14.h, z24.h\n"
+ "trn1 z5.h, z23.h, z22.h\n"
+ "add z5.h, z5.h, z21.h\n"
"ld1sb { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"ld1sb { z16.s }, p1/Z, [x21]\n"
- "trn1 z15.h, z17.h, z16.h\n"
- "add z15.h, z15.h, z24.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ "trn1 z6.h, z17.h, z16.h\n"
+ "add z6.h, z6.h, z21.h\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16b1488 // sdot za.s[x8, 0], { z4.h-z5.h }, z11.h\n"
+ ".inst 0xc1631489 // sdot za.s[x8, 1], { z4.h-z5.h }, z3.h\n"
+ ".inst 0xa1412a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16814a8 // sdot za.s[x8, 0], { z5.h-z6.h }, z8.h\n"
+ ".inst 0xc16014a9 // sdot za.s[x8, 1], { z5.h-z6.h }, z0.h\n"
"9:" // Unpadded: 1 priming loads
"add x22, x14, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x14]\n"
+ "ld1sb { z25.s }, p1/Z, [x14]\n"
"addvl x21, SP, #4\n"
- "ld1sb { z16.s }, p1/Z, [x22]\n"
+ "ld1sb { z6.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z13.h, z17.h, z16.h\n"
- "add z13.h, z13.h, z24.h\n"
- "ld1sb { z17.s }, p1/Z, [x22]\n"
+ "trn1 z3.h, z25.h, z6.h\n"
+ "add z3.h, z3.h, z21.h\n"
+ "ld1sb { z18.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #8\n"
- "ld1sb { z16.s }, p1/Z, [x22]\n"
+ "ld1sb { z26.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z14.h, z17.h, z16.h\n"
- "add z14.h, z14.h, z24.h\n"
- "ld1sb { z17.s }, p1/Z, [x22]\n"
+ "trn1 z4.h, z18.h, z26.h\n"
+ "add z4.h, z4.h, z21.h\n"
+ "ld1sb { z2.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1sb { z16.s }, p1/Z, [x22]\n"
- "trn1 z15.h, z17.h, z16.h\n"
- "add z15.h, z15.h, z24.h\n"
+ "ld1sb { z5.s }, p1/Z, [x22]\n"
+ "trn1 z5.h, z2.h, z5.h\n"
+ "add z5.h, z5.h, z21.h\n"
".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc1611468 // sdot za.s[x8, 0], { z3.h-z4.h }, z1.h\n"
+ ".inst 0xc1601469 // sdot za.s[x8, 1], { z3.h-z4.h }, z0.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa0412aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16a146a // sdot za.s[x8, 2], { z3.h-z4.h }, z10.h\n"
+ ".inst 0xc162146b // sdot za.s[x8, 3], { z3.h-z4.h }, z2.h\n"
+ ".inst 0xc1691488 // sdot za.s[x8, 0], { z4.h-z5.h }, z9.h\n"
+ ".inst 0xc1681489 // sdot za.s[x8, 1], { z4.h-z5.h }, z8.h\n"
+ ".inst 0xa1412a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16a148a // sdot za.s[x8, 2], { z4.h-z5.h }, z10.h\n"
+ ".inst 0xc162148b // sdot za.s[x8, 3], { z4.h-z5.h }, z2.h\n"
"10:" // Unpadded: 0 priming loads
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
"cbz x15, 18f\n"
"add x20, x14, %x[ld_in_row]\n"
"ld1sb { z17.s }, p1/Z, [x14]\n"
"sub x15, x15, #0x1\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
+ "ld1sb { z9.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z17.h, z16.h\n"
+ "trn1 z6.h, z17.h, z9.h\n"
"sub x13, x13, #0x1\n"
"ld1sb { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"cmp x15, x13\n"
- "add z13.h, z13.h, z24.h\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
+ "add z6.h, z6.h, z21.h\n"
+ "ld1sb { z7.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z14.h, z17.h, z16.h\n"
+ "trn1 z7.h, z17.h, z7.h\n"
"csel x23, x15, x13, LT\n"
"ld1sb { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z14.h, z14.h, z24.h\n"
+ "add z7.h, z7.h, z21.h\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
- "trn1 z15.h, z17.h, z16.h\n"
- "add z15.h, z15.h, z24.h\n"
+ "ld1sb { z1.s }, p1/Z, [x20]\n"
+ "trn1 z8.h, z17.h, z1.h\n"
+ "add z8.h, z8.h, z21.h\n"
"sub x13, x13, x23\n"
"cbz x23, 17f\n"
"11:" // Unpadded: Main loop
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
"addvl x22, SP, #4\n"
"addvl x21, SP, #8\n"
- "ld1sb { z21.s }, p1/Z, [x14]\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22]\n"
+ "ld1sb { z2.s }, p1/Z, [x14]\n"
+ ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+ ".inst 0xa1402ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22]\n"
"add x20, x14, %x[ld_in_row]\n"
"subs x23, x23, #0x1\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- "ld1sb { z20.s }, p1/Z, [x20]\n"
+ ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+ "ld1sb { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
- "ld1sb { z19.s }, p1/Z, [x20]\n"
+ ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+ ".inst 0xa1412ac3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+ "ld1sb { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
- ".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ ".inst 0xc16d14ca // sdot za.s[x8, 2], { z6.h-z7.h }, z13.h\n"
"ld1sb { z18.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc16115ac // sdot za.s[x8, 4], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16514cb // sdot za.s[x8, 3], { z6.h-z7.h }, z5.h\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xc16914cc // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
"ld1sb { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc16015ad // sdot za.s[x8, 5], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xc16114cd // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
"ld1sb { z16.s }, p1/Z, [x20]\n"
- ".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
- "trn1 z13.h, z21.h, z20.h\n"
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
- ".inst 0xc16315cc // sdot za.s[x8, 4], { z14.h-z15.h }, z3.h\n"
- "st1b { z4.s }, p1, [x11]\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ ".inst 0xc16b14ea // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
+ "trn1 z6.h, z2.h, z19.h\n"
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc16314eb // sdot za.s[x8, 3], { z7.h-z8.h }, z3.h\n"
+ ".inst 0xa1412aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ ".inst 0xc16914ec // sdot za.s[x8, 4], { z7.h-z8.h }, z9.h\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "add z13.h, z13.h, z24.h\n"
- ".inst 0xc16215cd // sdot za.s[x8, 5], { z14.h-z15.h }, z2.h\n"
- "trn1 z14.h, z19.h, z18.h\n"
- "trn1 z15.h, z17.h, z16.h\n"
+ "add z6.h, z6.h, z21.h\n"
+ ".inst 0xc16114ed // sdot za.s[x8, 5], { z7.h-z8.h }, z1.h\n"
+ "trn1 z7.h, z23.h, z18.h\n"
+ "trn1 z8.h, z17.h, z16.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "st1b { z6.s }, p1, [x10]\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
- "st1b { z5.s }, p1, [x27]\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ "st1b { z25.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
- "add z14.h, z14.h, z24.h\n"
- "st1b { z7.s }, p1, [x26]\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ "add z7.h, z7.h, z21.h\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "add z15.h, z15.h, z24.h\n"
+ "add z8.h, z8.h, z21.h\n"
"bgt 11b\n"
"b 17f\n"
"12:" // Padded
@@ -384,118 +384,118 @@ void sme2_s8q_planar_3x3_s1_4rows_dot_za_impl(
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1sb { z19.s }, p0/Z, [x14]\n"
- "add z19.h, p0/M, z19.h, z24.h\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z24.h\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z19.h, z18.h\n"
- "trn1 z14.h, z17.h, z16.h\n"
+ "trn1 z7.h, z19.h, z18.h\n"
+ "trn1 z8.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1sb { z16.s }, p0/Z, [x20]\n"
"addvl x20, SP, #8\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- "trn1 z15.h, z17.h, z16.h\n"
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ "trn1 z9.h, z17.h, z16.h\n"
+ ".inst 0xc16a14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z10.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16214e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z2.h\n"
+ ".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16d1508 // sdot za.s[x8, 0], { z8.h-z9.h }, z13.h\n"
+ ".inst 0xc1651509 // sdot za.s[x8, 1], { z8.h-z9.h }, z5.h\n"
"14:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1sb { z19.s }, p0/Z, [x14]\n"
- "add z19.h, p0/M, z19.h, z24.h\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z24.h\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z19.h, z18.h\n"
- "trn1 z14.h, z17.h, z16.h\n"
+ "trn1 z22.h, z19.h, z18.h\n"
+ "trn1 z23.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1sb { z16.s }, p0/Z, [x20]\n"
"addvl x21, SP, #4\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
"addvl x20, SP, #8\n"
- "trn1 z15.h, z17.h, z16.h\n"
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "trn1 z24.h, z17.h, z16.h\n"
+ ".inst 0xc16116c8 // sdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+ ".inst 0xc16016c9 // sdot za.s[x8, 1], { z22.h-z23.h }, z0.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xa0412aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16d16ca // sdot za.s[x8, 2], { z22.h-z23.h }, z13.h\n"
+ ".inst 0xc16516cb // sdot za.s[x8, 3], { z22.h-z23.h }, z5.h\n"
+ ".inst 0xc16116e8 // sdot za.s[x8, 0], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xc16016e9 // sdot za.s[x8, 1], { z23.h-z24.h }, z0.h\n"
+ ".inst 0xa0412a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16116ea // sdot za.s[x8, 2], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xc16016eb // sdot za.s[x8, 3], { z23.h-z24.h }, z0.h\n"
"15:" // Padded: 0 priming loads
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
"cbz x15, 18f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1sb { z19.s }, p0/Z, [x14]\n"
- "add z19.h, p0/M, z19.h, z24.h\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z24.h\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z19.h, z18.h\n"
- "trn1 z14.h, z17.h, z16.h\n"
+ "trn1 z6.h, z19.h, z18.h\n"
+ "trn1 z7.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
"sub x15, x15, #0x1\n"
"sub x13, x13, #0x1\n"
"cmp x15, x13\n"
- "trn1 z15.h, z17.h, z16.h\n"
+ "trn1 z8.h, z17.h, z16.h\n"
"csel x23, x15, x13, LT\n"
"add x14, x14, %x[ld_in_col]\n"
"sub x13, x13, x23\n"
@@ -503,121 +503,121 @@ void sme2_s8q_planar_3x3_s1_4rows_dot_za_impl(
"16:" // Padded: Main loop
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z21.s }, p0/Z, [x14]\n"
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- "add z21.h, p0/M, z21.h, z24.h\n"
+ "ld1sb { z9.s }, p0/Z, [x14]\n"
+ ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+ ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+ "add z9.h, p0/M, z9.h, z21.h\n"
"add x22, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z20.s }, p0/Z, [x22]\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- "add z20.h, p0/M, z20.h, z24.h\n"
+ "ld1sb { z19.s }, p0/Z, [x22]\n"
+ ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z19.s }, p0/Z, [x22]\n"
- "add z19.h, p0/M, z19.h, z24.h\n"
- ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+ "ld1sb { z18.s }, p0/Z, [x22]\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z18.s }, p0/Z, [x22]\n"
- ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
+ "ld1sb { z16.s }, p0/Z, [x22]\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
"mov x12, #0x4\n"
"addvl x21, SP, #4\n"
- "add z18.h, p0/M, z18.h, z24.h\n"
- ".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
"addvl x20, SP, #8\n"
- ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16b14ca // sdot za.s[x8, 2], { z6.h-z7.h }, z11.h\n"
"subs x23, x23, #0x1\n"
"ld1sb { z17.s }, p0/Z, [x22]\n"
- ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ ".inst 0xc16314cb // sdot za.s[x8, 3], { z6.h-z7.h }, z3.h\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16115ac // sdot za.s[x8, 4], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- "ld1sb { z16.s }, p0/Z, [x22]\n"
- ".inst 0xc16015ad // sdot za.s[x8, 5], { z13.h-z14.h }, z0.h\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ ".inst 0xa0412aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16d14cc // sdot za.s[x8, 4], { z6.h-z7.h }, z13.h\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ "ld1sb { z2.s }, p0/Z, [x22]\n"
+ ".inst 0xc16514cd // sdot za.s[x8, 5], { z6.h-z7.h }, z5.h\n"
+ "add z2.h, p0/M, z2.h, z21.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
- ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "st1b { z4.s }, p1, [x11]\n"
+ ".inst 0xc16b14ea // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ ".inst 0xc16a14eb // sdot za.s[x8, 3], { z7.h-z8.h }, z10.h\n"
+ ".inst 0xa1412a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc16315cc // sdot za.s[x8, 4], { z14.h-z15.h }, z3.h\n"
- "st1b { z6.s }, p1, [x10]\n"
+ ".inst 0xc16b14ec // sdot za.s[x8, 4], { z7.h-z8.h }, z11.h\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "trn1 z13.h, z21.h, z20.h\n"
- ".inst 0xc16215cd // sdot za.s[x8, 5], { z14.h-z15.h }, z2.h\n"
+ "trn1 z6.h, z9.h, z19.h\n"
+ ".inst 0xc16314ed // sdot za.s[x8, 5], { z7.h-z8.h }, z3.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "st1b { z5.s }, p1, [x27]\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "st1b { z25.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z7.s }, p1, [x26]\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
- ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
- "trn1 z14.h, z19.h, z18.h\n"
- "trn1 z15.h, z17.h, z16.h\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ "trn1 z7.h, z18.h, z16.h\n"
+ "trn1 z8.h, z17.h, z2.h\n"
"bgt 16b\n"
"17:" // Main loop tail
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
"addvl x21, SP, #4\n"
"addvl x20, SP, #8\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+ ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
- ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
- ".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc16115ac // sdot za.s[x8, 4], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
- "st1b { z4.s }, p1, [x11]\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ ".inst 0xc16114ca // sdot za.s[x8, 2], { z6.h-z7.h }, z1.h\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xc16014cb // sdot za.s[x8, 3], { z6.h-z7.h }, z0.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ ".inst 0xc16914cc // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc16015ad // sdot za.s[x8, 5], { z13.h-z14.h }, z0.h\n"
- "st1b { z6.s }, p1, [x10]\n"
+ ".inst 0xc16114cd // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
- "st1b { z5.s }, p1, [x27]\n"
+ ".inst 0xc16314ea // sdot za.s[x8, 2], { z7.h-z8.h }, z3.h\n"
+ "st1b { z25.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "st1b { z7.s }, p1, [x26]\n"
+ ".inst 0xc16214eb // sdot za.s[x8, 3], { z7.h-z8.h }, z2.h\n"
+ ".inst 0xa0412a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xc16315cc // sdot za.s[x8, 4], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215cd // sdot za.s[x8, 5], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16114ec // sdot za.s[x8, 4], { z7.h-z8.h }, z1.h\n"
+ ".inst 0xc16014ed // sdot za.s[x8, 5], { z7.h-z8.h }, z0.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
- ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
"18:" // Main loop skip tail
"cbz x13, 20f\n"
"19:" // Right padding loop
".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
"subs x13, x13, #0x1\n"
".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
- ".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
+ ".inst 0xc1aeac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
"add x8, x8, #0x2\n"
- ".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
- ".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
- ".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
+ ".inst 0xc1acaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ ".inst 0xc1afab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z15.s\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ ".inst 0xc1bccfa4 // sclamp { z4.s-z7.s }, z29.s, z28.s\n"
"st1b { z4.s }, p1, [x11]\n"
"add x11, x11, x9\n"
"st1b { z6.s }, p1, [x10]\n"
@@ -628,15 +628,15 @@ void sme2_s8q_planar_3x3_s1_4rows_dot_za_impl(
"add x26, x26, x24\n"
"bgt 19b\n"
"20:" // End
- "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x22, ALL, MUL #9\n"
- "str x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
"incw x16\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
"whilelt p1.s, x16, x17\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x14, x14, x20\n"
- "str x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
"ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
"ldp x23, x22, [x25, #0x0]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za.hpp
index d14d662240..56fb127aa0 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za/generic.cpp
index 03575aa799..1d0efc6bc1 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za/generic.cpp
@@ -73,86 +73,86 @@ void sme2_s8q_planar_3x3_s2_4rows_dot_za_impl(
"ptrue p2.b\n"
"mov x20, #0x9\n"
"ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z5.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "ld1rh { z11.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
"sub x20, x20, x6\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x17\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
"whilelt p8.s, XZR, x7\n"
"addvl SP, SP, #-6\n"
"ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z5.h, p2/M, z5.h\n"
+ "neg z11.h, p2/M, z11.h\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z27.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z0.s, #0x0\n"
+ "mov z28.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z0.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z28.s }, p1/Z, [x20, x16, LSL #2]\n"
"2:" // Load bias: Done
"ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
"mov x20, x22\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "ld1sb { z26.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "ld1rh { z13.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "sub z24.h, z24.h, z13.h\n"
+ "ld1rh { z16.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "sub z26.h, z26.h, z16.h\n"
"incw x22\n"
- "mov z17.h, #0x0\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "mov z24.h, #0x0\n"
+ "ld1sb { z3.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z25.h, z25.h, z13.h\n"
- "trn1 z10.h, z24.h, z25.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
- "sub z16.h, z16.h, z13.h\n"
+ "sub z3.h, z3.h, z16.h\n"
+ "trn1 z31.h, z26.h, z3.h\n"
+ "ld1sb { z21.s }, p2/Z, [x20]\n"
+ "sub z21.h, z21.h, z16.h\n"
"mov x20, x22\n"
- "trn1 z11.h, z16.h, z17.h\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "trn1 z14.h, z21.h, z24.h\n"
+ "ld1sb { z2.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z24.h, z24.h, z13.h\n"
+ "sub z2.h, z2.h, z16.h\n"
"addvl x21, SP, #6\n"
"ld1sb { z25.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z25.h, z25.h, z13.h\n"
+ "sub z25.h, z25.h, z16.h\n"
"incw x22\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
- "sub z16.h, z16.h, z13.h\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "sub z27.h, z27.h, z16.h\n"
"addvl x21, x21, #-2\n"
"mov x20, x22\n"
- "st1h { z10.h }, p2, [x21]\n"
- "trn1 z10.h, z24.h, z25.h\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "st1h { z31.h }, p2, [x21]\n"
+ "trn1 z4.h, z2.h, z25.h\n"
+ "ld1sb { z26.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "ld1sb { z23.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "st1h { z11.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z11.h, z16.h, z17.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
- "sub z24.h, z24.h, z13.h\n"
- "sub z25.h, z25.h, z13.h\n"
+ "st1h { z14.h }, p2, [x21, #1, MUL VL]\n"
+ "trn1 z12.h, z27.h, z24.h\n"
+ "ld1sb { z20.s }, p2/Z, [x20]\n"
+ "sub z26.h, z26.h, z16.h\n"
+ "sub z23.h, z23.h, z16.h\n"
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "sub z16.h, z16.h, z13.h\n"
+ "sub z20.h, z20.h, z16.h\n"
"addvl x21, x21, #-2\n"
- "st1h { z10.h }, p2, [x21]\n"
- "mov z1.d, z0.d\n"
- "st1h { z11.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z4.h }, p2, [x21]\n"
+ "mov z29.d, z28.d\n"
+ "st1h { z12.h }, p2, [x21, #1, MUL VL]\n"
"addvl x21, x21, #-2\n"
- "mov z2.d, z0.d\n"
- "mov z3.d, z0.d\n"
- "trn1 z10.h, z24.h, z25.h\n"
- "st1h { z10.h }, p2, [x21]\n"
- "trn1 z11.h, z16.h, z17.h\n"
- "st1h { z11.h }, p2, [x21, #1, MUL VL]\n"
+ "mov z30.d, z28.d\n"
+ "mov z31.d, z28.d\n"
+ "trn1 z25.h, z26.h, z23.h\n"
+ "st1h { z25.h }, p2, [x21]\n"
+ "trn1 z3.h, z20.h, z24.h\n"
+ "st1h { z3.h }, p2, [x21, #1, MUL VL]\n"
"cbz x20, 3f\n"
- "ld1w { z8.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z6.s }, p1/Z, [x20, x16, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z7.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z9.s }, p1/Z, [x20, x16, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
"sub x20, x15, #0x1\n"
@@ -172,18 +172,18 @@ void sme2_s8q_planar_3x3_s2_4rows_dot_za_impl(
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x0\n"
"msub x14, x7, x20, x14\n"
- ".inst 0xc0040c00 // mova za.d[x8, #0], { z0.d-z3.d }\n"
+ ".inst 0xc0040f80 // mova za.d[x8, #0], { z28.d-z31.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040c01 // mova za.d[x8, #1], { z0.d-z3.d }\n"
+ ".inst 0xc0040f81 // mova za.d[x8, #1], { z28.d-z31.d }\n"
"mov x22, #0x2\n"
- "ldp x11, x10, [x25], #0x10\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
+ "ldp x11, x10, [x23], #0x10\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
"ldp x9, x28, [x20], #0x10\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- "ldp x27, x26, [x25], #0x10\n"
+ "ldp x27, x26, [x23], #0x10\n"
"ldp x25, x24, [x20], #0x10\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
@@ -191,24 +191,24 @@ void sme2_s8q_planar_3x3_s2_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
"and x22, x21, #0x1\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
+ ".inst 0xc1a9aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z9.s\n"
"add x21, x21, #0x1\n"
"lsr x21, x21, #0x1\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+ ".inst 0xc1adab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z13.s\n"
"sub x13, x13, x21\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
+ ".inst 0xc1a7cd58 // sclamp { z24.s-z27.s }, z10.s, z7.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z28.s }, p1, [x11]\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "st1b { z29.s }, p1, [x10]\n"
+ "st1b { z25.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z30.s }, p1, [x27]\n"
+ "st1b { z26.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"bgt 6b\n"
"7:" // Left padding: End
@@ -220,194 +220,194 @@ void sme2_s8q_planar_3x3_s2_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 2 priming loads
"add x21, x14, %x[ld_in_row]\n"
- "ld1sb { z12.s }, p1/Z, [x14]\n"
+ "ld1sb { z1.s }, p1/Z, [x14]\n"
"addvl x20, SP, #4\n"
- "ld1sb { z20.s }, p1/Z, [x21]\n"
+ "ld1sb { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z5.h\n"
- "ld1sb { z13.s }, p1/Z, [x21]\n"
+ "trn1 z1.h, z1.h, z21.h\n"
+ "add z1.h, z1.h, z11.h\n"
+ "ld1sb { z2.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1sb { z19.s }, p1/Z, [x21]\n"
+ "ld1sb { z15.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z5.h\n"
- "ld1sb { z14.s }, p1/Z, [x21]\n"
+ "trn1 z2.h, z2.h, z15.h\n"
+ "add z2.h, z2.h, z11.h\n"
+ "ld1sb { z3.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z18.s }, p1/Z, [x21]\n"
+ "ld1sb { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z5.h\n"
- "ld1sb { z15.s }, p1/Z, [x21]\n"
+ "trn1 z3.h, z3.h, z21.h\n"
+ "add z3.h, z3.h, z11.h\n"
+ "ld1sb { z4.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x21]\n"
+ "ld1sb { z19.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z5.h\n"
- "ld1sb { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z5.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ "trn1 z4.h, z4.h, z19.h\n"
+ "add z4.h, z4.h, z11.h\n"
+ "ld1sb { z8.s }, p1/Z, [x21]\n"
+ "mov z5.d, z8.d\n"
+ "add z5.h, z5.h, z11.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701428 // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+ ".inst 0xc1781448 // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
"9:" // Unpadded: 1 priming loads
"add x21, x14, %x[ld_in_row]\n"
- "ld1sb { z12.s }, p1/Z, [x14]\n"
+ "ld1sb { z1.s }, p1/Z, [x14]\n"
"addvl x20, SP, #2\n"
- "ld1sb { z20.s }, p1/Z, [x21]\n"
+ "ld1sb { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z5.h\n"
- "ld1sb { z13.s }, p1/Z, [x21]\n"
+ "trn1 z1.h, z1.h, z21.h\n"
+ "add z1.h, z1.h, z11.h\n"
+ "ld1sb { z2.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1sb { z19.s }, p1/Z, [x21]\n"
+ "ld1sb { z12.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z5.h\n"
- "ld1sb { z14.s }, p1/Z, [x21]\n"
+ "trn1 z2.h, z2.h, z12.h\n"
+ "add z2.h, z2.h, z11.h\n"
+ "ld1sb { z3.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z18.s }, p1/Z, [x21]\n"
+ "ld1sb { z8.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z5.h\n"
- "ld1sb { z15.s }, p1/Z, [x21]\n"
+ "trn1 z3.h, z3.h, z8.h\n"
+ "add z3.h, z3.h, z11.h\n"
+ "ld1sb { z4.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x21]\n"
+ "ld1sb { z5.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z5.h\n"
- "ld1sb { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z5.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ "trn1 z4.h, z4.h, z5.h\n"
+ "add z4.h, z4.h, z11.h\n"
+ "ld1sb { z5.s }, p1/Z, [x21]\n"
+ "mov z5.d, z5.d\n"
+ "add z5.h, z5.h, z11.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701428 // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+ ".inst 0xc1781448 // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
"10:" // Unpadded: 0 priming loads
"cmp x15, #0x2\n"
- ".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
"blt 18f\n"
"add x21, x14, %x[ld_in_row]\n"
- "ld1sb { z12.s }, p1/Z, [x14]\n"
+ "ld1sb { z21.s }, p1/Z, [x14]\n"
"sub x15, x15, #0x2\n"
- "ld1sb { z20.s }, p1/Z, [x21]\n"
+ "ld1sb { z8.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z21.h, z21.h, z8.h\n"
"sub x13, x13, #0x1\n"
- "ld1sb { z13.s }, p1/Z, [x21]\n"
+ "ld1sb { z22.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"lsr x20, x15, #0x1\n"
- "add z12.h, z12.h, z5.h\n"
- "ld1sb { z19.s }, p1/Z, [x21]\n"
+ "add z21.h, z21.h, z11.h\n"
+ "ld1sb { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z22.h, z22.h, z25.h\n"
"cmp x20, x13\n"
- "ld1sb { z14.s }, p1/Z, [x21]\n"
+ "ld1sb { z23.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"csel x23, x20, x13, LT\n"
- "add z13.h, z13.h, z5.h\n"
+ "add z22.h, z22.h, z11.h\n"
"ld1sb { z18.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z5.h\n"
- "ld1sb { z15.s }, p1/Z, [x21]\n"
+ "trn1 z23.h, z23.h, z18.h\n"
+ "add z23.h, z23.h, z11.h\n"
+ "ld1sb { z24.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1sb { z17.s }, p1/Z, [x21]\n"
+ "ld1sb { z19.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z5.h\n"
- "ld1sb { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z5.h\n"
+ "trn1 z24.h, z24.h, z19.h\n"
+ "add z24.h, z24.h, z11.h\n"
+ "ld1sb { z8.s }, p1/Z, [x21]\n"
+ "mov z25.d, z8.d\n"
+ "add z25.h, z25.h, z11.h\n"
"and x15, x15, #0x1\n"
"sub x13, x13, x23\n"
"cbz x23, 17f\n"
"11:" // Unpadded: Main loop
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
"addvl x20, SP, #4\n"
"add x22, x14, %x[ld_in_row]\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
"addvl x21, SP, #2\n"
"subs x23, x23, #0x1\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- "ld1sb { z12.s }, p1/Z, [x14]\n"
+ ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ "ld1sb { z21.s }, p1/Z, [x14]\n"
"add x14, x14, %x[ld_in_col]\n"
"add x20, x14, %x[ld_in_row]\n"
- "ld1sb { z20.s }, p1/Z, [x22]\n"
+ "ld1sb { z18.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "ld1sb { z13.s }, p1/Z, [x22]\n"
+ ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ "trn1 z21.h, z21.h, z18.h\n"
+ "ld1sb { z22.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "add z12.h, z12.h, z5.h\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- "ld1sb { z19.s }, p1/Z, [x22]\n"
+ "add z21.h, z21.h, z11.h\n"
+ ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+ "ld1sb { z8.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z5.h\n"
- "ld1sb { z14.s }, p1/Z, [x22]\n"
+ "trn1 z22.h, z22.h, z8.h\n"
+ "add z22.h, z22.h, z11.h\n"
+ "ld1sb { z23.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x8, x8, #0x1\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
- "ld1sb { z18.s }, p1/Z, [x22]\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "ld1sb { z27.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z5.h\n"
- "ld1sb { z15.s }, p1/Z, [x22]\n"
+ "trn1 z23.h, z23.h, z27.h\n"
+ "add z23.h, z23.h, z11.h\n"
+ "ld1sb { z24.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
- "ld1sb { z17.s }, p1/Z, [x22]\n"
+ ".inst 0xc1a6ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+ "ld1sb { z8.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z5.h\n"
- "ld1sb { z16.s }, p1/Z, [x22]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z5.h\n"
- ".inst 0xa0402aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
- "ld1sb { z12.s }, p1/Z, [x14]\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "ld1sb { z20.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "ld1sb { z13.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "st1b { z28.s }, p1, [x11]\n"
+ "trn1 z24.h, z24.h, z8.h\n"
+ "add z24.h, z24.h, z11.h\n"
+ "ld1sb { z4.s }, p1/Z, [x22]\n"
+ "mov z25.d, z4.d\n"
+ "add z25.h, z25.h, z11.h\n"
+ ".inst 0xa1402aa4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc17416a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z4.h\n"
+ ".inst 0xc1a9aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+ "ld1sb { z21.s }, p1/Z, [x14]\n"
+ ".inst 0xc17c16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z12.h\n"
+ ".inst 0xc1adab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+ "ld1sb { z12.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z21.h, z21.h, z12.h\n"
+ ".inst 0xc1a7cd40 // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+ "ld1sb { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "st1b { z0.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "ld1sb { z19.s }, p1/Z, [x20]\n"
+ "ld1sb { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "st1b { z29.s }, p1, [x10]\n"
- "ld1sb { z14.s }, p1/Z, [x20]\n"
+ "trn1 z22.h, z22.h, z20.h\n"
+ "st1b { z1.s }, p1, [x10]\n"
+ "ld1sb { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"add x10, x10, x28\n"
- "st1b { z30.s }, p1, [x27]\n"
- "ld1sb { z18.s }, p1/Z, [x20]\n"
+ "st1b { z2.s }, p1, [x27]\n"
+ "ld1sb { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
+ "trn1 z23.h, z23.h, z24.h\n"
"add x27, x27, x25\n"
- "ld1sb { z15.s }, p1/Z, [x20]\n"
+ "ld1sb { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "st1b { z3.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "ld1sb { z17.s }, p1/Z, [x20]\n"
+ "ld1sb { z3.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z12.h, z12.h, z5.h\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- "add z13.h, z13.h, z5.h\n"
+ "trn1 z24.h, z24.h, z3.h\n"
+ "add z21.h, z21.h, z11.h\n"
+ "ld1sb { z3.s }, p1/Z, [x20]\n"
+ "mov z25.d, z3.d\n"
+ "add z22.h, z22.h, z11.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
- "add z14.h, z14.h, z5.h\n"
- "add z15.h, z15.h, z5.h\n"
- "add z16.h, z16.h, z5.h\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ "add z23.h, z23.h, z11.h\n"
+ "add z24.h, z24.h, z11.h\n"
+ "add z25.h, z25.h, z11.h\n"
"bgt 11b\n"
"b 17f\n"
"12:" // Padded
@@ -418,442 +418,442 @@ void sme2_s8q_planar_3x3_s2_4rows_dot_za_impl(
"13:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ "ld1sb { z22.s }, p0/Z, [x14]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ "ld1sb { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "ld1sb { z4.s }, p0/Z, [x20]\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z22.h, z22.h, z17.h\n"
+ "trn1 z23.h, z23.h, z4.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1sb { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "add z18.h, p0/M, z18.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1sb { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1sb { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
+ "ld1sb { z1.s }, p0/Z, [x20]\n"
+ "add z1.h, p0/M, z1.h, z11.h\n"
"addvl x20, SP, #4\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ "trn1 z24.h, z24.h, z18.h\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ ".inst 0xa1402a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
+ "mov z26.d, z1.d\n"
+ ".inst 0xc17416c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z4.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17c16e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z12.h\n"
"14:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ "ld1sb { z22.s }, p0/Z, [x14]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ "ld1sb { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "ld1sb { z5.s }, p0/Z, [x20]\n"
+ "add z5.h, p0/M, z5.h, z11.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z22.h, z22.h, z17.h\n"
+ "trn1 z23.h, z23.h, z5.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1sb { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "add z18.h, p0/M, z18.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1sb { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1sb { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
+ "ld1sb { z15.s }, p0/Z, [x20]\n"
+ "add z15.h, p0/M, z15.h, z11.h\n"
"addvl x20, SP, #2\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ "trn1 z24.h, z24.h, z18.h\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "mov z26.d, z15.d\n"
+ ".inst 0xc17016c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z0.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17116e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z1.h\n"
"15:" // Padded: 0 priming loads
"cmp x15, #0x2\n"
- ".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
"blt 18f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ "ld1sb { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ "ld1sb { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "ld1sb { z3.s }, p0/Z, [x20]\n"
+ "add z3.h, p0/M, z3.h, z11.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z21.h, z21.h, z18.h\n"
+ "trn1 z22.h, z22.h, z3.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1sb { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "ld1sb { z19.s }, p0/Z, [x20]\n"
+ "add z19.h, p0/M, z19.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1sb { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z20.h, p0/M, z20.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
+ "ld1sb { z3.s }, p0/Z, [x20]\n"
+ "add z3.h, p0/M, z3.h, z11.h\n"
"sub x15, x15, #0x2\n"
"sub x13, x13, #0x1\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ "trn1 z23.h, z23.h, z19.h\n"
+ "trn1 z24.h, z24.h, z20.h\n"
"lsr x20, x15, #0x1\n"
"cmp x20, x13\n"
- "mov z16.d, z16.d\n"
+ "mov z25.d, z3.d\n"
"csel x22, x20, x13, LT\n"
"add x14, x14, %x[ld_in_col]\n"
"and x15, x15, #0x1\n"
"sub x13, x13, x22\n"
"cbz x22, 17f\n"
"16:" // Padded: Main loop
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
"addvl x20, SP, #4\n"
"mov x12, #0x0\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa1402a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"add x21, x14, %x[ld_in_row]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- "ld1sb { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ ".inst 0xc17416a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z4.h\n"
+ "ld1sb { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z20.s }, p0/Z, [x21]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1sb { z14.s }, p0/Z, [x21]\n"
+ "add z14.h, p0/M, z14.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
- "ld1sb { z13.s }, p0/Z, [x21]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ ".inst 0xc17c16c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z12.h\n"
+ "ld1sb { z22.s }, p0/Z, [x21]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z19.s }, p0/Z, [x21]\n"
+ "ld1sb { z15.s }, p0/Z, [x21]\n"
"mov x12, #0x4\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "add z15.h, p0/M, z15.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z14.s }, p0/Z, [x21]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1sb { z23.s }, p0/Z, [x21]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x21]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "ld1sb { z17.s }, p0/Z, [x21]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z15.s }, p0/Z, [x21]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1sb { z24.s }, p0/Z, [x21]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "ld1sb { z4.s }, p0/Z, [x21]\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
"mov x12, #0x8\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z21.h, z21.h, z14.h\n"
+ "trn1 z22.h, z22.h, z15.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"addvl x20, SP, #2\n"
- "ld1sb { z16.s }, p0/Z, [x21]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ "ld1sb { z2.s }, p0/Z, [x21]\n"
+ "trn1 z23.h, z23.h, z17.h\n"
+ "trn1 z24.h, z24.h, z4.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
"mov x12, #0x0\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
+ "add z2.h, p0/M, z2.h, z11.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17016a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z0.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ "ld1sb { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "mov z16.d, z16.d\n"
+ "mov z25.d, z2.d\n"
"ld1sb { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "add z20.h, p0/M, z20.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- "ld1sb { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ ".inst 0xc17116c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z1.h\n"
+ "ld1sb { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z19.s }, p0/Z, [x20]\n"
+ "ld1sb { z4.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1sb { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "ld1sb { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z15.s }, p0/Z, [x20]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1sb { z24.s }, p0/Z, [x20]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "ld1sb { z12.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
+ ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z12.h, p0/M, z12.h, z11.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+ "ld1sb { z8.s }, p0/Z, [x20]\n"
+ "add z8.h, p0/M, z8.h, z11.h\n"
+ ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
"subs x22, x22, #0x1\n"
- ".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x11]\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+ "st1b { z16.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "st1b { z29.s }, p1, [x10]\n"
+ "trn1 z21.h, z21.h, z20.h\n"
+ "st1b { z17.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "st1b { z30.s }, p1, [x27]\n"
+ "trn1 z22.h, z22.h, z4.h\n"
+ "trn1 z23.h, z23.h, z27.h\n"
+ "st1b { z18.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "trn1 z24.h, z24.h, z12.h\n"
+ "mov z25.d, z8.d\n"
+ "st1b { z19.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"add x14, x14, %x[ld_in_col]\n"
"bgt 16b\n"
"17:" // Main loop tail
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
"addvl x20, SP, #4\n"
"mov x12, #0x0\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"add x20, x14, %x[ld_in_row]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- "ld1sb { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ "ld1sb { z0.s }, p0/Z, [x14]\n"
+ "add z0.h, p0/M, z0.h, z11.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1sb { z14.s }, p0/Z, [x20]\n"
+ "add z14.h, p0/M, z14.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
- "ld1sb { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ "ld1sb { z1.s }, p0/Z, [x20]\n"
+ "add z1.h, p0/M, z1.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z19.s }, p0/Z, [x20]\n"
+ "ld1sb { z12.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "add z12.h, p0/M, z12.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1sb { z2.s }, p0/Z, [x20]\n"
+ "add z2.h, p0/M, z2.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "ld1sb { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z15.s }, p0/Z, [x20]\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1sb { z3.s }, p0/Z, [x20]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ "add z3.h, p0/M, z3.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "ld1sb { z25.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z25.h, p0/M, z25.h, z11.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "ld1sb { z27.s }, p0/Z, [x20]\n"
"addvl x20, SP, #2\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+ "trn1 z0.h, z0.h, z14.h\n"
"add x8, x8, #0x1\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
+ "add z27.h, p0/M, z27.h, z11.h\n"
+ "trn1 z1.h, z1.h, z12.h\n"
+ "trn1 z2.h, z2.h, z21.h\n"
"add x14, x14, %x[ld_in_col]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x11]\n"
+ "trn1 z3.h, z3.h, z25.h\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+ "mov z4.d, z27.d\n"
+ ".inst 0xc17e1408 // sdot za.s[x8, 0], { z0.h-z3.h }, z14.h\n"
+ ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+ "st1b { z16.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
- "st1b { z29.s }, p1, [x10]\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "st1b { z17.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
- "st1b { z30.s }, p1, [x27]\n"
+ ".inst 0xc17f1428 // sdot za.s[x8, 0], { z1.h-z4.h }, z15.h\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ "st1b { z18.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "st1b { z19.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"18:" // Main loop skip tail
"cbz x15, 19f\n" // Skip remainder inputs
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ "ld1sb { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ "ld1sb { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "ld1sb { z0.s }, p0/Z, [x20]\n"
+ "add z0.h, p0/M, z0.h, z11.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z21.h, z21.h, z17.h\n"
+ "trn1 z22.h, z22.h, z0.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1sb { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1sb { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "ld1sb { z5.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z5.h, p0/M, z5.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
+ "ld1sb { z4.s }, p0/Z, [x20]\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
+ "trn1 z23.h, z23.h, z17.h\n"
+ "trn1 z24.h, z24.h, z5.h\n"
+ "mov z25.d, z4.d\n"
"addvl x20, SP, #4\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
"sub x13, x13, #0x1\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+ ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+ ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
"add x8, x8, #0x1\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x11]\n"
+ ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+ "st1b { z16.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
- "st1b { z29.s }, p1, [x10]\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "st1b { z17.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z30.s }, p1, [x27]\n"
+ "st1b { z18.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "st1b { z19.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"19:" // Tail input: End
"cbz x13, 21f\n"
"20:" // Right padding loop
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
+ ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
"add x8, x8, #0x1\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
+ ".inst 0xc1a9aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
"subs x13, x13, #0x1\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x11]\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ ".inst 0xc1adab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+ ".inst 0xc1a7cd40 // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+ "st1b { z0.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "st1b { z29.s }, p1, [x10]\n"
+ "st1b { z1.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z30.s }, p1, [x27]\n"
+ "st1b { z2.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "st1b { z3.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"bgt 20b\n"
"21:" // End
- "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x22, ALL, MUL #9\n"
- "str x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
"incw x16\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
"whilelt p1.s, x16, x17\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x14, x14, x20\n"
- "str x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
"ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
"ldp x23, x22, [x25, #0x0]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za.hpp
index 6f3290fd3c..40fa718266 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za/generic.cpp
index d366b3c8d5..bb68733a45 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za/generic.cpp
@@ -69,196 +69,196 @@ void sme2_s8q_planar_5x5_s1_4rows_dot_za_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "ldr x4, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ldr x5, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"ptrue p2.b\n"
"mov x20, #0x8\n"
"ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z25.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "sub x20, x20, x4\n"
+ "ld1rh { z17.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "sub x20, x20, x5\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x7\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
"whilelt p8.s, XZR, x6\n"
"addvl SP, SP, #-30\n"
- "ldr x5, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z25.h, p2/M, z25.h\n"
+ "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "neg z17.h, p2/M, z17.h\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z3.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z1.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
"ld1rw { z24.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z31.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z6.s, #0x0\n"
+ "mov z18.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z6.s }, p1/Z, [x20, x5, LSL #2]\n"
+ "ld1w { z18.s }, p1/Z, [x20, x17, LSL #2]\n"
"2:" // Load bias: Done
"ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x22, x23\n"
- "ld1sb { z18.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "ld1rh { z12.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "mov z2.h, #0x0\n"
- "sub z18.h, z18.h, z12.h\n"
+ "mov x20, x23\n"
+ "ld1sb { z2.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "ld1rh { z3.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z15.h, #0x0\n"
+ "sub z2.h, z2.h, z3.h\n"
"incw x23\n"
- "ld1sb { z17.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "sub z17.h, z17.h, z12.h\n"
- "trn1 z0.h, z2.h, z18.h\n"
- "ld1sb { z21.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "sub z21.h, z21.h, z12.h\n"
- "trn1 z8.h, z18.h, z17.h\n"
- "ld1sb { z16.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "sub z16.h, z16.h, z12.h\n"
- "trn1 z4.h, z17.h, z21.h\n"
- "ld1sb { z15.s }, p2/Z, [x22]\n"
- "sub z15.h, z15.h, z12.h\n"
- "mov x22, x23\n"
- "trn1 z5.h, z21.h, z16.h\n"
- "ld1sb { z18.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "trn1 z10.h, z16.h, z15.h\n"
+ "ld1sb { z13.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z13.h, z13.h, z3.h\n"
"trn1 z11.h, z15.h, z2.h\n"
- "ld1sb { z17.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "sub z18.h, z18.h, z12.h\n"
- "sub z17.h, z17.h, z12.h\n"
- "ld1sb { z21.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "sub z21.h, z21.h, z12.h\n"
- "addvl x21, SP, #30\n"
- "ld1sb { z16.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z27.h, z27.h, z3.h\n"
+ "trn1 z0.h, z2.h, z13.h\n"
+ "ld1sb { z19.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z19.h, z19.h, z3.h\n"
+ "trn1 z26.h, z13.h, z27.h\n"
+ "ld1sb { z14.s }, p2/Z, [x20]\n"
+ "sub z14.h, z14.h, z3.h\n"
+ "mov x20, x23\n"
+ "trn1 z10.h, z27.h, z19.h\n"
+ "ld1sb { z9.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z19.h, z19.h, z14.h\n"
+ "trn1 z1.h, z14.h, z15.h\n"
+ "ld1sb { z5.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z9.h, z9.h, z3.h\n"
+ "sub z5.h, z5.h, z3.h\n"
+ "ld1sb { z29.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z29.h, z29.h, z3.h\n"
+ "addvl x22, SP, #30\n"
+ "ld1sb { z2.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"incw x23\n"
- "sub z16.h, z16.h, z12.h\n"
- "ld1sb { z15.s }, p2/Z, [x22]\n"
- "addvl x21, x21, #-6\n"
- "sub z15.h, z15.h, z12.h\n"
- "mov x22, x23\n"
- "st1h { z0.h }, p2, [x21]\n"
- "trn1 z0.h, z2.h, z18.h\n"
+ "sub z2.h, z2.h, z3.h\n"
+ "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "addvl x22, x22, #-6\n"
+ "sub z23.h, z23.h, z3.h\n"
+ "mov x20, x23\n"
+ "st1h { z11.h }, p2, [x22]\n"
+ "trn1 z20.h, z15.h, z9.h\n"
"incw x23\n"
- "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z8.h, z18.h, z17.h\n"
- "ld1sb { z18.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z4.h, z17.h, z21.h\n"
- "ld1sb { z17.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z5.h, z21.h, z16.h\n"
- "ld1sb { z21.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
- "trn1 z10.h, z16.h, z15.h\n"
- "ld1sb { z16.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
- "trn1 z11.h, z15.h, z2.h\n"
- "sub z18.h, z18.h, z12.h\n"
- "addvl x21, x21, #-6\n"
- "sub z17.h, z17.h, z12.h\n"
- "ld1sb { z15.s }, p2/Z, [x22]\n"
- "sub z21.h, z21.h, z12.h\n"
- "mov x22, x23\n"
- "sub z16.h, z16.h, z12.h\n"
- "sub z15.h, z15.h, z12.h\n"
- "st1h { z0.h }, p2, [x21]\n"
+ "ldr x21, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "st1h { z0.h }, p2, [x22, #1, MUL VL]\n"
+ "trn1 z22.h, z9.h, z5.h\n"
+ "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z26.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z9.h, z5.h, z29.h\n"
+ "ld1sb { z21.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z10.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z26.h, z29.h, z2.h\n"
+ "ld1sb { z0.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z28.h, z2.h, z23.h\n"
+ "ld1sb { z19.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z1.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z2.h, z23.h, z15.h\n"
+ "sub z25.h, z25.h, z3.h\n"
+ "addvl x22, x22, #-6\n"
+ "sub z21.h, z21.h, z3.h\n"
+ "ld1sb { z6.s }, p2/Z, [x20]\n"
+ "sub z0.h, z0.h, z3.h\n"
+ "mov x20, x23\n"
+ "sub z19.h, z19.h, z3.h\n"
+ "sub z6.h, z6.h, z3.h\n"
+ "st1h { z20.h }, p2, [x22]\n"
"incw x23\n"
- "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z0.h, z2.h, z18.h\n"
- "trn1 z8.h, z18.h, z17.h\n"
- "ld1sb { z18.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z4.h, z17.h, z21.h\n"
- "ld1sb { z17.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z5.h, z21.h, z16.h\n"
- "ld1sb { z21.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
- "trn1 z10.h, z16.h, z15.h\n"
- "ld1sb { z16.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
- "trn1 z11.h, z15.h, z2.h\n"
- "sub z18.h, z18.h, z12.h\n"
- "sub z17.h, z17.h, z12.h\n"
- "ld1sb { z15.s }, p2/Z, [x22]\n"
- "addvl x21, x21, #-6\n"
- "sub z21.h, z21.h, z12.h\n"
- "sub z16.h, z16.h, z12.h\n"
- "mov x22, x23\n"
- "st1h { z0.h }, p2, [x21]\n"
- "sub z15.h, z15.h, z12.h\n"
- "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z0.h, z2.h, z18.h\n"
- "trn1 z8.h, z18.h, z17.h\n"
- "ld1sb { z18.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z4.h, z17.h, z21.h\n"
- "ld1sb { z17.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z5.h, z21.h, z16.h\n"
- "ld1sb { z21.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
- "trn1 z10.h, z16.h, z15.h\n"
- "ld1sb { z16.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
- "trn1 z11.h, z15.h, z2.h\n"
- "ld1sb { z15.s }, p2/Z, [x22]\n"
- "sub z18.h, z18.h, z12.h\n"
- "addvl x21, x21, #-6\n"
- "sub z17.h, z17.h, z12.h\n"
- "sub z21.h, z21.h, z12.h\n"
- "st1h { z0.h }, p2, [x21]\n"
- "sub z16.h, z16.h, z12.h\n"
- "sub z15.h, z15.h, z12.h\n"
- "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
- "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
- "mov z7.d, z6.d\n"
- "trn1 z0.h, z2.h, z18.h\n"
- "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z8.h, z18.h, z17.h\n"
- "trn1 z4.h, z17.h, z21.h\n"
- "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
- "trn1 z5.h, z21.h, z16.h\n"
- "trn1 z10.h, z16.h, z15.h\n"
- "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
- "addvl x21, x21, #-6\n"
- "trn1 z11.h, z15.h, z2.h\n"
- "st1h { z0.h }, p2, [x21]\n"
- "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
- "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
- "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
- "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
- "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
- "cbz x20, 3f\n"
- "ld1w { z3.s }, p1/Z, [x20, x5, LSL #2]\n"
+ "st1h { z22.h }, p2, [x22, #1, MUL VL]\n"
+ "trn1 z11.h, z15.h, z25.h\n"
+ "trn1 z10.h, z25.h, z21.h\n"
+ "ld1sb { z5.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z9.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z14.h, z21.h, z0.h\n"
+ "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z26.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z21.h, z0.h, z19.h\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z28.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z19.h, z19.h, z6.h\n"
+ "ld1sb { z29.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z13.h, z6.h, z15.h\n"
+ "sub z5.h, z5.h, z3.h\n"
+ "sub z23.h, z23.h, z3.h\n"
+ "ld1sb { z1.s }, p2/Z, [x20]\n"
+ "addvl x22, x22, #-6\n"
+ "sub z27.h, z27.h, z3.h\n"
+ "sub z29.h, z29.h, z3.h\n"
+ "mov x20, x23\n"
+ "st1h { z11.h }, p2, [x22]\n"
+ "sub z1.h, z1.h, z3.h\n"
+ "st1h { z10.h }, p2, [x22, #1, MUL VL]\n"
+ "trn1 z30.h, z15.h, z5.h\n"
+ "trn1 z26.h, z5.h, z23.h\n"
+ "ld1sb { z11.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z14.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z22.h, z23.h, z27.h\n"
+ "ld1sb { z5.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z21.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z28.h, z27.h, z29.h\n"
+ "ld1sb { z8.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z27.h, z29.h, z1.h\n"
+ "ld1sb { z9.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z13.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z2.h, z1.h, z15.h\n"
+ "ld1sb { z14.s }, p2/Z, [x20]\n"
+ "sub z11.h, z11.h, z3.h\n"
+ "addvl x22, x22, #-6\n"
+ "sub z5.h, z5.h, z3.h\n"
+ "sub z8.h, z8.h, z3.h\n"
+ "st1h { z30.h }, p2, [x22]\n"
+ "sub z9.h, z9.h, z3.h\n"
+ "sub z14.h, z14.h, z3.h\n"
+ "st1h { z26.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z22.h }, p2, [x22, #2, MUL VL]\n"
+ "mov z19.d, z18.d\n"
+ "trn1 z22.h, z15.h, z11.h\n"
+ "st1h { z28.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z1.h, z11.h, z5.h\n"
+ "trn1 z31.h, z5.h, z8.h\n"
+ "st1h { z27.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z8.h, z8.h, z9.h\n"
+ "trn1 z21.h, z9.h, z14.h\n"
+ "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+ "addvl x22, x22, #-6\n"
+ "trn1 z15.h, z14.h, z15.h\n"
+ "st1h { z22.h }, p2, [x22]\n"
+ "st1h { z1.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z31.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z8.h }, p2, [x22, #3, MUL VL]\n"
+ "st1h { z21.h }, p2, [x22, #4, MUL VL]\n"
+ "st1h { z15.h }, p2, [x22, #5, MUL VL]\n"
+ "cbz x21, 3f\n"
+ "ld1w { z7.s }, p1/Z, [x21, x17, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z1.s }, p1/Z, [x20, x5, LSL #2]\n"
+ "ld1w { z4.s }, p1/Z, [x20, x17, LSL #2]\n"
"4:" // Load right_shift: End
- "ldr x17, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x20, x17, #0x1\n"
+ "ldr x25, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x25, #0x1\n"
"orr x23, x20, %x[ld_in_col], LSL #16\n"
"ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
"orr x23, x7, x23, LSL #22\n"
"mov x22, #0x8\n"
- "add x21, x6, x4\n"
+ "add x21, x6, x5\n"
"lsl x20, %x[ld_in_row], #0x0\n"
"ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
"mov x11, #0x0\n"
@@ -271,56 +271,56 @@ void sme2_s8q_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x0\n"
"msub x16, x6, x20, x16\n"
- ".inst 0xc00468c0 // mova za.d[x11, #0], { z6.d-z7.d }\n"
+ ".inst 0xc0046a40 // mova za.d[x11, #0], { z18.d-z19.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc00468c1 // mova za.d[x11, #1], { z6.d-z7.d }\n"
+ ".inst 0xc0046a41 // mova za.d[x11, #1], { z18.d-z19.d }\n"
"mov x22, #0x4\n"
- "ldp x14, x13, [x25], #0x10\n"
- ".inst 0xc00468c2 // mova za.d[x11, #2], { z6.d-z7.d }\n"
- "ldp x3, x10, [x20], #0x10\n"
- ".inst 0xc00468c3 // mova za.d[x11, #3], { z6.d-z7.d }\n"
+ "ldp x14, x13, [x23], #0x10\n"
+ ".inst 0xc0046a42 // mova za.d[x11, #2], { z18.d-z19.d }\n"
+ "ldp x4, x10, [x20], #0x10\n"
+ ".inst 0xc0046a43 // mova za.d[x11, #3], { z18.d-z19.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc00468c4 // mova za.d[x11, #4], { z6.d-z7.d }\n"
- "ldp x9, x28, [x25], #0x10\n"
- ".inst 0xc00468c5 // mova za.d[x11, #5], { z6.d-z7.d }\n"
+ ".inst 0xc0046a44 // mova za.d[x11, #4], { z18.d-z19.d }\n"
+ "ldp x9, x28, [x23], #0x10\n"
+ ".inst 0xc0046a45 // mova za.d[x11, #5], { z18.d-z19.d }\n"
"ldp x27, x26, [x20], #0x10\n"
- ".inst 0xc00468c6 // mova za.d[x11, #6], { z6.d-z7.d }\n"
- ".inst 0xc00468c7 // mova za.d[x11, #7], { z6.d-z7.d }\n"
- ".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
- ".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
+ ".inst 0xc0046a46 // mova za.d[x11, #6], { z18.d-z19.d }\n"
+ ".inst 0xc0046a47 // mova za.d[x11, #7], { z18.d-z19.d }\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
"csel x20, x21, x22, LT\n"
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
+ ".inst 0xc0066814 // mova { z20.d-z21.d }, za.d[x11, #0]\n"
"sub x15, x15, x21\n"
- ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
+ ".inst 0xc0066836 // mova { z22.d-z23.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
+ ".inst 0xc1a4aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z4.s\n"
+ ".inst 0xc1acab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z12.s\n"
+ ".inst 0xc1b0cf14 // sclamp { z20.s-z23.s }, z24.s, z16.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z12.s }, p1, [x14]\n"
- "add x14, x14, x3\n"
- "st1b { z14.s }, p1, [x13]\n"
+ "st1b { z20.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z22.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z13.s }, p1, [x9]\n"
+ "st1b { z21.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z15.s }, p1, [x28]\n"
+ "st1b { z23.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 6b\n"
"7:" // Left padding: End
- "adds XZR, x6, x4\n"
+ "adds XZR, x6, x5\n"
"bne 14f\n"
"cbz x22, 12f\n"
"cmp x22, #0x1\n"
- "sub x17, x17, x22\n"
+ "sub x25, x25, x22\n"
"beq 11f\n"
"cmp x22, #0x2\n"
"beq 10f\n"
@@ -328,338 +328,338 @@ void sme2_s8q_planar_5x5_s1_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 4 priming loads
"add x21, x16, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x16]\n"
+ "ld1sb { z1.s }, p1/Z, [x16]\n"
"addvl x20, SP, #24\n"
- "ld1sb { z16.s }, p1/Z, [x21]\n"
+ "ld1sb { z28.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1sb { z17.s }, p1/Z, [x21]\n"
+ "trn1 z27.h, z1.h, z28.h\n"
+ "add z27.h, z27.h, z17.h\n"
+ "ld1sb { z1.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1sb { z16.s }, p1/Z, [x21]\n"
+ "ld1sb { z2.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "add z28.h, z28.h, z25.h\n"
- "ld1sb { z16.s }, p1/Z, [x21]\n"
+ "trn1 z28.h, z1.h, z2.h\n"
+ "add z28.h, z28.h, z17.h\n"
+ "ld1sb { z13.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z29.s }, p1/Z, [x21]\n"
+ "ld1sb { z6.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z29.h, z16.h, z29.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1sb { z17.s }, p1/Z, [x21]\n"
+ "trn1 z29.h, z13.h, z6.h\n"
+ "add z29.h, z29.h, z17.h\n"
+ "ld1sb { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "ld1sb { z16.s }, p1/Z, [x21]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "add z30.h, z30.h, z25.h\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16a7768 // sdot za.s[x11, 0], { z27.h-z28.h }, z10.h\n"
+ "ld1sb { z20.s }, p1/Z, [x21]\n"
+ "trn1 z30.h, z30.h, z20.h\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "add z30.h, z30.h, z17.h\n"
+ ".inst 0xc1697788 // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
+ ".inst 0xc1617789 // sdot za.s[x11, 1], { z28.h-z29.h }, z1.h\n"
".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
"9:" // Unpadded: 3 priming loads
"add x22, x16, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x16]\n"
+ "ld1sb { z2.s }, p1/Z, [x16]\n"
"addvl x21, SP, #18\n"
- "ld1sb { z16.s }, p1/Z, [x22]\n"
+ "ld1sb { z28.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1sb { z17.s }, p1/Z, [x22]\n"
+ "trn1 z20.h, z2.h, z28.h\n"
+ "add z20.h, z20.h, z17.h\n"
+ "ld1sb { z31.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #24\n"
- "ld1sb { z16.s }, p1/Z, [x22]\n"
+ "ld1sb { z11.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "add z28.h, z28.h, z25.h\n"
- "ld1sb { z17.s }, p1/Z, [x22]\n"
+ "trn1 z21.h, z31.h, z11.h\n"
+ "add z21.h, z21.h, z17.h\n"
+ "ld1sb { z25.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1sb { z16.s }, p1/Z, [x22]\n"
+ "ld1sb { z8.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z29.h, z17.h, z16.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1sb { z17.s }, p1/Z, [x22]\n"
+ "trn1 z22.h, z25.h, z8.h\n"
+ "add z22.h, z22.h, z17.h\n"
+ "ld1sb { z8.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "ld1sb { z16.s }, p1/Z, [x22]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- "add z30.h, z30.h, z25.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16e7688 // sdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
+ "ld1sb { z3.s }, p1/Z, [x22]\n"
+ "trn1 z23.h, z8.h, z3.h\n"
+ ".inst 0xc1667689 // sdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc161768a // sdot za.s[x11, 2], { z20.h-z21.h }, z1.h\n"
+ "add z23.h, z23.h, z17.h\n"
+ ".inst 0xa1412aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc160768b // sdot za.s[x11, 3], { z20.h-z21.h }, z0.h\n"
+ ".inst 0xc16976a8 // sdot za.s[x11, 0], { z21.h-z22.h }, z9.h\n"
+ ".inst 0xa0422aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16176a9 // sdot za.s[x11, 1], { z21.h-z22.h }, z1.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16976aa // sdot za.s[x11, 2], { z21.h-z22.h }, z9.h\n"
+ ".inst 0xc16176ab // sdot za.s[x11, 3], { z21.h-z22.h }, z1.h\n"
+ ".inst 0xc16f76c8 // sdot za.s[x11, 0], { z22.h-z23.h }, z15.h\n"
+ ".inst 0xc16e76c9 // sdot za.s[x11, 1], { z22.h-z23.h }, z14.h\n"
".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16b76ca // sdot za.s[x11, 2], { z22.h-z23.h }, z11.h\n"
+ ".inst 0xc16a76cb // sdot za.s[x11, 3], { z22.h-z23.h }, z10.h\n"
"10:" // Unpadded: 2 priming loads
"add x23, x16, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x16]\n"
+ "ld1sb { z2.s }, p1/Z, [x16]\n"
"addvl x22, SP, #12\n"
- "ld1sb { z16.s }, p1/Z, [x23]\n"
+ "ld1sb { z22.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1sb { z17.s }, p1/Z, [x23]\n"
+ "trn1 z0.h, z2.h, z22.h\n"
+ "add z0.h, z0.h, z17.h\n"
+ "ld1sb { z14.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"addvl x21, SP, #18\n"
- "ld1sb { z16.s }, p1/Z, [x23]\n"
+ "ld1sb { z6.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "add z28.h, z28.h, z25.h\n"
- "ld1sb { z17.s }, p1/Z, [x23]\n"
+ "trn1 z1.h, z14.h, z6.h\n"
+ "add z1.h, z1.h, z17.h\n"
+ "ld1sb { z15.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"addvl x20, SP, #24\n"
- "ld1sb { z16.s }, p1/Z, [x23]\n"
+ "ld1sb { z6.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z29.h, z17.h, z16.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1sb { z17.s }, p1/Z, [x23]\n"
+ "trn1 z2.h, z15.h, z6.h\n"
+ "add z2.h, z2.h, z17.h\n"
+ "ld1sb { z21.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "ld1sb { z16.s }, p1/Z, [x23]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- "add z30.h, z30.h, z25.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16f7408 // sdot za.s[x11, 0], { z0.h-z1.h }, z15.h\n"
+ "ld1sb { z30.s }, p1/Z, [x23]\n"
+ "trn1 z3.h, z21.h, z30.h\n"
+ ".inst 0xc16e7409 // sdot za.s[x11, 1], { z0.h-z1.h }, z14.h\n"
+ ".inst 0xa1402aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16d740a // sdot za.s[x11, 2], { z0.h-z1.h }, z13.h\n"
+ "add z3.h, z3.h, z17.h\n"
+ ".inst 0xa0412ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc165740b // sdot za.s[x11, 3], { z0.h-z1.h }, z5.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16f7428 // sdot za.s[x11, 0], { z1.h-z2.h }, z15.h\n"
+ ".inst 0xc16e7429 // sdot za.s[x11, 1], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xa0412aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16b740c // sdot za.s[x11, 4], { z0.h-z1.h }, z11.h\n"
+ ".inst 0xc16a740d // sdot za.s[x11, 5], { z0.h-z1.h }, z10.h\n"
+ ".inst 0xc16f742a // sdot za.s[x11, 2], { z1.h-z2.h }, z15.h\n"
+ ".inst 0xc16e742b // sdot za.s[x11, 3], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xa0412a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1697448 // sdot za.s[x11, 0], { z2.h-z3.h }, z9.h\n"
+ ".inst 0xc1687449 // sdot za.s[x11, 1], { z2.h-z3.h }, z8.h\n"
".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16f742c // sdot za.s[x11, 4], { z1.h-z2.h }, z15.h\n"
+ ".inst 0xc16e742d // sdot za.s[x11, 5], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xc16b744a // sdot za.s[x11, 2], { z2.h-z3.h }, z11.h\n"
+ ".inst 0xc16a744b // sdot za.s[x11, 3], { z2.h-z3.h }, z10.h\n"
+ ".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc161744c // sdot za.s[x11, 4], { z2.h-z3.h }, z1.h\n"
+ ".inst 0xc160744d // sdot za.s[x11, 5], { z2.h-z3.h }, z0.h\n"
"11:" // Unpadded: 1 priming loads
"add x24, x16, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x16]\n"
+ "ld1sb { z0.s }, p1/Z, [x16]\n"
"addvl x23, SP, #6\n"
- "ld1sb { z16.s }, p1/Z, [x24]\n"
+ "ld1sb { z3.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1sb { z17.s }, p1/Z, [x24]\n"
+ "trn1 z28.h, z0.h, z3.h\n"
+ "add z28.h, z28.h, z17.h\n"
+ "ld1sb { z6.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
"addvl x22, SP, #12\n"
- "ld1sb { z16.s }, p1/Z, [x24]\n"
+ "ld1sb { z30.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "add z28.h, z28.h, z25.h\n"
- "ld1sb { z17.s }, p1/Z, [x24]\n"
+ "trn1 z29.h, z6.h, z30.h\n"
+ "add z29.h, z29.h, z17.h\n"
+ "ld1sb { z1.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
"addvl x21, SP, #18\n"
- "ld1sb { z16.s }, p1/Z, [x24]\n"
+ "ld1sb { z25.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
- "trn1 z29.h, z17.h, z16.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1sb { z17.s }, p1/Z, [x24]\n"
+ "trn1 z30.h, z1.h, z25.h\n"
+ "add z30.h, z30.h, z17.h\n"
+ "ld1sb { z3.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
"addvl x20, SP, #24\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1617788 // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1sb { z16.s }, p1/Z, [x24]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- "add z30.h, z30.h, z25.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
+ "ld1sb { z5.s }, p1/Z, [x24]\n"
+ "trn1 z31.h, z3.h, z5.h\n"
+ ".inst 0xc1607789 // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16e778a // sdot za.s[x11, 2], { z28.h-z29.h }, z14.h\n"
+ "add z31.h, z31.h, z17.h\n"
+ ".inst 0xa1412ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc166778b // sdot za.s[x11, 3], { z28.h-z29.h }, z6.h\n"
+ ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16a77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16277a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z2.h\n"
+ ".inst 0xa0412ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa1422ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16f778c // sdot za.s[x11, 4], { z28.h-z29.h }, z15.h\n"
+ ".inst 0xc16e778d // sdot za.s[x11, 5], { z28.h-z29.h }, z14.h\n"
+ ".inst 0xa1402a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16977aa // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
+ ".inst 0xc16877ab // sdot za.s[x11, 3], { z29.h-z30.h }, z8.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16a77c8 // sdot za.s[x11, 0], { z30.h-z31.h }, z10.h\n"
+ ".inst 0xc16277c9 // sdot za.s[x11, 1], { z30.h-z31.h }, z2.h\n"
+ ".inst 0xa1422ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16e778e // sdot za.s[x11, 6], { z28.h-z29.h }, z14.h\n"
+ ".inst 0xc166778f // sdot za.s[x11, 7], { z28.h-z29.h }, z6.h\n"
+ ".inst 0xc16d77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z13.h\n"
+ ".inst 0xc16577ad // sdot za.s[x11, 5], { z29.h-z30.h }, z5.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16a77ca // sdot za.s[x11, 2], { z30.h-z31.h }, z10.h\n"
+ ".inst 0xc16277cb // sdot za.s[x11, 3], { z30.h-z31.h }, z2.h\n"
+ ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16e77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z14.h\n"
+ ".inst 0xc16677af // sdot za.s[x11, 7], { z29.h-z30.h }, z6.h\n"
+ ".inst 0xc16977cc // sdot za.s[x11, 4], { z30.h-z31.h }, z9.h\n"
+ ".inst 0xc16877cd // sdot za.s[x11, 5], { z30.h-z31.h }, z8.h\n"
+ ".inst 0xa1422a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16e77ce // sdot za.s[x11, 6], { z30.h-z31.h }, z14.h\n"
+ ".inst 0xc16677cf // sdot za.s[x11, 7], { z30.h-z31.h }, z6.h\n"
"12:" // Unpadded: 0 priming loads
- ".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xa0422bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "cbz x17, 22f\n"
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "cbz x25, 22f\n"
"add x20, x16, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x16]\n"
- "sub x17, x17, #0x1\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
+ "ld1sb { z26.s }, p1/Z, [x16]\n"
+ "sub x25, x25, #0x1\n"
+ "ld1sb { z28.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
+ "trn1 z25.h, z26.h, z28.h\n"
"sub x15, x15, #0x1\n"
- "ld1sb { z17.s }, p1/Z, [x20]\n"
+ "ld1sb { z31.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "cmp x17, x15\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
+ "cmp x25, x15\n"
+ "add z25.h, z25.h, z17.h\n"
+ "ld1sb { z15.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "csel x25, x17, x15, LT\n"
- "ld1sb { z17.s }, p1/Z, [x20]\n"
+ "trn1 z26.h, z31.h, z15.h\n"
+ "csel x25, x25, x15, LT\n"
+ "ld1sb { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z28.h, z28.h, z25.h\n"
+ "add z26.h, z26.h, z17.h\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
+ "ld1sb { z8.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z29.h, z17.h, z16.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1sb { z17.s }, p1/Z, [x20]\n"
+ "trn1 z27.h, z22.h, z8.h\n"
+ "add z27.h, z27.h, z17.h\n"
+ "ld1sb { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"sub x15, x15, x25\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- "add z30.h, z30.h, z25.h\n"
+ "ld1sb { z20.s }, p1/Z, [x20]\n"
+ "trn1 z28.h, z21.h, z20.h\n"
+ "add z28.h, z28.h, z17.h\n"
"cbz x25, 21f\n"
"13:" // Unpadded: Main loop
"addvl x24, SP, #6\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
"addvl x23, SP, #12\n"
- "ld1sb { z23.s }, p1/Z, [x16]\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402b00 // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
+ "ld1sb { z21.s }, p1/Z, [x16]\n"
+ ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa0402b0e // ld1h { z14.h-z15.h }, pn10.b/Z, [x24]\n"
"addvl x22, SP, #18\n"
"addvl x21, SP, #24\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc16f772a // sdot za.s[x11, 2], { z25.h-z26.h }, z15.h\n"
"add x20, x16, %x[ld_in_row]\n"
- "ld1sb { z22.s }, p1/Z, [x20]\n"
+ "ld1sb { z0.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc16e772b // sdot za.s[x11, 3], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
"subs x25, x25, #0x1\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- "ld1sb { z21.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412b04 // ld1h { z4.h-z5.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
"ld1sb { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- "ld1sb { z19.s }, p1/Z, [x20]\n"
+ ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412b05 // ld1h { z5.h, z13.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+ ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+ "ld1sb { z31.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- "ld1sb { z18.s }, p1/Z, [x20]\n"
+ ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16d774a // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+ "ld1sb { z29.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422b0a // ld1h { z10.h-z11.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- "ld1sb { z17.s }, p1/Z, [x20]\n"
+ ".inst 0xc165774b // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ae5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+ "ld1sb { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc16f772e // sdot za.s[x11, 6], { z25.h-z26.h }, z15.h\n"
+ "ld1sb { z30.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16e772f // sdot za.s[x11, 7], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16d774c // sdot za.s[x11, 4], { z26.h-z27.h }, z13.h\n"
+ "ld1sb { z6.s }, p1/Z, [x20]\n"
+ ".inst 0xc165774d // sdot za.s[x11, 5], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16a776a // sdot za.s[x11, 2], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc162776b // sdot za.s[x11, 3], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16a776c // sdot za.s[x11, 4], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc162776d // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc169776e // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
+ ".inst 0xc161776f // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc1681768 // sdot za.s[x8, 0], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
- "trn1 z27.h, z23.h, z22.h\n"
- ".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc1651788 // sdot za.s[x8, 0], { z28.h-z29.h }, z5.h\n"
- "add z27.h, z27.h, z25.h\n"
- ".inst 0xc1641789 // sdot za.s[x8, 1], { z28.h-z29.h }, z4.h\n"
- "trn1 z28.h, z21.h, z20.h\n"
- ".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xc16b17a8 // sdot za.s[x8, 0], { z29.h-z30.h }, z11.h\n"
- "add z28.h, z28.h, z25.h\n"
- ".inst 0xc16a17a9 // sdot za.s[x8, 1], { z29.h-z30.h }, z10.h\n"
- "trn1 z29.h, z19.h, z18.h\n"
- "trn1 z30.h, z17.h, z16.h\n"
+ ".inst 0xc16f1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
+ ".inst 0xc16e1729 // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
+ "trn1 z25.h, z21.h, z0.h\n"
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc16d1748 // sdot za.s[x8, 0], { z26.h-z27.h }, z13.h\n"
+ "add z25.h, z25.h, z17.h\n"
+ ".inst 0xc1651749 // sdot za.s[x8, 1], { z26.h-z27.h }, z5.h\n"
+ "trn1 z26.h, z20.h, z31.h\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xc16b1768 // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
+ "add z26.h, z26.h, z17.h\n"
+ ".inst 0xc16a1769 // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
+ "trn1 z27.h, z29.h, z22.h\n"
+ "trn1 z28.h, z30.h, z6.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
- ".inst 0xa0422bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "add z29.h, z29.h, z25.h\n"
- ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "add z27.h, z27.h, z17.h\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
- ".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x14]\n"
- "add x14, x14, x3\n"
- "add z30.h, z30.h, z25.h\n"
- "st1b { z14.s }, p1, [x13]\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "add z28.h, z28.h, z17.h\n"
+ "st1b { z10.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z13.s }, p1, [x9]\n"
+ "st1b { z9.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z15.s }, p1, [x28]\n"
+ "st1b { z11.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 13b\n"
"b 21f\n"
"14:" // Padded
"cbz x22, 19f\n"
"cmp x22, #0x1\n"
- "sub x17, x17, x22\n"
+ "sub x25, x25, x22\n"
"beq 18f\n"
"cmp x22, #0x2\n"
"beq 17f\n"
@@ -668,515 +668,515 @@ void sme2_s8q_planar_5x5_s1_4rows_dot_za_impl(
"15:" // Padded: 4 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z19.s }, p0/Z, [x16]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1sb { z9.s }, p0/Z, [x16]\n"
+ "add z9.h, p0/M, z9.h, z17.h\n"
"add x21, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x21]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1sb { z22.s }, p0/Z, [x21]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1sb { z21.s }, p0/Z, [x21]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1sb { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z27.h, z19.h, z18.h\n"
- "trn1 z28.h, z17.h, z16.h\n"
+ "trn1 z31.h, z9.h, z22.h\n"
+ "trn1 z0.h, z21.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z18.s }, p0/Z, [x21]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1sb { z22.s }, p0/Z, [x21]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1sb { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x21]\n"
+ "ld1sb { z21.s }, p0/Z, [x21]\n"
"addvl x20, SP, #24\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- "trn1 z29.h, z18.h, z16.h\n"
- "ld1sb { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ "trn1 z1.h, z22.h, z20.h\n"
+ "ld1sb { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc16a77e8 // sdot za.s[x11, 0], { z31.h-z0.h }, z10.h\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16277e9 // sdot za.s[x11, 1], { z31.h-z0.h }, z2.h\n"
+ ".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "trn1 z2.h, z21.h, z20.h\n"
+ ".inst 0xc16d7408 // sdot za.s[x11, 0], { z0.h-z1.h }, z13.h\n"
+ ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc1657409 // sdot za.s[x11, 1], { z0.h-z1.h }, z5.h\n"
+ ".inst 0xc1697428 // sdot za.s[x11, 0], { z1.h-z2.h }, z9.h\n"
+ ".inst 0xc1687429 // sdot za.s[x11, 1], { z1.h-z2.h }, z8.h\n"
"16:" // Padded: 3 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z19.s }, p0/Z, [x16]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1sb { z5.s }, p0/Z, [x16]\n"
+ "add z5.h, p0/M, z5.h, z17.h\n"
"add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1sb { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1sb { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z19.h, z18.h\n"
- "trn1 z28.h, z17.h, z16.h\n"
+ "trn1 z28.h, z5.h, z22.h\n"
+ "trn1 z29.h, z21.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1sb { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "ld1sb { z21.s }, p0/Z, [x20]\n"
"addvl x21, SP, #18\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- "trn1 z29.h, z18.h, z16.h\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ "trn1 z30.h, z22.h, z20.h\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
"addvl x20, SP, #24\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- "trn1 z30.h, z17.h, z16.h\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc1617788 // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xc1607789 // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ "trn1 z31.h, z21.h, z20.h\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xa0412aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc169778a // sdot za.s[x11, 2], { z28.h-z29.h }, z9.h\n"
+ ".inst 0xc161778b // sdot za.s[x11, 3], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xa1422aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16f77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z15.h\n"
+ ".inst 0xc16e77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z14.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16977aa // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
+ ".inst 0xc16177ab // sdot za.s[x11, 3], { z29.h-z30.h }, z1.h\n"
+ ".inst 0xc16b77c8 // sdot za.s[x11, 0], { z30.h-z31.h }, z11.h\n"
+ ".inst 0xc16377c9 // sdot za.s[x11, 1], { z30.h-z31.h }, z3.h\n"
+ ".inst 0xa0422a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16f77ca // sdot za.s[x11, 2], { z30.h-z31.h }, z15.h\n"
+ ".inst 0xc16e77cb // sdot za.s[x11, 3], { z30.h-z31.h }, z14.h\n"
"17:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z19.s }, p0/Z, [x16]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1sb { z29.s }, p0/Z, [x16]\n"
+ "add z29.h, p0/M, z29.h, z17.h\n"
"add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1sb { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1sb { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z19.h, z18.h\n"
- "trn1 z28.h, z17.h, z16.h\n"
+ "trn1 z8.h, z29.h, z22.h\n"
+ "trn1 z9.h, z21.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1sb { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "ld1sb { z21.s }, p0/Z, [x20]\n"
"addvl x22, SP, #12\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- "trn1 z29.h, z18.h, z16.h\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+ "trn1 z10.h, z22.h, z20.h\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
"addvl x21, SP, #18\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc16f7508 // sdot za.s[x11, 0], { z8.h-z9.h }, z15.h\n"
+ ".inst 0xc16e7509 // sdot za.s[x11, 1], { z8.h-z9.h }, z14.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
"addvl x20, SP, #24\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
+ "trn1 z11.h, z21.h, z20.h\n"
+ ".inst 0xa1412ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16e750a // sdot za.s[x11, 2], { z8.h-z9.h }, z14.h\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc166750b // sdot za.s[x11, 3], { z8.h-z9.h }, z6.h\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16d7528 // sdot za.s[x11, 0], { z9.h-z10.h }, z13.h\n"
+ ".inst 0xa0422ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc1657529 // sdot za.s[x11, 1], { z9.h-z10.h }, z5.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16f750c // sdot za.s[x11, 4], { z8.h-z9.h }, z15.h\n"
+ ".inst 0xc16e750d // sdot za.s[x11, 5], { z8.h-z9.h }, z14.h\n"
+ ".inst 0xc16d752a // sdot za.s[x11, 2], { z9.h-z10.h }, z13.h\n"
+ ".inst 0xc165752b // sdot za.s[x11, 3], { z9.h-z10.h }, z5.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1617548 // sdot za.s[x11, 0], { z10.h-z11.h }, z1.h\n"
+ ".inst 0xc1607549 // sdot za.s[x11, 1], { z10.h-z11.h }, z0.h\n"
+ ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16e752c // sdot za.s[x11, 4], { z9.h-z10.h }, z14.h\n"
+ ".inst 0xc166752d // sdot za.s[x11, 5], { z9.h-z10.h }, z6.h\n"
+ ".inst 0xc161754a // sdot za.s[x11, 2], { z10.h-z11.h }, z1.h\n"
+ ".inst 0xc160754b // sdot za.s[x11, 3], { z10.h-z11.h }, z0.h\n"
+ ".inst 0xa0422a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16f754c // sdot za.s[x11, 4], { z10.h-z11.h }, z15.h\n"
+ ".inst 0xc16e754d // sdot za.s[x11, 5], { z10.h-z11.h }, z14.h\n"
"18:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z19.s }, p0/Z, [x16]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1sb { z1.s }, p0/Z, [x16]\n"
+ "add z1.h, p0/M, z1.h, z17.h\n"
"add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1sb { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1sb { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z19.h, z18.h\n"
- "trn1 z28.h, z17.h, z16.h\n"
+ "trn1 z26.h, z1.h, z22.h\n"
+ "trn1 z27.h, z21.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1sb { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "ld1sb { z21.s }, p0/Z, [x20]\n"
"addvl x23, SP, #6\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
- "trn1 z29.h, z18.h, z16.h\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xa0402aee // ld1h { z14.h-z15.h }, pn10.b/Z, [x23]\n"
+ "trn1 z28.h, z22.h, z20.h\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
"addvl x22, SP, #12\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc16f7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z15.h\n"
+ ".inst 0xc16e7749 // sdot za.s[x11, 1], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xa0402ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22]\n"
"addvl x21, SP, #18\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
+ "trn1 z29.h, z21.h, z20.h\n"
+ ".inst 0xa0412aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc161774a // sdot za.s[x11, 2], { z26.h-z27.h }, z1.h\n"
"addvl x20, SP, #24\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc160774b // sdot za.s[x11, 3], { z26.h-z27.h }, z0.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16b7768 // sdot za.s[x11, 0], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xa0422ae8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16a7769 // sdot za.s[x11, 1], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xa0412aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16e774c // sdot za.s[x11, 4], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xc166774d // sdot za.s[x11, 5], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16b776a // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xc16a776b // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc1697788 // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
+ ".inst 0xc1687789 // sdot za.s[x11, 1], { z28.h-z29.h }, z8.h\n"
+ ".inst 0xa1422ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xc16e776c // sdot za.s[x11, 4], { z27.h-z28.h }, z14.h\n"
+ ".inst 0xc166776d // sdot za.s[x11, 5], { z27.h-z28.h }, z6.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16a778a // sdot za.s[x11, 2], { z28.h-z29.h }, z10.h\n"
+ ".inst 0xc162778b // sdot za.s[x11, 3], { z28.h-z29.h }, z2.h\n"
+ ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16e776e // sdot za.s[x11, 6], { z27.h-z28.h }, z14.h\n"
+ ".inst 0xc166776f // sdot za.s[x11, 7], { z27.h-z28.h }, z6.h\n"
+ ".inst 0xc161778c // sdot za.s[x11, 4], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xc160778d // sdot za.s[x11, 5], { z28.h-z29.h }, z0.h\n"
+ ".inst 0xa1422a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16a778e // sdot za.s[x11, 6], { z28.h-z29.h }, z10.h\n"
+ ".inst 0xc162778f // sdot za.s[x11, 7], { z28.h-z29.h }, z2.h\n"
"19:" // Padded: 0 priming loads
- ".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xa0422bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "cbz x17, 22f\n"
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "cbz x25, 22f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z19.s }, p0/Z, [x16]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1sb { z6.s }, p0/Z, [x16]\n"
+ "add z6.h, p0/M, z6.h, z17.h\n"
"add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1sb { z30.s }, p0/Z, [x20]\n"
+ "add z30.h, p0/M, z30.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1sb { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1sb { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z17.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z19.h, z18.h\n"
- "trn1 z28.h, z17.h, z16.h\n"
+ "trn1 z25.h, z6.h, z30.h\n"
+ "trn1 z26.h, z27.h, z26.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1sb { z8.s }, p0/Z, [x20]\n"
+ "add z8.h, p0/M, z8.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1sb { z9.s }, p0/Z, [x20]\n"
+ "add z9.h, p0/M, z9.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1sb { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- "sub x17, x17, #0x1\n"
+ "ld1sb { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z17.h\n"
+ "sub x25, x25, #0x1\n"
"sub x15, x15, #0x1\n"
- "cmp x17, x15\n"
- "trn1 z29.h, z19.h, z18.h\n"
- "trn1 z30.h, z17.h, z16.h\n"
- "csel x25, x17, x15, LT\n"
+ "cmp x25, x15\n"
+ "trn1 z27.h, z8.h, z9.h\n"
+ "trn1 z28.h, z21.h, z29.h\n"
+ "csel x25, x25, x15, LT\n"
"add x16, x16, %x[ld_in_col]\n"
"sub x15, x15, x25\n"
"cbz x25, 21f\n"
"20:" // Padded: Main loop
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z23.s }, p0/Z, [x16]\n"
- "add z23.h, p0/M, z23.h, z25.h\n"
+ "ld1sb { z8.s }, p0/Z, [x16]\n"
+ "add z8.h, p0/M, z8.h, z17.h\n"
"add x24, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z22.s }, p0/Z, [x24]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ "ld1sb { z21.s }, p0/Z, [x24]\n"
+ ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
"addvl x23, SP, #6\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
"addvl x22, SP, #12\n"
- "add z22.h, p0/M, z22.h, z25.h\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x24, x24, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc161772a // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xc160772b // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
"addvl x21, SP, #18\n"
"addvl x20, SP, #24\n"
- "ld1sb { z21.s }, p0/Z, [x24]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- "add z21.h, p0/M, z21.h, z25.h\n"
+ "ld1sb { z29.s }, p0/Z, [x24]\n"
+ ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+ "add z29.h, p0/M, z29.h, z17.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ae5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
"mov x12, #0x4\n"
"add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- "ld1sb { z20.s }, p0/Z, [x24]\n"
- "add z20.h, p0/M, z20.h, z25.h\n"
+ ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+ "ld1sb { z30.s }, p0/Z, [x24]\n"
+ "add z30.h, p0/M, z30.h, z17.h\n"
"add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"subs x25, x25, #0x1\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- "ld1sb { z19.s }, p0/Z, [x24]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ ".inst 0xc16d774a // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+ "ld1sb { z15.s }, p0/Z, [x24]\n"
+ "add z15.h, p0/M, z15.h, z17.h\n"
"add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc165774b // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa0412aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- "ld1sb { z18.s }, p0/Z, [x24]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+ "ld1sb { z20.s }, p0/Z, [x24]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- "ld1sb { z17.s }, p0/Z, [x24]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ ".inst 0xc16e772e // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+ "ld1sb { z31.s }, p0/Z, [x24]\n"
+ "add z31.h, p0/M, z31.h, z17.h\n"
"add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc166772f // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- "ld1sb { z16.s }, p0/Z, [x24]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16b774c // sdot za.s[x11, 4], { z26.h-z27.h }, z11.h\n"
+ "ld1sb { z22.s }, p0/Z, [x24]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ ".inst 0xc16a774d // sdot za.s[x11, 5], { z26.h-z27.h }, z10.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc169776a // sdot za.s[x11, 2], { z27.h-z28.h }, z9.h\n"
+ ".inst 0xc161776b // sdot za.s[x11, 3], { z27.h-z28.h }, z1.h\n"
+ ".inst 0xa0422ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16e774e // sdot za.s[x11, 6], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xc166774f // sdot za.s[x11, 7], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc161776c // sdot za.s[x11, 4], { z27.h-z28.h }, z1.h\n"
+ ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
+ ".inst 0xa1422aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc169776e // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
+ ".inst 0xc161776f // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1681768 // sdot za.s[x8, 0], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
- "trn1 z27.h, z23.h, z22.h\n"
- ".inst 0xc1651788 // sdot za.s[x8, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1641789 // sdot za.s[x8, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "trn1 z28.h, z21.h, z20.h\n"
- ".inst 0xc16b17a8 // sdot za.s[x8, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a17a9 // sdot za.s[x8, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc1631728 // sdot za.s[x8, 0], { z25.h-z26.h }, z3.h\n"
+ ".inst 0xc1621729 // sdot za.s[x8, 1], { z25.h-z26.h }, z2.h\n"
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ "trn1 z25.h, z8.h, z21.h\n"
+ ".inst 0xc16e1748 // sdot za.s[x8, 0], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xc1661749 // sdot za.s[x8, 1], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "trn1 z26.h, z29.h, z30.h\n"
+ ".inst 0xc16b1768 // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xc16a1769 // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xa0422bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "trn1 z29.h, z19.h, z18.h\n"
- ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "trn1 z27.h, z15.h, z20.h\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ "trn1 z28.h, z31.h, z22.h\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
- ".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x14]\n"
- "add x14, x14, x3\n"
- "st1b { z14.s }, p1, [x13]\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z10.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z13.s }, p1, [x9]\n"
+ "st1b { z9.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z15.s }, p1, [x28]\n"
+ "st1b { z11.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 20b\n"
"21:" // Main loop tail
"addvl x23, SP, #6\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
"addvl x22, SP, #12\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
"addvl x21, SP, #18\n"
"addvl x20, SP, #24\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc161772a // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xc160772b // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc169774a // sdot za.s[x11, 2], { z26.h-z27.h }, z9.h\n"
+ ".inst 0xc161774b // sdot za.s[x11, 3], { z26.h-z27.h }, z1.h\n"
+ ".inst 0xa1412ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1681768 // sdot za.s[x8, 0], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc16e772e // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc166772f // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc169774c // sdot za.s[x11, 4], { z26.h-z27.h }, z9.h\n"
+ ".inst 0xc161774d // sdot za.s[x11, 5], { z26.h-z27.h }, z1.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16b776a // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xc16a776b // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xa0422ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa0412a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc163776c // sdot za.s[x11, 4], { z27.h-z28.h }, z3.h\n"
+ ".inst 0xc162776d // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16a776e // sdot za.s[x11, 6], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc162776f // sdot za.s[x11, 7], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16f1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
+ ".inst 0xc16e1729 // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc1691748 // sdot za.s[x8, 0], { z26.h-z27.h }, z9.h\n"
+ ".inst 0xc1681749 // sdot za.s[x8, 1], { z26.h-z27.h }, z8.h\n"
+ ".inst 0xc1611768 // sdot za.s[x8, 0], { z27.h-z28.h }, z1.h\n"
".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc1651788 // sdot za.s[x8, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1641789 // sdot za.s[x8, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b17a8 // sdot za.s[x8, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a17a9 // sdot za.s[x8, 1], { z29.h-z30.h }, z10.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
- ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
- ".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x14]\n"
- "add x14, x14, x3\n"
- "st1b { z14.s }, p1, [x13]\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z10.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z13.s }, p1, [x9]\n"
+ "st1b { z9.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z15.s }, p1, [x28]\n"
+ "st1b { z11.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"22:" // Main loop skip tail
"cbz x15, 24f\n"
"23:" // Right padding loop
- ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
"add x8, x8, #0x2\n"
"subs x15, x15, #0x1\n"
- ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
- ".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x14]\n"
- "add x14, x14, x3\n"
- "st1b { z14.s }, p1, [x13]\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z10.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z13.s }, p1, [x9]\n"
+ "st1b { z9.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z15.s }, p1, [x28]\n"
+ "st1b { z11.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 23b\n"
"24:" // End
- "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x23, ALL, MUL #16\n"
- "incw x23, ALL, MUL #9\n"
- "str x23, [%x[args], %[offsetof_Args_weights]]\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "incw x5\n"
- "whilelt p1.s, x5, x7\n"
- "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x16, x16, x20\n"
- "str x16, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #16\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "incw x17\n"
+ "whilelt p1.s, x17, x7\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
"ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
"ldp x23, x22, [x25, #0x0]\n"
@@ -1194,7 +1194,7 @@ void sme2_s8q_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za.hpp
index e7a781d072..8bffc05e1f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za/generic.cpp
index 3e8510392f..3da0d14d74 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za/generic.cpp
@@ -73,156 +73,156 @@ void sme2_s8q_planar_5x5_s2_4rows_dot_za_impl(
"ptrue p2.b\n"
"mov x20, #0xb\n"
"ldr x4, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z9.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "ld1rh { z7.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
"sub x20, x20, x3\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x5, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x5\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
"whilelt p8.s, XZR, x4\n"
"addvl SP, SP, #-15\n"
"ldr x6, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z9.h, p2/M, z9.h\n"
+ "neg z7.h, p2/M, z7.h\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z3.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z1.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z5.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z21.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z28.s, #0x0\n"
+ "mov z12.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z28.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "ld1w { z12.s }, p1/Z, [x20, x6, LSL #2]\n"
"2:" // Load bias: Done
"ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
"mov x20, x22\n"
- "ld1sb { z12.s }, p2/Z, [x20]\n"
+ "ld1sb { z13.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "ld1rh { z18.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "sub z12.h, z12.h, z18.h\n"
+ "ld1rh { z28.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "sub z13.h, z13.h, z28.h\n"
"incw x22\n"
- "mov z14.h, #0x0\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "mov z26.h, #0x0\n"
+ "ld1sb { z22.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z25.h, z25.h, z18.h\n"
- "trn1 z2.h, z12.h, z25.h\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "sub z22.h, z22.h, z28.h\n"
+ "trn1 z17.h, z13.h, z22.h\n"
+ "ld1sb { z20.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z24.h, z24.h, z18.h\n"
+ "sub z20.h, z20.h, z28.h\n"
"addvl x21, SP, #15\n"
- "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "ld1sb { z1.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z17.h, z17.h, z18.h\n"
- "trn1 z10.h, z24.h, z17.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "sub z1.h, z1.h, z28.h\n"
+ "trn1 z29.h, z20.h, z1.h\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
"mov x20, x22\n"
- "sub z16.h, z16.h, z18.h\n"
+ "sub z27.h, z27.h, z28.h\n"
"incw x22\n"
- "ld1sb { z12.s }, p2/Z, [x20]\n"
+ "ld1sb { z14.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z12.h, z12.h, z18.h\n"
+ "sub z14.h, z14.h, z28.h\n"
"addvl x21, x21, #-3\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "ld1sb { z18.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z25.h, z25.h, z18.h\n"
- "trn1 z0.h, z16.h, z14.h\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "sub z18.h, z18.h, z28.h\n"
+ "trn1 z22.h, z27.h, z26.h\n"
+ "ld1sb { z23.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z24.h, z24.h, z18.h\n"
- "st1h { z2.h }, p2, [x21]\n"
- "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "sub z23.h, z23.h, z28.h\n"
+ "st1h { z17.h }, p2, [x21]\n"
+ "ld1sb { z30.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z17.h, z17.h, z18.h\n"
- "trn1 z2.h, z12.h, z25.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "sub z30.h, z30.h, z28.h\n"
+ "trn1 z8.h, z14.h, z18.h\n"
+ "ld1sb { z15.s }, p2/Z, [x20]\n"
"mov x20, x22\n"
- "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
- "sub z16.h, z16.h, z18.h\n"
- "ld1sb { z12.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #5\n"
- "trn1 z10.h, z24.h, z17.h\n"
- "sub z12.h, z12.h, z18.h\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "st1h { z29.h }, p2, [x21, #1, MUL VL]\n"
+ "sub z15.h, z15.h, z28.h\n"
+ "ld1sb { z20.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z25.h, z25.h, z18.h\n"
- "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z23.h, z23.h, z30.h\n"
+ "sub z20.h, z20.h, z28.h\n"
"ld1sb { z24.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z0.h, z16.h, z14.h\n"
+ "sub z24.h, z24.h, z28.h\n"
+ "st1h { z22.h }, p2, [x21, #2, MUL VL]\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z0.h, z15.h, z26.h\n"
"incw x22\n"
- "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "ld1sb { z13.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z24.h, z24.h, z18.h\n"
- "sub z17.h, z17.h, z18.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "sub z16.h, z16.h, z28.h\n"
+ "sub z13.h, z13.h, z28.h\n"
+ "ld1sb { z11.s }, p2/Z, [x20]\n"
"addvl x21, x21, #-3\n"
"mov x20, x22\n"
- "st1h { z2.h }, p2, [x21]\n"
- "trn1 z2.h, z12.h, z25.h\n"
- "ld1sb { z12.s }, p2/Z, [x20]\n"
+ "st1h { z8.h }, p2, [x21]\n"
+ "trn1 z27.h, z20.h, z24.h\n"
+ "ld1sb { z22.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z16.h, z16.h, z18.h\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "sub z11.h, z11.h, z28.h\n"
+ "ld1sb { z3.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z10.h, z24.h, z17.h\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "st1h { z23.h }, p2, [x21, #1, MUL VL]\n"
+ "trn1 z20.h, z16.h, z13.h\n"
+ "ld1sb { z13.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z12.h, z12.h, z18.h\n"
- "sub z25.h, z25.h, z18.h\n"
- "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "sub z22.h, z22.h, z28.h\n"
+ "sub z3.h, z3.h, z28.h\n"
+ "ld1sb { z15.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
"st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z0.h, z16.h, z14.h\n"
+ "trn1 z29.h, z11.h, z26.h\n"
"ld1sb { z16.s }, p2/Z, [x20]\n"
"incw x22\n"
- "sub z24.h, z24.h, z18.h\n"
- "sub z17.h, z17.h, z18.h\n"
+ "sub z13.h, z13.h, z28.h\n"
+ "sub z15.h, z15.h, z28.h\n"
"addvl x21, x21, #-3\n"
"mov x20, x22\n"
- "st1h { z2.h }, p2, [x21]\n"
- "sub z16.h, z16.h, z18.h\n"
- "trn1 z2.h, z12.h, z25.h\n"
- "ld1sb { z12.s }, p2/Z, [x20]\n"
+ "st1h { z27.h }, p2, [x21]\n"
+ "sub z16.h, z16.h, z28.h\n"
+ "trn1 z19.h, z22.h, z3.h\n"
+ "ld1sb { z17.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "st1h { z20.h }, p2, [x21, #1, MUL VL]\n"
+ "ld1sb { z0.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z10.h, z24.h, z17.h\n"
- "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "trn1 z31.h, z13.h, z15.h\n"
+ "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
+ "ld1sb { z18.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z0.h, z16.h, z14.h\n"
- "sub z12.h, z12.h, z18.h\n"
- "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "trn1 z16.h, z16.h, z26.h\n"
+ "sub z17.h, z17.h, z28.h\n"
+ "ld1sb { z22.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z25.h, z25.h, z18.h\n"
- "sub z24.h, z24.h, z18.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
- "sub z17.h, z17.h, z18.h\n"
- "sub z16.h, z16.h, z18.h\n"
+ "sub z0.h, z0.h, z28.h\n"
+ "sub z18.h, z18.h, z28.h\n"
+ "ld1sb { z1.s }, p2/Z, [x20]\n"
+ "sub z22.h, z22.h, z28.h\n"
+ "sub z1.h, z1.h, z28.h\n"
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
"addvl x21, x21, #-3\n"
- "st1h { z2.h }, p2, [x21]\n"
- "mov z29.d, z28.d\n"
- "mov z30.d, z28.d\n"
- "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
- "mov z31.d, z28.d\n"
- "trn1 z2.h, z12.h, z25.h\n"
- "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "st1h { z19.h }, p2, [x21]\n"
+ "mov z13.d, z12.d\n"
+ "mov z14.d, z12.d\n"
+ "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
+ "mov z15.d, z12.d\n"
+ "trn1 z8.h, z17.h, z0.h\n"
+ "st1h { z16.h }, p2, [x21, #2, MUL VL]\n"
"addvl x21, x21, #-3\n"
- "trn1 z10.h, z24.h, z17.h\n"
- "trn1 z0.h, z16.h, z14.h\n"
- "st1h { z2.h }, p2, [x21]\n"
- "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
- "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z31.h, z18.h, z22.h\n"
+ "trn1 z29.h, z1.h, z26.h\n"
+ "st1h { z8.h }, p2, [x21]\n"
+ "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
"cbz x20, 3f\n"
- "ld1w { z3.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "ld1w { z6.s }, p1/Z, [x20, x6, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z1.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "ld1w { z4.s }, p1/Z, [x20, x6, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x7, [%x[args], %[offsetof_Args_input_cols]]\n"
"sub x20, x7, #0x1\n"
@@ -242,20 +242,20 @@ void sme2_s8q_planar_5x5_s2_4rows_dot_za_impl(
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x0\n"
"msub x17, x4, x20, x17\n"
- ".inst 0xc0040f80 // mova za.d[x8, #0], { z28.d-z31.d }\n"
+ ".inst 0xc0040d80 // mova za.d[x8, #0], { z12.d-z15.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040f81 // mova za.d[x8, #1], { z28.d-z31.d }\n"
+ ".inst 0xc0040d81 // mova za.d[x8, #1], { z12.d-z15.d }\n"
"mov x22, #0x4\n"
- "ldp x15, x14, [x25], #0x10\n"
- ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "ldp x15, x14, [x23], #0x10\n"
+ ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
"ldp x13, x11, [x20], #0x10\n"
- ".inst 0xc0040f83 // mova za.d[x8, #3], { z28.d-z31.d }\n"
+ ".inst 0xc0040d83 // mova za.d[x8, #3], { z12.d-z15.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "ldp x10, x9, [x25], #0x10\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "ldp x10, x9, [x23], #0x10\n"
"ldp x28, x27, [x20], #0x10\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
@@ -263,24 +263,24 @@ void sme2_s8q_planar_5x5_s2_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
"and x22, x21, #0x1\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
"add x21, x21, #0x1\n"
"lsr x21, x21, #0x1\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
+ ".inst 0xc1aaab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
"sub x16, x16, x21\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
+ ".inst 0xc1b5ccbc // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z4.s }, p1, [x15]\n"
+ "st1b { z28.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- "st1b { z5.s }, p1, [x14]\n"
+ "st1b { z29.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "st1b { z30.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z7.s }, p1, [x9]\n"
+ "st1b { z31.s }, p1, [x9]\n"
"add x9, x9, x27\n"
"bgt 6b\n"
"7:" // Left padding: End
@@ -296,341 +296,341 @@ void sme2_s8q_planar_5x5_s2_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 4 priming loads
"add x21, x17, %x[ld_in_row]\n"
- "ld1sb { z11.s }, p1/Z, [x17]\n"
+ "ld1sb { z27.s }, p1/Z, [x17]\n"
"addvl x20, SP, #12\n"
- "ld1sb { z21.s }, p1/Z, [x21]\n"
+ "ld1sb { z0.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1sb { z12.s }, p1/Z, [x21]\n"
+ "trn1 z27.h, z27.h, z0.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ "ld1sb { z28.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1sb { z20.s }, p1/Z, [x21]\n"
+ "ld1sb { z11.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1sb { z13.s }, p1/Z, [x21]\n"
+ "trn1 z28.h, z28.h, z11.h\n"
+ "add z28.h, z28.h, z7.h\n"
+ "ld1sb { z29.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z19.s }, p1/Z, [x21]\n"
+ "ld1sb { z8.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1sb { z14.s }, p1/Z, [x21]\n"
+ "trn1 z29.h, z29.h, z8.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1sb { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z18.s }, p1/Z, [x21]\n"
+ "ld1sb { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1sb { z15.s }, p1/Z, [x21]\n"
+ "trn1 z30.h, z30.h, z17.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ "ld1sb { z31.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x21]\n"
+ "ld1sb { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "trn1 z31.h, z31.h, z26.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+ "ld1sb { z20.s }, p1/Z, [x21]\n"
+ "mov z0.d, z20.d\n"
+ "add z0.h, z0.h, z7.h\n"
+ ".inst 0xc1781788 // sdot za.s[x8, 0], { z28.h-z31.h }, z8.h\n"
+ "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17817a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z8.h\n"
"9:" // Unpadded: 3 priming loads
"add x21, x17, %x[ld_in_row]\n"
- "ld1sb { z11.s }, p1/Z, [x17]\n"
+ "ld1sb { z29.s }, p1/Z, [x17]\n"
"addvl x20, SP, #9\n"
- "ld1sb { z21.s }, p1/Z, [x21]\n"
+ "ld1sb { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1sb { z12.s }, p1/Z, [x21]\n"
+ "trn1 z29.h, z29.h, z17.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1sb { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1sb { z20.s }, p1/Z, [x21]\n"
+ "ld1sb { z0.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1sb { z13.s }, p1/Z, [x21]\n"
+ "trn1 z30.h, z30.h, z0.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ "ld1sb { z31.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z19.s }, p1/Z, [x21]\n"
+ "ld1sb { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1sb { z14.s }, p1/Z, [x21]\n"
+ "trn1 z31.h, z31.h, z16.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ "ld1sb { z0.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z18.s }, p1/Z, [x21]\n"
+ "ld1sb { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1sb { z15.s }, p1/Z, [x21]\n"
+ "trn1 z0.h, z0.h, z16.h\n"
+ "add z0.h, z0.h, z7.h\n"
+ "ld1sb { z1.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x21]\n"
+ "ld1sb { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ "trn1 z1.h, z1.h, z16.h\n"
+ "add z1.h, z1.h, z7.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17217a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
"ld1sb { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "mov z2.d, z16.d\n"
+ "add z2.h, z2.h, z7.h\n"
+ ".inst 0xc17317c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
+ "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17817e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z8.h\n"
"10:" // Unpadded: 2 priming loads
"add x22, x17, %x[ld_in_row]\n"
- "ld1sb { z11.s }, p1/Z, [x17]\n"
+ "ld1sb { z26.s }, p1/Z, [x17]\n"
"addvl x21, SP, #6\n"
- "ld1sb { z21.s }, p1/Z, [x22]\n"
+ "ld1sb { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1sb { z12.s }, p1/Z, [x22]\n"
+ "trn1 z26.h, z26.h, z16.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1sb { z27.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #12\n"
- "ld1sb { z20.s }, p1/Z, [x22]\n"
+ "ld1sb { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1sb { z13.s }, p1/Z, [x22]\n"
+ "trn1 z27.h, z27.h, z16.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ "ld1sb { z28.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1sb { z19.s }, p1/Z, [x22]\n"
+ "ld1sb { z29.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1sb { z14.s }, p1/Z, [x22]\n"
+ "trn1 z28.h, z28.h, z29.h\n"
+ "add z28.h, z28.h, z7.h\n"
+ "ld1sb { z29.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1sb { z18.s }, p1/Z, [x22]\n"
+ "ld1sb { z19.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1sb { z15.s }, p1/Z, [x22]\n"
+ "trn1 z29.h, z29.h, z19.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1sb { z30.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x22]\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ "ld1sb { z23.s }, p1/Z, [x22]\n"
+ "trn1 z30.h, z30.h, z23.h\n"
"add x22, x22, %x[ld_in_row]\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z16.s }, p1/Z, [x22]\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "add z16.h, z16.h, z9.h\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1721748 // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+ "ld1sb { z22.s }, p1/Z, [x22]\n"
+ "mov z31.d, z22.d\n"
+ ".inst 0xc1731768 // sdot za.s[x8, 0], { z27.h-z30.h }, z3.h\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc17b1769 // sdot za.s[x8, 1], { z27.h-z30.h }, z11.h\n"
+ ".inst 0xc1731788 // sdot za.s[x8, 0], { z28.h-z31.h }, z3.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1701789 // sdot za.s[x8, 1], { z28.h-z31.h }, z0.h\n"
"11:" // Unpadded: 1 priming loads
"add x22, x17, %x[ld_in_row]\n"
- "ld1sb { z11.s }, p1/Z, [x17]\n"
+ "ld1sb { z29.s }, p1/Z, [x17]\n"
"addvl x21, SP, #3\n"
- "ld1sb { z21.s }, p1/Z, [x22]\n"
+ "ld1sb { z22.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1sb { z12.s }, p1/Z, [x22]\n"
+ "trn1 z29.h, z29.h, z22.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1sb { z30.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #9\n"
- "ld1sb { z20.s }, p1/Z, [x22]\n"
+ "ld1sb { z25.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1sb { z13.s }, p1/Z, [x22]\n"
+ "trn1 z30.h, z30.h, z25.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ "ld1sb { z31.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1sb { z19.s }, p1/Z, [x22]\n"
+ "ld1sb { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1sb { z14.s }, p1/Z, [x22]\n"
+ "trn1 z31.h, z31.h, z16.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ "ld1sb { z0.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1sb { z18.s }, p1/Z, [x22]\n"
+ "ld1sb { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1sb { z15.s }, p1/Z, [x22]\n"
+ "trn1 z0.h, z0.h, z16.h\n"
+ "add z0.h, z0.h, z7.h\n"
+ "ld1sb { z1.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x22]\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ "ld1sb { z2.s }, p1/Z, [x22]\n"
+ "trn1 z1.h, z1.h, z2.h\n"
"add x22, x22, %x[ld_in_row]\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z16.s }, p1/Z, [x22]\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "add z16.h, z16.h, z9.h\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ "add z1.h, z1.h, z7.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc17217a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
+ "ld1sb { z24.s }, p1/Z, [x22]\n"
+ "mov z2.d, z24.d\n"
+ ".inst 0xc17317c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
+ ".inst 0xa0402a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17817a9 // sdot za.s[x8, 1], { z29.h-z0.h }, z8.h\n"
+ "add z2.h, z2.h, z7.h\n"
+ "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc17917c9 // sdot za.s[x8, 1], { z30.h-z1.h }, z9.h\n"
+ ".inst 0xc17317e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z3.h\n"
+ "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17317e9 // sdot za.s[x8, 1], { z31.h-z2.h }, z3.h\n"
"12:" // Unpadded: 0 priming loads
"cmp x7, #0x2\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
"blt 22f\n"
"add x21, x17, %x[ld_in_row]\n"
- "ld1sb { z11.s }, p1/Z, [x17]\n"
+ "ld1sb { z23.s }, p1/Z, [x17]\n"
"sub x7, x7, #0x2\n"
- "ld1sb { z21.s }, p1/Z, [x21]\n"
+ "ld1sb { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
+ "trn1 z23.h, z23.h, z25.h\n"
"sub x16, x16, #0x1\n"
- "ld1sb { z12.s }, p1/Z, [x21]\n"
+ "ld1sb { z24.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"lsr x20, x7, #0x1\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1sb { z20.s }, p1/Z, [x21]\n"
+ "add z23.h, z23.h, z7.h\n"
+ "ld1sb { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z24.h, z24.h, z30.h\n"
"cmp x20, x16\n"
- "ld1sb { z13.s }, p1/Z, [x21]\n"
+ "ld1sb { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"csel x26, x20, x16, LT\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1sb { z19.s }, p1/Z, [x21]\n"
+ "add z24.h, z24.h, z7.h\n"
+ "ld1sb { z22.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1sb { z14.s }, p1/Z, [x21]\n"
+ "trn1 z25.h, z25.h, z22.h\n"
+ "add z25.h, z25.h, z7.h\n"
+ "ld1sb { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1sb { z18.s }, p1/Z, [x21]\n"
+ "ld1sb { z22.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1sb { z15.s }, p1/Z, [x21]\n"
+ "trn1 z26.h, z26.h, z22.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1sb { z27.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"and x7, x7, #0x1\n"
- "ld1sb { z17.s }, p1/Z, [x21]\n"
+ "ld1sb { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z9.h\n"
- "ld1sb { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
+ "trn1 z27.h, z27.h, z30.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ "ld1sb { z28.s }, p1/Z, [x21]\n"
+ "mov z28.d, z28.d\n"
+ "add z28.h, z28.h, z7.h\n"
"sub x16, x16, x26\n"
"cbz x26, 21f\n"
"13:" // Unpadded: Main loop
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
"addvl x25, SP, #6\n"
"addvl x24, SP, #12\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402b22 // ld1h { z2.h, z10.h }, pn10.b/Z, [x25]\n"
+ ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+ ".inst 0xa0402b20 // ld1h { z0.h-z1.h }, pn10.b/Z, [x25]\n"
"add x23, x17, %x[ld_in_row]\n"
"addvl x22, SP, #3\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
"addvl x21, SP, #9\n"
"subs x26, x26, #0x1\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24]\n"
- ".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z11.s }, p1/Z, [x17]\n"
+ ".inst 0xc1711709 // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
+ ".inst 0xa0402b08 // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc17816ea // sdot za.s[x8, 2], { z23.h-z26.h }, z8.h\n"
+ "ld1sb { z23.s }, p1/Z, [x17]\n"
"add x17, x17, %x[ld_in_col]\n"
"add x20, x17, %x[ld_in_row]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
"ld1h { z0.h }, p2/Z, [x25, #2, MUL VL]\n"
- ".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
- "ld1sb { z21.s }, p1/Z, [x23]\n"
+ ".inst 0xc179170a // sdot za.s[x8, 2], { z24.h-z27.h }, z9.h\n"
+ "ld1sb { z16.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1sb { z12.s }, p1/Z, [x23]\n"
+ "trn1 z23.h, z23.h, z16.h\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "ld1h { z9.h }, p2/Z, [x24, #2, MUL VL]\n"
+ "add z23.h, z23.h, z7.h\n"
+ "ld1sb { z24.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
- "ld1sb { z20.s }, p1/Z, [x23]\n"
+ ".inst 0xc179172a // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
+ "ld1sb { z18.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1sb { z13.s }, p1/Z, [x23]\n"
+ "trn1 z24.h, z24.h, z18.h\n"
+ "add z24.h, z24.h, z7.h\n"
+ "ld1sb { z25.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "ld1sb { z19.s }, p1/Z, [x23]\n"
+ "ld1sb { z8.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1sb { z14.s }, p1/Z, [x23]\n"
+ "trn1 z25.h, z25.h, z8.h\n"
+ "add z25.h, z25.h, z7.h\n"
+ "ld1sb { z26.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
- "ld1sb { z18.s }, p1/Z, [x23]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "ld1sb { z28.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1sb { z15.s }, p1/Z, [x23]\n"
+ "trn1 z26.h, z26.h, z28.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1sb { z27.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
- "ld1sb { z17.s }, p1/Z, [x23]\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+ "ld1sb { z28.s }, p1/Z, [x23]\n"
+ "trn1 z27.h, z27.h, z28.h\n"
"add x23, x23, %x[ld_in_row]\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- "ld1sb { z16.s }, p1/Z, [x23]\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "add z16.h, z16.h, z9.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc17216e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+ ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+ "ld1sb { z20.s }, p1/Z, [x23]\n"
+ "mov z28.d, z20.d\n"
+ ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+ "add z28.h, z28.h, z7.h\n"
"ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1711709 // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
+ ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+ ".inst 0xc1701728 // sdot za.s[x8, 0], { z25.h-z28.h }, z0.h\n"
"ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- "st1b { z4.s }, p1, [x15]\n"
+ "st1b { z16.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- "ld1sb { z11.s }, p1/Z, [x17]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "st1b { z5.s }, p1, [x14]\n"
+ "ld1sb { z23.s }, p1/Z, [x17]\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "st1b { z17.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "ld1sb { z21.s }, p1/Z, [x20]\n"
+ "ld1sb { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "st1b { z6.s }, p1, [x10]\n"
- "ld1sb { z12.s }, p1/Z, [x20]\n"
+ "trn1 z23.h, z23.h, z16.h\n"
+ "st1b { z18.s }, p1, [x10]\n"
+ "ld1sb { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"add x10, x10, x28\n"
- "st1b { z7.s }, p1, [x9]\n"
- "ld1sb { z20.s }, p1/Z, [x20]\n"
+ "st1b { z19.s }, p1, [x9]\n"
+ "ld1sb { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z24.h, z24.h, z16.h\n"
"add x9, x9, x27\n"
- "ld1sb { z13.s }, p1/Z, [x20]\n"
+ "ld1sb { z25.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1sb { z19.s }, p1/Z, [x20]\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "add z23.h, z23.h, z7.h\n"
+ "ld1sb { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1sb { z14.s }, p1/Z, [x20]\n"
+ "trn1 z25.h, z25.h, z16.h\n"
+ "add z24.h, z24.h, z7.h\n"
+ "ld1sb { z26.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z13.h, z13.h, z9.h\n"
+ "add z25.h, z25.h, z7.h\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1sb { z18.s }, p1/Z, [x20]\n"
+ "ld1sb { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1sb { z15.s }, p1/Z, [x20]\n"
+ "trn1 z26.h, z26.h, z16.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1sb { z27.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x20]\n"
+ "ld1sb { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z9.h\n"
+ "trn1 z27.h, z27.h, z16.h\n"
+ "add z27.h, z27.h, z7.h\n"
"ld1sb { z16.s }, p1/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "mov z28.d, z16.d\n"
+ "add z28.h, z28.h, z7.h\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
"bgt 13b\n"
"b 21f\n"
"14:" // Padded
@@ -645,688 +645,688 @@ void sme2_s8q_planar_5x5_s2_4rows_dot_za_impl(
"15:" // Padded: 4 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1sb { z27.s }, p0/Z, [x17]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x21, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z21.s }, p0/Z, [x21]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1sb { z17.s }, p0/Z, [x21]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z12.s }, p0/Z, [x21]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1sb { z28.s }, p0/Z, [x21]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z20.s }, p0/Z, [x21]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z27.h, z27.h, z17.h\n"
+ "trn1 z28.h, z28.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z13.s }, p0/Z, [x21]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1sb { z29.s }, p0/Z, [x21]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z19.s }, p0/Z, [x21]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1sb { z18.s }, p0/Z, [x21]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z14.s }, p0/Z, [x21]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1sb { z30.s }, p0/Z, [x21]\n"
+ "add z30.h, p0/M, z30.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z18.s }, p0/Z, [x21]\n"
+ "ld1sb { z17.s }, p0/Z, [x21]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z15.s }, p0/Z, [x21]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1sb { z31.s }, p0/Z, [x21]\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"addvl x20, SP, #12\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
+ "trn1 z29.h, z29.h, z18.h\n"
+ "trn1 z30.h, z30.h, z17.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- "mov z16.d, z16.d\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "trn1 z31.h, z31.h, z16.h\n"
+ ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+ "ld1sb { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
+ "mov z0.d, z20.d\n"
"add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1711788 // sdot za.s[x8, 0], { z28.h-z31.h }, z1.h\n"
+ "ld1h { z1.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17117a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z1.h\n"
"16:" // Padded: 3 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1sb { z24.s }, p0/Z, [x17]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
"add x21, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z21.s }, p0/Z, [x21]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1sb { z17.s }, p0/Z, [x21]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z12.s }, p0/Z, [x21]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1sb { z25.s }, p0/Z, [x21]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z20.s }, p0/Z, [x21]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z24.h, z24.h, z17.h\n"
+ "trn1 z25.h, z25.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z13.s }, p0/Z, [x21]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1sb { z26.s }, p0/Z, [x21]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z19.s }, p0/Z, [x21]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1sb { z18.s }, p0/Z, [x21]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z14.s }, p0/Z, [x21]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1sb { z27.s }, p0/Z, [x21]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z18.s }, p0/Z, [x21]\n"
+ "ld1sb { z17.s }, p0/Z, [x21]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z15.s }, p0/Z, [x21]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1sb { z28.s }, p0/Z, [x21]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"addvl x20, SP, #9\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
+ "trn1 z26.h, z26.h, z18.h\n"
+ "trn1 z27.h, z27.h, z17.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- "mov z16.d, z16.d\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ "trn1 z28.h, z28.h, z16.h\n"
+ ".inst 0xc1721708 // sdot za.s[x8, 0], { z24.h-z27.h }, z2.h\n"
+ "ld1sb { z11.s }, p0/Z, [x21]\n"
+ "add z11.h, p0/M, z11.h, z7.h\n"
+ "mov z29.d, z11.d\n"
"add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc1731728 // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1701748 // sdot za.s[x8, 0], { z26.h-z29.h }, z0.h\n"
"17:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1sb { z25.s }, p0/Z, [x17]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z12.s }, p0/Z, [x20]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1sb { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ "trn1 z26.h, z26.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1sb { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1sb { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1sb { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"addvl x21, SP, #6\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "trn1 z27.h, z27.h, z18.h\n"
+ "trn1 z28.h, z28.h, z17.h\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "trn1 z29.h, z29.h, z16.h\n"
+ ".inst 0xc1711728 // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
+ "ld1sb { z1.s }, p0/Z, [x20]\n"
"addvl x20, SP, #12\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "mov z16.d, z16.d\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
+ ".inst 0xc1791748 // sdot za.s[x8, 0], { z26.h-z29.h }, z9.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1721729 // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
+ "mov z30.d, z1.d\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+ ".inst 0xc1791768 // sdot za.s[x8, 0], { z27.h-z30.h }, z9.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1701769 // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
"18:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1sb { z25.s }, p0/Z, [x17]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z12.s }, p0/Z, [x20]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1sb { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ "trn1 z26.h, z26.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1sb { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1sb { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1sb { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"addvl x21, SP, #3\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "trn1 z27.h, z27.h, z18.h\n"
+ "trn1 z28.h, z28.h, z17.h\n"
+ ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "trn1 z29.h, z29.h, z16.h\n"
+ ".inst 0xc1731728 // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
+ "ld1sb { z0.s }, p0/Z, [x20]\n"
"addvl x20, SP, #9\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "mov z16.d, z16.d\n"
+ "add z0.h, p0/M, z0.h, z7.h\n"
+ ".inst 0xc17b1748 // sdot za.s[x8, 0], { z26.h-z29.h }, z11.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1721729 // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
+ "mov z30.d, z0.d\n"
"add x17, x17, %x[ld_in_col]\n"
"ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+ ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1701769 // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
"19:" // Padded: 0 priming loads
"cmp x7, #0x2\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
"blt 22f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1sb { z23.s }, p0/Z, [x17]\n"
+ "add z23.h, p0/M, z23.h, z7.h\n"
"add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z12.s }, p0/Z, [x20]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1sb { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z23.h, z23.h, z17.h\n"
+ "trn1 z24.h, z24.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1sb { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1sb { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "add z19.h, p0/M, z19.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1sb { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1sb { z18.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1sb { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"sub x7, x7, #0x2\n"
"sub x16, x16, #0x1\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
+ "trn1 z25.h, z25.h, z19.h\n"
+ "trn1 z26.h, z26.h, z18.h\n"
"lsr x20, x7, #0x1\n"
"cmp x20, x16\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
+ "trn1 z27.h, z27.h, z17.h\n"
+ "mov z28.d, z16.d\n"
"csel x25, x20, x16, LT\n"
"add x17, x17, %x[ld_in_col]\n"
"and x7, x7, #0x1\n"
"sub x16, x16, x25\n"
"cbz x25, 21f\n"
"20:" // Padded: Main loop
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
"addvl x24, SP, #6\n"
"addvl x23, SP, #12\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+ ".inst 0xa1402b00 // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
"add x20, x17, %x[ld_in_row]\n"
"addvl x22, SP, #3\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1781709 // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
+ ".inst 0xa1402ae3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
"addvl x21, SP, #9\n"
"subs x25, x25, #0x1\n"
- ".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ ".inst 0xc17316ea // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
+ "ld1sb { z23.s }, p0/Z, [x17]\n"
+ "add z23.h, p0/M, z23.h, z7.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
- "ld1sb { z12.s }, p0/Z, [x20]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ ".inst 0xc17b170a // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
+ "ld1sb { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
"ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1sb { z1.s }, p0/Z, [x20]\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
- "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z3.h }, p2/Z, [x23, #2, MUL VL]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "ld1sb { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ ".inst 0xc173172a // sdot za.s[x8, 2], { z25.h-z28.h }, z3.h\n"
+ "trn1 z23.h, z23.h, z16.h\n"
+ "ld1sb { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1sb { z3.s }, p0/Z, [x20]\n"
+ "add z3.h, p0/M, z3.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1sb { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
+ "ld1sb { z30.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z30.h, p0/M, z30.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1sb { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22]\n"
+ "ld1sb { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
+ "trn1 z24.h, z24.h, z1.h\n"
+ "trn1 z25.h, z25.h, z3.h\n"
+ "trn1 z26.h, z26.h, z30.h\n"
+ ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ "trn1 z27.h, z27.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xc17216e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
"mov x12, #0x0\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
+ ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
"add x17, x17, %x[ld_in_col]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ ".inst 0xc17216e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z2.h\n"
+ "ld1sb { z23.s }, p0/Z, [x17]\n"
+ "add z23.h, p0/M, z23.h, z7.h\n"
"add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1sb { z8.s }, p0/Z, [x20]\n"
+ "add z8.h, p0/M, z8.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- "ld1sb { z12.s }, p0/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ ".inst 0xc1731709 // sdot za.s[x8, 1], { z24.h-z27.h }, z3.h\n"
+ "ld1sb { z24.s }, p0/Z, [x20]\n"
+ "mov z28.d, z20.d\n"
+ "ld1h { z1.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "ld1sb { z22.s }, p0/Z, [x20]\n"
+ ".inst 0xc1711728 // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
"mov x12, #0x4\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "add z22.h, p0/M, z22.h, z7.h\n"
+ "ld1h { z1.h }, p2/Z, [x21, #2, MUL VL]\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1sb { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ ".inst 0xc1711729 // sdot za.s[x8, 1], { z25.h-z28.h }, z1.h\n"
+ "ld1sb { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1sb { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1sb { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z15.s }, p0/Z, [x20]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1sb { z27.s }, p0/Z, [x20]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ "ld1sb { z31.s }, p0/Z, [x20]\n"
+ ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
+ "ld1sb { z1.s }, p0/Z, [x20]\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
+ ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
"add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x15]\n"
+ ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+ "st1b { z16.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
- "st1b { z5.s }, p1, [x14]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "st1b { z17.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "trn1 z23.h, z23.h, z8.h\n"
+ "trn1 z24.h, z24.h, z22.h\n"
+ "st1b { z18.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "st1b { z7.s }, p1, [x9]\n"
+ "trn1 z25.h, z25.h, z28.h\n"
+ "trn1 z26.h, z26.h, z20.h\n"
+ "st1b { z19.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
+ "trn1 z27.h, z27.h, z31.h\n"
+ "mov z28.d, z1.d\n"
"bgt 20b\n"
"21:" // Main loop tail
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
"addvl x24, SP, #6\n"
"addvl x23, SP, #12\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+ ".inst 0xa0402b08 // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17816e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z8.h\n"
"add x22, x17, %x[ld_in_row]\n"
"addvl x21, SP, #3\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1791709 // sdot za.s[x8, 1], { z24.h-z27.h }, z9.h\n"
+ ".inst 0xa1402ae3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
"addvl x20, SP, #9\n"
- ".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ ".inst 0xc17316ea // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
+ "ld1sb { z29.s }, p0/Z, [x17]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z21.s }, p0/Z, [x22]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1sb { z8.s }, p0/Z, [x22]\n"
+ "add z8.h, p0/M, z8.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
- "ld1sb { z12.s }, p0/Z, [x22]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ ".inst 0xc17b170a // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
+ "ld1sb { z30.s }, p0/Z, [x22]\n"
+ "add z30.h, p0/M, z30.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
"ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
"ld1sb { z20.s }, p0/Z, [x22]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z2.h }, p2/Z, [x23, #2, MUL VL]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "ld1sb { z13.s }, p0/Z, [x22]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ ".inst 0xc172172a // sdot za.s[x8, 2], { z25.h-z28.h }, z2.h\n"
+ "trn1 z29.h, z29.h, z8.h\n"
+ "ld1sb { z31.s }, p0/Z, [x22]\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z19.s }, p0/Z, [x22]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1sb { z25.s }, p0/Z, [x22]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z14.s }, p0/Z, [x22]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1sb { z0.s }, p0/Z, [x22]\n"
+ "add z0.h, p0/M, z0.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z18.s }, p0/Z, [x22]\n"
+ "ld1sb { z17.s }, p0/Z, [x22]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z15.s }, p0/Z, [x22]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1sb { z1.s }, p0/Z, [x22]\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x22]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "ld1sb { z28.s }, p0/Z, [x22]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
+ "trn1 z30.h, z30.h, z20.h\n"
+ "trn1 z31.h, z31.h, z25.h\n"
+ "trn1 z0.h, z0.h, z17.h\n"
+ ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z16.s }, p0/Z, [x22]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ "trn1 z1.h, z1.h, z28.h\n"
+ ".inst 0xc17317a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z3.h\n"
+ "ld1sb { z22.s }, p0/Z, [x22]\n"
+ ".inst 0xc1a6ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+ "add z22.h, p0/M, z22.h, z7.h\n"
+ ".inst 0xc17b17c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z11.h\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
"add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "mov z16.d, z16.d\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x15]\n"
+ ".inst 0xc1a4aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z4.s\n"
+ ".inst 0xc17317a9 // sdot za.s[x8, 1], { z29.h-z0.h }, z3.h\n"
+ "mov z2.d, z22.d\n"
+ "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc17b17c9 // sdot za.s[x8, 1], { z30.h-z1.h }, z11.h\n"
+ ".inst 0xc1aaab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z10.s\n"
+ ".inst 0xc17917e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z9.h\n"
+ "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1b5ccb8 // sclamp { z24.s-z27.s }, z5.s, z21.s\n"
+ "st1b { z24.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- "st1b { z5.s }, p1, [x14]\n"
+ "st1b { z25.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "st1b { z6.s }, p1, [x10]\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
- "st1b { z7.s }, p1, [x9]\n"
+ ".inst 0xc17817e9 // sdot za.s[x8, 1], { z31.h-z2.h }, z8.h\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "st1b { z27.s }, p1, [x9]\n"
"add x9, x9, x27\n"
"22:" // Main loop skip tail
"cbz x7, 23f\n" // Skip remainder inputs
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1sb { z24.s }, p0/Z, [x17]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
"add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z12.s }, p0/Z, [x20]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1sb { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z24.h, z24.h, z17.h\n"
+ "trn1 z25.h, z25.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1sb { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1sb { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1sb { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "ld1sb { z31.s }, p0/Z, [x20]\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ "trn1 z26.h, z26.h, z17.h\n"
+ "trn1 z27.h, z27.h, z16.h\n"
+ "ld1sb { z0.s }, p0/Z, [x20]\n"
+ "add z0.h, p0/M, z0.h, z7.h\n"
+ "trn1 z28.h, z28.h, z31.h\n"
"addvl x21, SP, #6\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "mov z16.d, z16.d\n"
+ ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+ "mov z29.d, z0.d\n"
"addvl x20, SP, #12\n"
"sub x16, x16, #0x1\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
- ".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- ".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x15]\n"
+ ".inst 0xc17b1728 // sdot za.s[x8, 0], { z25.h-z28.h }, z11.h\n"
+ ".inst 0xa0402aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1721748 // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+ "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1781709 // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ ".inst 0xc1791729 // sdot za.s[x8, 1], { z25.h-z28.h }, z9.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+ ".inst 0xc171170a // sdot za.s[x8, 2], { z24.h-z27.h }, z1.h\n"
+ ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+ ".inst 0xc179172a // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
+ ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+ "st1b { z16.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- "st1b { z5.s }, p1, [x14]\n"
+ ".inst 0xc1721749 // sdot za.s[x8, 1], { z26.h-z29.h }, z2.h\n"
+ "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "st1b { z17.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc173174a // sdot za.s[x8, 2], { z26.h-z29.h }, z3.h\n"
"add x8, x8, #0x1\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "st1b { z18.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z7.s }, p1, [x9]\n"
+ "st1b { z19.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
"23:" // Tail input: End
"cbz x16, 25f\n"
"24:" // Right padding loop
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
"add x8, x8, #0x1\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
"subs x16, x16, #0x1\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x15]\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ ".inst 0xc1aaab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
+ ".inst 0xc1b5ccbc // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
+ "st1b { z28.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- "st1b { z5.s }, p1, [x14]\n"
+ "st1b { z29.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "st1b { z30.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z7.s }, p1, [x9]\n"
+ "st1b { z31.s }, p1, [x9]\n"
"add x9, x9, x27\n"
"bgt 24b\n"
"25:" // End
- "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x22, ALL, MUL #16\n"
- "incw x22, ALL, MUL #9\n"
- "str x22, [%x[args], %[offsetof_Args_weights]]\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #16\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
"incw x6\n"
"whilelt p1.s, x6, x5\n"
- "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x17, x17, x20\n"
- "str x17, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
"ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
"ldp x23, x22, [x25, #0x0]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za.hpp
index 875a9f8294..2e40c75d6b 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
index a7ef556840..60c3a1e632 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
@@ -73,96 +73,96 @@ void sme2_u8q_planar_3x3_s1_4rows_dot_za_impl(
"ptrue p2.b\n"
"mov x20, #0x6\n"
"ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z24.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "ld1rh { z21.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
"sub x20, x20, x6\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x17\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
"whilelt p8.s, XZR, x7\n"
"addvl SP, SP, #-12\n"
"ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z24.h, p2/M, z24.h\n"
+ "neg z21.h, p2/M, z21.h\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z22.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z29.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z28.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z8.s, #0x0\n"
+ "mov z30.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z8.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z30.s }, p1/Z, [x20, x16, LSL #2]\n"
"2:" // Load bias: Done
"ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
"mov x20, x22\n"
- "ld1b { z27.s }, p2/Z, [x20]\n"
+ "ld1b { z10.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "ld1rh { z21.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "mov z20.h, #0x0\n"
- "sub z27.h, z27.h, z21.h\n"
+ "ld1rh { z31.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z7.h, #0x0\n"
+ "sub z10.h, z10.h, z31.h\n"
"incw x22\n"
- "ld1b { z23.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #3\n"
- "sub z23.h, z23.h, z21.h\n"
- "trn1 z0.h, z20.h, z27.h\n"
"ld1b { z16.s }, p2/Z, [x20]\n"
- "sub z16.h, z16.h, z21.h\n"
+ "incw x20, ALL, MUL #3\n"
+ "sub z16.h, z16.h, z31.h\n"
+ "trn1 z20.h, z7.h, z10.h\n"
+ "ld1b { z11.s }, p2/Z, [x20]\n"
+ "sub z11.h, z11.h, z31.h\n"
"mov x20, x22\n"
- "trn1 z1.h, z27.h, z23.h\n"
- "ld1b { z27.s }, p2/Z, [x20]\n"
+ "trn1 z19.h, z10.h, z16.h\n"
+ "ld1b { z24.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "trn1 z2.h, z23.h, z16.h\n"
- "trn1 z3.h, z16.h, z20.h\n"
- "ld1b { z23.s }, p2/Z, [x20]\n"
+ "trn1 z26.h, z16.h, z11.h\n"
+ "trn1 z13.h, z11.h, z7.h\n"
+ "ld1b { z11.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z27.h, z27.h, z21.h\n"
- "sub z23.h, z23.h, z21.h\n"
- "ld1b { z16.s }, p2/Z, [x20]\n"
- "sub z16.h, z16.h, z21.h\n"
+ "sub z24.h, z24.h, z31.h\n"
+ "sub z11.h, z11.h, z31.h\n"
+ "ld1b { z2.s }, p2/Z, [x20]\n"
+ "sub z2.h, z2.h, z31.h\n"
"addvl x21, SP, #12\n"
"incw x22\n"
"addvl x21, x21, #-4\n"
"mov x20, x22\n"
- "st1h { z0.h }, p2, [x21]\n"
- "trn1 z0.h, z20.h, z27.h\n"
- "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z1.h, z27.h, z23.h\n"
- "ld1b { z27.s }, p2/Z, [x20]\n"
+ "st1h { z20.h }, p2, [x21]\n"
+ "trn1 z22.h, z7.h, z24.h\n"
+ "st1h { z19.h }, p2, [x21, #1, MUL VL]\n"
+ "trn1 z1.h, z24.h, z11.h\n"
+ "ld1b { z16.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "st1h { z2.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z2.h, z23.h, z16.h\n"
- "ld1b { z23.s }, p2/Z, [x20]\n"
+ "st1h { z26.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z3.h, z11.h, z2.h\n"
+ "ld1b { z0.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "st1h { z3.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z3.h, z16.h, z20.h\n"
- "ld1b { z16.s }, p2/Z, [x20]\n"
+ "st1h { z13.h }, p2, [x21, #3, MUL VL]\n"
+ "trn1 z25.h, z2.h, z7.h\n"
+ "ld1b { z4.s }, p2/Z, [x20]\n"
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "sub z27.h, z27.h, z21.h\n"
- "sub z23.h, z23.h, z21.h\n"
+ "sub z16.h, z16.h, z31.h\n"
+ "sub z0.h, z0.h, z31.h\n"
"addvl x21, x21, #-4\n"
- "st1h { z0.h }, p2, [x21]\n"
- "sub z16.h, z16.h, z21.h\n"
+ "st1h { z22.h }, p2, [x21]\n"
+ "sub z4.h, z4.h, z31.h\n"
"st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
- "mov z9.d, z8.d\n"
- "st1h { z2.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z0.h, z20.h, z27.h\n"
- "trn1 z1.h, z27.h, z23.h\n"
- "st1h { z3.h }, p2, [x21, #3, MUL VL]\n"
+ "mov z31.d, z30.d\n"
+ "st1h { z3.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z24.h, z7.h, z16.h\n"
+ "trn1 z18.h, z16.h, z0.h\n"
+ "st1h { z25.h }, p2, [x21, #3, MUL VL]\n"
"addvl x21, x21, #-4\n"
- "trn1 z2.h, z23.h, z16.h\n"
- "trn1 z3.h, z16.h, z20.h\n"
- "st1h { z0.h }, p2, [x21]\n"
- "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
- "st1h { z2.h }, p2, [x21, #2, MUL VL]\n"
- "st1h { z3.h }, p2, [x21, #3, MUL VL]\n"
+ "trn1 z0.h, z0.h, z4.h\n"
+ "trn1 z1.h, z4.h, z7.h\n"
+ "st1h { z24.h }, p2, [x21]\n"
+ "st1h { z18.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "st1h { z1.h }, p2, [x21, #3, MUL VL]\n"
"cbz x20, 3f\n"
- "ld1w { z10.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z14.s }, p1/Z, [x20, x16, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z11.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z12.s }, p1/Z, [x20, x16, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
"sub x20, x15, #0x1\n"
@@ -182,21 +182,21 @@ void sme2_u8q_planar_3x3_s1_4rows_dot_za_impl(
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x0\n"
"msub x14, x7, x20, x14\n"
- ".inst 0xc0040900 // mova za.d[x8, #0], { z8.d-z9.d }\n"
+ ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040901 // mova za.d[x8, #1], { z8.d-z9.d }\n"
+ ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
"mov x22, #0x2\n"
- "ldp x11, x10, [x25], #0x10\n"
- ".inst 0xc0040902 // mova za.d[x8, #2], { z8.d-z9.d }\n"
+ "ldp x11, x10, [x23], #0x10\n"
+ ".inst 0xc0040bc2 // mova za.d[x8, #2], { z30.d-z31.d }\n"
"ldp x9, x28, [x20], #0x10\n"
- ".inst 0xc0040903 // mova za.d[x8, #3], { z8.d-z9.d }\n"
+ ".inst 0xc0040bc3 // mova za.d[x8, #3], { z30.d-z31.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
- "ldp x27, x26, [x25], #0x10\n"
- ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ "ldp x27, x26, [x23], #0x10\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
"ldp x25, x24, [x20], #0x10\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
@@ -204,22 +204,22 @@ void sme2_u8q_planar_3x3_s1_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
"sub x13, x13, x21\n"
- ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
- ".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- ".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z4.s }, p1, [x11]\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z5.s }, p1, [x27]\n"
+ "st1b { z25.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z7.s }, p1, [x26]\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"bgt 6b\n"
"7:" // Left padding: End
@@ -231,148 +231,148 @@ void sme2_u8q_planar_3x3_s1_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 2 priming loads
"add x21, x14, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x14]\n"
+ "ld1b { z20.s }, p1/Z, [x14]\n"
"addvl x20, SP, #8\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z17.h, z16.h\n"
- "add z13.h, z13.h, z24.h\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "trn1 z4.h, z20.h, z16.h\n"
+ "add z4.h, z4.h, z21.h\n"
+ "ld1b { z23.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
+ "ld1b { z22.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z17.h, z16.h\n"
- "add z14.h, z14.h, z24.h\n"
+ "trn1 z5.h, z23.h, z22.h\n"
+ "add z5.h, z5.h, z21.h\n"
"ld1b { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
- "trn1 z15.h, z17.h, z16.h\n"
- "add z15.h, z15.h, z24.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ "trn1 z6.h, z17.h, z16.h\n"
+ "add z6.h, z6.h, z21.h\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16b1488 // sdot za.s[x8, 0], { z4.h-z5.h }, z11.h\n"
+ ".inst 0xc1631489 // sdot za.s[x8, 1], { z4.h-z5.h }, z3.h\n"
+ ".inst 0xa1412a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16814a8 // sdot za.s[x8, 0], { z5.h-z6.h }, z8.h\n"
+ ".inst 0xc16014a9 // sdot za.s[x8, 1], { z5.h-z6.h }, z0.h\n"
"9:" // Unpadded: 1 priming loads
"add x22, x14, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x14]\n"
+ "ld1b { z25.s }, p1/Z, [x14]\n"
"addvl x21, SP, #4\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
+ "ld1b { z6.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z13.h, z17.h, z16.h\n"
- "add z13.h, z13.h, z24.h\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
+ "trn1 z3.h, z25.h, z6.h\n"
+ "add z3.h, z3.h, z21.h\n"
+ "ld1b { z18.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #8\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
+ "ld1b { z26.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z14.h, z17.h, z16.h\n"
- "add z14.h, z14.h, z24.h\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
+ "trn1 z4.h, z18.h, z26.h\n"
+ "add z4.h, z4.h, z21.h\n"
+ "ld1b { z2.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
- "trn1 z15.h, z17.h, z16.h\n"
- "add z15.h, z15.h, z24.h\n"
+ "ld1b { z5.s }, p1/Z, [x22]\n"
+ "trn1 z5.h, z2.h, z5.h\n"
+ "add z5.h, z5.h, z21.h\n"
".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc1611468 // sdot za.s[x8, 0], { z3.h-z4.h }, z1.h\n"
+ ".inst 0xc1601469 // sdot za.s[x8, 1], { z3.h-z4.h }, z0.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa0412aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16a146a // sdot za.s[x8, 2], { z3.h-z4.h }, z10.h\n"
+ ".inst 0xc162146b // sdot za.s[x8, 3], { z3.h-z4.h }, z2.h\n"
+ ".inst 0xc1691488 // sdot za.s[x8, 0], { z4.h-z5.h }, z9.h\n"
+ ".inst 0xc1681489 // sdot za.s[x8, 1], { z4.h-z5.h }, z8.h\n"
+ ".inst 0xa1412a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16a148a // sdot za.s[x8, 2], { z4.h-z5.h }, z10.h\n"
+ ".inst 0xc162148b // sdot za.s[x8, 3], { z4.h-z5.h }, z2.h\n"
"10:" // Unpadded: 0 priming loads
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
"cbz x15, 18f\n"
"add x20, x14, %x[ld_in_row]\n"
"ld1b { z17.s }, p1/Z, [x14]\n"
"sub x15, x15, #0x1\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "ld1b { z9.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z17.h, z16.h\n"
+ "trn1 z6.h, z17.h, z9.h\n"
"sub x13, x13, #0x1\n"
"ld1b { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"cmp x15, x13\n"
- "add z13.h, z13.h, z24.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "add z6.h, z6.h, z21.h\n"
+ "ld1b { z7.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z14.h, z17.h, z16.h\n"
+ "trn1 z7.h, z17.h, z7.h\n"
"csel x23, x15, x13, LT\n"
"ld1b { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z14.h, z14.h, z24.h\n"
+ "add z7.h, z7.h, z21.h\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "trn1 z15.h, z17.h, z16.h\n"
- "add z15.h, z15.h, z24.h\n"
+ "ld1b { z1.s }, p1/Z, [x20]\n"
+ "trn1 z8.h, z17.h, z1.h\n"
+ "add z8.h, z8.h, z21.h\n"
"sub x13, x13, x23\n"
"cbz x23, 17f\n"
"11:" // Unpadded: Main loop
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
"addvl x22, SP, #4\n"
"addvl x21, SP, #8\n"
- "ld1b { z21.s }, p1/Z, [x14]\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22]\n"
+ "ld1b { z2.s }, p1/Z, [x14]\n"
+ ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+ ".inst 0xa1402ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22]\n"
"add x20, x14, %x[ld_in_row]\n"
"subs x23, x23, #0x1\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- "ld1b { z20.s }, p1/Z, [x20]\n"
+ ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+ "ld1b { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
- "ld1b { z19.s }, p1/Z, [x20]\n"
+ ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+ ".inst 0xa1412ac3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+ "ld1b { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
- ".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ ".inst 0xc16d14ca // sdot za.s[x8, 2], { z6.h-z7.h }, z13.h\n"
"ld1b { z18.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc16115ac // sdot za.s[x8, 4], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16514cb // sdot za.s[x8, 3], { z6.h-z7.h }, z5.h\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xc16914cc // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
"ld1b { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc16015ad // sdot za.s[x8, 5], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xc16114cd // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
"ld1b { z16.s }, p1/Z, [x20]\n"
- ".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
- "trn1 z13.h, z21.h, z20.h\n"
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
- ".inst 0xc16315cc // sdot za.s[x8, 4], { z14.h-z15.h }, z3.h\n"
- "st1b { z4.s }, p1, [x11]\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ ".inst 0xc16b14ea // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
+ "trn1 z6.h, z2.h, z19.h\n"
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc16314eb // sdot za.s[x8, 3], { z7.h-z8.h }, z3.h\n"
+ ".inst 0xa1412aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ ".inst 0xc16914ec // sdot za.s[x8, 4], { z7.h-z8.h }, z9.h\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "add z13.h, z13.h, z24.h\n"
- ".inst 0xc16215cd // sdot za.s[x8, 5], { z14.h-z15.h }, z2.h\n"
- "trn1 z14.h, z19.h, z18.h\n"
- "trn1 z15.h, z17.h, z16.h\n"
+ "add z6.h, z6.h, z21.h\n"
+ ".inst 0xc16114ed // sdot za.s[x8, 5], { z7.h-z8.h }, z1.h\n"
+ "trn1 z7.h, z23.h, z18.h\n"
+ "trn1 z8.h, z17.h, z16.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "st1b { z6.s }, p1, [x10]\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
- "st1b { z5.s }, p1, [x27]\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ "st1b { z25.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
- "add z14.h, z14.h, z24.h\n"
- "st1b { z7.s }, p1, [x26]\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ "add z7.h, z7.h, z21.h\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "add z15.h, z15.h, z24.h\n"
+ "add z8.h, z8.h, z21.h\n"
"bgt 11b\n"
"b 17f\n"
"12:" // Padded
@@ -384,118 +384,118 @@ void sme2_u8q_planar_3x3_s1_4rows_dot_za_impl(
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z19.s }, p0/Z, [x14]\n"
- "add z19.h, p0/M, z19.h, z24.h\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z24.h\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z19.h, z18.h\n"
- "trn1 z14.h, z17.h, z16.h\n"
+ "trn1 z7.h, z19.h, z18.h\n"
+ "trn1 z8.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z16.s }, p0/Z, [x20]\n"
"addvl x20, SP, #8\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- "trn1 z15.h, z17.h, z16.h\n"
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ "trn1 z9.h, z17.h, z16.h\n"
+ ".inst 0xc16a14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z10.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16214e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z2.h\n"
+ ".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16d1508 // sdot za.s[x8, 0], { z8.h-z9.h }, z13.h\n"
+ ".inst 0xc1651509 // sdot za.s[x8, 1], { z8.h-z9.h }, z5.h\n"
"14:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z19.s }, p0/Z, [x14]\n"
- "add z19.h, p0/M, z19.h, z24.h\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z24.h\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z19.h, z18.h\n"
- "trn1 z14.h, z17.h, z16.h\n"
+ "trn1 z22.h, z19.h, z18.h\n"
+ "trn1 z23.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z16.s }, p0/Z, [x20]\n"
"addvl x21, SP, #4\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
"addvl x20, SP, #8\n"
- "trn1 z15.h, z17.h, z16.h\n"
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "trn1 z24.h, z17.h, z16.h\n"
+ ".inst 0xc16116c8 // sdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+ ".inst 0xc16016c9 // sdot za.s[x8, 1], { z22.h-z23.h }, z0.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xa0412aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16d16ca // sdot za.s[x8, 2], { z22.h-z23.h }, z13.h\n"
+ ".inst 0xc16516cb // sdot za.s[x8, 3], { z22.h-z23.h }, z5.h\n"
+ ".inst 0xc16116e8 // sdot za.s[x8, 0], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xc16016e9 // sdot za.s[x8, 1], { z23.h-z24.h }, z0.h\n"
+ ".inst 0xa0412a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16116ea // sdot za.s[x8, 2], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xc16016eb // sdot za.s[x8, 3], { z23.h-z24.h }, z0.h\n"
"15:" // Padded: 0 priming loads
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
"cbz x15, 18f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z19.s }, p0/Z, [x14]\n"
- "add z19.h, p0/M, z19.h, z24.h\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z24.h\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z19.h, z18.h\n"
- "trn1 z14.h, z17.h, z16.h\n"
+ "trn1 z6.h, z19.h, z18.h\n"
+ "trn1 z7.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
"sub x15, x15, #0x1\n"
"sub x13, x13, #0x1\n"
"cmp x15, x13\n"
- "trn1 z15.h, z17.h, z16.h\n"
+ "trn1 z8.h, z17.h, z16.h\n"
"csel x23, x15, x13, LT\n"
"add x14, x14, %x[ld_in_col]\n"
"sub x13, x13, x23\n"
@@ -503,121 +503,121 @@ void sme2_u8q_planar_3x3_s1_4rows_dot_za_impl(
"16:" // Padded: Main loop
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z21.s }, p0/Z, [x14]\n"
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- "add z21.h, p0/M, z21.h, z24.h\n"
+ "ld1b { z9.s }, p0/Z, [x14]\n"
+ ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+ ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+ "add z9.h, p0/M, z9.h, z21.h\n"
"add x22, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x22]\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- "add z20.h, p0/M, z20.h, z24.h\n"
+ "ld1b { z19.s }, p0/Z, [x22]\n"
+ ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z19.s }, p0/Z, [x22]\n"
- "add z19.h, p0/M, z19.h, z24.h\n"
- ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+ "ld1b { z18.s }, p0/Z, [x22]\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x22]\n"
- ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
"mov x12, #0x4\n"
"addvl x21, SP, #4\n"
- "add z18.h, p0/M, z18.h, z24.h\n"
- ".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
"addvl x20, SP, #8\n"
- ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16b14ca // sdot za.s[x8, 2], { z6.h-z7.h }, z11.h\n"
"subs x23, x23, #0x1\n"
"ld1b { z17.s }, p0/Z, [x22]\n"
- ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ ".inst 0xc16314cb // sdot za.s[x8, 3], { z6.h-z7.h }, z3.h\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16115ac // sdot za.s[x8, 4], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- "ld1b { z16.s }, p0/Z, [x22]\n"
- ".inst 0xc16015ad // sdot za.s[x8, 5], { z13.h-z14.h }, z0.h\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ ".inst 0xa0412aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16d14cc // sdot za.s[x8, 4], { z6.h-z7.h }, z13.h\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ "ld1b { z2.s }, p0/Z, [x22]\n"
+ ".inst 0xc16514cd // sdot za.s[x8, 5], { z6.h-z7.h }, z5.h\n"
+ "add z2.h, p0/M, z2.h, z21.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
- ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "st1b { z4.s }, p1, [x11]\n"
+ ".inst 0xc16b14ea // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ ".inst 0xc16a14eb // sdot za.s[x8, 3], { z7.h-z8.h }, z10.h\n"
+ ".inst 0xa1412a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc16315cc // sdot za.s[x8, 4], { z14.h-z15.h }, z3.h\n"
- "st1b { z6.s }, p1, [x10]\n"
+ ".inst 0xc16b14ec // sdot za.s[x8, 4], { z7.h-z8.h }, z11.h\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "trn1 z13.h, z21.h, z20.h\n"
- ".inst 0xc16215cd // sdot za.s[x8, 5], { z14.h-z15.h }, z2.h\n"
+ "trn1 z6.h, z9.h, z19.h\n"
+ ".inst 0xc16314ed // sdot za.s[x8, 5], { z7.h-z8.h }, z3.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "st1b { z5.s }, p1, [x27]\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "st1b { z25.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z7.s }, p1, [x26]\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
- ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
- "trn1 z14.h, z19.h, z18.h\n"
- "trn1 z15.h, z17.h, z16.h\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ "trn1 z7.h, z18.h, z16.h\n"
+ "trn1 z8.h, z17.h, z2.h\n"
"bgt 16b\n"
"17:" // Main loop tail
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
"addvl x21, SP, #4\n"
"addvl x20, SP, #8\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+ ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
- ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
- ".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc16115ac // sdot za.s[x8, 4], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
- "st1b { z4.s }, p1, [x11]\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ ".inst 0xc16114ca // sdot za.s[x8, 2], { z6.h-z7.h }, z1.h\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xc16014cb // sdot za.s[x8, 3], { z6.h-z7.h }, z0.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ ".inst 0xc16914cc // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc16015ad // sdot za.s[x8, 5], { z13.h-z14.h }, z0.h\n"
- "st1b { z6.s }, p1, [x10]\n"
+ ".inst 0xc16114cd // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
- "st1b { z5.s }, p1, [x27]\n"
+ ".inst 0xc16314ea // sdot za.s[x8, 2], { z7.h-z8.h }, z3.h\n"
+ "st1b { z25.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "st1b { z7.s }, p1, [x26]\n"
+ ".inst 0xc16214eb // sdot za.s[x8, 3], { z7.h-z8.h }, z2.h\n"
+ ".inst 0xa0412a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xc16315cc // sdot za.s[x8, 4], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215cd // sdot za.s[x8, 5], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16114ec // sdot za.s[x8, 4], { z7.h-z8.h }, z1.h\n"
+ ".inst 0xc16014ed // sdot za.s[x8, 5], { z7.h-z8.h }, z0.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
- ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
"18:" // Main loop skip tail
"cbz x13, 20f\n"
"19:" // Right padding loop
".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
"subs x13, x13, #0x1\n"
".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
- ".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
+ ".inst 0xc1aeac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
"add x8, x8, #0x2\n"
- ".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
- ".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
- ".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
+ ".inst 0xc1acaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ ".inst 0xc1afab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z15.s\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ ".inst 0xc1bccfa4 // sclamp { z4.s-z7.s }, z29.s, z28.s\n"
"st1b { z4.s }, p1, [x11]\n"
"add x11, x11, x9\n"
"st1b { z6.s }, p1, [x10]\n"
@@ -628,15 +628,15 @@ void sme2_u8q_planar_3x3_s1_4rows_dot_za_impl(
"add x26, x26, x24\n"
"bgt 19b\n"
"20:" // End
- "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x22, ALL, MUL #9\n"
- "str x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
"incw x16\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
"whilelt p1.s, x16, x17\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x14, x14, x20\n"
- "str x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
"ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
"ldp x23, x22, [x25, #0x0]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za.hpp
index b878914ce8..f852e12de1 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
index 630d870433..e4ce6c74fb 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
@@ -73,86 +73,86 @@ void sme2_u8q_planar_3x3_s2_4rows_dot_za_impl(
"ptrue p2.b\n"
"mov x20, #0x9\n"
"ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z5.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "ld1rh { z11.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
"sub x20, x20, x6\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x17\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
"whilelt p8.s, XZR, x7\n"
"addvl SP, SP, #-6\n"
"ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z5.h, p2/M, z5.h\n"
+ "neg z11.h, p2/M, z11.h\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z27.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z0.s, #0x0\n"
+ "mov z28.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z0.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z28.s }, p1/Z, [x20, x16, LSL #2]\n"
"2:" // Load bias: Done
"ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
"mov x20, x22\n"
- "ld1b { z24.s }, p2/Z, [x20]\n"
+ "ld1b { z26.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "ld1rh { z13.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "sub z24.h, z24.h, z13.h\n"
+ "ld1rh { z16.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "sub z26.h, z26.h, z16.h\n"
"incw x22\n"
- "mov z17.h, #0x0\n"
- "ld1b { z25.s }, p2/Z, [x20]\n"
+ "mov z24.h, #0x0\n"
+ "ld1b { z3.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z25.h, z25.h, z13.h\n"
- "trn1 z10.h, z24.h, z25.h\n"
- "ld1b { z16.s }, p2/Z, [x20]\n"
- "sub z16.h, z16.h, z13.h\n"
+ "sub z3.h, z3.h, z16.h\n"
+ "trn1 z31.h, z26.h, z3.h\n"
+ "ld1b { z21.s }, p2/Z, [x20]\n"
+ "sub z21.h, z21.h, z16.h\n"
"mov x20, x22\n"
- "trn1 z11.h, z16.h, z17.h\n"
- "ld1b { z24.s }, p2/Z, [x20]\n"
+ "trn1 z14.h, z21.h, z24.h\n"
+ "ld1b { z2.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z24.h, z24.h, z13.h\n"
+ "sub z2.h, z2.h, z16.h\n"
"addvl x21, SP, #6\n"
"ld1b { z25.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z25.h, z25.h, z13.h\n"
+ "sub z25.h, z25.h, z16.h\n"
"incw x22\n"
- "ld1b { z16.s }, p2/Z, [x20]\n"
- "sub z16.h, z16.h, z13.h\n"
+ "ld1b { z27.s }, p2/Z, [x20]\n"
+ "sub z27.h, z27.h, z16.h\n"
"addvl x21, x21, #-2\n"
"mov x20, x22\n"
- "st1h { z10.h }, p2, [x21]\n"
- "trn1 z10.h, z24.h, z25.h\n"
- "ld1b { z24.s }, p2/Z, [x20]\n"
+ "st1h { z31.h }, p2, [x21]\n"
+ "trn1 z4.h, z2.h, z25.h\n"
+ "ld1b { z26.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "ld1b { z25.s }, p2/Z, [x20]\n"
+ "ld1b { z23.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "st1h { z11.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z11.h, z16.h, z17.h\n"
- "ld1b { z16.s }, p2/Z, [x20]\n"
- "sub z24.h, z24.h, z13.h\n"
- "sub z25.h, z25.h, z13.h\n"
+ "st1h { z14.h }, p2, [x21, #1, MUL VL]\n"
+ "trn1 z12.h, z27.h, z24.h\n"
+ "ld1b { z20.s }, p2/Z, [x20]\n"
+ "sub z26.h, z26.h, z16.h\n"
+ "sub z23.h, z23.h, z16.h\n"
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "sub z16.h, z16.h, z13.h\n"
+ "sub z20.h, z20.h, z16.h\n"
"addvl x21, x21, #-2\n"
- "st1h { z10.h }, p2, [x21]\n"
- "mov z1.d, z0.d\n"
- "st1h { z11.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z4.h }, p2, [x21]\n"
+ "mov z29.d, z28.d\n"
+ "st1h { z12.h }, p2, [x21, #1, MUL VL]\n"
"addvl x21, x21, #-2\n"
- "mov z2.d, z0.d\n"
- "mov z3.d, z0.d\n"
- "trn1 z10.h, z24.h, z25.h\n"
- "st1h { z10.h }, p2, [x21]\n"
- "trn1 z11.h, z16.h, z17.h\n"
- "st1h { z11.h }, p2, [x21, #1, MUL VL]\n"
+ "mov z30.d, z28.d\n"
+ "mov z31.d, z28.d\n"
+ "trn1 z25.h, z26.h, z23.h\n"
+ "st1h { z25.h }, p2, [x21]\n"
+ "trn1 z3.h, z20.h, z24.h\n"
+ "st1h { z3.h }, p2, [x21, #1, MUL VL]\n"
"cbz x20, 3f\n"
- "ld1w { z8.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z6.s }, p1/Z, [x20, x16, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z7.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z9.s }, p1/Z, [x20, x16, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
"sub x20, x15, #0x1\n"
@@ -172,18 +172,18 @@ void sme2_u8q_planar_3x3_s2_4rows_dot_za_impl(
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x0\n"
"msub x14, x7, x20, x14\n"
- ".inst 0xc0040c00 // mova za.d[x8, #0], { z0.d-z3.d }\n"
+ ".inst 0xc0040f80 // mova za.d[x8, #0], { z28.d-z31.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040c01 // mova za.d[x8, #1], { z0.d-z3.d }\n"
+ ".inst 0xc0040f81 // mova za.d[x8, #1], { z28.d-z31.d }\n"
"mov x22, #0x2\n"
- "ldp x11, x10, [x25], #0x10\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
+ "ldp x11, x10, [x23], #0x10\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
"ldp x9, x28, [x20], #0x10\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- "ldp x27, x26, [x25], #0x10\n"
+ "ldp x27, x26, [x23], #0x10\n"
"ldp x25, x24, [x20], #0x10\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
@@ -191,24 +191,24 @@ void sme2_u8q_planar_3x3_s2_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
"and x22, x21, #0x1\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
+ ".inst 0xc1a9aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z9.s\n"
"add x21, x21, #0x1\n"
"lsr x21, x21, #0x1\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+ ".inst 0xc1adab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z13.s\n"
"sub x13, x13, x21\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
+ ".inst 0xc1a7cd58 // sclamp { z24.s-z27.s }, z10.s, z7.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z28.s }, p1, [x11]\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "st1b { z29.s }, p1, [x10]\n"
+ "st1b { z25.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z30.s }, p1, [x27]\n"
+ "st1b { z26.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"bgt 6b\n"
"7:" // Left padding: End
@@ -220,194 +220,194 @@ void sme2_u8q_planar_3x3_s2_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 2 priming loads
"add x21, x14, %x[ld_in_row]\n"
- "ld1b { z12.s }, p1/Z, [x14]\n"
+ "ld1b { z1.s }, p1/Z, [x14]\n"
"addvl x20, SP, #4\n"
- "ld1b { z20.s }, p1/Z, [x21]\n"
+ "ld1b { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z5.h\n"
- "ld1b { z13.s }, p1/Z, [x21]\n"
+ "trn1 z1.h, z1.h, z21.h\n"
+ "add z1.h, z1.h, z11.h\n"
+ "ld1b { z2.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z19.s }, p1/Z, [x21]\n"
+ "ld1b { z15.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z5.h\n"
- "ld1b { z14.s }, p1/Z, [x21]\n"
+ "trn1 z2.h, z2.h, z15.h\n"
+ "add z2.h, z2.h, z11.h\n"
+ "ld1b { z3.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x21]\n"
+ "ld1b { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z5.h\n"
- "ld1b { z15.s }, p1/Z, [x21]\n"
+ "trn1 z3.h, z3.h, z21.h\n"
+ "add z3.h, z3.h, z11.h\n"
+ "ld1b { z4.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "ld1b { z19.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z5.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ "trn1 z4.h, z4.h, z19.h\n"
+ "add z4.h, z4.h, z11.h\n"
+ "ld1b { z8.s }, p1/Z, [x21]\n"
+ "mov z5.d, z8.d\n"
+ "add z5.h, z5.h, z11.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701428 // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+ ".inst 0xc1781448 // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
"9:" // Unpadded: 1 priming loads
"add x21, x14, %x[ld_in_row]\n"
- "ld1b { z12.s }, p1/Z, [x14]\n"
+ "ld1b { z1.s }, p1/Z, [x14]\n"
"addvl x20, SP, #2\n"
- "ld1b { z20.s }, p1/Z, [x21]\n"
+ "ld1b { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z5.h\n"
- "ld1b { z13.s }, p1/Z, [x21]\n"
+ "trn1 z1.h, z1.h, z21.h\n"
+ "add z1.h, z1.h, z11.h\n"
+ "ld1b { z2.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z19.s }, p1/Z, [x21]\n"
+ "ld1b { z12.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z5.h\n"
- "ld1b { z14.s }, p1/Z, [x21]\n"
+ "trn1 z2.h, z2.h, z12.h\n"
+ "add z2.h, z2.h, z11.h\n"
+ "ld1b { z3.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x21]\n"
+ "ld1b { z8.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z5.h\n"
- "ld1b { z15.s }, p1/Z, [x21]\n"
+ "trn1 z3.h, z3.h, z8.h\n"
+ "add z3.h, z3.h, z11.h\n"
+ "ld1b { z4.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "ld1b { z5.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z5.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ "trn1 z4.h, z4.h, z5.h\n"
+ "add z4.h, z4.h, z11.h\n"
+ "ld1b { z5.s }, p1/Z, [x21]\n"
+ "mov z5.d, z5.d\n"
+ "add z5.h, z5.h, z11.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701428 // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+ ".inst 0xc1781448 // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
"10:" // Unpadded: 0 priming loads
"cmp x15, #0x2\n"
- ".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
"blt 18f\n"
"add x21, x14, %x[ld_in_row]\n"
- "ld1b { z12.s }, p1/Z, [x14]\n"
+ "ld1b { z21.s }, p1/Z, [x14]\n"
"sub x15, x15, #0x2\n"
- "ld1b { z20.s }, p1/Z, [x21]\n"
+ "ld1b { z8.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z21.h, z21.h, z8.h\n"
"sub x13, x13, #0x1\n"
- "ld1b { z13.s }, p1/Z, [x21]\n"
+ "ld1b { z22.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"lsr x20, x15, #0x1\n"
- "add z12.h, z12.h, z5.h\n"
- "ld1b { z19.s }, p1/Z, [x21]\n"
+ "add z21.h, z21.h, z11.h\n"
+ "ld1b { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z22.h, z22.h, z25.h\n"
"cmp x20, x13\n"
- "ld1b { z14.s }, p1/Z, [x21]\n"
+ "ld1b { z23.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"csel x23, x20, x13, LT\n"
- "add z13.h, z13.h, z5.h\n"
+ "add z22.h, z22.h, z11.h\n"
"ld1b { z18.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z5.h\n"
- "ld1b { z15.s }, p1/Z, [x21]\n"
+ "trn1 z23.h, z23.h, z18.h\n"
+ "add z23.h, z23.h, z11.h\n"
+ "ld1b { z24.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "ld1b { z19.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z5.h\n"
+ "trn1 z24.h, z24.h, z19.h\n"
+ "add z24.h, z24.h, z11.h\n"
+ "ld1b { z8.s }, p1/Z, [x21]\n"
+ "mov z25.d, z8.d\n"
+ "add z25.h, z25.h, z11.h\n"
"and x15, x15, #0x1\n"
"sub x13, x13, x23\n"
"cbz x23, 17f\n"
"11:" // Unpadded: Main loop
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
"addvl x20, SP, #4\n"
"add x22, x14, %x[ld_in_row]\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
"addvl x21, SP, #2\n"
"subs x23, x23, #0x1\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- "ld1b { z12.s }, p1/Z, [x14]\n"
+ ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ "ld1b { z21.s }, p1/Z, [x14]\n"
"add x14, x14, %x[ld_in_col]\n"
"add x20, x14, %x[ld_in_row]\n"
- "ld1b { z20.s }, p1/Z, [x22]\n"
+ "ld1b { z18.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "ld1b { z13.s }, p1/Z, [x22]\n"
+ ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ "trn1 z21.h, z21.h, z18.h\n"
+ "ld1b { z22.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "add z12.h, z12.h, z5.h\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- "ld1b { z19.s }, p1/Z, [x22]\n"
+ "add z21.h, z21.h, z11.h\n"
+ ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+ "ld1b { z8.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z5.h\n"
- "ld1b { z14.s }, p1/Z, [x22]\n"
+ "trn1 z22.h, z22.h, z8.h\n"
+ "add z22.h, z22.h, z11.h\n"
+ "ld1b { z23.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x8, x8, #0x1\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
- "ld1b { z18.s }, p1/Z, [x22]\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "ld1b { z27.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z5.h\n"
- "ld1b { z15.s }, p1/Z, [x22]\n"
+ "trn1 z23.h, z23.h, z27.h\n"
+ "add z23.h, z23.h, z11.h\n"
+ "ld1b { z24.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
+ ".inst 0xc1a6ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+ "ld1b { z8.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z5.h\n"
- ".inst 0xa0402aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
- "ld1b { z12.s }, p1/Z, [x14]\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "ld1b { z20.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "ld1b { z13.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "st1b { z28.s }, p1, [x11]\n"
+ "trn1 z24.h, z24.h, z8.h\n"
+ "add z24.h, z24.h, z11.h\n"
+ "ld1b { z4.s }, p1/Z, [x22]\n"
+ "mov z25.d, z4.d\n"
+ "add z25.h, z25.h, z11.h\n"
+ ".inst 0xa1402aa4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc17416a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z4.h\n"
+ ".inst 0xc1a9aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+ "ld1b { z21.s }, p1/Z, [x14]\n"
+ ".inst 0xc17c16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z12.h\n"
+ ".inst 0xc1adab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+ "ld1b { z12.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z21.h, z21.h, z12.h\n"
+ ".inst 0xc1a7cd40 // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+ "ld1b { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "st1b { z0.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "ld1b { z19.s }, p1/Z, [x20]\n"
+ "ld1b { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "st1b { z29.s }, p1, [x10]\n"
- "ld1b { z14.s }, p1/Z, [x20]\n"
+ "trn1 z22.h, z22.h, z20.h\n"
+ "st1b { z1.s }, p1, [x10]\n"
+ "ld1b { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"add x10, x10, x28\n"
- "st1b { z30.s }, p1, [x27]\n"
- "ld1b { z18.s }, p1/Z, [x20]\n"
+ "st1b { z2.s }, p1, [x27]\n"
+ "ld1b { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
+ "trn1 z23.h, z23.h, z24.h\n"
"add x27, x27, x25\n"
- "ld1b { z15.s }, p1/Z, [x20]\n"
+ "ld1b { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "st1b { z3.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
+ "ld1b { z3.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z12.h, z12.h, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- "add z13.h, z13.h, z5.h\n"
+ "trn1 z24.h, z24.h, z3.h\n"
+ "add z21.h, z21.h, z11.h\n"
+ "ld1b { z3.s }, p1/Z, [x20]\n"
+ "mov z25.d, z3.d\n"
+ "add z22.h, z22.h, z11.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
- "add z14.h, z14.h, z5.h\n"
- "add z15.h, z15.h, z5.h\n"
- "add z16.h, z16.h, z5.h\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ "add z23.h, z23.h, z11.h\n"
+ "add z24.h, z24.h, z11.h\n"
+ "add z25.h, z25.h, z11.h\n"
"bgt 11b\n"
"b 17f\n"
"12:" // Padded
@@ -418,442 +418,442 @@ void sme2_u8q_planar_3x3_s2_4rows_dot_za_impl(
"13:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ "ld1b { z22.s }, p0/Z, [x14]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ "ld1b { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "ld1b { z4.s }, p0/Z, [x20]\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z22.h, z22.h, z17.h\n"
+ "trn1 z23.h, z23.h, z4.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "add z18.h, p0/M, z18.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
+ "ld1b { z1.s }, p0/Z, [x20]\n"
+ "add z1.h, p0/M, z1.h, z11.h\n"
"addvl x20, SP, #4\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ "trn1 z24.h, z24.h, z18.h\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ ".inst 0xa1402a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
+ "mov z26.d, z1.d\n"
+ ".inst 0xc17416c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z4.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17c16e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z12.h\n"
"14:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ "ld1b { z22.s }, p0/Z, [x14]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ "ld1b { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "ld1b { z5.s }, p0/Z, [x20]\n"
+ "add z5.h, p0/M, z5.h, z11.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z22.h, z22.h, z17.h\n"
+ "trn1 z23.h, z23.h, z5.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "add z18.h, p0/M, z18.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
+ "add z15.h, p0/M, z15.h, z11.h\n"
"addvl x20, SP, #2\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ "trn1 z24.h, z24.h, z18.h\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "mov z26.d, z15.d\n"
+ ".inst 0xc17016c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z0.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17116e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z1.h\n"
"15:" // Padded: 0 priming loads
"cmp x15, #0x2\n"
- ".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
"blt 18f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ "ld1b { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "ld1b { z3.s }, p0/Z, [x20]\n"
+ "add z3.h, p0/M, z3.h, z11.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z21.h, z21.h, z18.h\n"
+ "trn1 z22.h, z22.h, z3.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1b { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
+ "add z19.h, p0/M, z19.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z20.h, p0/M, z20.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
+ "ld1b { z3.s }, p0/Z, [x20]\n"
+ "add z3.h, p0/M, z3.h, z11.h\n"
"sub x15, x15, #0x2\n"
"sub x13, x13, #0x1\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ "trn1 z23.h, z23.h, z19.h\n"
+ "trn1 z24.h, z24.h, z20.h\n"
"lsr x20, x15, #0x1\n"
"cmp x20, x13\n"
- "mov z16.d, z16.d\n"
+ "mov z25.d, z3.d\n"
"csel x22, x20, x13, LT\n"
"add x14, x14, %x[ld_in_col]\n"
"and x15, x15, #0x1\n"
"sub x13, x13, x22\n"
"cbz x22, 17f\n"
"16:" // Padded: Main loop
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
"addvl x20, SP, #4\n"
"mov x12, #0x0\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa1402a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"add x21, x14, %x[ld_in_row]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- "ld1b { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ ".inst 0xc17416a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z4.h\n"
+ "ld1b { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x21]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1b { z14.s }, p0/Z, [x21]\n"
+ "add z14.h, p0/M, z14.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
- "ld1b { z13.s }, p0/Z, [x21]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ ".inst 0xc17c16c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z12.h\n"
+ "ld1b { z22.s }, p0/Z, [x21]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x21]\n"
+ "ld1b { z15.s }, p0/Z, [x21]\n"
"mov x12, #0x4\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "add z15.h, p0/M, z15.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x21]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1b { z23.s }, p0/Z, [x21]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x21]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x21]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1b { z24.s }, p0/Z, [x21]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "ld1b { z4.s }, p0/Z, [x21]\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
"mov x12, #0x8\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z21.h, z21.h, z14.h\n"
+ "trn1 z22.h, z22.h, z15.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"addvl x20, SP, #2\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ "ld1b { z2.s }, p0/Z, [x21]\n"
+ "trn1 z23.h, z23.h, z17.h\n"
+ "trn1 z24.h, z24.h, z4.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
"mov x12, #0x0\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
+ "add z2.h, p0/M, z2.h, z11.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17016a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z0.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ "ld1b { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "mov z16.d, z16.d\n"
+ "mov z25.d, z2.d\n"
"ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "add z20.h, p0/M, z20.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ ".inst 0xc17116c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z1.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
+ "ld1b { z4.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1b { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z12.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
+ ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z12.h, p0/M, z12.h, z11.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+ "ld1b { z8.s }, p0/Z, [x20]\n"
+ "add z8.h, p0/M, z8.h, z11.h\n"
+ ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
"subs x22, x22, #0x1\n"
- ".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x11]\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+ "st1b { z16.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "st1b { z29.s }, p1, [x10]\n"
+ "trn1 z21.h, z21.h, z20.h\n"
+ "st1b { z17.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "st1b { z30.s }, p1, [x27]\n"
+ "trn1 z22.h, z22.h, z4.h\n"
+ "trn1 z23.h, z23.h, z27.h\n"
+ "st1b { z18.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "trn1 z24.h, z24.h, z12.h\n"
+ "mov z25.d, z8.d\n"
+ "st1b { z19.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"add x14, x14, %x[ld_in_col]\n"
"bgt 16b\n"
"17:" // Main loop tail
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
"addvl x20, SP, #4\n"
"mov x12, #0x0\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"add x20, x14, %x[ld_in_row]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- "ld1b { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ "ld1b { z0.s }, p0/Z, [x14]\n"
+ "add z0.h, p0/M, z0.h, z11.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
+ "add z14.h, p0/M, z14.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ "ld1b { z1.s }, p0/Z, [x20]\n"
+ "add z1.h, p0/M, z1.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
+ "ld1b { z12.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "add z12.h, p0/M, z12.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1b { z2.s }, p0/Z, [x20]\n"
+ "add z2.h, p0/M, z2.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1b { z3.s }, p0/Z, [x20]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ "add z3.h, p0/M, z3.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z25.h, p0/M, z25.h, z11.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
"addvl x20, SP, #2\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+ "trn1 z0.h, z0.h, z14.h\n"
"add x8, x8, #0x1\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
+ "add z27.h, p0/M, z27.h, z11.h\n"
+ "trn1 z1.h, z1.h, z12.h\n"
+ "trn1 z2.h, z2.h, z21.h\n"
"add x14, x14, %x[ld_in_col]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x11]\n"
+ "trn1 z3.h, z3.h, z25.h\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+ "mov z4.d, z27.d\n"
+ ".inst 0xc17e1408 // sdot za.s[x8, 0], { z0.h-z3.h }, z14.h\n"
+ ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+ "st1b { z16.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
- "st1b { z29.s }, p1, [x10]\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "st1b { z17.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
- "st1b { z30.s }, p1, [x27]\n"
+ ".inst 0xc17f1428 // sdot za.s[x8, 0], { z1.h-z4.h }, z15.h\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ "st1b { z18.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "st1b { z19.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"18:" // Main loop skip tail
"cbz x15, 19f\n" // Skip remainder inputs
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ "ld1b { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "ld1b { z0.s }, p0/Z, [x20]\n"
+ "add z0.h, p0/M, z0.h, z11.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z21.h, z21.h, z17.h\n"
+ "trn1 z22.h, z22.h, z0.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1b { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z5.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z5.h, p0/M, z5.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
+ "ld1b { z4.s }, p0/Z, [x20]\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
+ "trn1 z23.h, z23.h, z17.h\n"
+ "trn1 z24.h, z24.h, z5.h\n"
+ "mov z25.d, z4.d\n"
"addvl x20, SP, #4\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
"sub x13, x13, #0x1\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+ ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+ ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
"add x8, x8, #0x1\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x11]\n"
+ ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+ "st1b { z16.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
- "st1b { z29.s }, p1, [x10]\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "st1b { z17.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z30.s }, p1, [x27]\n"
+ "st1b { z18.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "st1b { z19.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"19:" // Tail input: End
"cbz x13, 21f\n"
"20:" // Right padding loop
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
+ ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
"add x8, x8, #0x1\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
+ ".inst 0xc1a9aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
"subs x13, x13, #0x1\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x11]\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ ".inst 0xc1adab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+ ".inst 0xc1a7cd40 // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+ "st1b { z0.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "st1b { z29.s }, p1, [x10]\n"
+ "st1b { z1.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z30.s }, p1, [x27]\n"
+ "st1b { z2.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "st1b { z3.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"bgt 20b\n"
"21:" // End
- "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x22, ALL, MUL #9\n"
- "str x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
"incw x16\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
"whilelt p1.s, x16, x17\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x14, x14, x20\n"
- "str x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
"ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
"ldp x23, x22, [x25, #0x0]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za.hpp
index db0750eb08..d8b87dcd55 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
index 2c19e232f8..d33ef764ef 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
@@ -69,196 +69,196 @@ void sme2_u8q_planar_5x5_s1_4rows_dot_za_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "ldr x4, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ldr x5, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"ptrue p2.b\n"
"mov x20, #0x8\n"
"ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z25.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "sub x20, x20, x4\n"
+ "ld1rh { z17.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "sub x20, x20, x5\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x7\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
"whilelt p8.s, XZR, x6\n"
"addvl SP, SP, #-30\n"
- "ldr x5, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z25.h, p2/M, z25.h\n"
+ "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "neg z17.h, p2/M, z17.h\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z3.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z1.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
"ld1rw { z24.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z31.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z6.s, #0x0\n"
+ "mov z18.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z6.s }, p1/Z, [x20, x5, LSL #2]\n"
+ "ld1w { z18.s }, p1/Z, [x20, x17, LSL #2]\n"
"2:" // Load bias: Done
"ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x22, x23\n"
- "ld1b { z18.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "ld1rh { z12.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "mov z2.h, #0x0\n"
- "sub z18.h, z18.h, z12.h\n"
+ "mov x20, x23\n"
+ "ld1b { z2.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "ld1rh { z3.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z15.h, #0x0\n"
+ "sub z2.h, z2.h, z3.h\n"
"incw x23\n"
- "ld1b { z17.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "sub z17.h, z17.h, z12.h\n"
- "trn1 z0.h, z2.h, z18.h\n"
- "ld1b { z21.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "sub z21.h, z21.h, z12.h\n"
- "trn1 z8.h, z18.h, z17.h\n"
- "ld1b { z16.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "sub z16.h, z16.h, z12.h\n"
- "trn1 z4.h, z17.h, z21.h\n"
- "ld1b { z15.s }, p2/Z, [x22]\n"
- "sub z15.h, z15.h, z12.h\n"
- "mov x22, x23\n"
- "trn1 z5.h, z21.h, z16.h\n"
- "ld1b { z18.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "trn1 z10.h, z16.h, z15.h\n"
+ "ld1b { z13.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z13.h, z13.h, z3.h\n"
"trn1 z11.h, z15.h, z2.h\n"
- "ld1b { z17.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "sub z18.h, z18.h, z12.h\n"
- "sub z17.h, z17.h, z12.h\n"
- "ld1b { z21.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "sub z21.h, z21.h, z12.h\n"
- "addvl x21, SP, #30\n"
- "ld1b { z16.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
+ "ld1b { z27.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z27.h, z27.h, z3.h\n"
+ "trn1 z0.h, z2.h, z13.h\n"
+ "ld1b { z19.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z19.h, z19.h, z3.h\n"
+ "trn1 z26.h, z13.h, z27.h\n"
+ "ld1b { z14.s }, p2/Z, [x20]\n"
+ "sub z14.h, z14.h, z3.h\n"
+ "mov x20, x23\n"
+ "trn1 z10.h, z27.h, z19.h\n"
+ "ld1b { z9.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z19.h, z19.h, z14.h\n"
+ "trn1 z1.h, z14.h, z15.h\n"
+ "ld1b { z5.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z9.h, z9.h, z3.h\n"
+ "sub z5.h, z5.h, z3.h\n"
+ "ld1b { z29.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z29.h, z29.h, z3.h\n"
+ "addvl x22, SP, #30\n"
+ "ld1b { z2.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"incw x23\n"
- "sub z16.h, z16.h, z12.h\n"
- "ld1b { z15.s }, p2/Z, [x22]\n"
- "addvl x21, x21, #-6\n"
- "sub z15.h, z15.h, z12.h\n"
- "mov x22, x23\n"
- "st1h { z0.h }, p2, [x21]\n"
- "trn1 z0.h, z2.h, z18.h\n"
+ "sub z2.h, z2.h, z3.h\n"
+ "ld1b { z23.s }, p2/Z, [x20]\n"
+ "addvl x22, x22, #-6\n"
+ "sub z23.h, z23.h, z3.h\n"
+ "mov x20, x23\n"
+ "st1h { z11.h }, p2, [x22]\n"
+ "trn1 z20.h, z15.h, z9.h\n"
"incw x23\n"
- "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z8.h, z18.h, z17.h\n"
- "ld1b { z18.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z4.h, z17.h, z21.h\n"
- "ld1b { z17.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z5.h, z21.h, z16.h\n"
- "ld1b { z21.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
- "trn1 z10.h, z16.h, z15.h\n"
- "ld1b { z16.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
- "trn1 z11.h, z15.h, z2.h\n"
- "sub z18.h, z18.h, z12.h\n"
- "addvl x21, x21, #-6\n"
- "sub z17.h, z17.h, z12.h\n"
- "ld1b { z15.s }, p2/Z, [x22]\n"
- "sub z21.h, z21.h, z12.h\n"
- "mov x22, x23\n"
- "sub z16.h, z16.h, z12.h\n"
- "sub z15.h, z15.h, z12.h\n"
- "st1h { z0.h }, p2, [x21]\n"
+ "ldr x21, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "st1h { z0.h }, p2, [x22, #1, MUL VL]\n"
+ "trn1 z22.h, z9.h, z5.h\n"
+ "ld1b { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z26.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z9.h, z5.h, z29.h\n"
+ "ld1b { z21.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z10.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z26.h, z29.h, z2.h\n"
+ "ld1b { z0.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z28.h, z2.h, z23.h\n"
+ "ld1b { z19.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z1.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z2.h, z23.h, z15.h\n"
+ "sub z25.h, z25.h, z3.h\n"
+ "addvl x22, x22, #-6\n"
+ "sub z21.h, z21.h, z3.h\n"
+ "ld1b { z6.s }, p2/Z, [x20]\n"
+ "sub z0.h, z0.h, z3.h\n"
+ "mov x20, x23\n"
+ "sub z19.h, z19.h, z3.h\n"
+ "sub z6.h, z6.h, z3.h\n"
+ "st1h { z20.h }, p2, [x22]\n"
"incw x23\n"
- "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z0.h, z2.h, z18.h\n"
- "trn1 z8.h, z18.h, z17.h\n"
- "ld1b { z18.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z4.h, z17.h, z21.h\n"
- "ld1b { z17.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z5.h, z21.h, z16.h\n"
- "ld1b { z21.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
- "trn1 z10.h, z16.h, z15.h\n"
- "ld1b { z16.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
- "trn1 z11.h, z15.h, z2.h\n"
- "sub z18.h, z18.h, z12.h\n"
- "sub z17.h, z17.h, z12.h\n"
- "ld1b { z15.s }, p2/Z, [x22]\n"
- "addvl x21, x21, #-6\n"
- "sub z21.h, z21.h, z12.h\n"
- "sub z16.h, z16.h, z12.h\n"
- "mov x22, x23\n"
- "st1h { z0.h }, p2, [x21]\n"
- "sub z15.h, z15.h, z12.h\n"
- "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z0.h, z2.h, z18.h\n"
- "trn1 z8.h, z18.h, z17.h\n"
- "ld1b { z18.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z4.h, z17.h, z21.h\n"
- "ld1b { z17.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z5.h, z21.h, z16.h\n"
- "ld1b { z21.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
- "trn1 z10.h, z16.h, z15.h\n"
- "ld1b { z16.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
- "trn1 z11.h, z15.h, z2.h\n"
- "ld1b { z15.s }, p2/Z, [x22]\n"
- "sub z18.h, z18.h, z12.h\n"
- "addvl x21, x21, #-6\n"
- "sub z17.h, z17.h, z12.h\n"
- "sub z21.h, z21.h, z12.h\n"
- "st1h { z0.h }, p2, [x21]\n"
- "sub z16.h, z16.h, z12.h\n"
- "sub z15.h, z15.h, z12.h\n"
- "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
- "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
- "mov z7.d, z6.d\n"
- "trn1 z0.h, z2.h, z18.h\n"
- "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z8.h, z18.h, z17.h\n"
- "trn1 z4.h, z17.h, z21.h\n"
- "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
- "trn1 z5.h, z21.h, z16.h\n"
- "trn1 z10.h, z16.h, z15.h\n"
- "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
- "addvl x21, x21, #-6\n"
- "trn1 z11.h, z15.h, z2.h\n"
- "st1h { z0.h }, p2, [x21]\n"
- "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
- "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
- "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
- "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
- "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
- "cbz x20, 3f\n"
- "ld1w { z3.s }, p1/Z, [x20, x5, LSL #2]\n"
+ "st1h { z22.h }, p2, [x22, #1, MUL VL]\n"
+ "trn1 z11.h, z15.h, z25.h\n"
+ "trn1 z10.h, z25.h, z21.h\n"
+ "ld1b { z5.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z9.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z14.h, z21.h, z0.h\n"
+ "ld1b { z23.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z26.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z21.h, z0.h, z19.h\n"
+ "ld1b { z27.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z28.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z19.h, z19.h, z6.h\n"
+ "ld1b { z29.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z13.h, z6.h, z15.h\n"
+ "sub z5.h, z5.h, z3.h\n"
+ "sub z23.h, z23.h, z3.h\n"
+ "ld1b { z1.s }, p2/Z, [x20]\n"
+ "addvl x22, x22, #-6\n"
+ "sub z27.h, z27.h, z3.h\n"
+ "sub z29.h, z29.h, z3.h\n"
+ "mov x20, x23\n"
+ "st1h { z11.h }, p2, [x22]\n"
+ "sub z1.h, z1.h, z3.h\n"
+ "st1h { z10.h }, p2, [x22, #1, MUL VL]\n"
+ "trn1 z30.h, z15.h, z5.h\n"
+ "trn1 z26.h, z5.h, z23.h\n"
+ "ld1b { z11.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z14.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z22.h, z23.h, z27.h\n"
+ "ld1b { z5.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z21.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z28.h, z27.h, z29.h\n"
+ "ld1b { z8.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z27.h, z29.h, z1.h\n"
+ "ld1b { z9.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z13.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z2.h, z1.h, z15.h\n"
+ "ld1b { z14.s }, p2/Z, [x20]\n"
+ "sub z11.h, z11.h, z3.h\n"
+ "addvl x22, x22, #-6\n"
+ "sub z5.h, z5.h, z3.h\n"
+ "sub z8.h, z8.h, z3.h\n"
+ "st1h { z30.h }, p2, [x22]\n"
+ "sub z9.h, z9.h, z3.h\n"
+ "sub z14.h, z14.h, z3.h\n"
+ "st1h { z26.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z22.h }, p2, [x22, #2, MUL VL]\n"
+ "mov z19.d, z18.d\n"
+ "trn1 z22.h, z15.h, z11.h\n"
+ "st1h { z28.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z1.h, z11.h, z5.h\n"
+ "trn1 z31.h, z5.h, z8.h\n"
+ "st1h { z27.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z8.h, z8.h, z9.h\n"
+ "trn1 z21.h, z9.h, z14.h\n"
+ "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+ "addvl x22, x22, #-6\n"
+ "trn1 z15.h, z14.h, z15.h\n"
+ "st1h { z22.h }, p2, [x22]\n"
+ "st1h { z1.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z31.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z8.h }, p2, [x22, #3, MUL VL]\n"
+ "st1h { z21.h }, p2, [x22, #4, MUL VL]\n"
+ "st1h { z15.h }, p2, [x22, #5, MUL VL]\n"
+ "cbz x21, 3f\n"
+ "ld1w { z7.s }, p1/Z, [x21, x17, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z1.s }, p1/Z, [x20, x5, LSL #2]\n"
+ "ld1w { z4.s }, p1/Z, [x20, x17, LSL #2]\n"
"4:" // Load right_shift: End
- "ldr x17, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x20, x17, #0x1\n"
+ "ldr x25, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x25, #0x1\n"
"orr x23, x20, %x[ld_in_col], LSL #16\n"
"ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
"orr x23, x7, x23, LSL #22\n"
"mov x22, #0x8\n"
- "add x21, x6, x4\n"
+ "add x21, x6, x5\n"
"lsl x20, %x[ld_in_row], #0x0\n"
"ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
"mov x11, #0x0\n"
@@ -271,56 +271,56 @@ void sme2_u8q_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x0\n"
"msub x16, x6, x20, x16\n"
- ".inst 0xc00468c0 // mova za.d[x11, #0], { z6.d-z7.d }\n"
+ ".inst 0xc0046a40 // mova za.d[x11, #0], { z18.d-z19.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc00468c1 // mova za.d[x11, #1], { z6.d-z7.d }\n"
+ ".inst 0xc0046a41 // mova za.d[x11, #1], { z18.d-z19.d }\n"
"mov x22, #0x4\n"
- "ldp x14, x13, [x25], #0x10\n"
- ".inst 0xc00468c2 // mova za.d[x11, #2], { z6.d-z7.d }\n"
- "ldp x3, x10, [x20], #0x10\n"
- ".inst 0xc00468c3 // mova za.d[x11, #3], { z6.d-z7.d }\n"
+ "ldp x14, x13, [x23], #0x10\n"
+ ".inst 0xc0046a42 // mova za.d[x11, #2], { z18.d-z19.d }\n"
+ "ldp x4, x10, [x20], #0x10\n"
+ ".inst 0xc0046a43 // mova za.d[x11, #3], { z18.d-z19.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc00468c4 // mova za.d[x11, #4], { z6.d-z7.d }\n"
- "ldp x9, x28, [x25], #0x10\n"
- ".inst 0xc00468c5 // mova za.d[x11, #5], { z6.d-z7.d }\n"
+ ".inst 0xc0046a44 // mova za.d[x11, #4], { z18.d-z19.d }\n"
+ "ldp x9, x28, [x23], #0x10\n"
+ ".inst 0xc0046a45 // mova za.d[x11, #5], { z18.d-z19.d }\n"
"ldp x27, x26, [x20], #0x10\n"
- ".inst 0xc00468c6 // mova za.d[x11, #6], { z6.d-z7.d }\n"
- ".inst 0xc00468c7 // mova za.d[x11, #7], { z6.d-z7.d }\n"
- ".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
- ".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
+ ".inst 0xc0046a46 // mova za.d[x11, #6], { z18.d-z19.d }\n"
+ ".inst 0xc0046a47 // mova za.d[x11, #7], { z18.d-z19.d }\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
"csel x20, x21, x22, LT\n"
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
+ ".inst 0xc0066814 // mova { z20.d-z21.d }, za.d[x11, #0]\n"
"sub x15, x15, x21\n"
- ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
+ ".inst 0xc0066836 // mova { z22.d-z23.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
+ ".inst 0xc1a4aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z4.s\n"
+ ".inst 0xc1acab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z12.s\n"
+ ".inst 0xc1b0cf14 // sclamp { z20.s-z23.s }, z24.s, z16.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z12.s }, p1, [x14]\n"
- "add x14, x14, x3\n"
- "st1b { z14.s }, p1, [x13]\n"
+ "st1b { z20.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z22.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z13.s }, p1, [x9]\n"
+ "st1b { z21.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z15.s }, p1, [x28]\n"
+ "st1b { z23.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 6b\n"
"7:" // Left padding: End
- "adds XZR, x6, x4\n"
+ "adds XZR, x6, x5\n"
"bne 14f\n"
"cbz x22, 12f\n"
"cmp x22, #0x1\n"
- "sub x17, x17, x22\n"
+ "sub x25, x25, x22\n"
"beq 11f\n"
"cmp x22, #0x2\n"
"beq 10f\n"
@@ -328,338 +328,338 @@ void sme2_u8q_planar_5x5_s1_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 4 priming loads
"add x21, x16, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x16]\n"
+ "ld1b { z1.s }, p1/Z, [x16]\n"
"addvl x20, SP, #24\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
+ "ld1b { z28.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "trn1 z27.h, z1.h, z28.h\n"
+ "add z27.h, z27.h, z17.h\n"
+ "ld1b { z1.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
+ "ld1b { z2.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "add z28.h, z28.h, z25.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
+ "trn1 z28.h, z1.h, z2.h\n"
+ "add z28.h, z28.h, z17.h\n"
+ "ld1b { z13.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z29.s }, p1/Z, [x21]\n"
+ "ld1b { z6.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z29.h, z16.h, z29.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "trn1 z29.h, z13.h, z6.h\n"
+ "add z29.h, z29.h, z17.h\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "add z30.h, z30.h, z25.h\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16a7768 // sdot za.s[x11, 0], { z27.h-z28.h }, z10.h\n"
+ "ld1b { z20.s }, p1/Z, [x21]\n"
+ "trn1 z30.h, z30.h, z20.h\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "add z30.h, z30.h, z17.h\n"
+ ".inst 0xc1697788 // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
+ ".inst 0xc1617789 // sdot za.s[x11, 1], { z28.h-z29.h }, z1.h\n"
".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
"9:" // Unpadded: 3 priming loads
"add x22, x16, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x16]\n"
+ "ld1b { z2.s }, p1/Z, [x16]\n"
"addvl x21, SP, #18\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
+ "ld1b { z28.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
+ "trn1 z20.h, z2.h, z28.h\n"
+ "add z20.h, z20.h, z17.h\n"
+ "ld1b { z31.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #24\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
+ "ld1b { z11.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "add z28.h, z28.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
+ "trn1 z21.h, z31.h, z11.h\n"
+ "add z21.h, z21.h, z17.h\n"
+ "ld1b { z25.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
+ "ld1b { z8.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z29.h, z17.h, z16.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
+ "trn1 z22.h, z25.h, z8.h\n"
+ "add z22.h, z22.h, z17.h\n"
+ "ld1b { z8.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- "add z30.h, z30.h, z25.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16e7688 // sdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
+ "ld1b { z3.s }, p1/Z, [x22]\n"
+ "trn1 z23.h, z8.h, z3.h\n"
+ ".inst 0xc1667689 // sdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc161768a // sdot za.s[x11, 2], { z20.h-z21.h }, z1.h\n"
+ "add z23.h, z23.h, z17.h\n"
+ ".inst 0xa1412aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc160768b // sdot za.s[x11, 3], { z20.h-z21.h }, z0.h\n"
+ ".inst 0xc16976a8 // sdot za.s[x11, 0], { z21.h-z22.h }, z9.h\n"
+ ".inst 0xa0422aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16176a9 // sdot za.s[x11, 1], { z21.h-z22.h }, z1.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16976aa // sdot za.s[x11, 2], { z21.h-z22.h }, z9.h\n"
+ ".inst 0xc16176ab // sdot za.s[x11, 3], { z21.h-z22.h }, z1.h\n"
+ ".inst 0xc16f76c8 // sdot za.s[x11, 0], { z22.h-z23.h }, z15.h\n"
+ ".inst 0xc16e76c9 // sdot za.s[x11, 1], { z22.h-z23.h }, z14.h\n"
".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16b76ca // sdot za.s[x11, 2], { z22.h-z23.h }, z11.h\n"
+ ".inst 0xc16a76cb // sdot za.s[x11, 3], { z22.h-z23.h }, z10.h\n"
"10:" // Unpadded: 2 priming loads
"add x23, x16, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x16]\n"
+ "ld1b { z2.s }, p1/Z, [x16]\n"
"addvl x22, SP, #12\n"
- "ld1b { z16.s }, p1/Z, [x23]\n"
+ "ld1b { z22.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x23]\n"
+ "trn1 z0.h, z2.h, z22.h\n"
+ "add z0.h, z0.h, z17.h\n"
+ "ld1b { z14.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"addvl x21, SP, #18\n"
- "ld1b { z16.s }, p1/Z, [x23]\n"
+ "ld1b { z6.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "add z28.h, z28.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x23]\n"
+ "trn1 z1.h, z14.h, z6.h\n"
+ "add z1.h, z1.h, z17.h\n"
+ "ld1b { z15.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"addvl x20, SP, #24\n"
- "ld1b { z16.s }, p1/Z, [x23]\n"
+ "ld1b { z6.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z29.h, z17.h, z16.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x23]\n"
+ "trn1 z2.h, z15.h, z6.h\n"
+ "add z2.h, z2.h, z17.h\n"
+ "ld1b { z21.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "ld1b { z16.s }, p1/Z, [x23]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- "add z30.h, z30.h, z25.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16f7408 // sdot za.s[x11, 0], { z0.h-z1.h }, z15.h\n"
+ "ld1b { z30.s }, p1/Z, [x23]\n"
+ "trn1 z3.h, z21.h, z30.h\n"
+ ".inst 0xc16e7409 // sdot za.s[x11, 1], { z0.h-z1.h }, z14.h\n"
+ ".inst 0xa1402aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16d740a // sdot za.s[x11, 2], { z0.h-z1.h }, z13.h\n"
+ "add z3.h, z3.h, z17.h\n"
+ ".inst 0xa0412ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc165740b // sdot za.s[x11, 3], { z0.h-z1.h }, z5.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16f7428 // sdot za.s[x11, 0], { z1.h-z2.h }, z15.h\n"
+ ".inst 0xc16e7429 // sdot za.s[x11, 1], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xa0412aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16b740c // sdot za.s[x11, 4], { z0.h-z1.h }, z11.h\n"
+ ".inst 0xc16a740d // sdot za.s[x11, 5], { z0.h-z1.h }, z10.h\n"
+ ".inst 0xc16f742a // sdot za.s[x11, 2], { z1.h-z2.h }, z15.h\n"
+ ".inst 0xc16e742b // sdot za.s[x11, 3], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xa0412a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1697448 // sdot za.s[x11, 0], { z2.h-z3.h }, z9.h\n"
+ ".inst 0xc1687449 // sdot za.s[x11, 1], { z2.h-z3.h }, z8.h\n"
".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16f742c // sdot za.s[x11, 4], { z1.h-z2.h }, z15.h\n"
+ ".inst 0xc16e742d // sdot za.s[x11, 5], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xc16b744a // sdot za.s[x11, 2], { z2.h-z3.h }, z11.h\n"
+ ".inst 0xc16a744b // sdot za.s[x11, 3], { z2.h-z3.h }, z10.h\n"
+ ".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc161744c // sdot za.s[x11, 4], { z2.h-z3.h }, z1.h\n"
+ ".inst 0xc160744d // sdot za.s[x11, 5], { z2.h-z3.h }, z0.h\n"
"11:" // Unpadded: 1 priming loads
"add x24, x16, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x16]\n"
+ "ld1b { z0.s }, p1/Z, [x16]\n"
"addvl x23, SP, #6\n"
- "ld1b { z16.s }, p1/Z, [x24]\n"
+ "ld1b { z3.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x24]\n"
+ "trn1 z28.h, z0.h, z3.h\n"
+ "add z28.h, z28.h, z17.h\n"
+ "ld1b { z6.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
"addvl x22, SP, #12\n"
- "ld1b { z16.s }, p1/Z, [x24]\n"
+ "ld1b { z30.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "add z28.h, z28.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x24]\n"
+ "trn1 z29.h, z6.h, z30.h\n"
+ "add z29.h, z29.h, z17.h\n"
+ "ld1b { z1.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
"addvl x21, SP, #18\n"
- "ld1b { z16.s }, p1/Z, [x24]\n"
+ "ld1b { z25.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
- "trn1 z29.h, z17.h, z16.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x24]\n"
+ "trn1 z30.h, z1.h, z25.h\n"
+ "add z30.h, z30.h, z17.h\n"
+ "ld1b { z3.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
"addvl x20, SP, #24\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1617788 // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x24]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- "add z30.h, z30.h, z25.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
+ "ld1b { z5.s }, p1/Z, [x24]\n"
+ "trn1 z31.h, z3.h, z5.h\n"
+ ".inst 0xc1607789 // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16e778a // sdot za.s[x11, 2], { z28.h-z29.h }, z14.h\n"
+ "add z31.h, z31.h, z17.h\n"
+ ".inst 0xa1412ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc166778b // sdot za.s[x11, 3], { z28.h-z29.h }, z6.h\n"
+ ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16a77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16277a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z2.h\n"
+ ".inst 0xa0412ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa1422ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16f778c // sdot za.s[x11, 4], { z28.h-z29.h }, z15.h\n"
+ ".inst 0xc16e778d // sdot za.s[x11, 5], { z28.h-z29.h }, z14.h\n"
+ ".inst 0xa1402a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16977aa // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
+ ".inst 0xc16877ab // sdot za.s[x11, 3], { z29.h-z30.h }, z8.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16a77c8 // sdot za.s[x11, 0], { z30.h-z31.h }, z10.h\n"
+ ".inst 0xc16277c9 // sdot za.s[x11, 1], { z30.h-z31.h }, z2.h\n"
+ ".inst 0xa1422ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16e778e // sdot za.s[x11, 6], { z28.h-z29.h }, z14.h\n"
+ ".inst 0xc166778f // sdot za.s[x11, 7], { z28.h-z29.h }, z6.h\n"
+ ".inst 0xc16d77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z13.h\n"
+ ".inst 0xc16577ad // sdot za.s[x11, 5], { z29.h-z30.h }, z5.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16a77ca // sdot za.s[x11, 2], { z30.h-z31.h }, z10.h\n"
+ ".inst 0xc16277cb // sdot za.s[x11, 3], { z30.h-z31.h }, z2.h\n"
+ ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16e77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z14.h\n"
+ ".inst 0xc16677af // sdot za.s[x11, 7], { z29.h-z30.h }, z6.h\n"
+ ".inst 0xc16977cc // sdot za.s[x11, 4], { z30.h-z31.h }, z9.h\n"
+ ".inst 0xc16877cd // sdot za.s[x11, 5], { z30.h-z31.h }, z8.h\n"
+ ".inst 0xa1422a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16e77ce // sdot za.s[x11, 6], { z30.h-z31.h }, z14.h\n"
+ ".inst 0xc16677cf // sdot za.s[x11, 7], { z30.h-z31.h }, z6.h\n"
"12:" // Unpadded: 0 priming loads
- ".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xa0422bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "cbz x17, 22f\n"
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "cbz x25, 22f\n"
"add x20, x16, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x16]\n"
- "sub x17, x17, #0x1\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "ld1b { z26.s }, p1/Z, [x16]\n"
+ "sub x25, x25, #0x1\n"
+ "ld1b { z28.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
+ "trn1 z25.h, z26.h, z28.h\n"
"sub x15, x15, #0x1\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
+ "ld1b { z31.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "cmp x17, x15\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "cmp x25, x15\n"
+ "add z25.h, z25.h, z17.h\n"
+ "ld1b { z15.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "csel x25, x17, x15, LT\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
+ "trn1 z26.h, z31.h, z15.h\n"
+ "csel x25, x25, x15, LT\n"
+ "ld1b { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z28.h, z28.h, z25.h\n"
+ "add z26.h, z26.h, z17.h\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "ld1b { z8.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z29.h, z17.h, z16.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
+ "trn1 z27.h, z22.h, z8.h\n"
+ "add z27.h, z27.h, z17.h\n"
+ "ld1b { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"sub x15, x15, x25\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- "add z30.h, z30.h, z25.h\n"
+ "ld1b { z20.s }, p1/Z, [x20]\n"
+ "trn1 z28.h, z21.h, z20.h\n"
+ "add z28.h, z28.h, z17.h\n"
"cbz x25, 21f\n"
"13:" // Unpadded: Main loop
"addvl x24, SP, #6\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
"addvl x23, SP, #12\n"
- "ld1b { z23.s }, p1/Z, [x16]\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402b00 // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
+ "ld1b { z21.s }, p1/Z, [x16]\n"
+ ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa0402b0e // ld1h { z14.h-z15.h }, pn10.b/Z, [x24]\n"
"addvl x22, SP, #18\n"
"addvl x21, SP, #24\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc16f772a // sdot za.s[x11, 2], { z25.h-z26.h }, z15.h\n"
"add x20, x16, %x[ld_in_row]\n"
- "ld1b { z22.s }, p1/Z, [x20]\n"
+ "ld1b { z0.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc16e772b // sdot za.s[x11, 3], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
"subs x25, x25, #0x1\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- "ld1b { z21.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412b04 // ld1h { z4.h-z5.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
"ld1b { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- "ld1b { z19.s }, p1/Z, [x20]\n"
+ ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412b05 // ld1h { z5.h, z13.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+ ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+ "ld1b { z31.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- "ld1b { z18.s }, p1/Z, [x20]\n"
+ ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16d774a // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+ "ld1b { z29.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422b0a // ld1h { z10.h-z11.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
+ ".inst 0xc165774b // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ae5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+ "ld1b { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc16f772e // sdot za.s[x11, 6], { z25.h-z26.h }, z15.h\n"
+ "ld1b { z30.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16e772f // sdot za.s[x11, 7], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16d774c // sdot za.s[x11, 4], { z26.h-z27.h }, z13.h\n"
+ "ld1b { z6.s }, p1/Z, [x20]\n"
+ ".inst 0xc165774d // sdot za.s[x11, 5], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16a776a // sdot za.s[x11, 2], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc162776b // sdot za.s[x11, 3], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16a776c // sdot za.s[x11, 4], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc162776d // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc169776e // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
+ ".inst 0xc161776f // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc1681768 // sdot za.s[x8, 0], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
- "trn1 z27.h, z23.h, z22.h\n"
- ".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc1651788 // sdot za.s[x8, 0], { z28.h-z29.h }, z5.h\n"
- "add z27.h, z27.h, z25.h\n"
- ".inst 0xc1641789 // sdot za.s[x8, 1], { z28.h-z29.h }, z4.h\n"
- "trn1 z28.h, z21.h, z20.h\n"
- ".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xc16b17a8 // sdot za.s[x8, 0], { z29.h-z30.h }, z11.h\n"
- "add z28.h, z28.h, z25.h\n"
- ".inst 0xc16a17a9 // sdot za.s[x8, 1], { z29.h-z30.h }, z10.h\n"
- "trn1 z29.h, z19.h, z18.h\n"
- "trn1 z30.h, z17.h, z16.h\n"
+ ".inst 0xc16f1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
+ ".inst 0xc16e1729 // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
+ "trn1 z25.h, z21.h, z0.h\n"
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc16d1748 // sdot za.s[x8, 0], { z26.h-z27.h }, z13.h\n"
+ "add z25.h, z25.h, z17.h\n"
+ ".inst 0xc1651749 // sdot za.s[x8, 1], { z26.h-z27.h }, z5.h\n"
+ "trn1 z26.h, z20.h, z31.h\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xc16b1768 // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
+ "add z26.h, z26.h, z17.h\n"
+ ".inst 0xc16a1769 // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
+ "trn1 z27.h, z29.h, z22.h\n"
+ "trn1 z28.h, z30.h, z6.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
- ".inst 0xa0422bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "add z29.h, z29.h, z25.h\n"
- ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "add z27.h, z27.h, z17.h\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
- ".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x14]\n"
- "add x14, x14, x3\n"
- "add z30.h, z30.h, z25.h\n"
- "st1b { z14.s }, p1, [x13]\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "add z28.h, z28.h, z17.h\n"
+ "st1b { z10.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z13.s }, p1, [x9]\n"
+ "st1b { z9.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z15.s }, p1, [x28]\n"
+ "st1b { z11.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 13b\n"
"b 21f\n"
"14:" // Padded
"cbz x22, 19f\n"
"cmp x22, #0x1\n"
- "sub x17, x17, x22\n"
+ "sub x25, x25, x22\n"
"beq 18f\n"
"cmp x22, #0x2\n"
"beq 17f\n"
@@ -668,515 +668,515 @@ void sme2_u8q_planar_5x5_s1_4rows_dot_za_impl(
"15:" // Padded: 4 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x16]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1b { z9.s }, p0/Z, [x16]\n"
+ "add z9.h, p0/M, z9.h, z17.h\n"
"add x21, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x21]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x21]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1b { z21.s }, p0/Z, [x21]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z27.h, z19.h, z18.h\n"
- "trn1 z28.h, z17.h, z16.h\n"
+ "trn1 z31.h, z9.h, z22.h\n"
+ "trn1 z0.h, z21.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z18.s }, p0/Z, [x21]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x21]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
+ "ld1b { z21.s }, p0/Z, [x21]\n"
"addvl x20, SP, #24\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- "trn1 z29.h, z18.h, z16.h\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ "trn1 z1.h, z22.h, z20.h\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc16a77e8 // sdot za.s[x11, 0], { z31.h-z0.h }, z10.h\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16277e9 // sdot za.s[x11, 1], { z31.h-z0.h }, z2.h\n"
+ ".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "trn1 z2.h, z21.h, z20.h\n"
+ ".inst 0xc16d7408 // sdot za.s[x11, 0], { z0.h-z1.h }, z13.h\n"
+ ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc1657409 // sdot za.s[x11, 1], { z0.h-z1.h }, z5.h\n"
+ ".inst 0xc1697428 // sdot za.s[x11, 0], { z1.h-z2.h }, z9.h\n"
+ ".inst 0xc1687429 // sdot za.s[x11, 1], { z1.h-z2.h }, z8.h\n"
"16:" // Padded: 3 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x16]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1b { z5.s }, p0/Z, [x16]\n"
+ "add z5.h, p0/M, z5.h, z17.h\n"
"add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z19.h, z18.h\n"
- "trn1 z28.h, z17.h, z16.h\n"
+ "trn1 z28.h, z5.h, z22.h\n"
+ "trn1 z29.h, z21.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
"addvl x21, SP, #18\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- "trn1 z29.h, z18.h, z16.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ "trn1 z30.h, z22.h, z20.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"addvl x20, SP, #24\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- "trn1 z30.h, z17.h, z16.h\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc1617788 // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xc1607789 // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ "trn1 z31.h, z21.h, z20.h\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xa0412aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc169778a // sdot za.s[x11, 2], { z28.h-z29.h }, z9.h\n"
+ ".inst 0xc161778b // sdot za.s[x11, 3], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xa1422aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16f77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z15.h\n"
+ ".inst 0xc16e77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z14.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16977aa // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
+ ".inst 0xc16177ab // sdot za.s[x11, 3], { z29.h-z30.h }, z1.h\n"
+ ".inst 0xc16b77c8 // sdot za.s[x11, 0], { z30.h-z31.h }, z11.h\n"
+ ".inst 0xc16377c9 // sdot za.s[x11, 1], { z30.h-z31.h }, z3.h\n"
+ ".inst 0xa0422a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16f77ca // sdot za.s[x11, 2], { z30.h-z31.h }, z15.h\n"
+ ".inst 0xc16e77cb // sdot za.s[x11, 3], { z30.h-z31.h }, z14.h\n"
"17:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x16]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1b { z29.s }, p0/Z, [x16]\n"
+ "add z29.h, p0/M, z29.h, z17.h\n"
"add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z19.h, z18.h\n"
- "trn1 z28.h, z17.h, z16.h\n"
+ "trn1 z8.h, z29.h, z22.h\n"
+ "trn1 z9.h, z21.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
"addvl x22, SP, #12\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- "trn1 z29.h, z18.h, z16.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+ "trn1 z10.h, z22.h, z20.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"addvl x21, SP, #18\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc16f7508 // sdot za.s[x11, 0], { z8.h-z9.h }, z15.h\n"
+ ".inst 0xc16e7509 // sdot za.s[x11, 1], { z8.h-z9.h }, z14.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
"addvl x20, SP, #24\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
+ "trn1 z11.h, z21.h, z20.h\n"
+ ".inst 0xa1412ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16e750a // sdot za.s[x11, 2], { z8.h-z9.h }, z14.h\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc166750b // sdot za.s[x11, 3], { z8.h-z9.h }, z6.h\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16d7528 // sdot za.s[x11, 0], { z9.h-z10.h }, z13.h\n"
+ ".inst 0xa0422ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc1657529 // sdot za.s[x11, 1], { z9.h-z10.h }, z5.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16f750c // sdot za.s[x11, 4], { z8.h-z9.h }, z15.h\n"
+ ".inst 0xc16e750d // sdot za.s[x11, 5], { z8.h-z9.h }, z14.h\n"
+ ".inst 0xc16d752a // sdot za.s[x11, 2], { z9.h-z10.h }, z13.h\n"
+ ".inst 0xc165752b // sdot za.s[x11, 3], { z9.h-z10.h }, z5.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1617548 // sdot za.s[x11, 0], { z10.h-z11.h }, z1.h\n"
+ ".inst 0xc1607549 // sdot za.s[x11, 1], { z10.h-z11.h }, z0.h\n"
+ ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16e752c // sdot za.s[x11, 4], { z9.h-z10.h }, z14.h\n"
+ ".inst 0xc166752d // sdot za.s[x11, 5], { z9.h-z10.h }, z6.h\n"
+ ".inst 0xc161754a // sdot za.s[x11, 2], { z10.h-z11.h }, z1.h\n"
+ ".inst 0xc160754b // sdot za.s[x11, 3], { z10.h-z11.h }, z0.h\n"
+ ".inst 0xa0422a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16f754c // sdot za.s[x11, 4], { z10.h-z11.h }, z15.h\n"
+ ".inst 0xc16e754d // sdot za.s[x11, 5], { z10.h-z11.h }, z14.h\n"
"18:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x16]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1b { z1.s }, p0/Z, [x16]\n"
+ "add z1.h, p0/M, z1.h, z17.h\n"
"add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z19.h, z18.h\n"
- "trn1 z28.h, z17.h, z16.h\n"
+ "trn1 z26.h, z1.h, z22.h\n"
+ "trn1 z27.h, z21.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
"addvl x23, SP, #6\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
- "trn1 z29.h, z18.h, z16.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xa0402aee // ld1h { z14.h-z15.h }, pn10.b/Z, [x23]\n"
+ "trn1 z28.h, z22.h, z20.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"addvl x22, SP, #12\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc16f7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z15.h\n"
+ ".inst 0xc16e7749 // sdot za.s[x11, 1], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xa0402ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22]\n"
"addvl x21, SP, #18\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
+ "trn1 z29.h, z21.h, z20.h\n"
+ ".inst 0xa0412aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc161774a // sdot za.s[x11, 2], { z26.h-z27.h }, z1.h\n"
"addvl x20, SP, #24\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc160774b // sdot za.s[x11, 3], { z26.h-z27.h }, z0.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16b7768 // sdot za.s[x11, 0], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xa0422ae8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16a7769 // sdot za.s[x11, 1], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xa0412aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16e774c // sdot za.s[x11, 4], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xc166774d // sdot za.s[x11, 5], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16b776a // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xc16a776b // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc1697788 // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
+ ".inst 0xc1687789 // sdot za.s[x11, 1], { z28.h-z29.h }, z8.h\n"
+ ".inst 0xa1422ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xc16e776c // sdot za.s[x11, 4], { z27.h-z28.h }, z14.h\n"
+ ".inst 0xc166776d // sdot za.s[x11, 5], { z27.h-z28.h }, z6.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16a778a // sdot za.s[x11, 2], { z28.h-z29.h }, z10.h\n"
+ ".inst 0xc162778b // sdot za.s[x11, 3], { z28.h-z29.h }, z2.h\n"
+ ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16e776e // sdot za.s[x11, 6], { z27.h-z28.h }, z14.h\n"
+ ".inst 0xc166776f // sdot za.s[x11, 7], { z27.h-z28.h }, z6.h\n"
+ ".inst 0xc161778c // sdot za.s[x11, 4], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xc160778d // sdot za.s[x11, 5], { z28.h-z29.h }, z0.h\n"
+ ".inst 0xa1422a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16a778e // sdot za.s[x11, 6], { z28.h-z29.h }, z10.h\n"
+ ".inst 0xc162778f // sdot za.s[x11, 7], { z28.h-z29.h }, z2.h\n"
"19:" // Padded: 0 priming loads
- ".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xa0422bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "cbz x17, 22f\n"
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "cbz x25, 22f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x16]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1b { z6.s }, p0/Z, [x16]\n"
+ "add z6.h, p0/M, z6.h, z17.h\n"
"add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z30.s }, p0/Z, [x20]\n"
+ "add z30.h, p0/M, z30.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z17.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z19.h, z18.h\n"
- "trn1 z28.h, z17.h, z16.h\n"
+ "trn1 z25.h, z6.h, z30.h\n"
+ "trn1 z26.h, z27.h, z26.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1b { z8.s }, p0/Z, [x20]\n"
+ "add z8.h, p0/M, z8.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z9.s }, p0/Z, [x20]\n"
+ "add z9.h, p0/M, z9.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- "sub x17, x17, #0x1\n"
+ "ld1b { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z17.h\n"
+ "sub x25, x25, #0x1\n"
"sub x15, x15, #0x1\n"
- "cmp x17, x15\n"
- "trn1 z29.h, z19.h, z18.h\n"
- "trn1 z30.h, z17.h, z16.h\n"
- "csel x25, x17, x15, LT\n"
+ "cmp x25, x15\n"
+ "trn1 z27.h, z8.h, z9.h\n"
+ "trn1 z28.h, z21.h, z29.h\n"
+ "csel x25, x25, x15, LT\n"
"add x16, x16, %x[ld_in_col]\n"
"sub x15, x15, x25\n"
"cbz x25, 21f\n"
"20:" // Padded: Main loop
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z23.s }, p0/Z, [x16]\n"
- "add z23.h, p0/M, z23.h, z25.h\n"
+ "ld1b { z8.s }, p0/Z, [x16]\n"
+ "add z8.h, p0/M, z8.h, z17.h\n"
"add x24, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z22.s }, p0/Z, [x24]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ "ld1b { z21.s }, p0/Z, [x24]\n"
+ ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
"addvl x23, SP, #6\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
"addvl x22, SP, #12\n"
- "add z22.h, p0/M, z22.h, z25.h\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x24, x24, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc161772a // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xc160772b // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
"addvl x21, SP, #18\n"
"addvl x20, SP, #24\n"
- "ld1b { z21.s }, p0/Z, [x24]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- "add z21.h, p0/M, z21.h, z25.h\n"
+ "ld1b { z29.s }, p0/Z, [x24]\n"
+ ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+ "add z29.h, p0/M, z29.h, z17.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ae5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
"mov x12, #0x4\n"
"add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- "ld1b { z20.s }, p0/Z, [x24]\n"
- "add z20.h, p0/M, z20.h, z25.h\n"
+ ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+ "ld1b { z30.s }, p0/Z, [x24]\n"
+ "add z30.h, p0/M, z30.h, z17.h\n"
"add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"subs x25, x25, #0x1\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- "ld1b { z19.s }, p0/Z, [x24]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ ".inst 0xc16d774a // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+ "ld1b { z15.s }, p0/Z, [x24]\n"
+ "add z15.h, p0/M, z15.h, z17.h\n"
"add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc165774b // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa0412aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- "ld1b { z18.s }, p0/Z, [x24]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+ "ld1b { z20.s }, p0/Z, [x24]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- "ld1b { z17.s }, p0/Z, [x24]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ ".inst 0xc16e772e // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+ "ld1b { z31.s }, p0/Z, [x24]\n"
+ "add z31.h, p0/M, z31.h, z17.h\n"
"add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc166772f // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- "ld1b { z16.s }, p0/Z, [x24]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16b774c // sdot za.s[x11, 4], { z26.h-z27.h }, z11.h\n"
+ "ld1b { z22.s }, p0/Z, [x24]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ ".inst 0xc16a774d // sdot za.s[x11, 5], { z26.h-z27.h }, z10.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc169776a // sdot za.s[x11, 2], { z27.h-z28.h }, z9.h\n"
+ ".inst 0xc161776b // sdot za.s[x11, 3], { z27.h-z28.h }, z1.h\n"
+ ".inst 0xa0422ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16e774e // sdot za.s[x11, 6], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xc166774f // sdot za.s[x11, 7], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc161776c // sdot za.s[x11, 4], { z27.h-z28.h }, z1.h\n"
+ ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
+ ".inst 0xa1422aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc169776e // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
+ ".inst 0xc161776f // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1681768 // sdot za.s[x8, 0], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
- "trn1 z27.h, z23.h, z22.h\n"
- ".inst 0xc1651788 // sdot za.s[x8, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1641789 // sdot za.s[x8, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "trn1 z28.h, z21.h, z20.h\n"
- ".inst 0xc16b17a8 // sdot za.s[x8, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a17a9 // sdot za.s[x8, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc1631728 // sdot za.s[x8, 0], { z25.h-z26.h }, z3.h\n"
+ ".inst 0xc1621729 // sdot za.s[x8, 1], { z25.h-z26.h }, z2.h\n"
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ "trn1 z25.h, z8.h, z21.h\n"
+ ".inst 0xc16e1748 // sdot za.s[x8, 0], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xc1661749 // sdot za.s[x8, 1], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "trn1 z26.h, z29.h, z30.h\n"
+ ".inst 0xc16b1768 // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xc16a1769 // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xa0422bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "trn1 z29.h, z19.h, z18.h\n"
- ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "trn1 z27.h, z15.h, z20.h\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ "trn1 z28.h, z31.h, z22.h\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
- ".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x14]\n"
- "add x14, x14, x3\n"
- "st1b { z14.s }, p1, [x13]\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z10.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z13.s }, p1, [x9]\n"
+ "st1b { z9.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z15.s }, p1, [x28]\n"
+ "st1b { z11.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 20b\n"
"21:" // Main loop tail
"addvl x23, SP, #6\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
"addvl x22, SP, #12\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
"addvl x21, SP, #18\n"
"addvl x20, SP, #24\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc161772a // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xc160772b // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc169774a // sdot za.s[x11, 2], { z26.h-z27.h }, z9.h\n"
+ ".inst 0xc161774b // sdot za.s[x11, 3], { z26.h-z27.h }, z1.h\n"
+ ".inst 0xa1412ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1681768 // sdot za.s[x8, 0], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc16e772e // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc166772f // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc169774c // sdot za.s[x11, 4], { z26.h-z27.h }, z9.h\n"
+ ".inst 0xc161774d // sdot za.s[x11, 5], { z26.h-z27.h }, z1.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16b776a // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xc16a776b // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xa0422ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa0412a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc163776c // sdot za.s[x11, 4], { z27.h-z28.h }, z3.h\n"
+ ".inst 0xc162776d // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16a776e // sdot za.s[x11, 6], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc162776f // sdot za.s[x11, 7], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16f1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
+ ".inst 0xc16e1729 // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc1691748 // sdot za.s[x8, 0], { z26.h-z27.h }, z9.h\n"
+ ".inst 0xc1681749 // sdot za.s[x8, 1], { z26.h-z27.h }, z8.h\n"
+ ".inst 0xc1611768 // sdot za.s[x8, 0], { z27.h-z28.h }, z1.h\n"
".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc1651788 // sdot za.s[x8, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1641789 // sdot za.s[x8, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b17a8 // sdot za.s[x8, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a17a9 // sdot za.s[x8, 1], { z29.h-z30.h }, z10.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
- ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
- ".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x14]\n"
- "add x14, x14, x3\n"
- "st1b { z14.s }, p1, [x13]\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z10.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z13.s }, p1, [x9]\n"
+ "st1b { z9.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z15.s }, p1, [x28]\n"
+ "st1b { z11.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"22:" // Main loop skip tail
"cbz x15, 24f\n"
"23:" // Right padding loop
- ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
"add x8, x8, #0x2\n"
"subs x15, x15, #0x1\n"
- ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
- ".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x14]\n"
- "add x14, x14, x3\n"
- "st1b { z14.s }, p1, [x13]\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z10.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z13.s }, p1, [x9]\n"
+ "st1b { z9.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z15.s }, p1, [x28]\n"
+ "st1b { z11.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 23b\n"
"24:" // End
- "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x23, ALL, MUL #16\n"
- "incw x23, ALL, MUL #9\n"
- "str x23, [%x[args], %[offsetof_Args_weights]]\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "incw x5\n"
- "whilelt p1.s, x5, x7\n"
- "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x16, x16, x20\n"
- "str x16, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #16\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "incw x17\n"
+ "whilelt p1.s, x17, x7\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
"ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
"ldp x23, x22, [x25, #0x0]\n"
@@ -1194,7 +1194,7 @@ void sme2_u8q_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za.hpp
index 9fa295b20e..05aad19c09 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
index 468e6778a4..6c144afa77 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
@@ -73,156 +73,156 @@ void sme2_u8q_planar_5x5_s2_4rows_dot_za_impl(
"ptrue p2.b\n"
"mov x20, #0xb\n"
"ldr x4, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z9.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "ld1rh { z7.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
"sub x20, x20, x3\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x5, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x5\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
"whilelt p8.s, XZR, x4\n"
"addvl SP, SP, #-15\n"
"ldr x6, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z9.h, p2/M, z9.h\n"
+ "neg z7.h, p2/M, z7.h\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z3.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z1.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z5.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z21.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z28.s, #0x0\n"
+ "mov z12.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z28.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "ld1w { z12.s }, p1/Z, [x20, x6, LSL #2]\n"
"2:" // Load bias: Done
"ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
"mov x20, x22\n"
- "ld1b { z12.s }, p2/Z, [x20]\n"
+ "ld1b { z13.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "ld1rh { z18.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "sub z12.h, z12.h, z18.h\n"
+ "ld1rh { z28.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "sub z13.h, z13.h, z28.h\n"
"incw x22\n"
- "mov z14.h, #0x0\n"
- "ld1b { z25.s }, p2/Z, [x20]\n"
+ "mov z26.h, #0x0\n"
+ "ld1b { z22.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z25.h, z25.h, z18.h\n"
- "trn1 z2.h, z12.h, z25.h\n"
- "ld1b { z24.s }, p2/Z, [x20]\n"
+ "sub z22.h, z22.h, z28.h\n"
+ "trn1 z17.h, z13.h, z22.h\n"
+ "ld1b { z20.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z24.h, z24.h, z18.h\n"
+ "sub z20.h, z20.h, z28.h\n"
"addvl x21, SP, #15\n"
- "ld1b { z17.s }, p2/Z, [x20]\n"
+ "ld1b { z1.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z17.h, z17.h, z18.h\n"
- "trn1 z10.h, z24.h, z17.h\n"
- "ld1b { z16.s }, p2/Z, [x20]\n"
+ "sub z1.h, z1.h, z28.h\n"
+ "trn1 z29.h, z20.h, z1.h\n"
+ "ld1b { z27.s }, p2/Z, [x20]\n"
"mov x20, x22\n"
- "sub z16.h, z16.h, z18.h\n"
+ "sub z27.h, z27.h, z28.h\n"
"incw x22\n"
- "ld1b { z12.s }, p2/Z, [x20]\n"
+ "ld1b { z14.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z12.h, z12.h, z18.h\n"
+ "sub z14.h, z14.h, z28.h\n"
"addvl x21, x21, #-3\n"
- "ld1b { z25.s }, p2/Z, [x20]\n"
+ "ld1b { z18.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z25.h, z25.h, z18.h\n"
- "trn1 z0.h, z16.h, z14.h\n"
- "ld1b { z24.s }, p2/Z, [x20]\n"
+ "sub z18.h, z18.h, z28.h\n"
+ "trn1 z22.h, z27.h, z26.h\n"
+ "ld1b { z23.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z24.h, z24.h, z18.h\n"
- "st1h { z2.h }, p2, [x21]\n"
- "ld1b { z17.s }, p2/Z, [x20]\n"
+ "sub z23.h, z23.h, z28.h\n"
+ "st1h { z17.h }, p2, [x21]\n"
+ "ld1b { z30.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z17.h, z17.h, z18.h\n"
- "trn1 z2.h, z12.h, z25.h\n"
- "ld1b { z16.s }, p2/Z, [x20]\n"
+ "sub z30.h, z30.h, z28.h\n"
+ "trn1 z8.h, z14.h, z18.h\n"
+ "ld1b { z15.s }, p2/Z, [x20]\n"
"mov x20, x22\n"
- "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
- "sub z16.h, z16.h, z18.h\n"
- "ld1b { z12.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #5\n"
- "trn1 z10.h, z24.h, z17.h\n"
- "sub z12.h, z12.h, z18.h\n"
- "ld1b { z25.s }, p2/Z, [x20]\n"
+ "st1h { z29.h }, p2, [x21, #1, MUL VL]\n"
+ "sub z15.h, z15.h, z28.h\n"
+ "ld1b { z20.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z25.h, z25.h, z18.h\n"
- "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z23.h, z23.h, z30.h\n"
+ "sub z20.h, z20.h, z28.h\n"
"ld1b { z24.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z0.h, z16.h, z14.h\n"
+ "sub z24.h, z24.h, z28.h\n"
+ "st1h { z22.h }, p2, [x21, #2, MUL VL]\n"
+ "ld1b { z16.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z0.h, z15.h, z26.h\n"
"incw x22\n"
- "ld1b { z17.s }, p2/Z, [x20]\n"
+ "ld1b { z13.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z24.h, z24.h, z18.h\n"
- "sub z17.h, z17.h, z18.h\n"
- "ld1b { z16.s }, p2/Z, [x20]\n"
+ "sub z16.h, z16.h, z28.h\n"
+ "sub z13.h, z13.h, z28.h\n"
+ "ld1b { z11.s }, p2/Z, [x20]\n"
"addvl x21, x21, #-3\n"
"mov x20, x22\n"
- "st1h { z2.h }, p2, [x21]\n"
- "trn1 z2.h, z12.h, z25.h\n"
- "ld1b { z12.s }, p2/Z, [x20]\n"
+ "st1h { z8.h }, p2, [x21]\n"
+ "trn1 z27.h, z20.h, z24.h\n"
+ "ld1b { z22.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z16.h, z16.h, z18.h\n"
- "ld1b { z25.s }, p2/Z, [x20]\n"
+ "sub z11.h, z11.h, z28.h\n"
+ "ld1b { z3.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z10.h, z24.h, z17.h\n"
- "ld1b { z24.s }, p2/Z, [x20]\n"
+ "st1h { z23.h }, p2, [x21, #1, MUL VL]\n"
+ "trn1 z20.h, z16.h, z13.h\n"
+ "ld1b { z13.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z12.h, z12.h, z18.h\n"
- "sub z25.h, z25.h, z18.h\n"
- "ld1b { z17.s }, p2/Z, [x20]\n"
+ "sub z22.h, z22.h, z28.h\n"
+ "sub z3.h, z3.h, z28.h\n"
+ "ld1b { z15.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
"st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z0.h, z16.h, z14.h\n"
+ "trn1 z29.h, z11.h, z26.h\n"
"ld1b { z16.s }, p2/Z, [x20]\n"
"incw x22\n"
- "sub z24.h, z24.h, z18.h\n"
- "sub z17.h, z17.h, z18.h\n"
+ "sub z13.h, z13.h, z28.h\n"
+ "sub z15.h, z15.h, z28.h\n"
"addvl x21, x21, #-3\n"
"mov x20, x22\n"
- "st1h { z2.h }, p2, [x21]\n"
- "sub z16.h, z16.h, z18.h\n"
- "trn1 z2.h, z12.h, z25.h\n"
- "ld1b { z12.s }, p2/Z, [x20]\n"
+ "st1h { z27.h }, p2, [x21]\n"
+ "sub z16.h, z16.h, z28.h\n"
+ "trn1 z19.h, z22.h, z3.h\n"
+ "ld1b { z17.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
- "ld1b { z25.s }, p2/Z, [x20]\n"
+ "st1h { z20.h }, p2, [x21, #1, MUL VL]\n"
+ "ld1b { z0.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z10.h, z24.h, z17.h\n"
- "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
- "ld1b { z24.s }, p2/Z, [x20]\n"
+ "trn1 z31.h, z13.h, z15.h\n"
+ "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
+ "ld1b { z18.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z0.h, z16.h, z14.h\n"
- "sub z12.h, z12.h, z18.h\n"
- "ld1b { z17.s }, p2/Z, [x20]\n"
+ "trn1 z16.h, z16.h, z26.h\n"
+ "sub z17.h, z17.h, z28.h\n"
+ "ld1b { z22.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z25.h, z25.h, z18.h\n"
- "sub z24.h, z24.h, z18.h\n"
- "ld1b { z16.s }, p2/Z, [x20]\n"
- "sub z17.h, z17.h, z18.h\n"
- "sub z16.h, z16.h, z18.h\n"
+ "sub z0.h, z0.h, z28.h\n"
+ "sub z18.h, z18.h, z28.h\n"
+ "ld1b { z1.s }, p2/Z, [x20]\n"
+ "sub z22.h, z22.h, z28.h\n"
+ "sub z1.h, z1.h, z28.h\n"
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
"addvl x21, x21, #-3\n"
- "st1h { z2.h }, p2, [x21]\n"
- "mov z29.d, z28.d\n"
- "mov z30.d, z28.d\n"
- "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
- "mov z31.d, z28.d\n"
- "trn1 z2.h, z12.h, z25.h\n"
- "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "st1h { z19.h }, p2, [x21]\n"
+ "mov z13.d, z12.d\n"
+ "mov z14.d, z12.d\n"
+ "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
+ "mov z15.d, z12.d\n"
+ "trn1 z8.h, z17.h, z0.h\n"
+ "st1h { z16.h }, p2, [x21, #2, MUL VL]\n"
"addvl x21, x21, #-3\n"
- "trn1 z10.h, z24.h, z17.h\n"
- "trn1 z0.h, z16.h, z14.h\n"
- "st1h { z2.h }, p2, [x21]\n"
- "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
- "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z31.h, z18.h, z22.h\n"
+ "trn1 z29.h, z1.h, z26.h\n"
+ "st1h { z8.h }, p2, [x21]\n"
+ "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
"cbz x20, 3f\n"
- "ld1w { z3.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "ld1w { z6.s }, p1/Z, [x20, x6, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z1.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "ld1w { z4.s }, p1/Z, [x20, x6, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x7, [%x[args], %[offsetof_Args_input_cols]]\n"
"sub x20, x7, #0x1\n"
@@ -242,20 +242,20 @@ void sme2_u8q_planar_5x5_s2_4rows_dot_za_impl(
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x0\n"
"msub x17, x4, x20, x17\n"
- ".inst 0xc0040f80 // mova za.d[x8, #0], { z28.d-z31.d }\n"
+ ".inst 0xc0040d80 // mova za.d[x8, #0], { z12.d-z15.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040f81 // mova za.d[x8, #1], { z28.d-z31.d }\n"
+ ".inst 0xc0040d81 // mova za.d[x8, #1], { z12.d-z15.d }\n"
"mov x22, #0x4\n"
- "ldp x15, x14, [x25], #0x10\n"
- ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "ldp x15, x14, [x23], #0x10\n"
+ ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
"ldp x13, x11, [x20], #0x10\n"
- ".inst 0xc0040f83 // mova za.d[x8, #3], { z28.d-z31.d }\n"
+ ".inst 0xc0040d83 // mova za.d[x8, #3], { z12.d-z15.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "ldp x10, x9, [x25], #0x10\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "ldp x10, x9, [x23], #0x10\n"
"ldp x28, x27, [x20], #0x10\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
@@ -263,24 +263,24 @@ void sme2_u8q_planar_5x5_s2_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
"and x22, x21, #0x1\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
"add x21, x21, #0x1\n"
"lsr x21, x21, #0x1\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
+ ".inst 0xc1aaab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
"sub x16, x16, x21\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
+ ".inst 0xc1b5ccbc // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z4.s }, p1, [x15]\n"
+ "st1b { z28.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- "st1b { z5.s }, p1, [x14]\n"
+ "st1b { z29.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "st1b { z30.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z7.s }, p1, [x9]\n"
+ "st1b { z31.s }, p1, [x9]\n"
"add x9, x9, x27\n"
"bgt 6b\n"
"7:" // Left padding: End
@@ -296,341 +296,341 @@ void sme2_u8q_planar_5x5_s2_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 4 priming loads
"add x21, x17, %x[ld_in_row]\n"
- "ld1b { z11.s }, p1/Z, [x17]\n"
+ "ld1b { z27.s }, p1/Z, [x17]\n"
"addvl x20, SP, #12\n"
- "ld1b { z21.s }, p1/Z, [x21]\n"
+ "ld1b { z0.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z12.s }, p1/Z, [x21]\n"
+ "trn1 z27.h, z27.h, z0.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ "ld1b { z28.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1b { z20.s }, p1/Z, [x21]\n"
+ "ld1b { z11.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z13.s }, p1/Z, [x21]\n"
+ "trn1 z28.h, z28.h, z11.h\n"
+ "add z28.h, z28.h, z7.h\n"
+ "ld1b { z29.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z19.s }, p1/Z, [x21]\n"
+ "ld1b { z8.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x21]\n"
+ "trn1 z29.h, z29.h, z8.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x21]\n"
+ "ld1b { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x21]\n"
+ "trn1 z30.h, z30.h, z17.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ "ld1b { z31.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "ld1b { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "trn1 z31.h, z31.h, z26.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+ "ld1b { z20.s }, p1/Z, [x21]\n"
+ "mov z0.d, z20.d\n"
+ "add z0.h, z0.h, z7.h\n"
+ ".inst 0xc1781788 // sdot za.s[x8, 0], { z28.h-z31.h }, z8.h\n"
+ "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17817a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z8.h\n"
"9:" // Unpadded: 3 priming loads
"add x21, x17, %x[ld_in_row]\n"
- "ld1b { z11.s }, p1/Z, [x17]\n"
+ "ld1b { z29.s }, p1/Z, [x17]\n"
"addvl x20, SP, #9\n"
- "ld1b { z21.s }, p1/Z, [x21]\n"
+ "ld1b { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z12.s }, p1/Z, [x21]\n"
+ "trn1 z29.h, z29.h, z17.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1b { z20.s }, p1/Z, [x21]\n"
+ "ld1b { z0.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z13.s }, p1/Z, [x21]\n"
+ "trn1 z30.h, z30.h, z0.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ "ld1b { z31.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z19.s }, p1/Z, [x21]\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x21]\n"
+ "trn1 z31.h, z31.h, z16.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ "ld1b { z0.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x21]\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x21]\n"
+ "trn1 z0.h, z0.h, z16.h\n"
+ "add z0.h, z0.h, z7.h\n"
+ "ld1b { z1.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ "trn1 z1.h, z1.h, z16.h\n"
+ "add z1.h, z1.h, z7.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17217a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "mov z2.d, z16.d\n"
+ "add z2.h, z2.h, z7.h\n"
+ ".inst 0xc17317c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
+ "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17817e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z8.h\n"
"10:" // Unpadded: 2 priming loads
"add x22, x17, %x[ld_in_row]\n"
- "ld1b { z11.s }, p1/Z, [x17]\n"
+ "ld1b { z26.s }, p1/Z, [x17]\n"
"addvl x21, SP, #6\n"
- "ld1b { z21.s }, p1/Z, [x22]\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z12.s }, p1/Z, [x22]\n"
+ "trn1 z26.h, z26.h, z16.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1b { z27.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #12\n"
- "ld1b { z20.s }, p1/Z, [x22]\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z13.s }, p1/Z, [x22]\n"
+ "trn1 z27.h, z27.h, z16.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ "ld1b { z28.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1b { z19.s }, p1/Z, [x22]\n"
+ "ld1b { z29.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x22]\n"
+ "trn1 z28.h, z28.h, z29.h\n"
+ "add z28.h, z28.h, z7.h\n"
+ "ld1b { z29.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x22]\n"
+ "ld1b { z19.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x22]\n"
+ "trn1 z29.h, z29.h, z19.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1b { z30.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ "ld1b { z23.s }, p1/Z, [x22]\n"
+ "trn1 z30.h, z30.h, z23.h\n"
"add x22, x22, %x[ld_in_row]\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "add z16.h, z16.h, z9.h\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1721748 // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+ "ld1b { z22.s }, p1/Z, [x22]\n"
+ "mov z31.d, z22.d\n"
+ ".inst 0xc1731768 // sdot za.s[x8, 0], { z27.h-z30.h }, z3.h\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc17b1769 // sdot za.s[x8, 1], { z27.h-z30.h }, z11.h\n"
+ ".inst 0xc1731788 // sdot za.s[x8, 0], { z28.h-z31.h }, z3.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1701789 // sdot za.s[x8, 1], { z28.h-z31.h }, z0.h\n"
"11:" // Unpadded: 1 priming loads
"add x22, x17, %x[ld_in_row]\n"
- "ld1b { z11.s }, p1/Z, [x17]\n"
+ "ld1b { z29.s }, p1/Z, [x17]\n"
"addvl x21, SP, #3\n"
- "ld1b { z21.s }, p1/Z, [x22]\n"
+ "ld1b { z22.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z12.s }, p1/Z, [x22]\n"
+ "trn1 z29.h, z29.h, z22.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1b { z30.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #9\n"
- "ld1b { z20.s }, p1/Z, [x22]\n"
+ "ld1b { z25.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z13.s }, p1/Z, [x22]\n"
+ "trn1 z30.h, z30.h, z25.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ "ld1b { z31.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1b { z19.s }, p1/Z, [x22]\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x22]\n"
+ "trn1 z31.h, z31.h, z16.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ "ld1b { z0.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x22]\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x22]\n"
+ "trn1 z0.h, z0.h, z16.h\n"
+ "add z0.h, z0.h, z7.h\n"
+ "ld1b { z1.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ "ld1b { z2.s }, p1/Z, [x22]\n"
+ "trn1 z1.h, z1.h, z2.h\n"
"add x22, x22, %x[ld_in_row]\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "add z16.h, z16.h, z9.h\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ "add z1.h, z1.h, z7.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc17217a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
+ "ld1b { z24.s }, p1/Z, [x22]\n"
+ "mov z2.d, z24.d\n"
+ ".inst 0xc17317c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
+ ".inst 0xa0402a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17817a9 // sdot za.s[x8, 1], { z29.h-z0.h }, z8.h\n"
+ "add z2.h, z2.h, z7.h\n"
+ "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc17917c9 // sdot za.s[x8, 1], { z30.h-z1.h }, z9.h\n"
+ ".inst 0xc17317e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z3.h\n"
+ "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17317e9 // sdot za.s[x8, 1], { z31.h-z2.h }, z3.h\n"
"12:" // Unpadded: 0 priming loads
"cmp x7, #0x2\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
"blt 22f\n"
"add x21, x17, %x[ld_in_row]\n"
- "ld1b { z11.s }, p1/Z, [x17]\n"
+ "ld1b { z23.s }, p1/Z, [x17]\n"
"sub x7, x7, #0x2\n"
- "ld1b { z21.s }, p1/Z, [x21]\n"
+ "ld1b { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
+ "trn1 z23.h, z23.h, z25.h\n"
"sub x16, x16, #0x1\n"
- "ld1b { z12.s }, p1/Z, [x21]\n"
+ "ld1b { z24.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"lsr x20, x7, #0x1\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z20.s }, p1/Z, [x21]\n"
+ "add z23.h, z23.h, z7.h\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z24.h, z24.h, z30.h\n"
"cmp x20, x16\n"
- "ld1b { z13.s }, p1/Z, [x21]\n"
+ "ld1b { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"csel x26, x20, x16, LT\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z19.s }, p1/Z, [x21]\n"
+ "add z24.h, z24.h, z7.h\n"
+ "ld1b { z22.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x21]\n"
+ "trn1 z25.h, z25.h, z22.h\n"
+ "add z25.h, z25.h, z7.h\n"
+ "ld1b { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1b { z18.s }, p1/Z, [x21]\n"
+ "ld1b { z22.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x21]\n"
+ "trn1 z26.h, z26.h, z22.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1b { z27.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"and x7, x7, #0x1\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z9.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
+ "trn1 z27.h, z27.h, z30.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ "ld1b { z28.s }, p1/Z, [x21]\n"
+ "mov z28.d, z28.d\n"
+ "add z28.h, z28.h, z7.h\n"
"sub x16, x16, x26\n"
"cbz x26, 21f\n"
"13:" // Unpadded: Main loop
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
"addvl x25, SP, #6\n"
"addvl x24, SP, #12\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402b22 // ld1h { z2.h, z10.h }, pn10.b/Z, [x25]\n"
+ ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+ ".inst 0xa0402b20 // ld1h { z0.h-z1.h }, pn10.b/Z, [x25]\n"
"add x23, x17, %x[ld_in_row]\n"
"addvl x22, SP, #3\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
"addvl x21, SP, #9\n"
"subs x26, x26, #0x1\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24]\n"
- ".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- "ld1b { z11.s }, p1/Z, [x17]\n"
+ ".inst 0xc1711709 // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
+ ".inst 0xa0402b08 // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc17816ea // sdot za.s[x8, 2], { z23.h-z26.h }, z8.h\n"
+ "ld1b { z23.s }, p1/Z, [x17]\n"
"add x17, x17, %x[ld_in_col]\n"
"add x20, x17, %x[ld_in_row]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
"ld1h { z0.h }, p2/Z, [x25, #2, MUL VL]\n"
- ".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
- "ld1b { z21.s }, p1/Z, [x23]\n"
+ ".inst 0xc179170a // sdot za.s[x8, 2], { z24.h-z27.h }, z9.h\n"
+ "ld1b { z16.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z12.s }, p1/Z, [x23]\n"
+ "trn1 z23.h, z23.h, z16.h\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "ld1h { z9.h }, p2/Z, [x24, #2, MUL VL]\n"
+ "add z23.h, z23.h, z7.h\n"
+ "ld1b { z24.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
- "ld1b { z20.s }, p1/Z, [x23]\n"
+ ".inst 0xc179172a // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
+ "ld1b { z18.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z13.s }, p1/Z, [x23]\n"
+ "trn1 z24.h, z24.h, z18.h\n"
+ "add z24.h, z24.h, z7.h\n"
+ "ld1b { z25.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "ld1b { z19.s }, p1/Z, [x23]\n"
+ "ld1b { z8.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x23]\n"
+ "trn1 z25.h, z25.h, z8.h\n"
+ "add z25.h, z25.h, z7.h\n"
+ "ld1b { z26.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
- "ld1b { z18.s }, p1/Z, [x23]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "ld1b { z28.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x23]\n"
+ "trn1 z26.h, z26.h, z28.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1b { z27.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
- "ld1b { z17.s }, p1/Z, [x23]\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+ "ld1b { z28.s }, p1/Z, [x23]\n"
+ "trn1 z27.h, z27.h, z28.h\n"
"add x23, x23, %x[ld_in_row]\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- "ld1b { z16.s }, p1/Z, [x23]\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "add z16.h, z16.h, z9.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc17216e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+ ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+ "ld1b { z20.s }, p1/Z, [x23]\n"
+ "mov z28.d, z20.d\n"
+ ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+ "add z28.h, z28.h, z7.h\n"
"ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1711709 // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
+ ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+ ".inst 0xc1701728 // sdot za.s[x8, 0], { z25.h-z28.h }, z0.h\n"
"ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- "st1b { z4.s }, p1, [x15]\n"
+ "st1b { z16.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- "ld1b { z11.s }, p1/Z, [x17]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "st1b { z5.s }, p1, [x14]\n"
+ "ld1b { z23.s }, p1/Z, [x17]\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "st1b { z17.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "ld1b { z21.s }, p1/Z, [x20]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "st1b { z6.s }, p1, [x10]\n"
- "ld1b { z12.s }, p1/Z, [x20]\n"
+ "trn1 z23.h, z23.h, z16.h\n"
+ "st1b { z18.s }, p1, [x10]\n"
+ "ld1b { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"add x10, x10, x28\n"
- "st1b { z7.s }, p1, [x9]\n"
- "ld1b { z20.s }, p1/Z, [x20]\n"
+ "st1b { z19.s }, p1, [x9]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z24.h, z24.h, z16.h\n"
"add x9, x9, x27\n"
- "ld1b { z13.s }, p1/Z, [x20]\n"
+ "ld1b { z25.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z19.s }, p1/Z, [x20]\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "add z23.h, z23.h, z7.h\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x20]\n"
+ "trn1 z25.h, z25.h, z16.h\n"
+ "add z24.h, z24.h, z7.h\n"
+ "ld1b { z26.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z13.h, z13.h, z9.h\n"
+ "add z25.h, z25.h, z7.h\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1b { z18.s }, p1/Z, [x20]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x20]\n"
+ "trn1 z26.h, z26.h, z16.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1b { z27.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z9.h\n"
+ "trn1 z27.h, z27.h, z16.h\n"
+ "add z27.h, z27.h, z7.h\n"
"ld1b { z16.s }, p1/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "mov z28.d, z16.d\n"
+ "add z28.h, z28.h, z7.h\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
"bgt 13b\n"
"b 21f\n"
"14:" // Padded
@@ -645,688 +645,688 @@ void sme2_u8q_planar_5x5_s2_4rows_dot_za_impl(
"15:" // Padded: 4 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x17]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x21, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x21]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x21]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1b { z28.s }, p0/Z, [x21]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x21]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z27.h, z27.h, z17.h\n"
+ "trn1 z28.h, z28.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x21]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1b { z29.s }, p0/Z, [x21]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x21]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z18.s }, p0/Z, [x21]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x21]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z30.s }, p0/Z, [x21]\n"
+ "add z30.h, p0/M, z30.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x21]\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x21]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z31.s }, p0/Z, [x21]\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"addvl x20, SP, #12\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
+ "trn1 z29.h, z29.h, z18.h\n"
+ "trn1 z30.h, z30.h, z17.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- "mov z16.d, z16.d\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "trn1 z31.h, z31.h, z16.h\n"
+ ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
+ "mov z0.d, z20.d\n"
"add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1711788 // sdot za.s[x8, 0], { z28.h-z31.h }, z1.h\n"
+ "ld1h { z1.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17117a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z1.h\n"
"16:" // Padded: 3 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1b { z24.s }, p0/Z, [x17]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
"add x21, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x21]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x21]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1b { z25.s }, p0/Z, [x21]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x21]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z24.h, z24.h, z17.h\n"
+ "trn1 z25.h, z25.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x21]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1b { z26.s }, p0/Z, [x21]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x21]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z18.s }, p0/Z, [x21]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x21]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x21]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x21]\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x21]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z28.s }, p0/Z, [x21]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"addvl x20, SP, #9\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
+ "trn1 z26.h, z26.h, z18.h\n"
+ "trn1 z27.h, z27.h, z17.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- "mov z16.d, z16.d\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ "trn1 z28.h, z28.h, z16.h\n"
+ ".inst 0xc1721708 // sdot za.s[x8, 0], { z24.h-z27.h }, z2.h\n"
+ "ld1b { z11.s }, p0/Z, [x21]\n"
+ "add z11.h, p0/M, z11.h, z7.h\n"
+ "mov z29.d, z11.d\n"
"add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc1731728 // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1701748 // sdot za.s[x8, 0], { z26.h-z29.h }, z0.h\n"
"17:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1b { z25.s }, p0/Z, [x17]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ "trn1 z26.h, z26.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"addvl x21, SP, #6\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "trn1 z27.h, z27.h, z18.h\n"
+ "trn1 z28.h, z28.h, z17.h\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ "trn1 z29.h, z29.h, z16.h\n"
+ ".inst 0xc1711728 // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
+ "ld1b { z1.s }, p0/Z, [x20]\n"
"addvl x20, SP, #12\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "mov z16.d, z16.d\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
+ ".inst 0xc1791748 // sdot za.s[x8, 0], { z26.h-z29.h }, z9.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1721729 // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
+ "mov z30.d, z1.d\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+ ".inst 0xc1791768 // sdot za.s[x8, 0], { z27.h-z30.h }, z9.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1701769 // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
"18:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1b { z25.s }, p0/Z, [x17]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ "trn1 z26.h, z26.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"addvl x21, SP, #3\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "trn1 z27.h, z27.h, z18.h\n"
+ "trn1 z28.h, z28.h, z17.h\n"
+ ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ "trn1 z29.h, z29.h, z16.h\n"
+ ".inst 0xc1731728 // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
+ "ld1b { z0.s }, p0/Z, [x20]\n"
"addvl x20, SP, #9\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "mov z16.d, z16.d\n"
+ "add z0.h, p0/M, z0.h, z7.h\n"
+ ".inst 0xc17b1748 // sdot za.s[x8, 0], { z26.h-z29.h }, z11.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1721729 // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
+ "mov z30.d, z0.d\n"
"add x17, x17, %x[ld_in_col]\n"
"ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+ ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1701769 // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
"19:" // Padded: 0 priming loads
"cmp x7, #0x2\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
"blt 22f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1b { z23.s }, p0/Z, [x17]\n"
+ "add z23.h, p0/M, z23.h, z7.h\n"
"add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z23.h, z23.h, z17.h\n"
+ "trn1 z24.h, z24.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "add z19.h, p0/M, z19.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1b { z18.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"sub x7, x7, #0x2\n"
"sub x16, x16, #0x1\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
+ "trn1 z25.h, z25.h, z19.h\n"
+ "trn1 z26.h, z26.h, z18.h\n"
"lsr x20, x7, #0x1\n"
"cmp x20, x16\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
+ "trn1 z27.h, z27.h, z17.h\n"
+ "mov z28.d, z16.d\n"
"csel x25, x20, x16, LT\n"
"add x17, x17, %x[ld_in_col]\n"
"and x7, x7, #0x1\n"
"sub x16, x16, x25\n"
"cbz x25, 21f\n"
"20:" // Padded: Main loop
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
"addvl x24, SP, #6\n"
"addvl x23, SP, #12\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+ ".inst 0xa1402b00 // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
"add x20, x17, %x[ld_in_row]\n"
"addvl x22, SP, #3\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1781709 // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
+ ".inst 0xa1402ae3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
"addvl x21, SP, #9\n"
"subs x25, x25, #0x1\n"
- ".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ ".inst 0xc17316ea // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
+ "ld1b { z23.s }, p0/Z, [x17]\n"
+ "add z23.h, p0/M, z23.h, z7.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ ".inst 0xc17b170a // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
"ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1b { z1.s }, p0/Z, [x20]\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
- "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z3.h }, p2/Z, [x23, #2, MUL VL]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ ".inst 0xc173172a // sdot za.s[x8, 2], { z25.h-z28.h }, z3.h\n"
+ "trn1 z23.h, z23.h, z16.h\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z3.s }, p0/Z, [x20]\n"
+ "add z3.h, p0/M, z3.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
+ "ld1b { z30.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z30.h, p0/M, z30.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22]\n"
+ "ld1b { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
+ "trn1 z24.h, z24.h, z1.h\n"
+ "trn1 z25.h, z25.h, z3.h\n"
+ "trn1 z26.h, z26.h, z30.h\n"
+ ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ "trn1 z27.h, z27.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xc17216e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"mov x12, #0x0\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
+ ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
"add x17, x17, %x[ld_in_col]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ ".inst 0xc17216e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z2.h\n"
+ "ld1b { z23.s }, p0/Z, [x17]\n"
+ "add z23.h, p0/M, z23.h, z7.h\n"
"add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z8.s }, p0/Z, [x20]\n"
+ "add z8.h, p0/M, z8.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ ".inst 0xc1731709 // sdot za.s[x8, 1], { z24.h-z27.h }, z3.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "mov z28.d, z20.d\n"
+ "ld1h { z1.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ ".inst 0xc1711728 // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
"mov x12, #0x4\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "add z22.h, p0/M, z22.h, z7.h\n"
+ "ld1h { z1.h }, p2/Z, [x21, #2, MUL VL]\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ ".inst 0xc1711729 // sdot za.s[x8, 1], { z25.h-z28.h }, z1.h\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ "ld1b { z31.s }, p0/Z, [x20]\n"
+ ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
+ "ld1b { z1.s }, p0/Z, [x20]\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
+ ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
"add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x15]\n"
+ ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+ "st1b { z16.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
- "st1b { z5.s }, p1, [x14]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "st1b { z17.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "trn1 z23.h, z23.h, z8.h\n"
+ "trn1 z24.h, z24.h, z22.h\n"
+ "st1b { z18.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "st1b { z7.s }, p1, [x9]\n"
+ "trn1 z25.h, z25.h, z28.h\n"
+ "trn1 z26.h, z26.h, z20.h\n"
+ "st1b { z19.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
+ "trn1 z27.h, z27.h, z31.h\n"
+ "mov z28.d, z1.d\n"
"bgt 20b\n"
"21:" // Main loop tail
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
"addvl x24, SP, #6\n"
"addvl x23, SP, #12\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+ ".inst 0xa0402b08 // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17816e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z8.h\n"
"add x22, x17, %x[ld_in_row]\n"
"addvl x21, SP, #3\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1791709 // sdot za.s[x8, 1], { z24.h-z27.h }, z9.h\n"
+ ".inst 0xa1402ae3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
"addvl x20, SP, #9\n"
- ".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ ".inst 0xc17316ea // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
+ "ld1b { z29.s }, p0/Z, [x17]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x22]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z8.s }, p0/Z, [x22]\n"
+ "add z8.h, p0/M, z8.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
- "ld1b { z12.s }, p0/Z, [x22]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ ".inst 0xc17b170a // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
+ "ld1b { z30.s }, p0/Z, [x22]\n"
+ "add z30.h, p0/M, z30.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
"ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
"ld1b { z20.s }, p0/Z, [x22]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z2.h }, p2/Z, [x23, #2, MUL VL]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "ld1b { z13.s }, p0/Z, [x22]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ ".inst 0xc172172a // sdot za.s[x8, 2], { z25.h-z28.h }, z2.h\n"
+ "trn1 z29.h, z29.h, z8.h\n"
+ "ld1b { z31.s }, p0/Z, [x22]\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x22]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z25.s }, p0/Z, [x22]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x22]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z0.s }, p0/Z, [x22]\n"
+ "add z0.h, p0/M, z0.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x22]\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x22]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z1.s }, p0/Z, [x22]\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x22]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "ld1b { z28.s }, p0/Z, [x22]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
+ "trn1 z30.h, z30.h, z20.h\n"
+ "trn1 z31.h, z31.h, z25.h\n"
+ "trn1 z0.h, z0.h, z17.h\n"
+ ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x22]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ "trn1 z1.h, z1.h, z28.h\n"
+ ".inst 0xc17317a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z3.h\n"
+ "ld1b { z22.s }, p0/Z, [x22]\n"
+ ".inst 0xc1a6ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+ "add z22.h, p0/M, z22.h, z7.h\n"
+ ".inst 0xc17b17c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z11.h\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
"add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "mov z16.d, z16.d\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x15]\n"
+ ".inst 0xc1a4aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z4.s\n"
+ ".inst 0xc17317a9 // sdot za.s[x8, 1], { z29.h-z0.h }, z3.h\n"
+ "mov z2.d, z22.d\n"
+ "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc17b17c9 // sdot za.s[x8, 1], { z30.h-z1.h }, z11.h\n"
+ ".inst 0xc1aaab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z10.s\n"
+ ".inst 0xc17917e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z9.h\n"
+ "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1b5ccb8 // sclamp { z24.s-z27.s }, z5.s, z21.s\n"
+ "st1b { z24.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- "st1b { z5.s }, p1, [x14]\n"
+ "st1b { z25.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "st1b { z6.s }, p1, [x10]\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
- "st1b { z7.s }, p1, [x9]\n"
+ ".inst 0xc17817e9 // sdot za.s[x8, 1], { z31.h-z2.h }, z8.h\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "st1b { z27.s }, p1, [x9]\n"
"add x9, x9, x27\n"
"22:" // Main loop skip tail
"cbz x7, 23f\n" // Skip remainder inputs
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1b { z24.s }, p0/Z, [x17]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
"add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z24.h, z24.h, z17.h\n"
+ "trn1 z25.h, z25.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "ld1b { z31.s }, p0/Z, [x20]\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ "trn1 z26.h, z26.h, z17.h\n"
+ "trn1 z27.h, z27.h, z16.h\n"
+ "ld1b { z0.s }, p0/Z, [x20]\n"
+ "add z0.h, p0/M, z0.h, z7.h\n"
+ "trn1 z28.h, z28.h, z31.h\n"
"addvl x21, SP, #6\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "mov z16.d, z16.d\n"
+ ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+ "mov z29.d, z0.d\n"
"addvl x20, SP, #12\n"
"sub x16, x16, #0x1\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
- ".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- ".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x15]\n"
+ ".inst 0xc17b1728 // sdot za.s[x8, 0], { z25.h-z28.h }, z11.h\n"
+ ".inst 0xa0402aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1721748 // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+ "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1781709 // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ ".inst 0xc1791729 // sdot za.s[x8, 1], { z25.h-z28.h }, z9.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+ ".inst 0xc171170a // sdot za.s[x8, 2], { z24.h-z27.h }, z1.h\n"
+ ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+ ".inst 0xc179172a // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
+ ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+ "st1b { z16.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- "st1b { z5.s }, p1, [x14]\n"
+ ".inst 0xc1721749 // sdot za.s[x8, 1], { z26.h-z29.h }, z2.h\n"
+ "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "st1b { z17.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc173174a // sdot za.s[x8, 2], { z26.h-z29.h }, z3.h\n"
"add x8, x8, #0x1\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "st1b { z18.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z7.s }, p1, [x9]\n"
+ "st1b { z19.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
"23:" // Tail input: End
"cbz x16, 25f\n"
"24:" // Right padding loop
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
"add x8, x8, #0x1\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
"subs x16, x16, #0x1\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x15]\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ ".inst 0xc1aaab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
+ ".inst 0xc1b5ccbc // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
+ "st1b { z28.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- "st1b { z5.s }, p1, [x14]\n"
+ "st1b { z29.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "st1b { z30.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z7.s }, p1, [x9]\n"
+ "st1b { z31.s }, p1, [x9]\n"
"add x9, x9, x27\n"
"bgt 24b\n"
"25:" // End
- "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x22, ALL, MUL #16\n"
- "incw x22, ALL, MUL #9\n"
- "str x22, [%x[args], %[offsetof_Args_weights]]\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #16\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
"incw x6\n"
"whilelt p1.s, x6, x5\n"
- "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x17, x17, x20\n"
- "str x17, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
"ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
"ldp x23, x22, [x25, #0x0]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za.hpp
index de574fff9a..a4345097b5 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
index 1636225b31..612beb342a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
@@ -73,96 +73,96 @@ void sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za_impl(
"ptrue p2.b\n"
"mov x20, #0x6\n"
"ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z24.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "ld1rh { z21.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
"sub x20, x20, x6\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x17\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
"whilelt p8.s, XZR, x7\n"
"addvl SP, SP, #-12\n"
"ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z24.h, p2/M, z24.h\n"
+ "neg z21.h, p2/M, z21.h\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z22.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z29.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z28.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z8.s, #0x0\n"
+ "mov z30.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z8.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z30.s }, p1/Z, [x20, x16, LSL #2]\n"
"2:" // Load bias: Done
"ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
"mov x20, x22\n"
- "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "ld1sb { z10.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "ld1rh { z21.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "mov z20.h, #0x0\n"
- "sub z27.h, z27.h, z21.h\n"
+ "ld1rh { z31.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z7.h, #0x0\n"
+ "sub z10.h, z10.h, z31.h\n"
"incw x22\n"
- "ld1sb { z23.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #3\n"
- "sub z23.h, z23.h, z21.h\n"
- "trn1 z0.h, z20.h, z27.h\n"
"ld1sb { z16.s }, p2/Z, [x20]\n"
- "sub z16.h, z16.h, z21.h\n"
+ "incw x20, ALL, MUL #3\n"
+ "sub z16.h, z16.h, z31.h\n"
+ "trn1 z20.h, z7.h, z10.h\n"
+ "ld1sb { z11.s }, p2/Z, [x20]\n"
+ "sub z11.h, z11.h, z31.h\n"
"mov x20, x22\n"
- "trn1 z1.h, z27.h, z23.h\n"
- "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "trn1 z19.h, z10.h, z16.h\n"
+ "ld1sb { z24.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "trn1 z2.h, z23.h, z16.h\n"
- "trn1 z3.h, z16.h, z20.h\n"
- "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "trn1 z26.h, z16.h, z11.h\n"
+ "trn1 z13.h, z11.h, z7.h\n"
+ "ld1sb { z11.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z27.h, z27.h, z21.h\n"
- "sub z23.h, z23.h, z21.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
- "sub z16.h, z16.h, z21.h\n"
+ "sub z24.h, z24.h, z31.h\n"
+ "sub z11.h, z11.h, z31.h\n"
+ "ld1sb { z2.s }, p2/Z, [x20]\n"
+ "sub z2.h, z2.h, z31.h\n"
"addvl x21, SP, #12\n"
"incw x22\n"
"addvl x21, x21, #-4\n"
"mov x20, x22\n"
- "st1h { z0.h }, p2, [x21]\n"
- "trn1 z0.h, z20.h, z27.h\n"
- "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z1.h, z27.h, z23.h\n"
- "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "st1h { z20.h }, p2, [x21]\n"
+ "trn1 z22.h, z7.h, z24.h\n"
+ "st1h { z19.h }, p2, [x21, #1, MUL VL]\n"
+ "trn1 z1.h, z24.h, z11.h\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "st1h { z2.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z2.h, z23.h, z16.h\n"
- "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "st1h { z26.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z3.h, z11.h, z2.h\n"
+ "ld1sb { z0.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "st1h { z3.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z3.h, z16.h, z20.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "st1h { z13.h }, p2, [x21, #3, MUL VL]\n"
+ "trn1 z25.h, z2.h, z7.h\n"
+ "ld1sb { z4.s }, p2/Z, [x20]\n"
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "sub z27.h, z27.h, z21.h\n"
- "sub z23.h, z23.h, z21.h\n"
+ "sub z16.h, z16.h, z31.h\n"
+ "sub z0.h, z0.h, z31.h\n"
"addvl x21, x21, #-4\n"
- "st1h { z0.h }, p2, [x21]\n"
- "sub z16.h, z16.h, z21.h\n"
+ "st1h { z22.h }, p2, [x21]\n"
+ "sub z4.h, z4.h, z31.h\n"
"st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
- "mov z9.d, z8.d\n"
- "st1h { z2.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z0.h, z20.h, z27.h\n"
- "trn1 z1.h, z27.h, z23.h\n"
- "st1h { z3.h }, p2, [x21, #3, MUL VL]\n"
+ "mov z31.d, z30.d\n"
+ "st1h { z3.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z24.h, z7.h, z16.h\n"
+ "trn1 z18.h, z16.h, z0.h\n"
+ "st1h { z25.h }, p2, [x21, #3, MUL VL]\n"
"addvl x21, x21, #-4\n"
- "trn1 z2.h, z23.h, z16.h\n"
- "trn1 z3.h, z16.h, z20.h\n"
- "st1h { z0.h }, p2, [x21]\n"
- "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
- "st1h { z2.h }, p2, [x21, #2, MUL VL]\n"
- "st1h { z3.h }, p2, [x21, #3, MUL VL]\n"
+ "trn1 z0.h, z0.h, z4.h\n"
+ "trn1 z1.h, z4.h, z7.h\n"
+ "st1h { z24.h }, p2, [x21]\n"
+ "st1h { z18.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "st1h { z1.h }, p2, [x21, #3, MUL VL]\n"
"cbz x20, 3f\n"
- "ld1w { z10.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z14.s }, p1/Z, [x20, x16, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z11.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z12.s }, p1/Z, [x20, x16, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
"sub x20, x15, #0x1\n"
@@ -182,21 +182,21 @@ void sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za_impl(
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x0\n"
"msub x14, x7, x20, x14\n"
- ".inst 0xc0040900 // mova za.d[x8, #0], { z8.d-z9.d }\n"
+ ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040901 // mova za.d[x8, #1], { z8.d-z9.d }\n"
+ ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
"mov x22, #0x2\n"
- "ldp x11, x10, [x25], #0x10\n"
- ".inst 0xc0040902 // mova za.d[x8, #2], { z8.d-z9.d }\n"
+ "ldp x11, x10, [x23], #0x10\n"
+ ".inst 0xc0040bc2 // mova za.d[x8, #2], { z30.d-z31.d }\n"
"ldp x9, x28, [x20], #0x10\n"
- ".inst 0xc0040903 // mova za.d[x8, #3], { z8.d-z9.d }\n"
+ ".inst 0xc0040bc3 // mova za.d[x8, #3], { z30.d-z31.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
- "ldp x27, x26, [x25], #0x10\n"
- ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ "ldp x27, x26, [x23], #0x10\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
"ldp x25, x24, [x20], #0x10\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
@@ -204,22 +204,22 @@ void sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
"sub x13, x13, x21\n"
- ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
- ".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- ".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z4.s }, p1, [x11]\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z5.s }, p1, [x27]\n"
+ "st1b { z25.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z7.s }, p1, [x26]\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"bgt 6b\n"
"7:" // Left padding: End
@@ -231,148 +231,148 @@ void sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 2 priming loads
"add x21, x14, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x14]\n"
+ "ld1b { z20.s }, p1/Z, [x14]\n"
"addvl x20, SP, #8\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z17.h, z16.h\n"
- "add z13.h, z13.h, z24.h\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "trn1 z4.h, z20.h, z16.h\n"
+ "add z4.h, z4.h, z21.h\n"
+ "ld1b { z23.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
+ "ld1b { z22.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z17.h, z16.h\n"
- "add z14.h, z14.h, z24.h\n"
+ "trn1 z5.h, z23.h, z22.h\n"
+ "add z5.h, z5.h, z21.h\n"
"ld1b { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
- "trn1 z15.h, z17.h, z16.h\n"
- "add z15.h, z15.h, z24.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ "trn1 z6.h, z17.h, z16.h\n"
+ "add z6.h, z6.h, z21.h\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16b1488 // sdot za.s[x8, 0], { z4.h-z5.h }, z11.h\n"
+ ".inst 0xc1631489 // sdot za.s[x8, 1], { z4.h-z5.h }, z3.h\n"
+ ".inst 0xa1412a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16814a8 // sdot za.s[x8, 0], { z5.h-z6.h }, z8.h\n"
+ ".inst 0xc16014a9 // sdot za.s[x8, 1], { z5.h-z6.h }, z0.h\n"
"9:" // Unpadded: 1 priming loads
"add x22, x14, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x14]\n"
+ "ld1b { z25.s }, p1/Z, [x14]\n"
"addvl x21, SP, #4\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
+ "ld1b { z6.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z13.h, z17.h, z16.h\n"
- "add z13.h, z13.h, z24.h\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
+ "trn1 z3.h, z25.h, z6.h\n"
+ "add z3.h, z3.h, z21.h\n"
+ "ld1b { z18.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #8\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
+ "ld1b { z26.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z14.h, z17.h, z16.h\n"
- "add z14.h, z14.h, z24.h\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
+ "trn1 z4.h, z18.h, z26.h\n"
+ "add z4.h, z4.h, z21.h\n"
+ "ld1b { z2.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
- "trn1 z15.h, z17.h, z16.h\n"
- "add z15.h, z15.h, z24.h\n"
+ "ld1b { z5.s }, p1/Z, [x22]\n"
+ "trn1 z5.h, z2.h, z5.h\n"
+ "add z5.h, z5.h, z21.h\n"
".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc1611468 // sdot za.s[x8, 0], { z3.h-z4.h }, z1.h\n"
+ ".inst 0xc1601469 // sdot za.s[x8, 1], { z3.h-z4.h }, z0.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa0412aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16a146a // sdot za.s[x8, 2], { z3.h-z4.h }, z10.h\n"
+ ".inst 0xc162146b // sdot za.s[x8, 3], { z3.h-z4.h }, z2.h\n"
+ ".inst 0xc1691488 // sdot za.s[x8, 0], { z4.h-z5.h }, z9.h\n"
+ ".inst 0xc1681489 // sdot za.s[x8, 1], { z4.h-z5.h }, z8.h\n"
+ ".inst 0xa1412a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16a148a // sdot za.s[x8, 2], { z4.h-z5.h }, z10.h\n"
+ ".inst 0xc162148b // sdot za.s[x8, 3], { z4.h-z5.h }, z2.h\n"
"10:" // Unpadded: 0 priming loads
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
"cbz x15, 18f\n"
"add x20, x14, %x[ld_in_row]\n"
"ld1b { z17.s }, p1/Z, [x14]\n"
"sub x15, x15, #0x1\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "ld1b { z9.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z17.h, z16.h\n"
+ "trn1 z6.h, z17.h, z9.h\n"
"sub x13, x13, #0x1\n"
"ld1b { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"cmp x15, x13\n"
- "add z13.h, z13.h, z24.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "add z6.h, z6.h, z21.h\n"
+ "ld1b { z7.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z14.h, z17.h, z16.h\n"
+ "trn1 z7.h, z17.h, z7.h\n"
"csel x23, x15, x13, LT\n"
"ld1b { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z14.h, z14.h, z24.h\n"
+ "add z7.h, z7.h, z21.h\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "trn1 z15.h, z17.h, z16.h\n"
- "add z15.h, z15.h, z24.h\n"
+ "ld1b { z1.s }, p1/Z, [x20]\n"
+ "trn1 z8.h, z17.h, z1.h\n"
+ "add z8.h, z8.h, z21.h\n"
"sub x13, x13, x23\n"
"cbz x23, 17f\n"
"11:" // Unpadded: Main loop
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
"addvl x22, SP, #4\n"
"addvl x21, SP, #8\n"
- "ld1b { z21.s }, p1/Z, [x14]\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22]\n"
+ "ld1b { z2.s }, p1/Z, [x14]\n"
+ ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+ ".inst 0xa1402ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22]\n"
"add x20, x14, %x[ld_in_row]\n"
"subs x23, x23, #0x1\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- "ld1b { z20.s }, p1/Z, [x20]\n"
+ ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+ "ld1b { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
- "ld1b { z19.s }, p1/Z, [x20]\n"
+ ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+ ".inst 0xa1412ac3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+ "ld1b { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
- ".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ ".inst 0xc16d14ca // sdot za.s[x8, 2], { z6.h-z7.h }, z13.h\n"
"ld1b { z18.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc16115ac // sdot za.s[x8, 4], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16514cb // sdot za.s[x8, 3], { z6.h-z7.h }, z5.h\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xc16914cc // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
"ld1b { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc16015ad // sdot za.s[x8, 5], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xc16114cd // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
"ld1b { z16.s }, p1/Z, [x20]\n"
- ".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
- "trn1 z13.h, z21.h, z20.h\n"
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
- ".inst 0xc16315cc // sdot za.s[x8, 4], { z14.h-z15.h }, z3.h\n"
- "st1b { z4.s }, p1, [x11]\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ ".inst 0xc16b14ea // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
+ "trn1 z6.h, z2.h, z19.h\n"
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc16314eb // sdot za.s[x8, 3], { z7.h-z8.h }, z3.h\n"
+ ".inst 0xa1412aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ ".inst 0xc16914ec // sdot za.s[x8, 4], { z7.h-z8.h }, z9.h\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "add z13.h, z13.h, z24.h\n"
- ".inst 0xc16215cd // sdot za.s[x8, 5], { z14.h-z15.h }, z2.h\n"
- "trn1 z14.h, z19.h, z18.h\n"
- "trn1 z15.h, z17.h, z16.h\n"
+ "add z6.h, z6.h, z21.h\n"
+ ".inst 0xc16114ed // sdot za.s[x8, 5], { z7.h-z8.h }, z1.h\n"
+ "trn1 z7.h, z23.h, z18.h\n"
+ "trn1 z8.h, z17.h, z16.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "st1b { z6.s }, p1, [x10]\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
- "st1b { z5.s }, p1, [x27]\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ "st1b { z25.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
- "add z14.h, z14.h, z24.h\n"
- "st1b { z7.s }, p1, [x26]\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ "add z7.h, z7.h, z21.h\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "add z15.h, z15.h, z24.h\n"
+ "add z8.h, z8.h, z21.h\n"
"bgt 11b\n"
"b 17f\n"
"12:" // Padded
@@ -384,118 +384,118 @@ void sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za_impl(
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z19.s }, p0/Z, [x14]\n"
- "add z19.h, p0/M, z19.h, z24.h\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z24.h\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z19.h, z18.h\n"
- "trn1 z14.h, z17.h, z16.h\n"
+ "trn1 z7.h, z19.h, z18.h\n"
+ "trn1 z8.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z16.s }, p0/Z, [x20]\n"
"addvl x20, SP, #8\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- "trn1 z15.h, z17.h, z16.h\n"
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ "trn1 z9.h, z17.h, z16.h\n"
+ ".inst 0xc16a14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z10.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16214e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z2.h\n"
+ ".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16d1508 // sdot za.s[x8, 0], { z8.h-z9.h }, z13.h\n"
+ ".inst 0xc1651509 // sdot za.s[x8, 1], { z8.h-z9.h }, z5.h\n"
"14:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z19.s }, p0/Z, [x14]\n"
- "add z19.h, p0/M, z19.h, z24.h\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z24.h\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z19.h, z18.h\n"
- "trn1 z14.h, z17.h, z16.h\n"
+ "trn1 z22.h, z19.h, z18.h\n"
+ "trn1 z23.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z16.s }, p0/Z, [x20]\n"
"addvl x21, SP, #4\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
"addvl x20, SP, #8\n"
- "trn1 z15.h, z17.h, z16.h\n"
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "trn1 z24.h, z17.h, z16.h\n"
+ ".inst 0xc16116c8 // sdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+ ".inst 0xc16016c9 // sdot za.s[x8, 1], { z22.h-z23.h }, z0.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xa0412aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16d16ca // sdot za.s[x8, 2], { z22.h-z23.h }, z13.h\n"
+ ".inst 0xc16516cb // sdot za.s[x8, 3], { z22.h-z23.h }, z5.h\n"
+ ".inst 0xc16116e8 // sdot za.s[x8, 0], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xc16016e9 // sdot za.s[x8, 1], { z23.h-z24.h }, z0.h\n"
+ ".inst 0xa0412a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16116ea // sdot za.s[x8, 2], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xc16016eb // sdot za.s[x8, 3], { z23.h-z24.h }, z0.h\n"
"15:" // Padded: 0 priming loads
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
"cbz x15, 18f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z19.s }, p0/Z, [x14]\n"
- "add z19.h, p0/M, z19.h, z24.h\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z24.h\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z19.h, z18.h\n"
- "trn1 z14.h, z17.h, z16.h\n"
+ "trn1 z6.h, z19.h, z18.h\n"
+ "trn1 z7.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
"sub x15, x15, #0x1\n"
"sub x13, x13, #0x1\n"
"cmp x15, x13\n"
- "trn1 z15.h, z17.h, z16.h\n"
+ "trn1 z8.h, z17.h, z16.h\n"
"csel x23, x15, x13, LT\n"
"add x14, x14, %x[ld_in_col]\n"
"sub x13, x13, x23\n"
@@ -503,121 +503,121 @@ void sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za_impl(
"16:" // Padded: Main loop
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z21.s }, p0/Z, [x14]\n"
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- "add z21.h, p0/M, z21.h, z24.h\n"
+ "ld1b { z9.s }, p0/Z, [x14]\n"
+ ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+ ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+ "add z9.h, p0/M, z9.h, z21.h\n"
"add x22, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x22]\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- "add z20.h, p0/M, z20.h, z24.h\n"
+ "ld1b { z19.s }, p0/Z, [x22]\n"
+ ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z19.s }, p0/Z, [x22]\n"
- "add z19.h, p0/M, z19.h, z24.h\n"
- ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+ "ld1b { z18.s }, p0/Z, [x22]\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x22]\n"
- ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
"mov x12, #0x4\n"
"addvl x21, SP, #4\n"
- "add z18.h, p0/M, z18.h, z24.h\n"
- ".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
"addvl x20, SP, #8\n"
- ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16b14ca // sdot za.s[x8, 2], { z6.h-z7.h }, z11.h\n"
"subs x23, x23, #0x1\n"
"ld1b { z17.s }, p0/Z, [x22]\n"
- ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ ".inst 0xc16314cb // sdot za.s[x8, 3], { z6.h-z7.h }, z3.h\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16115ac // sdot za.s[x8, 4], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- "ld1b { z16.s }, p0/Z, [x22]\n"
- ".inst 0xc16015ad // sdot za.s[x8, 5], { z13.h-z14.h }, z0.h\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ ".inst 0xa0412aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16d14cc // sdot za.s[x8, 4], { z6.h-z7.h }, z13.h\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ "ld1b { z2.s }, p0/Z, [x22]\n"
+ ".inst 0xc16514cd // sdot za.s[x8, 5], { z6.h-z7.h }, z5.h\n"
+ "add z2.h, p0/M, z2.h, z21.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
- ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "st1b { z4.s }, p1, [x11]\n"
+ ".inst 0xc16b14ea // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ ".inst 0xc16a14eb // sdot za.s[x8, 3], { z7.h-z8.h }, z10.h\n"
+ ".inst 0xa1412a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc16315cc // sdot za.s[x8, 4], { z14.h-z15.h }, z3.h\n"
- "st1b { z6.s }, p1, [x10]\n"
+ ".inst 0xc16b14ec // sdot za.s[x8, 4], { z7.h-z8.h }, z11.h\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "trn1 z13.h, z21.h, z20.h\n"
- ".inst 0xc16215cd // sdot za.s[x8, 5], { z14.h-z15.h }, z2.h\n"
+ "trn1 z6.h, z9.h, z19.h\n"
+ ".inst 0xc16314ed // sdot za.s[x8, 5], { z7.h-z8.h }, z3.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "st1b { z5.s }, p1, [x27]\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "st1b { z25.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z7.s }, p1, [x26]\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
- ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
- "trn1 z14.h, z19.h, z18.h\n"
- "trn1 z15.h, z17.h, z16.h\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ "trn1 z7.h, z18.h, z16.h\n"
+ "trn1 z8.h, z17.h, z2.h\n"
"bgt 16b\n"
"17:" // Main loop tail
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
"addvl x21, SP, #4\n"
"addvl x20, SP, #8\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+ ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
- ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
- ".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc16115ac // sdot za.s[x8, 4], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
- "st1b { z4.s }, p1, [x11]\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ ".inst 0xc16114ca // sdot za.s[x8, 2], { z6.h-z7.h }, z1.h\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xc16014cb // sdot za.s[x8, 3], { z6.h-z7.h }, z0.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ ".inst 0xc16914cc // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc16015ad // sdot za.s[x8, 5], { z13.h-z14.h }, z0.h\n"
- "st1b { z6.s }, p1, [x10]\n"
+ ".inst 0xc16114cd // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
- "st1b { z5.s }, p1, [x27]\n"
+ ".inst 0xc16314ea // sdot za.s[x8, 2], { z7.h-z8.h }, z3.h\n"
+ "st1b { z25.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "st1b { z7.s }, p1, [x26]\n"
+ ".inst 0xc16214eb // sdot za.s[x8, 3], { z7.h-z8.h }, z2.h\n"
+ ".inst 0xa0412a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xc16315cc // sdot za.s[x8, 4], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215cd // sdot za.s[x8, 5], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16114ec // sdot za.s[x8, 4], { z7.h-z8.h }, z1.h\n"
+ ".inst 0xc16014ed // sdot za.s[x8, 5], { z7.h-z8.h }, z0.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
- ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
"18:" // Main loop skip tail
"cbz x13, 20f\n"
"19:" // Right padding loop
".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
"subs x13, x13, #0x1\n"
".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
- ".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
+ ".inst 0xc1aeac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
"add x8, x8, #0x2\n"
- ".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
- ".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
- ".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
+ ".inst 0xc1acaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ ".inst 0xc1afab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z15.s\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ ".inst 0xc1bccfa4 // sclamp { z4.s-z7.s }, z29.s, z28.s\n"
"st1b { z4.s }, p1, [x11]\n"
"add x11, x11, x9\n"
"st1b { z6.s }, p1, [x10]\n"
@@ -628,15 +628,15 @@ void sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za_impl(
"add x26, x26, x24\n"
"bgt 19b\n"
"20:" // End
- "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x22, ALL, MUL #9\n"
- "str x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
"incw x16\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
"whilelt p1.s, x16, x17\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x14, x14, x20\n"
- "str x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
"ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
"ldp x23, x22, [x25, #0x0]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za.hpp
index e412216af3..104c11fc9d 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
index 2848a015db..8ce04fb8c2 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
@@ -73,86 +73,86 @@ void sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za_impl(
"ptrue p2.b\n"
"mov x20, #0x9\n"
"ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z5.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "ld1rh { z11.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
"sub x20, x20, x6\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x17\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
"whilelt p8.s, XZR, x7\n"
"addvl SP, SP, #-6\n"
"ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z5.h, p2/M, z5.h\n"
+ "neg z11.h, p2/M, z11.h\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z27.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z0.s, #0x0\n"
+ "mov z28.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z0.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z28.s }, p1/Z, [x20, x16, LSL #2]\n"
"2:" // Load bias: Done
"ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
"mov x20, x22\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "ld1sb { z26.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "ld1rh { z13.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "sub z24.h, z24.h, z13.h\n"
+ "ld1rh { z16.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "sub z26.h, z26.h, z16.h\n"
"incw x22\n"
- "mov z17.h, #0x0\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "mov z24.h, #0x0\n"
+ "ld1sb { z3.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z25.h, z25.h, z13.h\n"
- "trn1 z10.h, z24.h, z25.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
- "sub z16.h, z16.h, z13.h\n"
+ "sub z3.h, z3.h, z16.h\n"
+ "trn1 z31.h, z26.h, z3.h\n"
+ "ld1sb { z21.s }, p2/Z, [x20]\n"
+ "sub z21.h, z21.h, z16.h\n"
"mov x20, x22\n"
- "trn1 z11.h, z16.h, z17.h\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "trn1 z14.h, z21.h, z24.h\n"
+ "ld1sb { z2.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z24.h, z24.h, z13.h\n"
+ "sub z2.h, z2.h, z16.h\n"
"addvl x21, SP, #6\n"
"ld1sb { z25.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z25.h, z25.h, z13.h\n"
+ "sub z25.h, z25.h, z16.h\n"
"incw x22\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
- "sub z16.h, z16.h, z13.h\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "sub z27.h, z27.h, z16.h\n"
"addvl x21, x21, #-2\n"
"mov x20, x22\n"
- "st1h { z10.h }, p2, [x21]\n"
- "trn1 z10.h, z24.h, z25.h\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "st1h { z31.h }, p2, [x21]\n"
+ "trn1 z4.h, z2.h, z25.h\n"
+ "ld1sb { z26.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "ld1sb { z23.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "st1h { z11.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z11.h, z16.h, z17.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
- "sub z24.h, z24.h, z13.h\n"
- "sub z25.h, z25.h, z13.h\n"
+ "st1h { z14.h }, p2, [x21, #1, MUL VL]\n"
+ "trn1 z12.h, z27.h, z24.h\n"
+ "ld1sb { z20.s }, p2/Z, [x20]\n"
+ "sub z26.h, z26.h, z16.h\n"
+ "sub z23.h, z23.h, z16.h\n"
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "sub z16.h, z16.h, z13.h\n"
+ "sub z20.h, z20.h, z16.h\n"
"addvl x21, x21, #-2\n"
- "st1h { z10.h }, p2, [x21]\n"
- "mov z1.d, z0.d\n"
- "st1h { z11.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z4.h }, p2, [x21]\n"
+ "mov z29.d, z28.d\n"
+ "st1h { z12.h }, p2, [x21, #1, MUL VL]\n"
"addvl x21, x21, #-2\n"
- "mov z2.d, z0.d\n"
- "mov z3.d, z0.d\n"
- "trn1 z10.h, z24.h, z25.h\n"
- "st1h { z10.h }, p2, [x21]\n"
- "trn1 z11.h, z16.h, z17.h\n"
- "st1h { z11.h }, p2, [x21, #1, MUL VL]\n"
+ "mov z30.d, z28.d\n"
+ "mov z31.d, z28.d\n"
+ "trn1 z25.h, z26.h, z23.h\n"
+ "st1h { z25.h }, p2, [x21]\n"
+ "trn1 z3.h, z20.h, z24.h\n"
+ "st1h { z3.h }, p2, [x21, #1, MUL VL]\n"
"cbz x20, 3f\n"
- "ld1w { z8.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z6.s }, p1/Z, [x20, x16, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z7.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z9.s }, p1/Z, [x20, x16, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
"sub x20, x15, #0x1\n"
@@ -172,18 +172,18 @@ void sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za_impl(
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x0\n"
"msub x14, x7, x20, x14\n"
- ".inst 0xc0040c00 // mova za.d[x8, #0], { z0.d-z3.d }\n"
+ ".inst 0xc0040f80 // mova za.d[x8, #0], { z28.d-z31.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040c01 // mova za.d[x8, #1], { z0.d-z3.d }\n"
+ ".inst 0xc0040f81 // mova za.d[x8, #1], { z28.d-z31.d }\n"
"mov x22, #0x2\n"
- "ldp x11, x10, [x25], #0x10\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
+ "ldp x11, x10, [x23], #0x10\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
"ldp x9, x28, [x20], #0x10\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- "ldp x27, x26, [x25], #0x10\n"
+ "ldp x27, x26, [x23], #0x10\n"
"ldp x25, x24, [x20], #0x10\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
@@ -191,24 +191,24 @@ void sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
"and x22, x21, #0x1\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
+ ".inst 0xc1a9aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z9.s\n"
"add x21, x21, #0x1\n"
"lsr x21, x21, #0x1\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+ ".inst 0xc1adab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z13.s\n"
"sub x13, x13, x21\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
+ ".inst 0xc1a7cd58 // sclamp { z24.s-z27.s }, z10.s, z7.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z28.s }, p1, [x11]\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "st1b { z29.s }, p1, [x10]\n"
+ "st1b { z25.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z30.s }, p1, [x27]\n"
+ "st1b { z26.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"bgt 6b\n"
"7:" // Left padding: End
@@ -220,194 +220,194 @@ void sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 2 priming loads
"add x21, x14, %x[ld_in_row]\n"
- "ld1b { z12.s }, p1/Z, [x14]\n"
+ "ld1b { z1.s }, p1/Z, [x14]\n"
"addvl x20, SP, #4\n"
- "ld1b { z20.s }, p1/Z, [x21]\n"
+ "ld1b { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z5.h\n"
- "ld1b { z13.s }, p1/Z, [x21]\n"
+ "trn1 z1.h, z1.h, z21.h\n"
+ "add z1.h, z1.h, z11.h\n"
+ "ld1b { z2.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z19.s }, p1/Z, [x21]\n"
+ "ld1b { z15.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z5.h\n"
- "ld1b { z14.s }, p1/Z, [x21]\n"
+ "trn1 z2.h, z2.h, z15.h\n"
+ "add z2.h, z2.h, z11.h\n"
+ "ld1b { z3.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x21]\n"
+ "ld1b { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z5.h\n"
- "ld1b { z15.s }, p1/Z, [x21]\n"
+ "trn1 z3.h, z3.h, z21.h\n"
+ "add z3.h, z3.h, z11.h\n"
+ "ld1b { z4.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "ld1b { z19.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z5.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ "trn1 z4.h, z4.h, z19.h\n"
+ "add z4.h, z4.h, z11.h\n"
+ "ld1b { z8.s }, p1/Z, [x21]\n"
+ "mov z5.d, z8.d\n"
+ "add z5.h, z5.h, z11.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701428 // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+ ".inst 0xc1781448 // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
"9:" // Unpadded: 1 priming loads
"add x21, x14, %x[ld_in_row]\n"
- "ld1b { z12.s }, p1/Z, [x14]\n"
+ "ld1b { z1.s }, p1/Z, [x14]\n"
"addvl x20, SP, #2\n"
- "ld1b { z20.s }, p1/Z, [x21]\n"
+ "ld1b { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z5.h\n"
- "ld1b { z13.s }, p1/Z, [x21]\n"
+ "trn1 z1.h, z1.h, z21.h\n"
+ "add z1.h, z1.h, z11.h\n"
+ "ld1b { z2.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z19.s }, p1/Z, [x21]\n"
+ "ld1b { z12.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z5.h\n"
- "ld1b { z14.s }, p1/Z, [x21]\n"
+ "trn1 z2.h, z2.h, z12.h\n"
+ "add z2.h, z2.h, z11.h\n"
+ "ld1b { z3.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x21]\n"
+ "ld1b { z8.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z5.h\n"
- "ld1b { z15.s }, p1/Z, [x21]\n"
+ "trn1 z3.h, z3.h, z8.h\n"
+ "add z3.h, z3.h, z11.h\n"
+ "ld1b { z4.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "ld1b { z5.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z5.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ "trn1 z4.h, z4.h, z5.h\n"
+ "add z4.h, z4.h, z11.h\n"
+ "ld1b { z5.s }, p1/Z, [x21]\n"
+ "mov z5.d, z5.d\n"
+ "add z5.h, z5.h, z11.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701428 // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+ ".inst 0xc1781448 // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
"10:" // Unpadded: 0 priming loads
"cmp x15, #0x2\n"
- ".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
"blt 18f\n"
"add x21, x14, %x[ld_in_row]\n"
- "ld1b { z12.s }, p1/Z, [x14]\n"
+ "ld1b { z21.s }, p1/Z, [x14]\n"
"sub x15, x15, #0x2\n"
- "ld1b { z20.s }, p1/Z, [x21]\n"
+ "ld1b { z8.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z21.h, z21.h, z8.h\n"
"sub x13, x13, #0x1\n"
- "ld1b { z13.s }, p1/Z, [x21]\n"
+ "ld1b { z22.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"lsr x20, x15, #0x1\n"
- "add z12.h, z12.h, z5.h\n"
- "ld1b { z19.s }, p1/Z, [x21]\n"
+ "add z21.h, z21.h, z11.h\n"
+ "ld1b { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z22.h, z22.h, z25.h\n"
"cmp x20, x13\n"
- "ld1b { z14.s }, p1/Z, [x21]\n"
+ "ld1b { z23.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"csel x23, x20, x13, LT\n"
- "add z13.h, z13.h, z5.h\n"
+ "add z22.h, z22.h, z11.h\n"
"ld1b { z18.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z5.h\n"
- "ld1b { z15.s }, p1/Z, [x21]\n"
+ "trn1 z23.h, z23.h, z18.h\n"
+ "add z23.h, z23.h, z11.h\n"
+ "ld1b { z24.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "ld1b { z19.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z5.h\n"
+ "trn1 z24.h, z24.h, z19.h\n"
+ "add z24.h, z24.h, z11.h\n"
+ "ld1b { z8.s }, p1/Z, [x21]\n"
+ "mov z25.d, z8.d\n"
+ "add z25.h, z25.h, z11.h\n"
"and x15, x15, #0x1\n"
"sub x13, x13, x23\n"
"cbz x23, 17f\n"
"11:" // Unpadded: Main loop
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
"addvl x20, SP, #4\n"
"add x22, x14, %x[ld_in_row]\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
"addvl x21, SP, #2\n"
"subs x23, x23, #0x1\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- "ld1b { z12.s }, p1/Z, [x14]\n"
+ ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ "ld1b { z21.s }, p1/Z, [x14]\n"
"add x14, x14, %x[ld_in_col]\n"
"add x20, x14, %x[ld_in_row]\n"
- "ld1b { z20.s }, p1/Z, [x22]\n"
+ "ld1b { z18.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "ld1b { z13.s }, p1/Z, [x22]\n"
+ ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ "trn1 z21.h, z21.h, z18.h\n"
+ "ld1b { z22.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "add z12.h, z12.h, z5.h\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- "ld1b { z19.s }, p1/Z, [x22]\n"
+ "add z21.h, z21.h, z11.h\n"
+ ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+ "ld1b { z8.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z5.h\n"
- "ld1b { z14.s }, p1/Z, [x22]\n"
+ "trn1 z22.h, z22.h, z8.h\n"
+ "add z22.h, z22.h, z11.h\n"
+ "ld1b { z23.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x8, x8, #0x1\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
- "ld1b { z18.s }, p1/Z, [x22]\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "ld1b { z27.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z5.h\n"
- "ld1b { z15.s }, p1/Z, [x22]\n"
+ "trn1 z23.h, z23.h, z27.h\n"
+ "add z23.h, z23.h, z11.h\n"
+ "ld1b { z24.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
+ ".inst 0xc1a6ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+ "ld1b { z8.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z5.h\n"
- ".inst 0xa0402aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
- "ld1b { z12.s }, p1/Z, [x14]\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "ld1b { z20.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "ld1b { z13.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "st1b { z28.s }, p1, [x11]\n"
+ "trn1 z24.h, z24.h, z8.h\n"
+ "add z24.h, z24.h, z11.h\n"
+ "ld1b { z4.s }, p1/Z, [x22]\n"
+ "mov z25.d, z4.d\n"
+ "add z25.h, z25.h, z11.h\n"
+ ".inst 0xa1402aa4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc17416a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z4.h\n"
+ ".inst 0xc1a9aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+ "ld1b { z21.s }, p1/Z, [x14]\n"
+ ".inst 0xc17c16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z12.h\n"
+ ".inst 0xc1adab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+ "ld1b { z12.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z21.h, z21.h, z12.h\n"
+ ".inst 0xc1a7cd40 // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+ "ld1b { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "st1b { z0.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "ld1b { z19.s }, p1/Z, [x20]\n"
+ "ld1b { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "st1b { z29.s }, p1, [x10]\n"
- "ld1b { z14.s }, p1/Z, [x20]\n"
+ "trn1 z22.h, z22.h, z20.h\n"
+ "st1b { z1.s }, p1, [x10]\n"
+ "ld1b { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"add x10, x10, x28\n"
- "st1b { z30.s }, p1, [x27]\n"
- "ld1b { z18.s }, p1/Z, [x20]\n"
+ "st1b { z2.s }, p1, [x27]\n"
+ "ld1b { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
+ "trn1 z23.h, z23.h, z24.h\n"
"add x27, x27, x25\n"
- "ld1b { z15.s }, p1/Z, [x20]\n"
+ "ld1b { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "st1b { z3.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
+ "ld1b { z3.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z12.h, z12.h, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- "add z13.h, z13.h, z5.h\n"
+ "trn1 z24.h, z24.h, z3.h\n"
+ "add z21.h, z21.h, z11.h\n"
+ "ld1b { z3.s }, p1/Z, [x20]\n"
+ "mov z25.d, z3.d\n"
+ "add z22.h, z22.h, z11.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
- "add z14.h, z14.h, z5.h\n"
- "add z15.h, z15.h, z5.h\n"
- "add z16.h, z16.h, z5.h\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ "add z23.h, z23.h, z11.h\n"
+ "add z24.h, z24.h, z11.h\n"
+ "add z25.h, z25.h, z11.h\n"
"bgt 11b\n"
"b 17f\n"
"12:" // Padded
@@ -418,442 +418,442 @@ void sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za_impl(
"13:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ "ld1b { z22.s }, p0/Z, [x14]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ "ld1b { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "ld1b { z4.s }, p0/Z, [x20]\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z22.h, z22.h, z17.h\n"
+ "trn1 z23.h, z23.h, z4.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "add z18.h, p0/M, z18.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
+ "ld1b { z1.s }, p0/Z, [x20]\n"
+ "add z1.h, p0/M, z1.h, z11.h\n"
"addvl x20, SP, #4\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ "trn1 z24.h, z24.h, z18.h\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ ".inst 0xa1402a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
+ "mov z26.d, z1.d\n"
+ ".inst 0xc17416c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z4.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17c16e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z12.h\n"
"14:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ "ld1b { z22.s }, p0/Z, [x14]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ "ld1b { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "ld1b { z5.s }, p0/Z, [x20]\n"
+ "add z5.h, p0/M, z5.h, z11.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z22.h, z22.h, z17.h\n"
+ "trn1 z23.h, z23.h, z5.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "add z18.h, p0/M, z18.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
+ "add z15.h, p0/M, z15.h, z11.h\n"
"addvl x20, SP, #2\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ "trn1 z24.h, z24.h, z18.h\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "mov z26.d, z15.d\n"
+ ".inst 0xc17016c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z0.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17116e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z1.h\n"
"15:" // Padded: 0 priming loads
"cmp x15, #0x2\n"
- ".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
"blt 18f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ "ld1b { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "ld1b { z3.s }, p0/Z, [x20]\n"
+ "add z3.h, p0/M, z3.h, z11.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z21.h, z21.h, z18.h\n"
+ "trn1 z22.h, z22.h, z3.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1b { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
+ "add z19.h, p0/M, z19.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z20.h, p0/M, z20.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
+ "ld1b { z3.s }, p0/Z, [x20]\n"
+ "add z3.h, p0/M, z3.h, z11.h\n"
"sub x15, x15, #0x2\n"
"sub x13, x13, #0x1\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ "trn1 z23.h, z23.h, z19.h\n"
+ "trn1 z24.h, z24.h, z20.h\n"
"lsr x20, x15, #0x1\n"
"cmp x20, x13\n"
- "mov z16.d, z16.d\n"
+ "mov z25.d, z3.d\n"
"csel x22, x20, x13, LT\n"
"add x14, x14, %x[ld_in_col]\n"
"and x15, x15, #0x1\n"
"sub x13, x13, x22\n"
"cbz x22, 17f\n"
"16:" // Padded: Main loop
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
"addvl x20, SP, #4\n"
"mov x12, #0x0\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa1402a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"add x21, x14, %x[ld_in_row]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- "ld1b { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ ".inst 0xc17416a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z4.h\n"
+ "ld1b { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x21]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1b { z14.s }, p0/Z, [x21]\n"
+ "add z14.h, p0/M, z14.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
- "ld1b { z13.s }, p0/Z, [x21]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ ".inst 0xc17c16c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z12.h\n"
+ "ld1b { z22.s }, p0/Z, [x21]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x21]\n"
+ "ld1b { z15.s }, p0/Z, [x21]\n"
"mov x12, #0x4\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "add z15.h, p0/M, z15.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x21]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1b { z23.s }, p0/Z, [x21]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x21]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x21]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1b { z24.s }, p0/Z, [x21]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "ld1b { z4.s }, p0/Z, [x21]\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
"mov x12, #0x8\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z21.h, z21.h, z14.h\n"
+ "trn1 z22.h, z22.h, z15.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"addvl x20, SP, #2\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ "ld1b { z2.s }, p0/Z, [x21]\n"
+ "trn1 z23.h, z23.h, z17.h\n"
+ "trn1 z24.h, z24.h, z4.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
"mov x12, #0x0\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
+ "add z2.h, p0/M, z2.h, z11.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17016a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z0.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ "ld1b { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "mov z16.d, z16.d\n"
+ "mov z25.d, z2.d\n"
"ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "add z20.h, p0/M, z20.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ ".inst 0xc17116c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z1.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
+ "ld1b { z4.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1b { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z12.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
+ ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z12.h, p0/M, z12.h, z11.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+ "ld1b { z8.s }, p0/Z, [x20]\n"
+ "add z8.h, p0/M, z8.h, z11.h\n"
+ ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
"subs x22, x22, #0x1\n"
- ".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x11]\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+ "st1b { z16.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "st1b { z29.s }, p1, [x10]\n"
+ "trn1 z21.h, z21.h, z20.h\n"
+ "st1b { z17.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "st1b { z30.s }, p1, [x27]\n"
+ "trn1 z22.h, z22.h, z4.h\n"
+ "trn1 z23.h, z23.h, z27.h\n"
+ "st1b { z18.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "trn1 z24.h, z24.h, z12.h\n"
+ "mov z25.d, z8.d\n"
+ "st1b { z19.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"add x14, x14, %x[ld_in_col]\n"
"bgt 16b\n"
"17:" // Main loop tail
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
"addvl x20, SP, #4\n"
"mov x12, #0x0\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"add x20, x14, %x[ld_in_row]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- "ld1b { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ "ld1b { z0.s }, p0/Z, [x14]\n"
+ "add z0.h, p0/M, z0.h, z11.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
+ "add z14.h, p0/M, z14.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ "ld1b { z1.s }, p0/Z, [x20]\n"
+ "add z1.h, p0/M, z1.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
+ "ld1b { z12.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "add z12.h, p0/M, z12.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1b { z2.s }, p0/Z, [x20]\n"
+ "add z2.h, p0/M, z2.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1b { z3.s }, p0/Z, [x20]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ "add z3.h, p0/M, z3.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z25.h, p0/M, z25.h, z11.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
"addvl x20, SP, #2\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+ "trn1 z0.h, z0.h, z14.h\n"
"add x8, x8, #0x1\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
+ "add z27.h, p0/M, z27.h, z11.h\n"
+ "trn1 z1.h, z1.h, z12.h\n"
+ "trn1 z2.h, z2.h, z21.h\n"
"add x14, x14, %x[ld_in_col]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x11]\n"
+ "trn1 z3.h, z3.h, z25.h\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+ "mov z4.d, z27.d\n"
+ ".inst 0xc17e1408 // sdot za.s[x8, 0], { z0.h-z3.h }, z14.h\n"
+ ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+ "st1b { z16.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
- "st1b { z29.s }, p1, [x10]\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "st1b { z17.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
- "st1b { z30.s }, p1, [x27]\n"
+ ".inst 0xc17f1428 // sdot za.s[x8, 0], { z1.h-z4.h }, z15.h\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ "st1b { z18.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "st1b { z19.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"18:" // Main loop skip tail
"cbz x15, 19f\n" // Skip remainder inputs
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ "ld1b { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "ld1b { z0.s }, p0/Z, [x20]\n"
+ "add z0.h, p0/M, z0.h, z11.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z21.h, z21.h, z17.h\n"
+ "trn1 z22.h, z22.h, z0.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1b { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z5.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z5.h, p0/M, z5.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
+ "ld1b { z4.s }, p0/Z, [x20]\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
+ "trn1 z23.h, z23.h, z17.h\n"
+ "trn1 z24.h, z24.h, z5.h\n"
+ "mov z25.d, z4.d\n"
"addvl x20, SP, #4\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
"sub x13, x13, #0x1\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+ ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+ ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
"add x8, x8, #0x1\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x11]\n"
+ ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+ "st1b { z16.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
- "st1b { z29.s }, p1, [x10]\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "st1b { z17.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z30.s }, p1, [x27]\n"
+ "st1b { z18.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "st1b { z19.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"19:" // Tail input: End
"cbz x13, 21f\n"
"20:" // Right padding loop
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
+ ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
"add x8, x8, #0x1\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
+ ".inst 0xc1a9aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
"subs x13, x13, #0x1\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x11]\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ ".inst 0xc1adab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+ ".inst 0xc1a7cd40 // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+ "st1b { z0.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "st1b { z29.s }, p1, [x10]\n"
+ "st1b { z1.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z30.s }, p1, [x27]\n"
+ "st1b { z2.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "st1b { z3.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"bgt 20b\n"
"21:" // End
- "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x22, ALL, MUL #9\n"
- "str x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
"incw x16\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
"whilelt p1.s, x16, x17\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x14, x14, x20\n"
- "str x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
"ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
"ldp x23, x22, [x25, #0x0]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za.hpp
index 6071197340..52173b8551 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
index 3e77c75ad7..64023eeaff 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
@@ -69,196 +69,196 @@ void sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "ldr x4, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ldr x5, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"ptrue p2.b\n"
"mov x20, #0x8\n"
"ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z25.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "sub x20, x20, x4\n"
+ "ld1rh { z17.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "sub x20, x20, x5\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x7\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
"whilelt p8.s, XZR, x6\n"
"addvl SP, SP, #-30\n"
- "ldr x5, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z25.h, p2/M, z25.h\n"
+ "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "neg z17.h, p2/M, z17.h\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z3.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z1.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
"ld1rw { z24.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z31.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z6.s, #0x0\n"
+ "mov z18.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z6.s }, p1/Z, [x20, x5, LSL #2]\n"
+ "ld1w { z18.s }, p1/Z, [x20, x17, LSL #2]\n"
"2:" // Load bias: Done
"ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x22, x23\n"
- "ld1sb { z18.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "ld1rh { z12.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "mov z2.h, #0x0\n"
- "sub z18.h, z18.h, z12.h\n"
+ "mov x20, x23\n"
+ "ld1sb { z2.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "ld1rh { z3.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z15.h, #0x0\n"
+ "sub z2.h, z2.h, z3.h\n"
"incw x23\n"
- "ld1sb { z17.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "sub z17.h, z17.h, z12.h\n"
- "trn1 z0.h, z2.h, z18.h\n"
- "ld1sb { z21.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "sub z21.h, z21.h, z12.h\n"
- "trn1 z8.h, z18.h, z17.h\n"
- "ld1sb { z16.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "sub z16.h, z16.h, z12.h\n"
- "trn1 z4.h, z17.h, z21.h\n"
- "ld1sb { z15.s }, p2/Z, [x22]\n"
- "sub z15.h, z15.h, z12.h\n"
- "mov x22, x23\n"
- "trn1 z5.h, z21.h, z16.h\n"
- "ld1sb { z18.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "trn1 z10.h, z16.h, z15.h\n"
+ "ld1sb { z13.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z13.h, z13.h, z3.h\n"
"trn1 z11.h, z15.h, z2.h\n"
- "ld1sb { z17.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "sub z18.h, z18.h, z12.h\n"
- "sub z17.h, z17.h, z12.h\n"
- "ld1sb { z21.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "sub z21.h, z21.h, z12.h\n"
- "addvl x21, SP, #30\n"
- "ld1sb { z16.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z27.h, z27.h, z3.h\n"
+ "trn1 z0.h, z2.h, z13.h\n"
+ "ld1sb { z19.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z19.h, z19.h, z3.h\n"
+ "trn1 z26.h, z13.h, z27.h\n"
+ "ld1sb { z14.s }, p2/Z, [x20]\n"
+ "sub z14.h, z14.h, z3.h\n"
+ "mov x20, x23\n"
+ "trn1 z10.h, z27.h, z19.h\n"
+ "ld1sb { z9.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z19.h, z19.h, z14.h\n"
+ "trn1 z1.h, z14.h, z15.h\n"
+ "ld1sb { z5.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z9.h, z9.h, z3.h\n"
+ "sub z5.h, z5.h, z3.h\n"
+ "ld1sb { z29.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z29.h, z29.h, z3.h\n"
+ "addvl x22, SP, #30\n"
+ "ld1sb { z2.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"incw x23\n"
- "sub z16.h, z16.h, z12.h\n"
- "ld1sb { z15.s }, p2/Z, [x22]\n"
- "addvl x21, x21, #-6\n"
- "sub z15.h, z15.h, z12.h\n"
- "mov x22, x23\n"
- "st1h { z0.h }, p2, [x21]\n"
- "trn1 z0.h, z2.h, z18.h\n"
+ "sub z2.h, z2.h, z3.h\n"
+ "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "addvl x22, x22, #-6\n"
+ "sub z23.h, z23.h, z3.h\n"
+ "mov x20, x23\n"
+ "st1h { z11.h }, p2, [x22]\n"
+ "trn1 z20.h, z15.h, z9.h\n"
"incw x23\n"
- "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z8.h, z18.h, z17.h\n"
- "ld1sb { z18.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z4.h, z17.h, z21.h\n"
- "ld1sb { z17.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z5.h, z21.h, z16.h\n"
- "ld1sb { z21.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
- "trn1 z10.h, z16.h, z15.h\n"
- "ld1sb { z16.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
- "trn1 z11.h, z15.h, z2.h\n"
- "sub z18.h, z18.h, z12.h\n"
- "addvl x21, x21, #-6\n"
- "sub z17.h, z17.h, z12.h\n"
- "ld1sb { z15.s }, p2/Z, [x22]\n"
- "sub z21.h, z21.h, z12.h\n"
- "mov x22, x23\n"
- "sub z16.h, z16.h, z12.h\n"
- "sub z15.h, z15.h, z12.h\n"
- "st1h { z0.h }, p2, [x21]\n"
+ "ldr x21, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "st1h { z0.h }, p2, [x22, #1, MUL VL]\n"
+ "trn1 z22.h, z9.h, z5.h\n"
+ "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z26.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z9.h, z5.h, z29.h\n"
+ "ld1sb { z21.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z10.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z26.h, z29.h, z2.h\n"
+ "ld1sb { z0.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z28.h, z2.h, z23.h\n"
+ "ld1sb { z19.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z1.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z2.h, z23.h, z15.h\n"
+ "sub z25.h, z25.h, z3.h\n"
+ "addvl x22, x22, #-6\n"
+ "sub z21.h, z21.h, z3.h\n"
+ "ld1sb { z6.s }, p2/Z, [x20]\n"
+ "sub z0.h, z0.h, z3.h\n"
+ "mov x20, x23\n"
+ "sub z19.h, z19.h, z3.h\n"
+ "sub z6.h, z6.h, z3.h\n"
+ "st1h { z20.h }, p2, [x22]\n"
"incw x23\n"
- "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z0.h, z2.h, z18.h\n"
- "trn1 z8.h, z18.h, z17.h\n"
- "ld1sb { z18.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z4.h, z17.h, z21.h\n"
- "ld1sb { z17.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z5.h, z21.h, z16.h\n"
- "ld1sb { z21.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
- "trn1 z10.h, z16.h, z15.h\n"
- "ld1sb { z16.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
- "trn1 z11.h, z15.h, z2.h\n"
- "sub z18.h, z18.h, z12.h\n"
- "sub z17.h, z17.h, z12.h\n"
- "ld1sb { z15.s }, p2/Z, [x22]\n"
- "addvl x21, x21, #-6\n"
- "sub z21.h, z21.h, z12.h\n"
- "sub z16.h, z16.h, z12.h\n"
- "mov x22, x23\n"
- "st1h { z0.h }, p2, [x21]\n"
- "sub z15.h, z15.h, z12.h\n"
- "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z0.h, z2.h, z18.h\n"
- "trn1 z8.h, z18.h, z17.h\n"
- "ld1sb { z18.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z4.h, z17.h, z21.h\n"
- "ld1sb { z17.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z5.h, z21.h, z16.h\n"
- "ld1sb { z21.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
- "trn1 z10.h, z16.h, z15.h\n"
- "ld1sb { z16.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
- "trn1 z11.h, z15.h, z2.h\n"
- "ld1sb { z15.s }, p2/Z, [x22]\n"
- "sub z18.h, z18.h, z12.h\n"
- "addvl x21, x21, #-6\n"
- "sub z17.h, z17.h, z12.h\n"
- "sub z21.h, z21.h, z12.h\n"
- "st1h { z0.h }, p2, [x21]\n"
- "sub z16.h, z16.h, z12.h\n"
- "sub z15.h, z15.h, z12.h\n"
- "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
- "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
- "mov z7.d, z6.d\n"
- "trn1 z0.h, z2.h, z18.h\n"
- "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z8.h, z18.h, z17.h\n"
- "trn1 z4.h, z17.h, z21.h\n"
- "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
- "trn1 z5.h, z21.h, z16.h\n"
- "trn1 z10.h, z16.h, z15.h\n"
- "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
- "addvl x21, x21, #-6\n"
- "trn1 z11.h, z15.h, z2.h\n"
- "st1h { z0.h }, p2, [x21]\n"
- "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
- "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
- "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
- "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
- "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
- "cbz x20, 3f\n"
- "ld1w { z3.s }, p1/Z, [x20, x5, LSL #2]\n"
+ "st1h { z22.h }, p2, [x22, #1, MUL VL]\n"
+ "trn1 z11.h, z15.h, z25.h\n"
+ "trn1 z10.h, z25.h, z21.h\n"
+ "ld1sb { z5.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z9.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z14.h, z21.h, z0.h\n"
+ "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z26.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z21.h, z0.h, z19.h\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z28.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z19.h, z19.h, z6.h\n"
+ "ld1sb { z29.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z13.h, z6.h, z15.h\n"
+ "sub z5.h, z5.h, z3.h\n"
+ "sub z23.h, z23.h, z3.h\n"
+ "ld1sb { z1.s }, p2/Z, [x20]\n"
+ "addvl x22, x22, #-6\n"
+ "sub z27.h, z27.h, z3.h\n"
+ "sub z29.h, z29.h, z3.h\n"
+ "mov x20, x23\n"
+ "st1h { z11.h }, p2, [x22]\n"
+ "sub z1.h, z1.h, z3.h\n"
+ "st1h { z10.h }, p2, [x22, #1, MUL VL]\n"
+ "trn1 z30.h, z15.h, z5.h\n"
+ "trn1 z26.h, z5.h, z23.h\n"
+ "ld1sb { z11.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z14.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z22.h, z23.h, z27.h\n"
+ "ld1sb { z5.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z21.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z28.h, z27.h, z29.h\n"
+ "ld1sb { z8.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z27.h, z29.h, z1.h\n"
+ "ld1sb { z9.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z13.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z2.h, z1.h, z15.h\n"
+ "ld1sb { z14.s }, p2/Z, [x20]\n"
+ "sub z11.h, z11.h, z3.h\n"
+ "addvl x22, x22, #-6\n"
+ "sub z5.h, z5.h, z3.h\n"
+ "sub z8.h, z8.h, z3.h\n"
+ "st1h { z30.h }, p2, [x22]\n"
+ "sub z9.h, z9.h, z3.h\n"
+ "sub z14.h, z14.h, z3.h\n"
+ "st1h { z26.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z22.h }, p2, [x22, #2, MUL VL]\n"
+ "mov z19.d, z18.d\n"
+ "trn1 z22.h, z15.h, z11.h\n"
+ "st1h { z28.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z1.h, z11.h, z5.h\n"
+ "trn1 z31.h, z5.h, z8.h\n"
+ "st1h { z27.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z8.h, z8.h, z9.h\n"
+ "trn1 z21.h, z9.h, z14.h\n"
+ "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+ "addvl x22, x22, #-6\n"
+ "trn1 z15.h, z14.h, z15.h\n"
+ "st1h { z22.h }, p2, [x22]\n"
+ "st1h { z1.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z31.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z8.h }, p2, [x22, #3, MUL VL]\n"
+ "st1h { z21.h }, p2, [x22, #4, MUL VL]\n"
+ "st1h { z15.h }, p2, [x22, #5, MUL VL]\n"
+ "cbz x21, 3f\n"
+ "ld1w { z7.s }, p1/Z, [x21, x17, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z1.s }, p1/Z, [x20, x5, LSL #2]\n"
+ "ld1w { z4.s }, p1/Z, [x20, x17, LSL #2]\n"
"4:" // Load right_shift: End
- "ldr x17, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x20, x17, #0x1\n"
+ "ldr x25, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x25, #0x1\n"
"orr x23, x20, %x[ld_in_col], LSL #16\n"
"ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
"orr x23, x7, x23, LSL #22\n"
"mov x22, #0x8\n"
- "add x21, x6, x4\n"
+ "add x21, x6, x5\n"
"lsl x20, %x[ld_in_row], #0x0\n"
"ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
"mov x11, #0x0\n"
@@ -271,56 +271,56 @@ void sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x0\n"
"msub x16, x6, x20, x16\n"
- ".inst 0xc00468c0 // mova za.d[x11, #0], { z6.d-z7.d }\n"
+ ".inst 0xc0046a40 // mova za.d[x11, #0], { z18.d-z19.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc00468c1 // mova za.d[x11, #1], { z6.d-z7.d }\n"
+ ".inst 0xc0046a41 // mova za.d[x11, #1], { z18.d-z19.d }\n"
"mov x22, #0x4\n"
- "ldp x14, x13, [x25], #0x10\n"
- ".inst 0xc00468c2 // mova za.d[x11, #2], { z6.d-z7.d }\n"
- "ldp x3, x10, [x20], #0x10\n"
- ".inst 0xc00468c3 // mova za.d[x11, #3], { z6.d-z7.d }\n"
+ "ldp x14, x13, [x23], #0x10\n"
+ ".inst 0xc0046a42 // mova za.d[x11, #2], { z18.d-z19.d }\n"
+ "ldp x4, x10, [x20], #0x10\n"
+ ".inst 0xc0046a43 // mova za.d[x11, #3], { z18.d-z19.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc00468c4 // mova za.d[x11, #4], { z6.d-z7.d }\n"
- "ldp x9, x28, [x25], #0x10\n"
- ".inst 0xc00468c5 // mova za.d[x11, #5], { z6.d-z7.d }\n"
+ ".inst 0xc0046a44 // mova za.d[x11, #4], { z18.d-z19.d }\n"
+ "ldp x9, x28, [x23], #0x10\n"
+ ".inst 0xc0046a45 // mova za.d[x11, #5], { z18.d-z19.d }\n"
"ldp x27, x26, [x20], #0x10\n"
- ".inst 0xc00468c6 // mova za.d[x11, #6], { z6.d-z7.d }\n"
- ".inst 0xc00468c7 // mova za.d[x11, #7], { z6.d-z7.d }\n"
- ".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
- ".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
+ ".inst 0xc0046a46 // mova za.d[x11, #6], { z18.d-z19.d }\n"
+ ".inst 0xc0046a47 // mova za.d[x11, #7], { z18.d-z19.d }\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
"csel x20, x21, x22, LT\n"
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
+ ".inst 0xc0066814 // mova { z20.d-z21.d }, za.d[x11, #0]\n"
"sub x15, x15, x21\n"
- ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
+ ".inst 0xc0066836 // mova { z22.d-z23.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
+ ".inst 0xc1a4aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z4.s\n"
+ ".inst 0xc1acab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z12.s\n"
+ ".inst 0xc1b0cf14 // sclamp { z20.s-z23.s }, z24.s, z16.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z12.s }, p1, [x14]\n"
- "add x14, x14, x3\n"
- "st1b { z14.s }, p1, [x13]\n"
+ "st1b { z20.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z22.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z13.s }, p1, [x9]\n"
+ "st1b { z21.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z15.s }, p1, [x28]\n"
+ "st1b { z23.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 6b\n"
"7:" // Left padding: End
- "adds XZR, x6, x4\n"
+ "adds XZR, x6, x5\n"
"bne 14f\n"
"cbz x22, 12f\n"
"cmp x22, #0x1\n"
- "sub x17, x17, x22\n"
+ "sub x25, x25, x22\n"
"beq 11f\n"
"cmp x22, #0x2\n"
"beq 10f\n"
@@ -328,338 +328,338 @@ void sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 4 priming loads
"add x21, x16, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x16]\n"
+ "ld1b { z1.s }, p1/Z, [x16]\n"
"addvl x20, SP, #24\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
+ "ld1b { z28.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "trn1 z27.h, z1.h, z28.h\n"
+ "add z27.h, z27.h, z17.h\n"
+ "ld1b { z1.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
+ "ld1b { z2.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "add z28.h, z28.h, z25.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
+ "trn1 z28.h, z1.h, z2.h\n"
+ "add z28.h, z28.h, z17.h\n"
+ "ld1b { z13.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z29.s }, p1/Z, [x21]\n"
+ "ld1b { z6.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z29.h, z16.h, z29.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "trn1 z29.h, z13.h, z6.h\n"
+ "add z29.h, z29.h, z17.h\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "add z30.h, z30.h, z25.h\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16a7768 // sdot za.s[x11, 0], { z27.h-z28.h }, z10.h\n"
+ "ld1b { z20.s }, p1/Z, [x21]\n"
+ "trn1 z30.h, z30.h, z20.h\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "add z30.h, z30.h, z17.h\n"
+ ".inst 0xc1697788 // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
+ ".inst 0xc1617789 // sdot za.s[x11, 1], { z28.h-z29.h }, z1.h\n"
".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
"9:" // Unpadded: 3 priming loads
"add x22, x16, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x16]\n"
+ "ld1b { z2.s }, p1/Z, [x16]\n"
"addvl x21, SP, #18\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
+ "ld1b { z28.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
+ "trn1 z20.h, z2.h, z28.h\n"
+ "add z20.h, z20.h, z17.h\n"
+ "ld1b { z31.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #24\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
+ "ld1b { z11.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "add z28.h, z28.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
+ "trn1 z21.h, z31.h, z11.h\n"
+ "add z21.h, z21.h, z17.h\n"
+ "ld1b { z25.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
+ "ld1b { z8.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z29.h, z17.h, z16.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
+ "trn1 z22.h, z25.h, z8.h\n"
+ "add z22.h, z22.h, z17.h\n"
+ "ld1b { z8.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- "add z30.h, z30.h, z25.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16e7688 // sdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
+ "ld1b { z3.s }, p1/Z, [x22]\n"
+ "trn1 z23.h, z8.h, z3.h\n"
+ ".inst 0xc1667689 // sdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc161768a // sdot za.s[x11, 2], { z20.h-z21.h }, z1.h\n"
+ "add z23.h, z23.h, z17.h\n"
+ ".inst 0xa1412aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc160768b // sdot za.s[x11, 3], { z20.h-z21.h }, z0.h\n"
+ ".inst 0xc16976a8 // sdot za.s[x11, 0], { z21.h-z22.h }, z9.h\n"
+ ".inst 0xa0422aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16176a9 // sdot za.s[x11, 1], { z21.h-z22.h }, z1.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16976aa // sdot za.s[x11, 2], { z21.h-z22.h }, z9.h\n"
+ ".inst 0xc16176ab // sdot za.s[x11, 3], { z21.h-z22.h }, z1.h\n"
+ ".inst 0xc16f76c8 // sdot za.s[x11, 0], { z22.h-z23.h }, z15.h\n"
+ ".inst 0xc16e76c9 // sdot za.s[x11, 1], { z22.h-z23.h }, z14.h\n"
".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16b76ca // sdot za.s[x11, 2], { z22.h-z23.h }, z11.h\n"
+ ".inst 0xc16a76cb // sdot za.s[x11, 3], { z22.h-z23.h }, z10.h\n"
"10:" // Unpadded: 2 priming loads
"add x23, x16, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x16]\n"
+ "ld1b { z2.s }, p1/Z, [x16]\n"
"addvl x22, SP, #12\n"
- "ld1b { z16.s }, p1/Z, [x23]\n"
+ "ld1b { z22.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x23]\n"
+ "trn1 z0.h, z2.h, z22.h\n"
+ "add z0.h, z0.h, z17.h\n"
+ "ld1b { z14.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"addvl x21, SP, #18\n"
- "ld1b { z16.s }, p1/Z, [x23]\n"
+ "ld1b { z6.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "add z28.h, z28.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x23]\n"
+ "trn1 z1.h, z14.h, z6.h\n"
+ "add z1.h, z1.h, z17.h\n"
+ "ld1b { z15.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"addvl x20, SP, #24\n"
- "ld1b { z16.s }, p1/Z, [x23]\n"
+ "ld1b { z6.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z29.h, z17.h, z16.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x23]\n"
+ "trn1 z2.h, z15.h, z6.h\n"
+ "add z2.h, z2.h, z17.h\n"
+ "ld1b { z21.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "ld1b { z16.s }, p1/Z, [x23]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- "add z30.h, z30.h, z25.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16f7408 // sdot za.s[x11, 0], { z0.h-z1.h }, z15.h\n"
+ "ld1b { z30.s }, p1/Z, [x23]\n"
+ "trn1 z3.h, z21.h, z30.h\n"
+ ".inst 0xc16e7409 // sdot za.s[x11, 1], { z0.h-z1.h }, z14.h\n"
+ ".inst 0xa1402aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16d740a // sdot za.s[x11, 2], { z0.h-z1.h }, z13.h\n"
+ "add z3.h, z3.h, z17.h\n"
+ ".inst 0xa0412ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc165740b // sdot za.s[x11, 3], { z0.h-z1.h }, z5.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16f7428 // sdot za.s[x11, 0], { z1.h-z2.h }, z15.h\n"
+ ".inst 0xc16e7429 // sdot za.s[x11, 1], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xa0412aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16b740c // sdot za.s[x11, 4], { z0.h-z1.h }, z11.h\n"
+ ".inst 0xc16a740d // sdot za.s[x11, 5], { z0.h-z1.h }, z10.h\n"
+ ".inst 0xc16f742a // sdot za.s[x11, 2], { z1.h-z2.h }, z15.h\n"
+ ".inst 0xc16e742b // sdot za.s[x11, 3], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xa0412a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1697448 // sdot za.s[x11, 0], { z2.h-z3.h }, z9.h\n"
+ ".inst 0xc1687449 // sdot za.s[x11, 1], { z2.h-z3.h }, z8.h\n"
".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16f742c // sdot za.s[x11, 4], { z1.h-z2.h }, z15.h\n"
+ ".inst 0xc16e742d // sdot za.s[x11, 5], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xc16b744a // sdot za.s[x11, 2], { z2.h-z3.h }, z11.h\n"
+ ".inst 0xc16a744b // sdot za.s[x11, 3], { z2.h-z3.h }, z10.h\n"
+ ".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc161744c // sdot za.s[x11, 4], { z2.h-z3.h }, z1.h\n"
+ ".inst 0xc160744d // sdot za.s[x11, 5], { z2.h-z3.h }, z0.h\n"
"11:" // Unpadded: 1 priming loads
"add x24, x16, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x16]\n"
+ "ld1b { z0.s }, p1/Z, [x16]\n"
"addvl x23, SP, #6\n"
- "ld1b { z16.s }, p1/Z, [x24]\n"
+ "ld1b { z3.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x24]\n"
+ "trn1 z28.h, z0.h, z3.h\n"
+ "add z28.h, z28.h, z17.h\n"
+ "ld1b { z6.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
"addvl x22, SP, #12\n"
- "ld1b { z16.s }, p1/Z, [x24]\n"
+ "ld1b { z30.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "add z28.h, z28.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x24]\n"
+ "trn1 z29.h, z6.h, z30.h\n"
+ "add z29.h, z29.h, z17.h\n"
+ "ld1b { z1.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
"addvl x21, SP, #18\n"
- "ld1b { z16.s }, p1/Z, [x24]\n"
+ "ld1b { z25.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
- "trn1 z29.h, z17.h, z16.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x24]\n"
+ "trn1 z30.h, z1.h, z25.h\n"
+ "add z30.h, z30.h, z17.h\n"
+ "ld1b { z3.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
"addvl x20, SP, #24\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1617788 // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x24]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- "add z30.h, z30.h, z25.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
+ "ld1b { z5.s }, p1/Z, [x24]\n"
+ "trn1 z31.h, z3.h, z5.h\n"
+ ".inst 0xc1607789 // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16e778a // sdot za.s[x11, 2], { z28.h-z29.h }, z14.h\n"
+ "add z31.h, z31.h, z17.h\n"
+ ".inst 0xa1412ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc166778b // sdot za.s[x11, 3], { z28.h-z29.h }, z6.h\n"
+ ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16a77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16277a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z2.h\n"
+ ".inst 0xa0412ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa1422ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16f778c // sdot za.s[x11, 4], { z28.h-z29.h }, z15.h\n"
+ ".inst 0xc16e778d // sdot za.s[x11, 5], { z28.h-z29.h }, z14.h\n"
+ ".inst 0xa1402a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16977aa // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
+ ".inst 0xc16877ab // sdot za.s[x11, 3], { z29.h-z30.h }, z8.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16a77c8 // sdot za.s[x11, 0], { z30.h-z31.h }, z10.h\n"
+ ".inst 0xc16277c9 // sdot za.s[x11, 1], { z30.h-z31.h }, z2.h\n"
+ ".inst 0xa1422ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16e778e // sdot za.s[x11, 6], { z28.h-z29.h }, z14.h\n"
+ ".inst 0xc166778f // sdot za.s[x11, 7], { z28.h-z29.h }, z6.h\n"
+ ".inst 0xc16d77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z13.h\n"
+ ".inst 0xc16577ad // sdot za.s[x11, 5], { z29.h-z30.h }, z5.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16a77ca // sdot za.s[x11, 2], { z30.h-z31.h }, z10.h\n"
+ ".inst 0xc16277cb // sdot za.s[x11, 3], { z30.h-z31.h }, z2.h\n"
+ ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16e77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z14.h\n"
+ ".inst 0xc16677af // sdot za.s[x11, 7], { z29.h-z30.h }, z6.h\n"
+ ".inst 0xc16977cc // sdot za.s[x11, 4], { z30.h-z31.h }, z9.h\n"
+ ".inst 0xc16877cd // sdot za.s[x11, 5], { z30.h-z31.h }, z8.h\n"
+ ".inst 0xa1422a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16e77ce // sdot za.s[x11, 6], { z30.h-z31.h }, z14.h\n"
+ ".inst 0xc16677cf // sdot za.s[x11, 7], { z30.h-z31.h }, z6.h\n"
"12:" // Unpadded: 0 priming loads
- ".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xa0422bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "cbz x17, 22f\n"
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "cbz x25, 22f\n"
"add x20, x16, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x16]\n"
- "sub x17, x17, #0x1\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "ld1b { z26.s }, p1/Z, [x16]\n"
+ "sub x25, x25, #0x1\n"
+ "ld1b { z28.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
+ "trn1 z25.h, z26.h, z28.h\n"
"sub x15, x15, #0x1\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
+ "ld1b { z31.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "cmp x17, x15\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "cmp x25, x15\n"
+ "add z25.h, z25.h, z17.h\n"
+ "ld1b { z15.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "csel x25, x17, x15, LT\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
+ "trn1 z26.h, z31.h, z15.h\n"
+ "csel x25, x25, x15, LT\n"
+ "ld1b { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z28.h, z28.h, z25.h\n"
+ "add z26.h, z26.h, z17.h\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "ld1b { z8.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z29.h, z17.h, z16.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
+ "trn1 z27.h, z22.h, z8.h\n"
+ "add z27.h, z27.h, z17.h\n"
+ "ld1b { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"sub x15, x15, x25\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- "add z30.h, z30.h, z25.h\n"
+ "ld1b { z20.s }, p1/Z, [x20]\n"
+ "trn1 z28.h, z21.h, z20.h\n"
+ "add z28.h, z28.h, z17.h\n"
"cbz x25, 21f\n"
"13:" // Unpadded: Main loop
"addvl x24, SP, #6\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
"addvl x23, SP, #12\n"
- "ld1b { z23.s }, p1/Z, [x16]\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402b00 // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
+ "ld1b { z21.s }, p1/Z, [x16]\n"
+ ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa0402b0e // ld1h { z14.h-z15.h }, pn10.b/Z, [x24]\n"
"addvl x22, SP, #18\n"
"addvl x21, SP, #24\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc16f772a // sdot za.s[x11, 2], { z25.h-z26.h }, z15.h\n"
"add x20, x16, %x[ld_in_row]\n"
- "ld1b { z22.s }, p1/Z, [x20]\n"
+ "ld1b { z0.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc16e772b // sdot za.s[x11, 3], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
"subs x25, x25, #0x1\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- "ld1b { z21.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412b04 // ld1h { z4.h-z5.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
"ld1b { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- "ld1b { z19.s }, p1/Z, [x20]\n"
+ ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412b05 // ld1h { z5.h, z13.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+ ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+ "ld1b { z31.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- "ld1b { z18.s }, p1/Z, [x20]\n"
+ ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16d774a // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+ "ld1b { z29.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422b0a // ld1h { z10.h-z11.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
+ ".inst 0xc165774b // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ae5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+ "ld1b { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc16f772e // sdot za.s[x11, 6], { z25.h-z26.h }, z15.h\n"
+ "ld1b { z30.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16e772f // sdot za.s[x11, 7], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16d774c // sdot za.s[x11, 4], { z26.h-z27.h }, z13.h\n"
+ "ld1b { z6.s }, p1/Z, [x20]\n"
+ ".inst 0xc165774d // sdot za.s[x11, 5], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16a776a // sdot za.s[x11, 2], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc162776b // sdot za.s[x11, 3], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16a776c // sdot za.s[x11, 4], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc162776d // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc169776e // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
+ ".inst 0xc161776f // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc1681768 // sdot za.s[x8, 0], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
- "trn1 z27.h, z23.h, z22.h\n"
- ".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc1651788 // sdot za.s[x8, 0], { z28.h-z29.h }, z5.h\n"
- "add z27.h, z27.h, z25.h\n"
- ".inst 0xc1641789 // sdot za.s[x8, 1], { z28.h-z29.h }, z4.h\n"
- "trn1 z28.h, z21.h, z20.h\n"
- ".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xc16b17a8 // sdot za.s[x8, 0], { z29.h-z30.h }, z11.h\n"
- "add z28.h, z28.h, z25.h\n"
- ".inst 0xc16a17a9 // sdot za.s[x8, 1], { z29.h-z30.h }, z10.h\n"
- "trn1 z29.h, z19.h, z18.h\n"
- "trn1 z30.h, z17.h, z16.h\n"
+ ".inst 0xc16f1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
+ ".inst 0xc16e1729 // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
+ "trn1 z25.h, z21.h, z0.h\n"
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc16d1748 // sdot za.s[x8, 0], { z26.h-z27.h }, z13.h\n"
+ "add z25.h, z25.h, z17.h\n"
+ ".inst 0xc1651749 // sdot za.s[x8, 1], { z26.h-z27.h }, z5.h\n"
+ "trn1 z26.h, z20.h, z31.h\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xc16b1768 // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
+ "add z26.h, z26.h, z17.h\n"
+ ".inst 0xc16a1769 // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
+ "trn1 z27.h, z29.h, z22.h\n"
+ "trn1 z28.h, z30.h, z6.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
- ".inst 0xa0422bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "add z29.h, z29.h, z25.h\n"
- ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "add z27.h, z27.h, z17.h\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
- ".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x14]\n"
- "add x14, x14, x3\n"
- "add z30.h, z30.h, z25.h\n"
- "st1b { z14.s }, p1, [x13]\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "add z28.h, z28.h, z17.h\n"
+ "st1b { z10.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z13.s }, p1, [x9]\n"
+ "st1b { z9.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z15.s }, p1, [x28]\n"
+ "st1b { z11.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 13b\n"
"b 21f\n"
"14:" // Padded
"cbz x22, 19f\n"
"cmp x22, #0x1\n"
- "sub x17, x17, x22\n"
+ "sub x25, x25, x22\n"
"beq 18f\n"
"cmp x22, #0x2\n"
"beq 17f\n"
@@ -668,515 +668,515 @@ void sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl(
"15:" // Padded: 4 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x16]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1b { z9.s }, p0/Z, [x16]\n"
+ "add z9.h, p0/M, z9.h, z17.h\n"
"add x21, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x21]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x21]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1b { z21.s }, p0/Z, [x21]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z27.h, z19.h, z18.h\n"
- "trn1 z28.h, z17.h, z16.h\n"
+ "trn1 z31.h, z9.h, z22.h\n"
+ "trn1 z0.h, z21.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z18.s }, p0/Z, [x21]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x21]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
+ "ld1b { z21.s }, p0/Z, [x21]\n"
"addvl x20, SP, #24\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- "trn1 z29.h, z18.h, z16.h\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ "trn1 z1.h, z22.h, z20.h\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc16a77e8 // sdot za.s[x11, 0], { z31.h-z0.h }, z10.h\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16277e9 // sdot za.s[x11, 1], { z31.h-z0.h }, z2.h\n"
+ ".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "trn1 z2.h, z21.h, z20.h\n"
+ ".inst 0xc16d7408 // sdot za.s[x11, 0], { z0.h-z1.h }, z13.h\n"
+ ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc1657409 // sdot za.s[x11, 1], { z0.h-z1.h }, z5.h\n"
+ ".inst 0xc1697428 // sdot za.s[x11, 0], { z1.h-z2.h }, z9.h\n"
+ ".inst 0xc1687429 // sdot za.s[x11, 1], { z1.h-z2.h }, z8.h\n"
"16:" // Padded: 3 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x16]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1b { z5.s }, p0/Z, [x16]\n"
+ "add z5.h, p0/M, z5.h, z17.h\n"
"add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z19.h, z18.h\n"
- "trn1 z28.h, z17.h, z16.h\n"
+ "trn1 z28.h, z5.h, z22.h\n"
+ "trn1 z29.h, z21.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
"addvl x21, SP, #18\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- "trn1 z29.h, z18.h, z16.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ "trn1 z30.h, z22.h, z20.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"addvl x20, SP, #24\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- "trn1 z30.h, z17.h, z16.h\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc1617788 // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xc1607789 // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ "trn1 z31.h, z21.h, z20.h\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xa0412aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc169778a // sdot za.s[x11, 2], { z28.h-z29.h }, z9.h\n"
+ ".inst 0xc161778b // sdot za.s[x11, 3], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xa1422aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16f77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z15.h\n"
+ ".inst 0xc16e77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z14.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16977aa // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
+ ".inst 0xc16177ab // sdot za.s[x11, 3], { z29.h-z30.h }, z1.h\n"
+ ".inst 0xc16b77c8 // sdot za.s[x11, 0], { z30.h-z31.h }, z11.h\n"
+ ".inst 0xc16377c9 // sdot za.s[x11, 1], { z30.h-z31.h }, z3.h\n"
+ ".inst 0xa0422a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16f77ca // sdot za.s[x11, 2], { z30.h-z31.h }, z15.h\n"
+ ".inst 0xc16e77cb // sdot za.s[x11, 3], { z30.h-z31.h }, z14.h\n"
"17:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x16]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1b { z29.s }, p0/Z, [x16]\n"
+ "add z29.h, p0/M, z29.h, z17.h\n"
"add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z19.h, z18.h\n"
- "trn1 z28.h, z17.h, z16.h\n"
+ "trn1 z8.h, z29.h, z22.h\n"
+ "trn1 z9.h, z21.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
"addvl x22, SP, #12\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- "trn1 z29.h, z18.h, z16.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+ "trn1 z10.h, z22.h, z20.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"addvl x21, SP, #18\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc16f7508 // sdot za.s[x11, 0], { z8.h-z9.h }, z15.h\n"
+ ".inst 0xc16e7509 // sdot za.s[x11, 1], { z8.h-z9.h }, z14.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
"addvl x20, SP, #24\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
+ "trn1 z11.h, z21.h, z20.h\n"
+ ".inst 0xa1412ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16e750a // sdot za.s[x11, 2], { z8.h-z9.h }, z14.h\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc166750b // sdot za.s[x11, 3], { z8.h-z9.h }, z6.h\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16d7528 // sdot za.s[x11, 0], { z9.h-z10.h }, z13.h\n"
+ ".inst 0xa0422ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc1657529 // sdot za.s[x11, 1], { z9.h-z10.h }, z5.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16f750c // sdot za.s[x11, 4], { z8.h-z9.h }, z15.h\n"
+ ".inst 0xc16e750d // sdot za.s[x11, 5], { z8.h-z9.h }, z14.h\n"
+ ".inst 0xc16d752a // sdot za.s[x11, 2], { z9.h-z10.h }, z13.h\n"
+ ".inst 0xc165752b // sdot za.s[x11, 3], { z9.h-z10.h }, z5.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1617548 // sdot za.s[x11, 0], { z10.h-z11.h }, z1.h\n"
+ ".inst 0xc1607549 // sdot za.s[x11, 1], { z10.h-z11.h }, z0.h\n"
+ ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16e752c // sdot za.s[x11, 4], { z9.h-z10.h }, z14.h\n"
+ ".inst 0xc166752d // sdot za.s[x11, 5], { z9.h-z10.h }, z6.h\n"
+ ".inst 0xc161754a // sdot za.s[x11, 2], { z10.h-z11.h }, z1.h\n"
+ ".inst 0xc160754b // sdot za.s[x11, 3], { z10.h-z11.h }, z0.h\n"
+ ".inst 0xa0422a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16f754c // sdot za.s[x11, 4], { z10.h-z11.h }, z15.h\n"
+ ".inst 0xc16e754d // sdot za.s[x11, 5], { z10.h-z11.h }, z14.h\n"
"18:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x16]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1b { z1.s }, p0/Z, [x16]\n"
+ "add z1.h, p0/M, z1.h, z17.h\n"
"add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z19.h, z18.h\n"
- "trn1 z28.h, z17.h, z16.h\n"
+ "trn1 z26.h, z1.h, z22.h\n"
+ "trn1 z27.h, z21.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
"addvl x23, SP, #6\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
- "trn1 z29.h, z18.h, z16.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xa0402aee // ld1h { z14.h-z15.h }, pn10.b/Z, [x23]\n"
+ "trn1 z28.h, z22.h, z20.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"addvl x22, SP, #12\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc16f7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z15.h\n"
+ ".inst 0xc16e7749 // sdot za.s[x11, 1], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xa0402ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22]\n"
"addvl x21, SP, #18\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
+ "trn1 z29.h, z21.h, z20.h\n"
+ ".inst 0xa0412aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc161774a // sdot za.s[x11, 2], { z26.h-z27.h }, z1.h\n"
"addvl x20, SP, #24\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc160774b // sdot za.s[x11, 3], { z26.h-z27.h }, z0.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16b7768 // sdot za.s[x11, 0], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xa0422ae8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16a7769 // sdot za.s[x11, 1], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xa0412aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16e774c // sdot za.s[x11, 4], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xc166774d // sdot za.s[x11, 5], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16b776a // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xc16a776b // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc1697788 // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
+ ".inst 0xc1687789 // sdot za.s[x11, 1], { z28.h-z29.h }, z8.h\n"
+ ".inst 0xa1422ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xc16e776c // sdot za.s[x11, 4], { z27.h-z28.h }, z14.h\n"
+ ".inst 0xc166776d // sdot za.s[x11, 5], { z27.h-z28.h }, z6.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16a778a // sdot za.s[x11, 2], { z28.h-z29.h }, z10.h\n"
+ ".inst 0xc162778b // sdot za.s[x11, 3], { z28.h-z29.h }, z2.h\n"
+ ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16e776e // sdot za.s[x11, 6], { z27.h-z28.h }, z14.h\n"
+ ".inst 0xc166776f // sdot za.s[x11, 7], { z27.h-z28.h }, z6.h\n"
+ ".inst 0xc161778c // sdot za.s[x11, 4], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xc160778d // sdot za.s[x11, 5], { z28.h-z29.h }, z0.h\n"
+ ".inst 0xa1422a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16a778e // sdot za.s[x11, 6], { z28.h-z29.h }, z10.h\n"
+ ".inst 0xc162778f // sdot za.s[x11, 7], { z28.h-z29.h }, z2.h\n"
"19:" // Padded: 0 priming loads
- ".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xa0422bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "cbz x17, 22f\n"
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "cbz x25, 22f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x16]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1b { z6.s }, p0/Z, [x16]\n"
+ "add z6.h, p0/M, z6.h, z17.h\n"
"add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z30.s }, p0/Z, [x20]\n"
+ "add z30.h, p0/M, z30.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z17.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z19.h, z18.h\n"
- "trn1 z28.h, z17.h, z16.h\n"
+ "trn1 z25.h, z6.h, z30.h\n"
+ "trn1 z26.h, z27.h, z26.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1b { z8.s }, p0/Z, [x20]\n"
+ "add z8.h, p0/M, z8.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z9.s }, p0/Z, [x20]\n"
+ "add z9.h, p0/M, z9.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- "sub x17, x17, #0x1\n"
+ "ld1b { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z17.h\n"
+ "sub x25, x25, #0x1\n"
"sub x15, x15, #0x1\n"
- "cmp x17, x15\n"
- "trn1 z29.h, z19.h, z18.h\n"
- "trn1 z30.h, z17.h, z16.h\n"
- "csel x25, x17, x15, LT\n"
+ "cmp x25, x15\n"
+ "trn1 z27.h, z8.h, z9.h\n"
+ "trn1 z28.h, z21.h, z29.h\n"
+ "csel x25, x25, x15, LT\n"
"add x16, x16, %x[ld_in_col]\n"
"sub x15, x15, x25\n"
"cbz x25, 21f\n"
"20:" // Padded: Main loop
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z23.s }, p0/Z, [x16]\n"
- "add z23.h, p0/M, z23.h, z25.h\n"
+ "ld1b { z8.s }, p0/Z, [x16]\n"
+ "add z8.h, p0/M, z8.h, z17.h\n"
"add x24, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z22.s }, p0/Z, [x24]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ "ld1b { z21.s }, p0/Z, [x24]\n"
+ ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
"addvl x23, SP, #6\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
"addvl x22, SP, #12\n"
- "add z22.h, p0/M, z22.h, z25.h\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x24, x24, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc161772a // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xc160772b // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
"addvl x21, SP, #18\n"
"addvl x20, SP, #24\n"
- "ld1b { z21.s }, p0/Z, [x24]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- "add z21.h, p0/M, z21.h, z25.h\n"
+ "ld1b { z29.s }, p0/Z, [x24]\n"
+ ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+ "add z29.h, p0/M, z29.h, z17.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ae5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
"mov x12, #0x4\n"
"add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- "ld1b { z20.s }, p0/Z, [x24]\n"
- "add z20.h, p0/M, z20.h, z25.h\n"
+ ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+ "ld1b { z30.s }, p0/Z, [x24]\n"
+ "add z30.h, p0/M, z30.h, z17.h\n"
"add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"subs x25, x25, #0x1\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- "ld1b { z19.s }, p0/Z, [x24]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ ".inst 0xc16d774a // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+ "ld1b { z15.s }, p0/Z, [x24]\n"
+ "add z15.h, p0/M, z15.h, z17.h\n"
"add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc165774b // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa0412aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- "ld1b { z18.s }, p0/Z, [x24]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+ "ld1b { z20.s }, p0/Z, [x24]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- "ld1b { z17.s }, p0/Z, [x24]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ ".inst 0xc16e772e // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+ "ld1b { z31.s }, p0/Z, [x24]\n"
+ "add z31.h, p0/M, z31.h, z17.h\n"
"add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc166772f // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- "ld1b { z16.s }, p0/Z, [x24]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16b774c // sdot za.s[x11, 4], { z26.h-z27.h }, z11.h\n"
+ "ld1b { z22.s }, p0/Z, [x24]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ ".inst 0xc16a774d // sdot za.s[x11, 5], { z26.h-z27.h }, z10.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc169776a // sdot za.s[x11, 2], { z27.h-z28.h }, z9.h\n"
+ ".inst 0xc161776b // sdot za.s[x11, 3], { z27.h-z28.h }, z1.h\n"
+ ".inst 0xa0422ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16e774e // sdot za.s[x11, 6], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xc166774f // sdot za.s[x11, 7], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc161776c // sdot za.s[x11, 4], { z27.h-z28.h }, z1.h\n"
+ ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
+ ".inst 0xa1422aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc169776e // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
+ ".inst 0xc161776f // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1681768 // sdot za.s[x8, 0], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
- "trn1 z27.h, z23.h, z22.h\n"
- ".inst 0xc1651788 // sdot za.s[x8, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1641789 // sdot za.s[x8, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "trn1 z28.h, z21.h, z20.h\n"
- ".inst 0xc16b17a8 // sdot za.s[x8, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a17a9 // sdot za.s[x8, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc1631728 // sdot za.s[x8, 0], { z25.h-z26.h }, z3.h\n"
+ ".inst 0xc1621729 // sdot za.s[x8, 1], { z25.h-z26.h }, z2.h\n"
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ "trn1 z25.h, z8.h, z21.h\n"
+ ".inst 0xc16e1748 // sdot za.s[x8, 0], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xc1661749 // sdot za.s[x8, 1], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "trn1 z26.h, z29.h, z30.h\n"
+ ".inst 0xc16b1768 // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xc16a1769 // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xa0422bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "trn1 z29.h, z19.h, z18.h\n"
- ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "trn1 z27.h, z15.h, z20.h\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ "trn1 z28.h, z31.h, z22.h\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
- ".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x14]\n"
- "add x14, x14, x3\n"
- "st1b { z14.s }, p1, [x13]\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z10.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z13.s }, p1, [x9]\n"
+ "st1b { z9.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z15.s }, p1, [x28]\n"
+ "st1b { z11.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 20b\n"
"21:" // Main loop tail
"addvl x23, SP, #6\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
"addvl x22, SP, #12\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
"addvl x21, SP, #18\n"
"addvl x20, SP, #24\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc161772a // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xc160772b // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc169774a // sdot za.s[x11, 2], { z26.h-z27.h }, z9.h\n"
+ ".inst 0xc161774b // sdot za.s[x11, 3], { z26.h-z27.h }, z1.h\n"
+ ".inst 0xa1412ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1681768 // sdot za.s[x8, 0], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc16e772e // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc166772f // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc169774c // sdot za.s[x11, 4], { z26.h-z27.h }, z9.h\n"
+ ".inst 0xc161774d // sdot za.s[x11, 5], { z26.h-z27.h }, z1.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16b776a // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xc16a776b // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xa0422ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa0412a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc163776c // sdot za.s[x11, 4], { z27.h-z28.h }, z3.h\n"
+ ".inst 0xc162776d // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16a776e // sdot za.s[x11, 6], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc162776f // sdot za.s[x11, 7], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16f1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
+ ".inst 0xc16e1729 // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc1691748 // sdot za.s[x8, 0], { z26.h-z27.h }, z9.h\n"
+ ".inst 0xc1681749 // sdot za.s[x8, 1], { z26.h-z27.h }, z8.h\n"
+ ".inst 0xc1611768 // sdot za.s[x8, 0], { z27.h-z28.h }, z1.h\n"
".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc1651788 // sdot za.s[x8, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1641789 // sdot za.s[x8, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b17a8 // sdot za.s[x8, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a17a9 // sdot za.s[x8, 1], { z29.h-z30.h }, z10.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
- ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
- ".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x14]\n"
- "add x14, x14, x3\n"
- "st1b { z14.s }, p1, [x13]\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z10.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z13.s }, p1, [x9]\n"
+ "st1b { z9.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z15.s }, p1, [x28]\n"
+ "st1b { z11.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"22:" // Main loop skip tail
"cbz x15, 24f\n"
"23:" // Right padding loop
- ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
"add x8, x8, #0x2\n"
"subs x15, x15, #0x1\n"
- ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
- ".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x14]\n"
- "add x14, x14, x3\n"
- "st1b { z14.s }, p1, [x13]\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z10.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z13.s }, p1, [x9]\n"
+ "st1b { z9.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z15.s }, p1, [x28]\n"
+ "st1b { z11.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 23b\n"
"24:" // End
- "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x23, ALL, MUL #16\n"
- "incw x23, ALL, MUL #9\n"
- "str x23, [%x[args], %[offsetof_Args_weights]]\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "incw x5\n"
- "whilelt p1.s, x5, x7\n"
- "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x16, x16, x20\n"
- "str x16, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #16\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "incw x17\n"
+ "whilelt p1.s, x17, x7\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
"ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
"ldp x23, x22, [x25, #0x0]\n"
@@ -1194,7 +1194,7 @@ void sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za.hpp
index 6949e69e39..ad82070912 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
index 33bb4eb8ec..d8dc69127e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
@@ -73,156 +73,156 @@ void sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za_impl(
"ptrue p2.b\n"
"mov x20, #0xb\n"
"ldr x4, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z9.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "ld1rh { z7.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
"sub x20, x20, x3\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x5, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x5\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
"whilelt p8.s, XZR, x4\n"
"addvl SP, SP, #-15\n"
"ldr x6, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z9.h, p2/M, z9.h\n"
+ "neg z7.h, p2/M, z7.h\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z3.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z1.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z5.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z21.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z28.s, #0x0\n"
+ "mov z12.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z28.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "ld1w { z12.s }, p1/Z, [x20, x6, LSL #2]\n"
"2:" // Load bias: Done
"ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
"mov x20, x22\n"
- "ld1sb { z12.s }, p2/Z, [x20]\n"
+ "ld1sb { z13.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "ld1rh { z18.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "sub z12.h, z12.h, z18.h\n"
+ "ld1rh { z28.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "sub z13.h, z13.h, z28.h\n"
"incw x22\n"
- "mov z14.h, #0x0\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "mov z26.h, #0x0\n"
+ "ld1sb { z22.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z25.h, z25.h, z18.h\n"
- "trn1 z2.h, z12.h, z25.h\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "sub z22.h, z22.h, z28.h\n"
+ "trn1 z17.h, z13.h, z22.h\n"
+ "ld1sb { z20.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z24.h, z24.h, z18.h\n"
+ "sub z20.h, z20.h, z28.h\n"
"addvl x21, SP, #15\n"
- "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "ld1sb { z1.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z17.h, z17.h, z18.h\n"
- "trn1 z10.h, z24.h, z17.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "sub z1.h, z1.h, z28.h\n"
+ "trn1 z29.h, z20.h, z1.h\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
"mov x20, x22\n"
- "sub z16.h, z16.h, z18.h\n"
+ "sub z27.h, z27.h, z28.h\n"
"incw x22\n"
- "ld1sb { z12.s }, p2/Z, [x20]\n"
+ "ld1sb { z14.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z12.h, z12.h, z18.h\n"
+ "sub z14.h, z14.h, z28.h\n"
"addvl x21, x21, #-3\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "ld1sb { z18.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z25.h, z25.h, z18.h\n"
- "trn1 z0.h, z16.h, z14.h\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "sub z18.h, z18.h, z28.h\n"
+ "trn1 z22.h, z27.h, z26.h\n"
+ "ld1sb { z23.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z24.h, z24.h, z18.h\n"
- "st1h { z2.h }, p2, [x21]\n"
- "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "sub z23.h, z23.h, z28.h\n"
+ "st1h { z17.h }, p2, [x21]\n"
+ "ld1sb { z30.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z17.h, z17.h, z18.h\n"
- "trn1 z2.h, z12.h, z25.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "sub z30.h, z30.h, z28.h\n"
+ "trn1 z8.h, z14.h, z18.h\n"
+ "ld1sb { z15.s }, p2/Z, [x20]\n"
"mov x20, x22\n"
- "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
- "sub z16.h, z16.h, z18.h\n"
- "ld1sb { z12.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #5\n"
- "trn1 z10.h, z24.h, z17.h\n"
- "sub z12.h, z12.h, z18.h\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "st1h { z29.h }, p2, [x21, #1, MUL VL]\n"
+ "sub z15.h, z15.h, z28.h\n"
+ "ld1sb { z20.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z25.h, z25.h, z18.h\n"
- "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z23.h, z23.h, z30.h\n"
+ "sub z20.h, z20.h, z28.h\n"
"ld1sb { z24.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z0.h, z16.h, z14.h\n"
+ "sub z24.h, z24.h, z28.h\n"
+ "st1h { z22.h }, p2, [x21, #2, MUL VL]\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z0.h, z15.h, z26.h\n"
"incw x22\n"
- "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "ld1sb { z13.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z24.h, z24.h, z18.h\n"
- "sub z17.h, z17.h, z18.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "sub z16.h, z16.h, z28.h\n"
+ "sub z13.h, z13.h, z28.h\n"
+ "ld1sb { z11.s }, p2/Z, [x20]\n"
"addvl x21, x21, #-3\n"
"mov x20, x22\n"
- "st1h { z2.h }, p2, [x21]\n"
- "trn1 z2.h, z12.h, z25.h\n"
- "ld1sb { z12.s }, p2/Z, [x20]\n"
+ "st1h { z8.h }, p2, [x21]\n"
+ "trn1 z27.h, z20.h, z24.h\n"
+ "ld1sb { z22.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z16.h, z16.h, z18.h\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "sub z11.h, z11.h, z28.h\n"
+ "ld1sb { z3.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z10.h, z24.h, z17.h\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "st1h { z23.h }, p2, [x21, #1, MUL VL]\n"
+ "trn1 z20.h, z16.h, z13.h\n"
+ "ld1sb { z13.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z12.h, z12.h, z18.h\n"
- "sub z25.h, z25.h, z18.h\n"
- "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "sub z22.h, z22.h, z28.h\n"
+ "sub z3.h, z3.h, z28.h\n"
+ "ld1sb { z15.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
"st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z0.h, z16.h, z14.h\n"
+ "trn1 z29.h, z11.h, z26.h\n"
"ld1sb { z16.s }, p2/Z, [x20]\n"
"incw x22\n"
- "sub z24.h, z24.h, z18.h\n"
- "sub z17.h, z17.h, z18.h\n"
+ "sub z13.h, z13.h, z28.h\n"
+ "sub z15.h, z15.h, z28.h\n"
"addvl x21, x21, #-3\n"
"mov x20, x22\n"
- "st1h { z2.h }, p2, [x21]\n"
- "sub z16.h, z16.h, z18.h\n"
- "trn1 z2.h, z12.h, z25.h\n"
- "ld1sb { z12.s }, p2/Z, [x20]\n"
+ "st1h { z27.h }, p2, [x21]\n"
+ "sub z16.h, z16.h, z28.h\n"
+ "trn1 z19.h, z22.h, z3.h\n"
+ "ld1sb { z17.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "st1h { z20.h }, p2, [x21, #1, MUL VL]\n"
+ "ld1sb { z0.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z10.h, z24.h, z17.h\n"
- "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "trn1 z31.h, z13.h, z15.h\n"
+ "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
+ "ld1sb { z18.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z0.h, z16.h, z14.h\n"
- "sub z12.h, z12.h, z18.h\n"
- "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "trn1 z16.h, z16.h, z26.h\n"
+ "sub z17.h, z17.h, z28.h\n"
+ "ld1sb { z22.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z25.h, z25.h, z18.h\n"
- "sub z24.h, z24.h, z18.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
- "sub z17.h, z17.h, z18.h\n"
- "sub z16.h, z16.h, z18.h\n"
+ "sub z0.h, z0.h, z28.h\n"
+ "sub z18.h, z18.h, z28.h\n"
+ "ld1sb { z1.s }, p2/Z, [x20]\n"
+ "sub z22.h, z22.h, z28.h\n"
+ "sub z1.h, z1.h, z28.h\n"
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
"addvl x21, x21, #-3\n"
- "st1h { z2.h }, p2, [x21]\n"
- "mov z29.d, z28.d\n"
- "mov z30.d, z28.d\n"
- "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
- "mov z31.d, z28.d\n"
- "trn1 z2.h, z12.h, z25.h\n"
- "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "st1h { z19.h }, p2, [x21]\n"
+ "mov z13.d, z12.d\n"
+ "mov z14.d, z12.d\n"
+ "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
+ "mov z15.d, z12.d\n"
+ "trn1 z8.h, z17.h, z0.h\n"
+ "st1h { z16.h }, p2, [x21, #2, MUL VL]\n"
"addvl x21, x21, #-3\n"
- "trn1 z10.h, z24.h, z17.h\n"
- "trn1 z0.h, z16.h, z14.h\n"
- "st1h { z2.h }, p2, [x21]\n"
- "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
- "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z31.h, z18.h, z22.h\n"
+ "trn1 z29.h, z1.h, z26.h\n"
+ "st1h { z8.h }, p2, [x21]\n"
+ "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
"cbz x20, 3f\n"
- "ld1w { z3.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "ld1w { z6.s }, p1/Z, [x20, x6, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z1.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "ld1w { z4.s }, p1/Z, [x20, x6, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x7, [%x[args], %[offsetof_Args_input_cols]]\n"
"sub x20, x7, #0x1\n"
@@ -242,20 +242,20 @@ void sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za_impl(
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x0\n"
"msub x17, x4, x20, x17\n"
- ".inst 0xc0040f80 // mova za.d[x8, #0], { z28.d-z31.d }\n"
+ ".inst 0xc0040d80 // mova za.d[x8, #0], { z12.d-z15.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040f81 // mova za.d[x8, #1], { z28.d-z31.d }\n"
+ ".inst 0xc0040d81 // mova za.d[x8, #1], { z12.d-z15.d }\n"
"mov x22, #0x4\n"
- "ldp x15, x14, [x25], #0x10\n"
- ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "ldp x15, x14, [x23], #0x10\n"
+ ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
"ldp x13, x11, [x20], #0x10\n"
- ".inst 0xc0040f83 // mova za.d[x8, #3], { z28.d-z31.d }\n"
+ ".inst 0xc0040d83 // mova za.d[x8, #3], { z12.d-z15.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "ldp x10, x9, [x25], #0x10\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "ldp x10, x9, [x23], #0x10\n"
"ldp x28, x27, [x20], #0x10\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
@@ -263,24 +263,24 @@ void sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
"and x22, x21, #0x1\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
"add x21, x21, #0x1\n"
"lsr x21, x21, #0x1\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
+ ".inst 0xc1aaab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
"sub x16, x16, x21\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
+ ".inst 0xc1b5ccbc // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z4.s }, p1, [x15]\n"
+ "st1b { z28.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- "st1b { z5.s }, p1, [x14]\n"
+ "st1b { z29.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "st1b { z30.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z7.s }, p1, [x9]\n"
+ "st1b { z31.s }, p1, [x9]\n"
"add x9, x9, x27\n"
"bgt 6b\n"
"7:" // Left padding: End
@@ -296,341 +296,341 @@ void sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 4 priming loads
"add x21, x17, %x[ld_in_row]\n"
- "ld1b { z11.s }, p1/Z, [x17]\n"
+ "ld1b { z27.s }, p1/Z, [x17]\n"
"addvl x20, SP, #12\n"
- "ld1b { z21.s }, p1/Z, [x21]\n"
+ "ld1b { z0.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z12.s }, p1/Z, [x21]\n"
+ "trn1 z27.h, z27.h, z0.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ "ld1b { z28.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1b { z20.s }, p1/Z, [x21]\n"
+ "ld1b { z11.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z13.s }, p1/Z, [x21]\n"
+ "trn1 z28.h, z28.h, z11.h\n"
+ "add z28.h, z28.h, z7.h\n"
+ "ld1b { z29.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z19.s }, p1/Z, [x21]\n"
+ "ld1b { z8.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x21]\n"
+ "trn1 z29.h, z29.h, z8.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x21]\n"
+ "ld1b { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x21]\n"
+ "trn1 z30.h, z30.h, z17.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ "ld1b { z31.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "ld1b { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "trn1 z31.h, z31.h, z26.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+ "ld1b { z20.s }, p1/Z, [x21]\n"
+ "mov z0.d, z20.d\n"
+ "add z0.h, z0.h, z7.h\n"
+ ".inst 0xc1781788 // sdot za.s[x8, 0], { z28.h-z31.h }, z8.h\n"
+ "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17817a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z8.h\n"
"9:" // Unpadded: 3 priming loads
"add x21, x17, %x[ld_in_row]\n"
- "ld1b { z11.s }, p1/Z, [x17]\n"
+ "ld1b { z29.s }, p1/Z, [x17]\n"
"addvl x20, SP, #9\n"
- "ld1b { z21.s }, p1/Z, [x21]\n"
+ "ld1b { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z12.s }, p1/Z, [x21]\n"
+ "trn1 z29.h, z29.h, z17.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1b { z20.s }, p1/Z, [x21]\n"
+ "ld1b { z0.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z13.s }, p1/Z, [x21]\n"
+ "trn1 z30.h, z30.h, z0.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ "ld1b { z31.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z19.s }, p1/Z, [x21]\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x21]\n"
+ "trn1 z31.h, z31.h, z16.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ "ld1b { z0.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x21]\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x21]\n"
+ "trn1 z0.h, z0.h, z16.h\n"
+ "add z0.h, z0.h, z7.h\n"
+ "ld1b { z1.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ "trn1 z1.h, z1.h, z16.h\n"
+ "add z1.h, z1.h, z7.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17217a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "mov z2.d, z16.d\n"
+ "add z2.h, z2.h, z7.h\n"
+ ".inst 0xc17317c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
+ "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17817e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z8.h\n"
"10:" // Unpadded: 2 priming loads
"add x22, x17, %x[ld_in_row]\n"
- "ld1b { z11.s }, p1/Z, [x17]\n"
+ "ld1b { z26.s }, p1/Z, [x17]\n"
"addvl x21, SP, #6\n"
- "ld1b { z21.s }, p1/Z, [x22]\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z12.s }, p1/Z, [x22]\n"
+ "trn1 z26.h, z26.h, z16.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1b { z27.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #12\n"
- "ld1b { z20.s }, p1/Z, [x22]\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z13.s }, p1/Z, [x22]\n"
+ "trn1 z27.h, z27.h, z16.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ "ld1b { z28.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1b { z19.s }, p1/Z, [x22]\n"
+ "ld1b { z29.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x22]\n"
+ "trn1 z28.h, z28.h, z29.h\n"
+ "add z28.h, z28.h, z7.h\n"
+ "ld1b { z29.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x22]\n"
+ "ld1b { z19.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x22]\n"
+ "trn1 z29.h, z29.h, z19.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1b { z30.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ "ld1b { z23.s }, p1/Z, [x22]\n"
+ "trn1 z30.h, z30.h, z23.h\n"
"add x22, x22, %x[ld_in_row]\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "add z16.h, z16.h, z9.h\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1721748 // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+ "ld1b { z22.s }, p1/Z, [x22]\n"
+ "mov z31.d, z22.d\n"
+ ".inst 0xc1731768 // sdot za.s[x8, 0], { z27.h-z30.h }, z3.h\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc17b1769 // sdot za.s[x8, 1], { z27.h-z30.h }, z11.h\n"
+ ".inst 0xc1731788 // sdot za.s[x8, 0], { z28.h-z31.h }, z3.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1701789 // sdot za.s[x8, 1], { z28.h-z31.h }, z0.h\n"
"11:" // Unpadded: 1 priming loads
"add x22, x17, %x[ld_in_row]\n"
- "ld1b { z11.s }, p1/Z, [x17]\n"
+ "ld1b { z29.s }, p1/Z, [x17]\n"
"addvl x21, SP, #3\n"
- "ld1b { z21.s }, p1/Z, [x22]\n"
+ "ld1b { z22.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z12.s }, p1/Z, [x22]\n"
+ "trn1 z29.h, z29.h, z22.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1b { z30.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #9\n"
- "ld1b { z20.s }, p1/Z, [x22]\n"
+ "ld1b { z25.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z13.s }, p1/Z, [x22]\n"
+ "trn1 z30.h, z30.h, z25.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ "ld1b { z31.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1b { z19.s }, p1/Z, [x22]\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x22]\n"
+ "trn1 z31.h, z31.h, z16.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ "ld1b { z0.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x22]\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x22]\n"
+ "trn1 z0.h, z0.h, z16.h\n"
+ "add z0.h, z0.h, z7.h\n"
+ "ld1b { z1.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ "ld1b { z2.s }, p1/Z, [x22]\n"
+ "trn1 z1.h, z1.h, z2.h\n"
"add x22, x22, %x[ld_in_row]\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "add z16.h, z16.h, z9.h\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ "add z1.h, z1.h, z7.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc17217a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
+ "ld1b { z24.s }, p1/Z, [x22]\n"
+ "mov z2.d, z24.d\n"
+ ".inst 0xc17317c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
+ ".inst 0xa0402a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17817a9 // sdot za.s[x8, 1], { z29.h-z0.h }, z8.h\n"
+ "add z2.h, z2.h, z7.h\n"
+ "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc17917c9 // sdot za.s[x8, 1], { z30.h-z1.h }, z9.h\n"
+ ".inst 0xc17317e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z3.h\n"
+ "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17317e9 // sdot za.s[x8, 1], { z31.h-z2.h }, z3.h\n"
"12:" // Unpadded: 0 priming loads
"cmp x7, #0x2\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
"blt 22f\n"
"add x21, x17, %x[ld_in_row]\n"
- "ld1b { z11.s }, p1/Z, [x17]\n"
+ "ld1b { z23.s }, p1/Z, [x17]\n"
"sub x7, x7, #0x2\n"
- "ld1b { z21.s }, p1/Z, [x21]\n"
+ "ld1b { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
+ "trn1 z23.h, z23.h, z25.h\n"
"sub x16, x16, #0x1\n"
- "ld1b { z12.s }, p1/Z, [x21]\n"
+ "ld1b { z24.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"lsr x20, x7, #0x1\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z20.s }, p1/Z, [x21]\n"
+ "add z23.h, z23.h, z7.h\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z24.h, z24.h, z30.h\n"
"cmp x20, x16\n"
- "ld1b { z13.s }, p1/Z, [x21]\n"
+ "ld1b { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"csel x26, x20, x16, LT\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z19.s }, p1/Z, [x21]\n"
+ "add z24.h, z24.h, z7.h\n"
+ "ld1b { z22.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x21]\n"
+ "trn1 z25.h, z25.h, z22.h\n"
+ "add z25.h, z25.h, z7.h\n"
+ "ld1b { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1b { z18.s }, p1/Z, [x21]\n"
+ "ld1b { z22.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x21]\n"
+ "trn1 z26.h, z26.h, z22.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1b { z27.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"and x7, x7, #0x1\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z9.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
+ "trn1 z27.h, z27.h, z30.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ "ld1b { z28.s }, p1/Z, [x21]\n"
+ "mov z28.d, z28.d\n"
+ "add z28.h, z28.h, z7.h\n"
"sub x16, x16, x26\n"
"cbz x26, 21f\n"
"13:" // Unpadded: Main loop
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
"addvl x25, SP, #6\n"
"addvl x24, SP, #12\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402b22 // ld1h { z2.h, z10.h }, pn10.b/Z, [x25]\n"
+ ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+ ".inst 0xa0402b20 // ld1h { z0.h-z1.h }, pn10.b/Z, [x25]\n"
"add x23, x17, %x[ld_in_row]\n"
"addvl x22, SP, #3\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
"addvl x21, SP, #9\n"
"subs x26, x26, #0x1\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24]\n"
- ".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- "ld1b { z11.s }, p1/Z, [x17]\n"
+ ".inst 0xc1711709 // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
+ ".inst 0xa0402b08 // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc17816ea // sdot za.s[x8, 2], { z23.h-z26.h }, z8.h\n"
+ "ld1b { z23.s }, p1/Z, [x17]\n"
"add x17, x17, %x[ld_in_col]\n"
"add x20, x17, %x[ld_in_row]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
"ld1h { z0.h }, p2/Z, [x25, #2, MUL VL]\n"
- ".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
- "ld1b { z21.s }, p1/Z, [x23]\n"
+ ".inst 0xc179170a // sdot za.s[x8, 2], { z24.h-z27.h }, z9.h\n"
+ "ld1b { z16.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z12.s }, p1/Z, [x23]\n"
+ "trn1 z23.h, z23.h, z16.h\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "ld1h { z9.h }, p2/Z, [x24, #2, MUL VL]\n"
+ "add z23.h, z23.h, z7.h\n"
+ "ld1b { z24.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
- "ld1b { z20.s }, p1/Z, [x23]\n"
+ ".inst 0xc179172a // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
+ "ld1b { z18.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z13.s }, p1/Z, [x23]\n"
+ "trn1 z24.h, z24.h, z18.h\n"
+ "add z24.h, z24.h, z7.h\n"
+ "ld1b { z25.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "ld1b { z19.s }, p1/Z, [x23]\n"
+ "ld1b { z8.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x23]\n"
+ "trn1 z25.h, z25.h, z8.h\n"
+ "add z25.h, z25.h, z7.h\n"
+ "ld1b { z26.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
- "ld1b { z18.s }, p1/Z, [x23]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "ld1b { z28.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x23]\n"
+ "trn1 z26.h, z26.h, z28.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1b { z27.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
- "ld1b { z17.s }, p1/Z, [x23]\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+ "ld1b { z28.s }, p1/Z, [x23]\n"
+ "trn1 z27.h, z27.h, z28.h\n"
"add x23, x23, %x[ld_in_row]\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- "ld1b { z16.s }, p1/Z, [x23]\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "add z16.h, z16.h, z9.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc17216e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+ ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+ "ld1b { z20.s }, p1/Z, [x23]\n"
+ "mov z28.d, z20.d\n"
+ ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+ "add z28.h, z28.h, z7.h\n"
"ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1711709 // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
+ ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+ ".inst 0xc1701728 // sdot za.s[x8, 0], { z25.h-z28.h }, z0.h\n"
"ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- "st1b { z4.s }, p1, [x15]\n"
+ "st1b { z16.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- "ld1b { z11.s }, p1/Z, [x17]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "st1b { z5.s }, p1, [x14]\n"
+ "ld1b { z23.s }, p1/Z, [x17]\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "st1b { z17.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "ld1b { z21.s }, p1/Z, [x20]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "st1b { z6.s }, p1, [x10]\n"
- "ld1b { z12.s }, p1/Z, [x20]\n"
+ "trn1 z23.h, z23.h, z16.h\n"
+ "st1b { z18.s }, p1, [x10]\n"
+ "ld1b { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"add x10, x10, x28\n"
- "st1b { z7.s }, p1, [x9]\n"
- "ld1b { z20.s }, p1/Z, [x20]\n"
+ "st1b { z19.s }, p1, [x9]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z24.h, z24.h, z16.h\n"
"add x9, x9, x27\n"
- "ld1b { z13.s }, p1/Z, [x20]\n"
+ "ld1b { z25.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z19.s }, p1/Z, [x20]\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "add z23.h, z23.h, z7.h\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x20]\n"
+ "trn1 z25.h, z25.h, z16.h\n"
+ "add z24.h, z24.h, z7.h\n"
+ "ld1b { z26.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z13.h, z13.h, z9.h\n"
+ "add z25.h, z25.h, z7.h\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1b { z18.s }, p1/Z, [x20]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x20]\n"
+ "trn1 z26.h, z26.h, z16.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1b { z27.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z9.h\n"
+ "trn1 z27.h, z27.h, z16.h\n"
+ "add z27.h, z27.h, z7.h\n"
"ld1b { z16.s }, p1/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "mov z28.d, z16.d\n"
+ "add z28.h, z28.h, z7.h\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
"bgt 13b\n"
"b 21f\n"
"14:" // Padded
@@ -645,688 +645,688 @@ void sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za_impl(
"15:" // Padded: 4 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x17]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x21, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x21]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x21]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1b { z28.s }, p0/Z, [x21]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x21]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z27.h, z27.h, z17.h\n"
+ "trn1 z28.h, z28.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x21]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1b { z29.s }, p0/Z, [x21]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x21]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z18.s }, p0/Z, [x21]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x21]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z30.s }, p0/Z, [x21]\n"
+ "add z30.h, p0/M, z30.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x21]\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x21]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z31.s }, p0/Z, [x21]\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"addvl x20, SP, #12\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
+ "trn1 z29.h, z29.h, z18.h\n"
+ "trn1 z30.h, z30.h, z17.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- "mov z16.d, z16.d\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "trn1 z31.h, z31.h, z16.h\n"
+ ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
+ "mov z0.d, z20.d\n"
"add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1711788 // sdot za.s[x8, 0], { z28.h-z31.h }, z1.h\n"
+ "ld1h { z1.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17117a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z1.h\n"
"16:" // Padded: 3 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1b { z24.s }, p0/Z, [x17]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
"add x21, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x21]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x21]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1b { z25.s }, p0/Z, [x21]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x21]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z24.h, z24.h, z17.h\n"
+ "trn1 z25.h, z25.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x21]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1b { z26.s }, p0/Z, [x21]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x21]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z18.s }, p0/Z, [x21]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x21]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x21]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x21]\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x21]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z28.s }, p0/Z, [x21]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"addvl x20, SP, #9\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
+ "trn1 z26.h, z26.h, z18.h\n"
+ "trn1 z27.h, z27.h, z17.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- "mov z16.d, z16.d\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ "trn1 z28.h, z28.h, z16.h\n"
+ ".inst 0xc1721708 // sdot za.s[x8, 0], { z24.h-z27.h }, z2.h\n"
+ "ld1b { z11.s }, p0/Z, [x21]\n"
+ "add z11.h, p0/M, z11.h, z7.h\n"
+ "mov z29.d, z11.d\n"
"add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc1731728 // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1701748 // sdot za.s[x8, 0], { z26.h-z29.h }, z0.h\n"
"17:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1b { z25.s }, p0/Z, [x17]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ "trn1 z26.h, z26.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"addvl x21, SP, #6\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "trn1 z27.h, z27.h, z18.h\n"
+ "trn1 z28.h, z28.h, z17.h\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ "trn1 z29.h, z29.h, z16.h\n"
+ ".inst 0xc1711728 // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
+ "ld1b { z1.s }, p0/Z, [x20]\n"
"addvl x20, SP, #12\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "mov z16.d, z16.d\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
+ ".inst 0xc1791748 // sdot za.s[x8, 0], { z26.h-z29.h }, z9.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1721729 // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
+ "mov z30.d, z1.d\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+ ".inst 0xc1791768 // sdot za.s[x8, 0], { z27.h-z30.h }, z9.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1701769 // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
"18:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1b { z25.s }, p0/Z, [x17]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ "trn1 z26.h, z26.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"addvl x21, SP, #3\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "trn1 z27.h, z27.h, z18.h\n"
+ "trn1 z28.h, z28.h, z17.h\n"
+ ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ "trn1 z29.h, z29.h, z16.h\n"
+ ".inst 0xc1731728 // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
+ "ld1b { z0.s }, p0/Z, [x20]\n"
"addvl x20, SP, #9\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "mov z16.d, z16.d\n"
+ "add z0.h, p0/M, z0.h, z7.h\n"
+ ".inst 0xc17b1748 // sdot za.s[x8, 0], { z26.h-z29.h }, z11.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1721729 // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
+ "mov z30.d, z0.d\n"
"add x17, x17, %x[ld_in_col]\n"
"ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+ ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1701769 // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
"19:" // Padded: 0 priming loads
"cmp x7, #0x2\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
"blt 22f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1b { z23.s }, p0/Z, [x17]\n"
+ "add z23.h, p0/M, z23.h, z7.h\n"
"add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z23.h, z23.h, z17.h\n"
+ "trn1 z24.h, z24.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "add z19.h, p0/M, z19.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1b { z18.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"sub x7, x7, #0x2\n"
"sub x16, x16, #0x1\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
+ "trn1 z25.h, z25.h, z19.h\n"
+ "trn1 z26.h, z26.h, z18.h\n"
"lsr x20, x7, #0x1\n"
"cmp x20, x16\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
+ "trn1 z27.h, z27.h, z17.h\n"
+ "mov z28.d, z16.d\n"
"csel x25, x20, x16, LT\n"
"add x17, x17, %x[ld_in_col]\n"
"and x7, x7, #0x1\n"
"sub x16, x16, x25\n"
"cbz x25, 21f\n"
"20:" // Padded: Main loop
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
"addvl x24, SP, #6\n"
"addvl x23, SP, #12\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+ ".inst 0xa1402b00 // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
"add x20, x17, %x[ld_in_row]\n"
"addvl x22, SP, #3\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1781709 // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
+ ".inst 0xa1402ae3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
"addvl x21, SP, #9\n"
"subs x25, x25, #0x1\n"
- ".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ ".inst 0xc17316ea // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
+ "ld1b { z23.s }, p0/Z, [x17]\n"
+ "add z23.h, p0/M, z23.h, z7.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ ".inst 0xc17b170a // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
"ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1b { z1.s }, p0/Z, [x20]\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
- "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z3.h }, p2/Z, [x23, #2, MUL VL]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ ".inst 0xc173172a // sdot za.s[x8, 2], { z25.h-z28.h }, z3.h\n"
+ "trn1 z23.h, z23.h, z16.h\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z3.s }, p0/Z, [x20]\n"
+ "add z3.h, p0/M, z3.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
+ "ld1b { z30.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z30.h, p0/M, z30.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22]\n"
+ "ld1b { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
+ "trn1 z24.h, z24.h, z1.h\n"
+ "trn1 z25.h, z25.h, z3.h\n"
+ "trn1 z26.h, z26.h, z30.h\n"
+ ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ "trn1 z27.h, z27.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xc17216e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"mov x12, #0x0\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
+ ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
"add x17, x17, %x[ld_in_col]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ ".inst 0xc17216e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z2.h\n"
+ "ld1b { z23.s }, p0/Z, [x17]\n"
+ "add z23.h, p0/M, z23.h, z7.h\n"
"add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z8.s }, p0/Z, [x20]\n"
+ "add z8.h, p0/M, z8.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ ".inst 0xc1731709 // sdot za.s[x8, 1], { z24.h-z27.h }, z3.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "mov z28.d, z20.d\n"
+ "ld1h { z1.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ ".inst 0xc1711728 // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
"mov x12, #0x4\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "add z22.h, p0/M, z22.h, z7.h\n"
+ "ld1h { z1.h }, p2/Z, [x21, #2, MUL VL]\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ ".inst 0xc1711729 // sdot za.s[x8, 1], { z25.h-z28.h }, z1.h\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ "ld1b { z31.s }, p0/Z, [x20]\n"
+ ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
+ "ld1b { z1.s }, p0/Z, [x20]\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
+ ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
"add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x15]\n"
+ ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+ "st1b { z16.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
- "st1b { z5.s }, p1, [x14]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "st1b { z17.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "trn1 z23.h, z23.h, z8.h\n"
+ "trn1 z24.h, z24.h, z22.h\n"
+ "st1b { z18.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "st1b { z7.s }, p1, [x9]\n"
+ "trn1 z25.h, z25.h, z28.h\n"
+ "trn1 z26.h, z26.h, z20.h\n"
+ "st1b { z19.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
+ "trn1 z27.h, z27.h, z31.h\n"
+ "mov z28.d, z1.d\n"
"bgt 20b\n"
"21:" // Main loop tail
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
"addvl x24, SP, #6\n"
"addvl x23, SP, #12\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+ ".inst 0xa0402b08 // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17816e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z8.h\n"
"add x22, x17, %x[ld_in_row]\n"
"addvl x21, SP, #3\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1791709 // sdot za.s[x8, 1], { z24.h-z27.h }, z9.h\n"
+ ".inst 0xa1402ae3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
"addvl x20, SP, #9\n"
- ".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ ".inst 0xc17316ea // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
+ "ld1b { z29.s }, p0/Z, [x17]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x22]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z8.s }, p0/Z, [x22]\n"
+ "add z8.h, p0/M, z8.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
- "ld1b { z12.s }, p0/Z, [x22]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ ".inst 0xc17b170a // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
+ "ld1b { z30.s }, p0/Z, [x22]\n"
+ "add z30.h, p0/M, z30.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
"ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
"ld1b { z20.s }, p0/Z, [x22]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z2.h }, p2/Z, [x23, #2, MUL VL]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "ld1b { z13.s }, p0/Z, [x22]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ ".inst 0xc172172a // sdot za.s[x8, 2], { z25.h-z28.h }, z2.h\n"
+ "trn1 z29.h, z29.h, z8.h\n"
+ "ld1b { z31.s }, p0/Z, [x22]\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x22]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z25.s }, p0/Z, [x22]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x22]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z0.s }, p0/Z, [x22]\n"
+ "add z0.h, p0/M, z0.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x22]\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x22]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z1.s }, p0/Z, [x22]\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x22]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "ld1b { z28.s }, p0/Z, [x22]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
+ "trn1 z30.h, z30.h, z20.h\n"
+ "trn1 z31.h, z31.h, z25.h\n"
+ "trn1 z0.h, z0.h, z17.h\n"
+ ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x22]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ "trn1 z1.h, z1.h, z28.h\n"
+ ".inst 0xc17317a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z3.h\n"
+ "ld1b { z22.s }, p0/Z, [x22]\n"
+ ".inst 0xc1a6ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+ "add z22.h, p0/M, z22.h, z7.h\n"
+ ".inst 0xc17b17c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z11.h\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
"add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "mov z16.d, z16.d\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x15]\n"
+ ".inst 0xc1a4aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z4.s\n"
+ ".inst 0xc17317a9 // sdot za.s[x8, 1], { z29.h-z0.h }, z3.h\n"
+ "mov z2.d, z22.d\n"
+ "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc17b17c9 // sdot za.s[x8, 1], { z30.h-z1.h }, z11.h\n"
+ ".inst 0xc1aaab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z10.s\n"
+ ".inst 0xc17917e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z9.h\n"
+ "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1b5ccb8 // sclamp { z24.s-z27.s }, z5.s, z21.s\n"
+ "st1b { z24.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- "st1b { z5.s }, p1, [x14]\n"
+ "st1b { z25.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "st1b { z6.s }, p1, [x10]\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
- "st1b { z7.s }, p1, [x9]\n"
+ ".inst 0xc17817e9 // sdot za.s[x8, 1], { z31.h-z2.h }, z8.h\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "st1b { z27.s }, p1, [x9]\n"
"add x9, x9, x27\n"
"22:" // Main loop skip tail
"cbz x7, 23f\n" // Skip remainder inputs
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1b { z24.s }, p0/Z, [x17]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
"add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z24.h, z24.h, z17.h\n"
+ "trn1 z25.h, z25.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "ld1b { z31.s }, p0/Z, [x20]\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ "trn1 z26.h, z26.h, z17.h\n"
+ "trn1 z27.h, z27.h, z16.h\n"
+ "ld1b { z0.s }, p0/Z, [x20]\n"
+ "add z0.h, p0/M, z0.h, z7.h\n"
+ "trn1 z28.h, z28.h, z31.h\n"
"addvl x21, SP, #6\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "mov z16.d, z16.d\n"
+ ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+ "mov z29.d, z0.d\n"
"addvl x20, SP, #12\n"
"sub x16, x16, #0x1\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
- ".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- ".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x15]\n"
+ ".inst 0xc17b1728 // sdot za.s[x8, 0], { z25.h-z28.h }, z11.h\n"
+ ".inst 0xa0402aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1721748 // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+ "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1781709 // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ ".inst 0xc1791729 // sdot za.s[x8, 1], { z25.h-z28.h }, z9.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+ ".inst 0xc171170a // sdot za.s[x8, 2], { z24.h-z27.h }, z1.h\n"
+ ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+ ".inst 0xc179172a // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
+ ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+ "st1b { z16.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- "st1b { z5.s }, p1, [x14]\n"
+ ".inst 0xc1721749 // sdot za.s[x8, 1], { z26.h-z29.h }, z2.h\n"
+ "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "st1b { z17.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc173174a // sdot za.s[x8, 2], { z26.h-z29.h }, z3.h\n"
"add x8, x8, #0x1\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "st1b { z18.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z7.s }, p1, [x9]\n"
+ "st1b { z19.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
"23:" // Tail input: End
"cbz x16, 25f\n"
"24:" // Right padding loop
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
"add x8, x8, #0x1\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
"subs x16, x16, #0x1\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x15]\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ ".inst 0xc1aaab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
+ ".inst 0xc1b5ccbc // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
+ "st1b { z28.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- "st1b { z5.s }, p1, [x14]\n"
+ "st1b { z29.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "st1b { z30.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z7.s }, p1, [x9]\n"
+ "st1b { z31.s }, p1, [x9]\n"
"add x9, x9, x27\n"
"bgt 24b\n"
"25:" // End
- "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x22, ALL, MUL #16\n"
- "incw x22, ALL, MUL #9\n"
- "str x22, [%x[args], %[offsetof_Args_weights]]\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #16\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
"incw x6\n"
"whilelt p1.s, x6, x5\n"
- "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x17, x17, x20\n"
- "str x17, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
"ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
"ldp x23, x22, [x25, #0x0]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index 1c1fb25e1f..edee21e941 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,19 +22,19 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
-void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
-void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
class sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
{
@@ -57,7 +57,7 @@ class sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 2;
sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(2, 3, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
@@ -68,4 +68,4 @@ class sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirst
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
index 9fd220abf8..d807856ccb 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -108,10 +108,10 @@ void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"whilelt p2.h, XZR, %x[n_channels]\n"
"madd x20, x14, x12, x20\n" // offset += tile_j * ld_output_col
"ldr x28, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "ld1h { z18.h }, p3/Z, [x10]\n"
+ "ld1h { z27.h }, p3/Z, [x10]\n"
"add x27, x13, x13\n"
"mul x21, x21, x25\n" // offset *= kernel_stride * output_size
- "add x9, x9, x21, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x9, x9, x21, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
"ld1h { z0.h }, p3/Z, [x10, #1, MUL VL]\n"
"ld1h { z1.h }, p3/Z, [x10, #2, MUL VL]\n"
"mul x20, x20, x24\n" // offset *= output_tile_size
@@ -125,10 +125,10 @@ void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"ld1h { z6.h }, p3/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
"add x28, x28, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
- "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z26.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
"cmp x11, %x[n_channels]\n"
"add x23, x25, x23, LSL #1\n"
- "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rh { z25.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"ld1h { z7.h }, p3/Z, [x10, #-8, MUL VL]\n"
"add x22, x28, x22, LSL #1\n"
"mov x21, #0x0\n"
@@ -142,175 +142,175 @@ void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"ld1h { z13.h }, p2/Z, [x25, x13, LSL #1]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z28, z18\n fmla z28.h, p3/M, z4.h, z9.h\n"
- "movprfx z29, z18\n fmla z29.h, p3/M, z3.h, z9.h\n"
+ "movprfx z24, z27\n fmla z24.h, p3/M, z4.h, z9.h\n"
+ "movprfx z23, z27\n fmla z23.h, p3/M, z3.h, z9.h\n"
"whilelt p1.h, x11, %x[n_channels]\n"
"inch x21\n"
- "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
- "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x23]\n"
+ "movprfx z22, z27\n fmla z22.h, p3/M, z1.h, z9.h\n"
+ "movprfx z21, z27\n fmla z21.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z18.h }, p2/Z, [x23]\n"
"inch x11\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "fmla z29.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x23, x24, LSL #1]\n"
- "ld1h { z10.h }, p2/Z, [x25, x27, LSL #1]\n"
- "fmla z30.h, p3/M, z2.h, z12.h\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "fmla z23.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, x24, LSL #1]\n"
+ "ld1h { z20.h }, p2/Z, [x25, x27, LSL #1]\n"
+ "fmla z22.h, p3/M, z2.h, z12.h\n"
+ "fmla z21.h, p3/M, z1.h, z12.h\n"
"mov p0.b, p2.b\n"
- "ld1h { z18.h }, p3/Z, [x10]\n"
- "fmla z28.h, p3/M, z5.h, z12.h\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x9, x13, LSL #1]\n"
+ "ld1h { z27.h }, p3/Z, [x10]\n"
+ "fmla z24.h, p3/M, z5.h, z12.h\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z16.h }, p2/Z, [x9, x13, LSL #1]\n"
"inch x20\n"
- "fmla z30.h, p3/M, z6.h, z9.h\n"
- "fmla z31.h, p3/M, z3.h, z13.h\n"
- "ld1h { z9.h }, p2/Z, [x9, x27, LSL #1]\n"
+ "fmla z22.h, p3/M, z6.h, z18.h\n"
+ "fmla z21.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z18.h }, p2/Z, [x9, x27, LSL #1]\n"
"addvl x9, x9, #1\n"
- "fmla z28.h, p3/M, z7.h, z13.h\n"
- "fmla z29.h, p3/M, z6.h, z13.h\n"
- "fmla z30.h, p3/M, z4.h, z13.h\n"
- "fmla z31.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26]\n"
- "fmla z28.h, p3/M, z1.h, z12.h\n"
- "fmla z29.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x24, LSL #1]\n"
+ "fmla z24.h, p3/M, z7.h, z13.h\n"
+ "fmla z23.h, p3/M, z6.h, z13.h\n"
+ "fmla z22.h, p3/M, z4.h, z13.h\n"
+ "fmla z21.h, p3/M, z8.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x26]\n"
+ "fmla z24.h, p3/M, z1.h, z16.h\n"
+ "fmla z23.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x26, x24, LSL #1]\n"
"addvl x26, x26, #1\n"
- "fmla z30.h, p3/M, z5.h, z10.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "fmla z22.h, p3/M, z5.h, z20.h\n"
+ "fmla z21.h, p3/M, z4.h, z20.h\n"
"ld1h { z4.h }, p3/Z, [x10, #5, MUL VL]\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "fmla z29.h, p3/M, z1.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x25]\n"
+ "fmla z24.h, p3/M, z2.h, z18.h\n"
+ "fmla z23.h, p3/M, z1.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x25]\n"
"ld1h { z1.h }, p3/Z, [x10, #2, MUL VL]\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "fmla z31.h, p3/M, z2.h, z12.h\n"
+ "fmla z22.h, p3/M, z0.h, z17.h\n"
+ "fmla z21.h, p3/M, z2.h, z16.h\n"
"ld1h { z0.h }, p3/Z, [x10, #1, MUL VL]\n"
"ld1h { z2.h }, p3/Z, [x10, #3, MUL VL]\n"
- "fmla z28.h, p3/M, z8.h, z10.h\n"
- "fmla z29.h, p3/M, z7.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x25, x24, LSL #1]\n"
+ "fmla z24.h, p3/M, z8.h, z20.h\n"
+ "fmla z23.h, p3/M, z7.h, z20.h\n"
+ "ld1h { z18.h }, p2/Z, [x25, x24, LSL #1]\n"
"addvl x25, x25, #1\n"
- "fmla z30.h, p3/M, z3.h, z9.h\n"
- "fmla z31.h, p3/M, z5.h, z10.h\n"
+ "fmla z22.h, p3/M, z3.h, z19.h\n"
+ "fmla z21.h, p3/M, z5.h, z18.h\n"
"ld1h { z13.h }, p1/Z, [x25, x13, LSL #1]\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x23, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z5.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x23, x27, LSL #1]\n"
- "fmla z30.h, p3/M, z7.h, z11.h\n"
- "fmla z31.h, p3/M, z6.h, z11.h\n"
+ "fmla z24.h, p3/M, z3.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, x13, LSL #1]\n"
+ "fmla z23.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x27, LSL #1]\n"
+ "fmla z22.h, p3/M, z7.h, z17.h\n"
+ "fmla z21.h, p3/M, z6.h, z17.h\n"
"ld1h { z3.h }, p3/Z, [x10, #4, MUL VL]\n"
"ld1h { z5.h }, p3/Z, [x10, #6, MUL VL]\n"
- "fmla z28.h, p3/M, z6.h, z9.h\n"
- "fmla z29.h, p3/M, z8.h, z10.h\n"
- "fmax z28.h, p3/M, z28.h, z17.h\n"
- "fmax z29.h, p3/M, z29.h, z17.h\n"
- "fmla z30.h, p3/M, z8.h, z12.h\n"
- "fmla z31.h, p3/M, z7.h, z12.h\n"
- "fmax z30.h, p3/M, z30.h, z17.h\n"
- "fmax z31.h, p3/M, z31.h, z17.h\n"
+ "fmla z24.h, p3/M, z6.h, z19.h\n"
+ "fmla z23.h, p3/M, z8.h, z18.h\n"
+ "fmax z24.h, p3/M, z24.h, z26.h\n"
+ "fmax z23.h, p3/M, z23.h, z26.h\n"
+ "fmla z22.h, p3/M, z8.h, z16.h\n"
+ "fmla z21.h, p3/M, z7.h, z16.h\n"
+ "fmax z22.h, p3/M, z22.h, z26.h\n"
+ "fmax z21.h, p3/M, z21.h, z26.h\n"
"ld1h { z6.h }, p3/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
"whilelt p2.h, x21, %x[n_channels]\n"
"ld1h { z9.h }, p1/Z, [x26, x13, LSL #1]\n"
"cmp x11, %x[n_channels]\n"
- "fmin z28.h, p3/M, z28.h, z16.h\n"
+ "fmin z24.h, p3/M, z24.h, z25.h\n"
"ld1h { z10.h }, p1/Z, [x9]\n"
"ld1h { z11.h }, p1/Z, [x9, x24, LSL #1]\n"
- "fmin z29.h, p3/M, z29.h, z16.h\n"
- "fmin z30.h, p3/M, z30.h, z16.h\n"
+ "fmin z23.h, p3/M, z23.h, z25.h\n"
+ "fmin z22.h, p3/M, z22.h, z25.h\n"
"ld1h { z12.h }, p1/Z, [x26, x27, LSL #1]\n"
- "st1h { z28.h }, p0, [x28]\n"
- "fmin z31.h, p3/M, z31.h, z16.h\n"
+ "st1h { z24.h }, p0, [x28]\n"
+ "fmin z21.h, p3/M, z21.h, z25.h\n"
"addvl x23, x23, #1\n"
- "st1h { z29.h }, p0, [x28, x12, LSL #1]\n"
+ "st1h { z23.h }, p0, [x28, x12, LSL #1]\n"
"ld1h { z7.h }, p3/Z, [x10, #-8, MUL VL]\n"
- "st1h { z30.h }, p0, [x22]\n"
+ "st1h { z22.h }, p0, [x22]\n"
"addvl x28, x28, #1\n"
"ld1h { z8.h }, p3/Z, [x10, #-7, MUL VL]\n"
"addvl x10, x10, #-6\n"
- "st1h { z31.h }, p0, [x22, x12, LSL #1]\n"
+ "st1h { z21.h }, p0, [x22, x12, LSL #1]\n"
"addvl x22, x22, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z28, z18\n fmla z28.h, p3/M, z4.h, z9.h\n"
- "movprfx z29, z18\n fmla z29.h, p3/M, z3.h, z9.h\n"
+ "movprfx z24, z27\n fmla z24.h, p3/M, z4.h, z9.h\n"
+ "movprfx z23, z27\n fmla z23.h, p3/M, z3.h, z9.h\n"
"ldr x14, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
- "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x23]\n"
+ "movprfx z22, z27\n fmla z22.h, p3/M, z1.h, z9.h\n"
+ "movprfx z21, z27\n fmla z21.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z18.h }, p2/Z, [x23]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "fmla z29.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x23, x24, LSL #1]\n"
- "ld1h { z10.h }, p2/Z, [x25, x27, LSL #1]\n"
- "fmla z30.h, p3/M, z2.h, z12.h\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "fmla z23.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, x24, LSL #1]\n"
+ "ld1h { z20.h }, p2/Z, [x25, x27, LSL #1]\n"
+ "fmla z22.h, p3/M, z2.h, z12.h\n"
+ "fmla z21.h, p3/M, z1.h, z12.h\n"
"add x14, x14, #0x1\n"
"cmp x14, x20\n"
- "fmla z28.h, p3/M, z5.h, z12.h\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x9, x13, LSL #1]\n"
+ "fmla z24.h, p3/M, z5.h, z12.h\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z16.h }, p2/Z, [x9, x13, LSL #1]\n"
"add x21, x10, #0x1\n"
- "fmla z30.h, p3/M, z6.h, z9.h\n"
- "fmla z31.h, p3/M, z3.h, z13.h\n"
- "ld1h { z9.h }, p2/Z, [x9, x27, LSL #1]\n"
+ "fmla z22.h, p3/M, z6.h, z18.h\n"
+ "fmla z21.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z18.h }, p2/Z, [x9, x27, LSL #1]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "fmla z28.h, p3/M, z7.h, z13.h\n"
- "fmla z29.h, p3/M, z6.h, z13.h\n"
+ "fmla z24.h, p3/M, z7.h, z13.h\n"
+ "fmla z23.h, p3/M, z6.h, z13.h\n"
"csel x10, x10, x21, LT\n"
"mov p0.b, p2.b\n"
- "fmla z30.h, p3/M, z4.h, z13.h\n"
- "fmla z31.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26]\n"
+ "fmla z22.h, p3/M, z4.h, z13.h\n"
+ "fmla z21.h, p3/M, z8.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x26]\n"
"csel x14, x14, XZR, LT\n"
- "fmla z28.h, p3/M, z1.h, z12.h\n"
- "fmla z29.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x24, LSL #1]\n"
+ "fmla z24.h, p3/M, z1.h, z16.h\n"
+ "fmla z23.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x26, x24, LSL #1]\n"
"cmp x10, x20\n"
- "fmla z30.h, p3/M, z5.h, z10.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "fmla z29.h, p3/M, z1.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x25]\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "fmla z31.h, p3/M, z2.h, z12.h\n"
- "fmla z28.h, p3/M, z8.h, z10.h\n"
- "fmla z29.h, p3/M, z7.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x25, x24, LSL #1]\n"
- "fmla z30.h, p3/M, z3.h, z9.h\n"
- "fmla z31.h, p3/M, z5.h, z10.h\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x23, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z5.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x23, x27, LSL #1]\n"
- "fmla z30.h, p3/M, z7.h, z11.h\n"
- "fmla z31.h, p3/M, z6.h, z11.h\n"
- "fmla z28.h, p3/M, z6.h, z9.h\n"
- "fmla z29.h, p3/M, z8.h, z10.h\n"
- "fmax z28.h, p3/M, z28.h, z17.h\n"
- "fmax z29.h, p3/M, z29.h, z17.h\n"
- "fmla z30.h, p3/M, z8.h, z12.h\n"
- "fmla z31.h, p3/M, z7.h, z12.h\n"
- "fmax z30.h, p3/M, z30.h, z17.h\n"
- "fmax z31.h, p3/M, z31.h, z17.h\n"
- "fmin z28.h, p3/M, z28.h, z16.h\n"
- "fmin z29.h, p3/M, z29.h, z16.h\n"
- "st1h { z28.h }, p0, [x28]\n"
- "fmin z30.h, p3/M, z30.h, z16.h\n"
- "fmin z31.h, p3/M, z31.h, z16.h\n"
- "st1h { z29.h }, p0, [x28, x12, LSL #1]\n"
- "st1h { z30.h }, p0, [x22]\n"
- "st1h { z31.h }, p0, [x22, x12, LSL #1]\n"
+ "fmla z22.h, p3/M, z5.h, z20.h\n"
+ "fmla z21.h, p3/M, z4.h, z20.h\n"
+ "fmla z24.h, p3/M, z2.h, z18.h\n"
+ "fmla z23.h, p3/M, z1.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x25]\n"
+ "fmla z22.h, p3/M, z0.h, z17.h\n"
+ "fmla z21.h, p3/M, z2.h, z16.h\n"
+ "fmla z24.h, p3/M, z8.h, z20.h\n"
+ "fmla z23.h, p3/M, z7.h, z20.h\n"
+ "ld1h { z18.h }, p2/Z, [x25, x24, LSL #1]\n"
+ "fmla z22.h, p3/M, z3.h, z19.h\n"
+ "fmla z21.h, p3/M, z5.h, z18.h\n"
+ "fmla z24.h, p3/M, z3.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, x13, LSL #1]\n"
+ "fmla z23.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x27, LSL #1]\n"
+ "fmla z22.h, p3/M, z7.h, z17.h\n"
+ "fmla z21.h, p3/M, z6.h, z17.h\n"
+ "fmla z24.h, p3/M, z6.h, z19.h\n"
+ "fmla z23.h, p3/M, z8.h, z18.h\n"
+ "fmax z24.h, p3/M, z24.h, z26.h\n"
+ "fmax z23.h, p3/M, z23.h, z26.h\n"
+ "fmla z22.h, p3/M, z8.h, z16.h\n"
+ "fmla z21.h, p3/M, z7.h, z16.h\n"
+ "fmax z22.h, p3/M, z22.h, z26.h\n"
+ "fmax z21.h, p3/M, z21.h, z26.h\n"
+ "fmin z24.h, p3/M, z24.h, z25.h\n"
+ "fmin z23.h, p3/M, z23.h, z25.h\n"
+ "st1h { z24.h }, p0, [x28]\n"
+ "fmin z22.h, p3/M, z22.h, z25.h\n"
+ "fmin z21.h, p3/M, z21.h, z25.h\n"
+ "st1h { z23.h }, p0, [x28, x12, LSL #1]\n"
+ "st1h { z22.h }, p0, [x22]\n"
+ "st1h { z21.h }, p0, [x22, x12, LSL #1]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index 9242b470c3..90982b6990 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -87,7 +87,7 @@ void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"ldp x11, x10, [x20, #0x10]\n"
"mov x9, #0x0\n"
"whilelt p2.h, XZR, %x[n_channels]\n"
- "ld1h { z18.h }, p3/Z, [x16]\n"
+ "ld1h { z20.h }, p3/Z, [x16]\n"
"ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
"cmp x14, %x[n_channels]\n"
"ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
@@ -98,99 +98,99 @@ void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n"
"ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n"
"addvl x16, x16, #16\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "ldr x23, [x15, #0x20]\n"
- "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ldp x24, x23, [x15, #0x0]\n"
+ "ldp x22, x21, [x15, #0x10]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ld1rh { z26.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z25.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
"ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
- "ld1h { z9.h }, p2/Z, [x27, x9, LSL #1]\n"
+ "ld1h { z9.h }, p2/Z, [x24, x9, LSL #1]\n"
"addvl x16, x16, #-6\n"
- "ld1h { z10.h }, p2/Z, [x26, x9, LSL #1]\n"
- "ld1h { z11.h }, p2/Z, [x25, x9, LSL #1]\n"
- "ld1h { z12.h }, p2/Z, [x24, x9, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x20, x9, LSL #1]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z28, z18\n fmla z28.h, p3/M, z4.h, z9.h\n"
- "movprfx z29, z18\n fmla z29.h, p3/M, z3.h, z9.h\n"
- "ldr x22, [x15, #0x28]\n"
- "ldr x21, [x15, #0x30]\n"
- "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
- "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x22, x9, LSL #1]\n"
- "ldr x20, [x15, #0x38]\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "fmla z29.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x21, x9, LSL #1]\n"
- "ldr x26, [x15, #0x48]\n"
- "fmla z30.h, p3/M, z2.h, z12.h\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
- "ldr x27, [x15, #0x40]\n"
- "ld1h { z10.h }, p2/Z, [x26, x9, LSL #1]\n"
- "fmla z28.h, p3/M, z5.h, z12.h\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x20, x9, LSL #1]\n"
- "ldr x25, [x15, #0x50]\n"
- "fmla z30.h, p3/M, z6.h, z9.h\n"
- "fmla z31.h, p3/M, z3.h, z13.h\n"
- "ld1h { z9.h }, p2/Z, [x27, x9, LSL #1]\n"
- "ldr x24, [x15, #0x58]\n"
- "fmla z28.h, p3/M, z7.h, z13.h\n"
- "fmla z29.h, p3/M, z6.h, z13.h\n"
- "ldr x23, [x15, #0x60]\n"
- "ldr x22, [x15, #0x68]\n"
- "fmla z30.h, p3/M, z4.h, z13.h\n"
- "fmla z31.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x25, x9, LSL #1]\n"
- "ldr x21, [x15, #0x70]\n"
- "fmla z28.h, p3/M, z1.h, z12.h\n"
- "fmla z29.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x9, LSL #1]\n"
- "ldr x20, [x15, #0x78]\n"
- "fmla z30.h, p3/M, z5.h, z10.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "movprfx z24, z20\n fmla z24.h, p3/M, z4.h, z9.h\n"
+ "movprfx z23, z20\n fmla z23.h, p3/M, z3.h, z9.h\n"
+ "ldr x21, [x15, #0x28]\n"
+ "ldr x20, [x15, #0x30]\n"
+ "movprfx z22, z20\n fmla z22.h, p3/M, z1.h, z9.h\n"
+ "movprfx z21, z20\n fmla z21.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z18.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ldr x22, [x15, #0x38]\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "fmla z23.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x21, [x15, #0x48]\n"
+ "fmla z22.h, p3/M, z2.h, z12.h\n"
+ "fmla z21.h, p3/M, z1.h, z12.h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "ld1h { z20.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "fmla z24.h, p3/M, z5.h, z12.h\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "ldr x22, [x15, #0x50]\n"
+ "fmla z22.h, p3/M, z6.h, z18.h\n"
+ "fmla z21.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x21, [x15, #0x58]\n"
+ "fmla z24.h, p3/M, z7.h, z13.h\n"
+ "fmla z23.h, p3/M, z6.h, z13.h\n"
+ "ldr x20, [x15, #0x60]\n"
+ "ldr x27, [x15, #0x68]\n"
+ "fmla z22.h, p3/M, z4.h, z13.h\n"
+ "fmla z21.h, p3/M, z8.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "ldr x26, [x15, #0x70]\n"
+ "fmla z24.h, p3/M, z1.h, z16.h\n"
+ "fmla z23.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ldr x25, [x15, #0x78]\n"
+ "fmla z22.h, p3/M, z5.h, z20.h\n"
+ "fmla z21.h, p3/M, z4.h, z20.h\n"
"whilelt p1.h, x14, %x[n_channels]\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "fmla z29.h, p3/M, z1.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x23, x9, LSL #1]\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "fmla z31.h, p3/M, z2.h, z12.h\n"
- "ldr x23, [x15, #0x20]\n"
- "ld1h { z13.h }, p1/Z, [x23, x14, LSL #1]\n"
- "fmla z28.h, p3/M, z8.h, z10.h\n"
- "fmla z29.h, p3/M, z7.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "ldp x24, x23, [x15, #0x0]\n"
+ "fmla z24.h, p3/M, z2.h, z18.h\n"
+ "fmla z23.h, p3/M, z1.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldp x22, x21, [x15, #0x10]\n"
+ "fmla z22.h, p3/M, z0.h, z17.h\n"
+ "fmla z21.h, p3/M, z2.h, z16.h\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ld1h { z13.h }, p1/Z, [x20, x14, LSL #1]\n"
+ "fmla z24.h, p3/M, z8.h, z20.h\n"
+ "fmla z23.h, p3/M, z7.h, z20.h\n"
+ "ld1h { z18.h }, p2/Z, [x27, x9, LSL #1]\n"
"inch x28\n"
- "fmla z30.h, p3/M, z3.h, z9.h\n"
- "fmla z31.h, p3/M, z5.h, z10.h\n"
+ "fmla z22.h, p3/M, z3.h, z19.h\n"
+ "fmla z21.h, p3/M, z5.h, z18.h\n"
"mov p0.b, p2.b\n"
- "ld1h { z18.h }, p3/Z, [x16]\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x21, x9, LSL #1]\n"
- "fmla z29.h, p3/M, z5.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x20, x9, LSL #1]\n"
- "fmla z30.h, p3/M, z7.h, z11.h\n"
- "fmla z31.h, p3/M, z6.h, z11.h\n"
+ "ld1h { z20.h }, p3/Z, [x16]\n"
+ "fmla z24.h, p3/M, z3.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x26, x9, LSL #1]\n"
+ "fmla z23.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z7.h, z17.h\n"
+ "fmla z21.h, p3/M, z6.h, z17.h\n"
"inch x9\n"
- "ld1h { z11.h }, p1/Z, [x25, x14, LSL #1]\n"
- "fmla z28.h, p3/M, z6.h, z9.h\n"
- "fmla z29.h, p3/M, z8.h, z10.h\n"
- "ld1h { z9.h }, p1/Z, [x27, x14, LSL #1]\n"
- "ld1h { z10.h }, p1/Z, [x26, x14, LSL #1]\n"
- "fmla z30.h, p3/M, z8.h, z12.h\n"
- "fmla z31.h, p3/M, z7.h, z12.h\n"
- "ld1h { z12.h }, p1/Z, [x24, x14, LSL #1]\n"
+ "ld1h { z11.h }, p1/Z, [x22, x14, LSL #1]\n"
+ "fmla z24.h, p3/M, z6.h, z19.h\n"
+ "fmla z23.h, p3/M, z8.h, z18.h\n"
+ "ld1h { z9.h }, p1/Z, [x24, x14, LSL #1]\n"
+ "ld1h { z10.h }, p1/Z, [x23, x14, LSL #1]\n"
+ "fmla z22.h, p3/M, z8.h, z16.h\n"
+ "fmla z21.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z12.h }, p1/Z, [x21, x14, LSL #1]\n"
"inch x14\n"
- "fmax z28.h, p3/M, z28.h, z17.h\n"
- "fmax z29.h, p3/M, z29.h, z17.h\n"
+ "fmax z24.h, p3/M, z24.h, z26.h\n"
+ "fmax z23.h, p3/M, z23.h, z26.h\n"
"ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
"ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
- "fmax z30.h, p3/M, z30.h, z17.h\n"
- "fmax z31.h, p3/M, z31.h, z17.h\n"
+ "fmax z22.h, p3/M, z22.h, z26.h\n"
+ "fmax z21.h, p3/M, z21.h, z26.h\n"
"ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n"
"ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n"
"ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n"
@@ -199,98 +199,98 @@ void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"cmp x14, %x[n_channels]\n"
"ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n"
"addvl x16, x16, #16\n"
- "fmin z28.h, p3/M, z28.h, z16.h\n"
- "st1h { z28.h }, p0, [x13, x28, LSL #1]\n"
- "fmin z29.h, p3/M, z29.h, z16.h\n"
- "fmin z30.h, p3/M, z30.h, z16.h\n"
- "st1h { z29.h }, p0, [x12, x28, LSL #1]\n"
+ "fmin z24.h, p3/M, z24.h, z25.h\n"
+ "st1h { z24.h }, p0, [x13, x28, LSL #1]\n"
+ "fmin z23.h, p3/M, z23.h, z25.h\n"
+ "fmin z22.h, p3/M, z22.h, z25.h\n"
+ "st1h { z23.h }, p0, [x12, x28, LSL #1]\n"
"ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
- "fmin z31.h, p3/M, z31.h, z16.h\n"
- "st1h { z30.h }, p0, [x11, x28, LSL #1]\n"
+ "fmin z21.h, p3/M, z21.h, z25.h\n"
+ "st1h { z22.h }, p0, [x11, x28, LSL #1]\n"
"ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
"addvl x16, x16, #-6\n"
- "st1h { z31.h }, p0, [x10, x28, LSL #1]\n"
+ "st1h { z21.h }, p0, [x10, x28, LSL #1]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z28, z18\n fmla z28.h, p3/M, z4.h, z9.h\n"
- "movprfx z29, z18\n fmla z29.h, p3/M, z3.h, z9.h\n"
- "ldr x22, [x15, #0x28]\n"
- "ldr x21, [x15, #0x30]\n"
- "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
- "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x22, x9, LSL #1]\n"
- "ldr x20, [x15, #0x38]\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "fmla z29.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x21, x9, LSL #1]\n"
- "ldr x26, [x15, #0x48]\n"
- "fmla z30.h, p3/M, z2.h, z12.h\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
- "ldr x27, [x15, #0x40]\n"
- "ld1h { z10.h }, p2/Z, [x26, x9, LSL #1]\n"
- "fmla z28.h, p3/M, z5.h, z12.h\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x20, x9, LSL #1]\n"
- "ldr x25, [x15, #0x50]\n"
- "fmla z30.h, p3/M, z6.h, z9.h\n"
- "fmla z31.h, p3/M, z3.h, z13.h\n"
- "ld1h { z9.h }, p2/Z, [x27, x9, LSL #1]\n"
- "ldr x24, [x15, #0x58]\n"
- "fmla z28.h, p3/M, z7.h, z13.h\n"
- "fmla z29.h, p3/M, z6.h, z13.h\n"
+ "movprfx z24, z20\n fmla z24.h, p3/M, z4.h, z9.h\n"
+ "movprfx z23, z20\n fmla z23.h, p3/M, z3.h, z9.h\n"
+ "ldr x21, [x15, #0x28]\n"
+ "ldr x20, [x15, #0x30]\n"
+ "movprfx z22, z20\n fmla z22.h, p3/M, z1.h, z9.h\n"
+ "movprfx z21, z20\n fmla z21.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z18.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ldr x22, [x15, #0x38]\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "fmla z23.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x21, [x15, #0x48]\n"
+ "fmla z22.h, p3/M, z2.h, z12.h\n"
+ "fmla z21.h, p3/M, z1.h, z12.h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "ld1h { z20.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "fmla z24.h, p3/M, z5.h, z12.h\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "ldr x21, [x15, #0x50]\n"
+ "fmla z22.h, p3/M, z6.h, z18.h\n"
+ "fmla z21.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x20, [x15, #0x58]\n"
+ "fmla z24.h, p3/M, z7.h, z13.h\n"
+ "fmla z23.h, p3/M, z6.h, z13.h\n"
"ldr x23, [x15, #0x60]\n"
"ldr x22, [x15, #0x68]\n"
- "fmla z30.h, p3/M, z4.h, z13.h\n"
- "fmla z31.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x25, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z4.h, z13.h\n"
+ "fmla z21.h, p3/M, z8.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
"ldr x21, [x15, #0x70]\n"
- "fmla z28.h, p3/M, z1.h, z12.h\n"
- "fmla z29.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x9, LSL #1]\n"
+ "fmla z24.h, p3/M, z1.h, z16.h\n"
+ "fmla z23.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
"ldr x20, [x15, #0x78]\n"
- "fmla z30.h, p3/M, z5.h, z10.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "fmla z22.h, p3/M, z5.h, z20.h\n"
+ "fmla z21.h, p3/M, z4.h, z20.h\n"
"inch x28\n"
"mov p0.b, p2.b\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "fmla z29.h, p3/M, z1.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x23, x9, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "fmla z31.h, p3/M, z2.h, z12.h\n"
- "fmla z28.h, p3/M, z8.h, z10.h\n"
- "fmla z29.h, p3/M, z7.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x22, x9, LSL #1]\n"
- "fmla z30.h, p3/M, z3.h, z9.h\n"
- "fmla z31.h, p3/M, z5.h, z10.h\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x21, x9, LSL #1]\n"
- "fmla z29.h, p3/M, z5.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x20, x9, LSL #1]\n"
- "fmla z30.h, p3/M, z7.h, z11.h\n"
- "fmla z31.h, p3/M, z6.h, z11.h\n"
- "fmla z28.h, p3/M, z6.h, z9.h\n"
- "fmla z29.h, p3/M, z8.h, z10.h\n"
- "fmax z28.h, p3/M, z28.h, z17.h\n"
- "fmax z29.h, p3/M, z29.h, z17.h\n"
- "fmla z30.h, p3/M, z8.h, z12.h\n"
- "fmla z31.h, p3/M, z7.h, z12.h\n"
- "fmax z30.h, p3/M, z30.h, z17.h\n"
- "fmax z31.h, p3/M, z31.h, z17.h\n"
- "fmin z28.h, p3/M, z28.h, z16.h\n"
- "fmin z29.h, p3/M, z29.h, z16.h\n"
- "st1h { z28.h }, p0, [x13, x28, LSL #1]\n"
- "fmin z30.h, p3/M, z30.h, z16.h\n"
- "fmin z31.h, p3/M, z31.h, z16.h\n"
- "st1h { z29.h }, p0, [x12, x28, LSL #1]\n"
- "st1h { z30.h }, p0, [x11, x28, LSL #1]\n"
- "st1h { z31.h }, p0, [x10, x28, LSL #1]\n"
+ "fmla z24.h, p3/M, z2.h, z18.h\n"
+ "fmla z23.h, p3/M, z1.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x23, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z0.h, z17.h\n"
+ "fmla z21.h, p3/M, z2.h, z16.h\n"
+ "fmla z24.h, p3/M, z8.h, z20.h\n"
+ "fmla z23.h, p3/M, z7.h, z20.h\n"
+ "ld1h { z18.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z3.h, z19.h\n"
+ "fmla z21.h, p3/M, z5.h, z18.h\n"
+ "fmla z24.h, p3/M, z3.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "fmla z23.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z7.h, z17.h\n"
+ "fmla z21.h, p3/M, z6.h, z17.h\n"
+ "fmla z24.h, p3/M, z6.h, z19.h\n"
+ "fmla z23.h, p3/M, z8.h, z18.h\n"
+ "fmax z24.h, p3/M, z24.h, z26.h\n"
+ "fmax z23.h, p3/M, z23.h, z26.h\n"
+ "fmla z22.h, p3/M, z8.h, z16.h\n"
+ "fmla z21.h, p3/M, z7.h, z16.h\n"
+ "fmax z22.h, p3/M, z22.h, z26.h\n"
+ "fmax z21.h, p3/M, z21.h, z26.h\n"
+ "fmin z24.h, p3/M, z24.h, z25.h\n"
+ "fmin z23.h, p3/M, z23.h, z25.h\n"
+ "st1h { z24.h }, p0, [x13, x28, LSL #1]\n"
+ "fmin z22.h, p3/M, z22.h, z25.h\n"
+ "fmin z21.h, p3/M, z21.h, z25.h\n"
+ "st1h { z23.h }, p0, [x12, x28, LSL #1]\n"
+ "st1h { z22.h }, p0, [x11, x28, LSL #1]\n"
+ "st1h { z21.h }, p0, [x10, x28, LSL #1]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
index d49b14eeaf..da2ef72a30 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,19 +22,19 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
-void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
-void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
class sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
{
@@ -57,7 +57,7 @@ class sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 3;
sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(3, 3, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
@@ -68,4 +68,4 @@ class sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirst
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
index d2dae84089..a22ab39d6f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -113,7 +113,7 @@ void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"madd x20, x8, x16, x20\n" // offset += tile_j * ld_output_col
"add x9, x10, x23, LSL #1\n"
"whilelt p2.h, XZR, %x[n_channels]\n"
- "ld1h { z18.h }, p3/Z, [x13]\n"
+ "ld1h { z14.h }, p3/Z, [x13]\n"
"mul x20, x20, x24\n" // offset *= output_tile_size
"ld1h { z0.h }, p3/Z, [x13, #1, MUL VL]\n"
"ld1h { z1.h }, p3/Z, [x13, #2, MUL VL]\n"
@@ -129,10 +129,10 @@ void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"ld1h { z6.h }, p3/Z, [x13, #7, MUL VL]\n"
"addvl x13, x13, #16\n"
"add x24, x11, x21, LSL #1\n"
- "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z31.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
"cmp x15, %x[n_channels]\n"
"add x23, x24, x21, LSL #1\n"
- "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rh { z30.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"ld1h { z7.h }, p3/Z, [x13, #-8, MUL VL]\n"
"add x22, x16, x16\n"
"mov x21, #0x0\n"
@@ -146,131 +146,131 @@ void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"ld1h { z13.h }, p2/Z, [x10, x12, LSL #1]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z24, z18\n fmla z24.h, p3/M, z7.h, z9.h\n"
- "movprfx z23, z18\n fmla z23.h, p3/M, z8.h, z9.h\n"
+ "movprfx z29, z14\n fmla z29.h, p3/M, z7.h, z9.h\n"
+ "movprfx z28, z14\n fmla z28.h, p3/M, z8.h, z9.h\n"
"whilelt p1.h, x15, %x[n_channels]\n"
"inch x21\n"
- "movprfx z25, z18\n fmla z25.h, p3/M, z6.h, z9.h\n"
- "fmla z24.h, p3/M, z4.h, z13.h\n"
+ "movprfx z27, z14\n fmla z27.h, p3/M, z6.h, z9.h\n"
+ "fmla z29.h, p3/M, z4.h, z13.h\n"
"inch x15\n"
"mov p0.b, p2.b\n"
- "movprfx z26, z18\n fmla z26.h, p3/M, z5.h, z9.h\n"
- "movprfx z27, z18\n fmla z27.h, p3/M, z4.h, z9.h\n"
+ "movprfx z26, z14\n fmla z26.h, p3/M, z5.h, z9.h\n"
+ "movprfx z25, z14\n fmla z25.h, p3/M, z4.h, z9.h\n"
"inch x20\n"
- "movprfx z28, z18\n fmla z28.h, p3/M, z3.h, z9.h\n"
- "fmla z23.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x9, x27, LSL #1]\n"
- "fmla z25.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x9, x17, LSL #1]\n"
- "movprfx z29, z18\n fmla z29.h, p3/M, z2.h, z9.h\n"
- "fmla z24.h, p3/M, z6.h, z11.h\n"
- "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
- "fmla z23.h, p3/M, z5.h, z13.h\n"
- "fmla z25.h, p3/M, z3.h, z13.h\n"
+ "movprfx z24, z14\n fmla z24.h, p3/M, z3.h, z9.h\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z23.h }, p2/Z, [x9, x27, LSL #1]\n"
+ "fmla z27.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z18.h }, p2/Z, [x9, x17, LSL #1]\n"
+ "movprfx z22, z14\n fmla z22.h, p3/M, z2.h, z9.h\n"
+ "fmla z29.h, p3/M, z6.h, z18.h\n"
+ "movprfx z21, z14\n fmla z21.h, p3/M, z0.h, z9.h\n"
+ "fmla z28.h, p3/M, z5.h, z13.h\n"
+ "fmla z27.h, p3/M, z3.h, z13.h\n"
"fmla z26.h, p3/M, z2.h, z13.h\n"
- "fmla z27.h, p3/M, z1.h, z13.h\n"
- "fmla z28.h, p3/M, z0.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x14, x17, LSL #1]\n"
- "fmla z29.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x25, LSL #1]\n"
- "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "fmla z25.h, p3/M, z1.h, z13.h\n"
"fmla z24.h, p3/M, z0.h, z13.h\n"
- "ld1h { z18.h }, p3/Z, [x13]\n"
- "fmla z31.h, p3/M, z8.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x14, x27, LSL #1]\n"
- "fmla z23.h, p3/M, z7.h, z11.h\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "fmla z26.h, p3/M, z4.h, z11.h\n"
- "fmla z27.h, p3/M, z3.h, z11.h\n"
- "fmla z29.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x10]\n"
- "fmla z24.h, p3/M, z2.h, z12.h\n"
- "fmla z25.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x28]\n"
- "fmla z28.h, p3/M, z4.h, z10.h\n"
- "fmla z23.h, p3/M, z1.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x10, x25, LSL #1]\n"
- "fmla z30.h, p3/M, z2.h, z10.h\n"
- "fmla z31.h, p3/M, z1.h, z10.h\n"
- "fmla z24.h, p3/M, z8.h, z10.h\n"
- "fmla z25.h, p3/M, z7.h, z10.h\n"
- "fmla z27.h, p3/M, z5.h, z10.h\n"
- "fmla z26.h, p3/M, z0.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x28, x12, LSL #1]\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "fmla z28.h, p3/M, z2.h, z13.h\n"
- "fmla z30.h, p3/M, z4.h, z10.h\n"
- "fmla z31.h, p3/M, z3.h, z10.h\n"
- "fmla z23.h, p3/M, z3.h, z11.h\n"
- "fmla z25.h, p3/M, z5.h, z13.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x25, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x26, x17, LSL #1]\n"
- "fmla z26.h, p3/M, z6.h, z12.h\n"
- "fmla z27.h, p3/M, z7.h, z10.h\n"
- "ld1h { z12.h }, p2/Z, [x10, x17, LSL #1]\n"
- "fmla z29.h, p3/M, z5.h, z10.h\n"
- "fmla z28.h, p3/M, z6.h, z10.h\n"
- "fmla z31.h, p3/M, z5.h, z11.h\n"
- "fmla z30.h, p3/M, z6.h, z13.h\n"
- "fmla z26.h, p3/M, z8.h, z10.h\n"
- "fmla z29.h, p3/M, z7.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x26, x27, LSL #1]\n"
- "fmla z24.h, p3/M, z3.h, z12.h\n"
- "fmla z27.h, p3/M, z0.h, z12.h\n"
- "fmla z28.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x10, x27, LSL #1]\n"
- "fmla z30.h, p3/M, z8.h, z13.h\n"
+ "ld1h { z17.h }, p2/Z, [x14, x17, LSL #1]\n"
+ "fmla z22.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z16.h }, p2/Z, [x26, x25, LSL #1]\n"
+ "movprfx z20, z14\n fmla z20.h, p3/M, z1.h, z9.h\n"
+ "fmla z29.h, p3/M, z0.h, z17.h\n"
+ "ld1h { z14.h }, p3/Z, [x13]\n"
+ "fmla z21.h, p3/M, z8.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x14, x27, LSL #1]\n"
+ "fmla z28.h, p3/M, z7.h, z18.h\n"
+ "fmla z20.h, p3/M, z0.h, z18.h\n"
+ "fmla z26.h, p3/M, z4.h, z18.h\n"
+ "fmla z25.h, p3/M, z3.h, z18.h\n"
+ "fmla z22.h, p3/M, z1.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x10]\n"
+ "fmla z29.h, p3/M, z2.h, z16.h\n"
+ "fmla z27.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x28]\n"
+ "fmla z24.h, p3/M, z4.h, z23.h\n"
+ "fmla z28.h, p3/M, z1.h, z17.h\n"
+ "ld1h { z16.h }, p2/Z, [x10, x25, LSL #1]\n"
+ "fmla z20.h, p3/M, z2.h, z23.h\n"
+ "fmla z21.h, p3/M, z1.h, z23.h\n"
+ "fmla z29.h, p3/M, z8.h, z23.h\n"
+ "fmla z27.h, p3/M, z7.h, z23.h\n"
+ "fmla z25.h, p3/M, z5.h, z23.h\n"
+ "fmla z26.h, p3/M, z0.h, z19.h\n"
+ "ld1h { z17.h }, p2/Z, [x28, x12, LSL #1]\n"
+ "fmla z22.h, p3/M, z3.h, z18.h\n"
+ "fmla z24.h, p3/M, z2.h, z16.h\n"
+ "fmla z20.h, p3/M, z4.h, z17.h\n"
+ "fmla z21.h, p3/M, z3.h, z17.h\n"
+ "fmla z28.h, p3/M, z3.h, z19.h\n"
+ "fmla z27.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z19.h }, p2/Z, [x28, x25, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x26, x17, LSL #1]\n"
+ "fmla z26.h, p3/M, z6.h, z18.h\n"
+ "fmla z25.h, p3/M, z7.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x10, x17, LSL #1]\n"
+ "fmla z22.h, p3/M, z5.h, z17.h\n"
+ "fmla z24.h, p3/M, z6.h, z17.h\n"
+ "fmla z21.h, p3/M, z5.h, z19.h\n"
+ "fmla z20.h, p3/M, z6.h, z16.h\n"
+ "fmla z26.h, p3/M, z8.h, z17.h\n"
+ "fmla z22.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x26, x27, LSL #1]\n"
+ "fmla z29.h, p3/M, z3.h, z18.h\n"
+ "fmla z25.h, p3/M, z0.h, z18.h\n"
+ "fmla z24.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x10, x27, LSL #1]\n"
+ "fmla z20.h, p3/M, z8.h, z17.h\n"
"addvl x10, x10, #1\n"
- "fmla z31.h, p3/M, z7.h, z13.h\n"
- "fmla z23.h, p3/M, z4.h, z12.h\n"
- "ld1h { z13.h }, p2/Z, [x28, x27, LSL #1]\n"
- "fmla z26.h, p3/M, z1.h, z12.h\n"
- "fmla z24.h, p3/M, z5.h, z11.h\n"
- "ld1h { z12.h }, p2/Z, [x28, x17, LSL #1]\n"
+ "fmla z21.h, p3/M, z7.h, z17.h\n"
+ "fmla z28.h, p3/M, z4.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x28, x27, LSL #1]\n"
+ "fmla z26.h, p3/M, z1.h, z18.h\n"
+ "fmla z29.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x28, x17, LSL #1]\n"
"addvl x28, x28, #1\n"
- "fmla z25.h, p3/M, z4.h, z11.h\n"
- "fmla z27.h, p3/M, z2.h, z11.h\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x14, x12, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
+ "fmla z27.h, p3/M, z4.h, z16.h\n"
+ "fmla z25.h, p3/M, z2.h, z16.h\n"
+ "fmla z24.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x14, x12, LSL #1]\n"
+ "fmla z22.h, p3/M, z4.h, z17.h\n"
"addvl x14, x14, #1\n"
- "fmla z30.h, p3/M, z3.h, z12.h\n"
- "fmla z31.h, p3/M, z4.h, z13.h\n"
+ "fmla z20.h, p3/M, z3.h, z17.h\n"
+ "fmla z21.h, p3/M, z4.h, z19.h\n"
"ld1h { z4.h }, p3/Z, [x13, #5, MUL VL]\n"
"ld1h { z10.h }, p1/Z, [x14]\n"
- "fmla z26.h, p3/M, z7.h, z12.h\n"
- "fmla z27.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x9]\n"
- "fmla z23.h, p3/M, z2.h, z11.h\n"
- "fmla z24.h, p3/M, z1.h, z11.h\n"
- "fmax z24.h, p3/M, z24.h, z17.h\n"
+ "fmla z26.h, p3/M, z7.h, z17.h\n"
+ "fmla z25.h, p3/M, z6.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x9]\n"
+ "fmla z28.h, p3/M, z2.h, z16.h\n"
+ "fmla z29.h, p3/M, z1.h, z16.h\n"
+ "fmax z29.h, p3/M, z29.h, z31.h\n"
"ld1h { z1.h }, p3/Z, [x13, #2, MUL VL]\n"
- "fmla z25.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x9, x25, LSL #1]\n"
- "fmla z28.h, p3/M, z7.h, z13.h\n"
+ "fmla z27.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x9, x25, LSL #1]\n"
+ "fmla z24.h, p3/M, z7.h, z19.h\n"
"addvl x9, x9, #1\n"
- "fmla z30.h, p3/M, z5.h, z13.h\n"
- "fmla z29.h, p3/M, z0.h, z12.h\n"
+ "fmla z20.h, p3/M, z5.h, z19.h\n"
+ "fmla z22.h, p3/M, z0.h, z18.h\n"
"ld1h { z0.h }, p3/Z, [x13, #1, MUL VL]\n"
- "fmin z24.h, p3/M, z24.h, z16.h\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "fmla z27.h, p3/M, z8.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x26, x12, LSL #1]\n"
- "fmax z27.h, p3/M, z27.h, z17.h\n"
- "fmla z23.h, p3/M, z6.h, z12.h\n"
- "fmla z26.h, p3/M, z3.h, z12.h\n"
- "fmax z23.h, p3/M, z23.h, z17.h\n"
- "fmax z26.h, p3/M, z26.h, z17.h\n"
- "fmla z25.h, p3/M, z8.h, z11.h\n"
- "fmla z28.h, p3/M, z5.h, z11.h\n"
- "fmax z25.h, p3/M, z25.h, z17.h\n"
- "fmax z28.h, p3/M, z28.h, z17.h\n"
- "fmla z29.h, p3/M, z8.h, z13.h\n"
- "fmla z30.h, p3/M, z7.h, z13.h\n"
- "fmax z29.h, p3/M, z29.h, z17.h\n"
- "fmax z30.h, p3/M, z30.h, z17.h\n"
- "fmla z31.h, p3/M, z6.h, z13.h\n"
- "fmax z31.h, p3/M, z31.h, z17.h\n"
+ "fmin z29.h, p3/M, z29.h, z30.h\n"
+ "fmla z21.h, p3/M, z2.h, z17.h\n"
+ "fmla z25.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x26, x12, LSL #1]\n"
+ "fmax z25.h, p3/M, z25.h, z31.h\n"
+ "fmla z28.h, p3/M, z6.h, z18.h\n"
+ "fmla z26.h, p3/M, z3.h, z18.h\n"
+ "fmax z28.h, p3/M, z28.h, z31.h\n"
+ "fmax z26.h, p3/M, z26.h, z31.h\n"
+ "fmla z27.h, p3/M, z8.h, z17.h\n"
+ "fmla z24.h, p3/M, z5.h, z17.h\n"
+ "fmax z27.h, p3/M, z27.h, z31.h\n"
+ "fmax z24.h, p3/M, z24.h, z31.h\n"
+ "fmla z22.h, p3/M, z8.h, z16.h\n"
+ "fmla z20.h, p3/M, z7.h, z16.h\n"
+ "fmax z22.h, p3/M, z22.h, z31.h\n"
+ "fmax z20.h, p3/M, z20.h, z31.h\n"
+ "fmla z21.h, p3/M, z6.h, z16.h\n"
+ "fmax z21.h, p3/M, z21.h, z31.h\n"
"addvl x26, x26, #1\n"
"ld1h { z2.h }, p3/Z, [x13, #3, MUL VL]\n"
"ld1h { z3.h }, p3/Z, [x13, #4, MUL VL]\n"
@@ -279,182 +279,182 @@ void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"cmp x15, %x[n_channels]\n"
"ld1h { z6.h }, p3/Z, [x13, #7, MUL VL]\n"
"addvl x13, x13, #16\n"
- "fmin z23.h, p3/M, z23.h, z16.h\n"
+ "fmin z28.h, p3/M, z28.h, z30.h\n"
"ld1h { z9.h }, p1/Z, [x9, x12, LSL #1]\n"
- "fmin z25.h, p3/M, z25.h, z16.h\n"
- "fmin z26.h, p3/M, z26.h, z16.h\n"
+ "fmin z27.h, p3/M, z27.h, z30.h\n"
+ "fmin z26.h, p3/M, z26.h, z30.h\n"
"ld1h { z11.h }, p1/Z, [x14, x25, LSL #1]\n"
"ld1h { z12.h }, p1/Z, [x26]\n"
- "fmin z27.h, p3/M, z27.h, z16.h\n"
- "fmin z28.h, p3/M, z28.h, z16.h\n"
+ "fmin z25.h, p3/M, z25.h, z30.h\n"
+ "fmin z24.h, p3/M, z24.h, z30.h\n"
"ld1h { z13.h }, p1/Z, [x10, x12, LSL #1]\n"
- "st1h { z23.h }, p0, [x11]\n"
- "fmin z29.h, p3/M, z29.h, z16.h\n"
- "fmin z30.h, p3/M, z30.h, z16.h\n"
- "st1h { z24.h }, p0, [x11, x16, LSL #1]\n"
+ "st1h { z28.h }, p0, [x11]\n"
+ "fmin z22.h, p3/M, z22.h, z30.h\n"
+ "fmin z20.h, p3/M, z20.h, z30.h\n"
+ "st1h { z29.h }, p0, [x11, x16, LSL #1]\n"
"ld1h { z7.h }, p3/Z, [x13, #-8, MUL VL]\n"
- "fmin z31.h, p3/M, z31.h, z16.h\n"
- "st1h { z25.h }, p0, [x11, x22, LSL #1]\n"
+ "fmin z21.h, p3/M, z21.h, z30.h\n"
+ "st1h { z27.h }, p0, [x11, x22, LSL #1]\n"
"addvl x11, x11, #1\n"
"ld1h { z8.h }, p3/Z, [x13, #-7, MUL VL]\n"
"st1h { z26.h }, p0, [x24]\n"
"addvl x13, x13, #-6\n"
- "st1h { z27.h }, p0, [x24, x16, LSL #1]\n"
- "st1h { z28.h }, p0, [x24, x22, LSL #1]\n"
+ "st1h { z25.h }, p0, [x24, x16, LSL #1]\n"
+ "st1h { z24.h }, p0, [x24, x22, LSL #1]\n"
"addvl x24, x24, #1\n"
- "st1h { z29.h }, p0, [x23]\n"
- "st1h { z30.h }, p0, [x23, x16, LSL #1]\n"
- "st1h { z31.h }, p0, [x23, x22, LSL #1]\n"
+ "st1h { z22.h }, p0, [x23]\n"
+ "st1h { z20.h }, p0, [x23, x16, LSL #1]\n"
+ "st1h { z21.h }, p0, [x23, x22, LSL #1]\n"
"addvl x23, x23, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z24, z18\n fmla z24.h, p3/M, z7.h, z9.h\n"
- "movprfx z23, z18\n fmla z23.h, p3/M, z8.h, z9.h\n"
+ "movprfx z29, z14\n fmla z29.h, p3/M, z7.h, z9.h\n"
+ "movprfx z28, z14\n fmla z28.h, p3/M, z8.h, z9.h\n"
"ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x13, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "movprfx z25, z18\n fmla z25.h, p3/M, z6.h, z9.h\n"
- "fmla z24.h, p3/M, z4.h, z13.h\n"
+ "movprfx z27, z14\n fmla z27.h, p3/M, z6.h, z9.h\n"
+ "fmla z29.h, p3/M, z4.h, z13.h\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
"add x8, x8, #0x1\n"
- "movprfx z26, z18\n fmla z26.h, p3/M, z5.h, z9.h\n"
- "movprfx z27, z18\n fmla z27.h, p3/M, z4.h, z9.h\n"
+ "movprfx z26, z14\n fmla z26.h, p3/M, z5.h, z9.h\n"
+ "movprfx z25, z14\n fmla z25.h, p3/M, z4.h, z9.h\n"
"cmp x8, x20\n"
"add x21, x13, #0x1\n"
- "movprfx z28, z18\n fmla z28.h, p3/M, z3.h, z9.h\n"
- "fmla z23.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x9, x27, LSL #1]\n"
+ "movprfx z24, z14\n fmla z24.h, p3/M, z3.h, z9.h\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z23.h }, p2/Z, [x9, x27, LSL #1]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "fmla z25.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x9, x17, LSL #1]\n"
- "movprfx z29, z18\n fmla z29.h, p3/M, z2.h, z9.h\n"
+ "fmla z27.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z18.h }, p2/Z, [x9, x17, LSL #1]\n"
+ "movprfx z22, z14\n fmla z22.h, p3/M, z2.h, z9.h\n"
"csel x13, x13, x21, LT\n"
- "fmla z24.h, p3/M, z6.h, z11.h\n"
- "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "fmla z29.h, p3/M, z6.h, z18.h\n"
+ "movprfx z21, z14\n fmla z21.h, p3/M, z0.h, z9.h\n"
"mov p0.b, p2.b\n"
"csel x8, x8, XZR, LT\n"
- "fmla z23.h, p3/M, z5.h, z13.h\n"
- "fmla z25.h, p3/M, z3.h, z13.h\n"
+ "fmla z28.h, p3/M, z5.h, z13.h\n"
+ "fmla z27.h, p3/M, z3.h, z13.h\n"
"cmp x13, x20\n"
"fmla z26.h, p3/M, z2.h, z13.h\n"
- "fmla z27.h, p3/M, z1.h, z13.h\n"
- "fmla z28.h, p3/M, z0.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x14, x17, LSL #1]\n"
- "fmla z29.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x25, LSL #1]\n"
- "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "fmla z25.h, p3/M, z1.h, z13.h\n"
"fmla z24.h, p3/M, z0.h, z13.h\n"
- "fmla z31.h, p3/M, z8.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x14, x27, LSL #1]\n"
- "fmla z23.h, p3/M, z7.h, z11.h\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "fmla z26.h, p3/M, z4.h, z11.h\n"
- "fmla z27.h, p3/M, z3.h, z11.h\n"
- "fmla z29.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x10]\n"
- "fmla z24.h, p3/M, z2.h, z12.h\n"
- "fmla z25.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x28]\n"
- "fmla z28.h, p3/M, z4.h, z10.h\n"
- "fmla z23.h, p3/M, z1.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x10, x25, LSL #1]\n"
- "fmla z30.h, p3/M, z2.h, z10.h\n"
- "fmla z31.h, p3/M, z1.h, z10.h\n"
- "fmla z24.h, p3/M, z8.h, z10.h\n"
- "fmla z25.h, p3/M, z7.h, z10.h\n"
- "fmla z27.h, p3/M, z5.h, z10.h\n"
- "fmla z26.h, p3/M, z0.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x28, x12, LSL #1]\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "fmla z28.h, p3/M, z2.h, z13.h\n"
- "fmla z30.h, p3/M, z4.h, z10.h\n"
- "fmla z31.h, p3/M, z3.h, z10.h\n"
- "fmla z23.h, p3/M, z3.h, z11.h\n"
- "fmla z25.h, p3/M, z5.h, z13.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x25, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x26, x17, LSL #1]\n"
- "fmla z26.h, p3/M, z6.h, z12.h\n"
- "fmla z27.h, p3/M, z7.h, z10.h\n"
- "ld1h { z12.h }, p2/Z, [x10, x17, LSL #1]\n"
- "fmla z29.h, p3/M, z5.h, z10.h\n"
- "fmla z28.h, p3/M, z6.h, z10.h\n"
- "fmla z31.h, p3/M, z5.h, z11.h\n"
- "fmla z30.h, p3/M, z6.h, z13.h\n"
- "fmla z26.h, p3/M, z8.h, z10.h\n"
- "fmla z29.h, p3/M, z7.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x26, x27, LSL #1]\n"
- "fmla z24.h, p3/M, z3.h, z12.h\n"
- "fmla z27.h, p3/M, z0.h, z12.h\n"
- "fmla z28.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x10, x27, LSL #1]\n"
- "fmla z30.h, p3/M, z8.h, z13.h\n"
- "fmla z31.h, p3/M, z7.h, z13.h\n"
- "fmla z23.h, p3/M, z4.h, z12.h\n"
- "ld1h { z13.h }, p2/Z, [x28, x27, LSL #1]\n"
- "fmla z26.h, p3/M, z1.h, z12.h\n"
- "fmla z24.h, p3/M, z5.h, z11.h\n"
- "ld1h { z12.h }, p2/Z, [x28, x17, LSL #1]\n"
- "fmla z25.h, p3/M, z4.h, z11.h\n"
- "fmla z27.h, p3/M, z2.h, z11.h\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x14, x12, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "fmla z30.h, p3/M, z3.h, z12.h\n"
- "fmla z31.h, p3/M, z4.h, z13.h\n"
- "fmla z26.h, p3/M, z7.h, z12.h\n"
- "fmla z27.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x9]\n"
- "fmla z23.h, p3/M, z2.h, z11.h\n"
- "fmla z24.h, p3/M, z1.h, z11.h\n"
- "fmax z24.h, p3/M, z24.h, z17.h\n"
- "fmin z24.h, p3/M, z24.h, z16.h\n"
- "fmla z25.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x9, x25, LSL #1]\n"
- "fmla z28.h, p3/M, z7.h, z13.h\n"
- "fmla z30.h, p3/M, z5.h, z13.h\n"
- "fmla z29.h, p3/M, z0.h, z12.h\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "fmla z27.h, p3/M, z8.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x26, x12, LSL #1]\n"
- "fmax z27.h, p3/M, z27.h, z17.h\n"
- "fmla z23.h, p3/M, z6.h, z12.h\n"
- "fmla z26.h, p3/M, z3.h, z12.h\n"
- "fmax z23.h, p3/M, z23.h, z17.h\n"
- "fmax z26.h, p3/M, z26.h, z17.h\n"
- "fmla z25.h, p3/M, z8.h, z11.h\n"
- "fmla z28.h, p3/M, z5.h, z11.h\n"
- "fmax z25.h, p3/M, z25.h, z17.h\n"
- "fmax z28.h, p3/M, z28.h, z17.h\n"
- "fmla z29.h, p3/M, z8.h, z13.h\n"
- "fmla z30.h, p3/M, z7.h, z13.h\n"
- "fmax z29.h, p3/M, z29.h, z17.h\n"
- "fmax z30.h, p3/M, z30.h, z17.h\n"
- "fmla z31.h, p3/M, z6.h, z13.h\n"
- "fmax z31.h, p3/M, z31.h, z17.h\n"
- "fmin z23.h, p3/M, z23.h, z16.h\n"
- "st1h { z23.h }, p0, [x11]\n"
- "fmin z25.h, p3/M, z25.h, z16.h\n"
- "fmin z26.h, p3/M, z26.h, z16.h\n"
- "st1h { z24.h }, p0, [x11, x16, LSL #1]\n"
- "fmin z27.h, p3/M, z27.h, z16.h\n"
- "fmin z28.h, p3/M, z28.h, z16.h\n"
- "st1h { z25.h }, p0, [x11, x22, LSL #1]\n"
- "fmin z29.h, p3/M, z29.h, z16.h\n"
- "fmin z30.h, p3/M, z30.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x14, x17, LSL #1]\n"
+ "fmla z22.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z16.h }, p2/Z, [x26, x25, LSL #1]\n"
+ "movprfx z20, z14\n fmla z20.h, p3/M, z1.h, z9.h\n"
+ "fmla z29.h, p3/M, z0.h, z17.h\n"
+ "fmla z21.h, p3/M, z8.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x14, x27, LSL #1]\n"
+ "fmla z28.h, p3/M, z7.h, z18.h\n"
+ "fmla z20.h, p3/M, z0.h, z18.h\n"
+ "fmla z26.h, p3/M, z4.h, z18.h\n"
+ "fmla z25.h, p3/M, z3.h, z18.h\n"
+ "fmla z22.h, p3/M, z1.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x10]\n"
+ "fmla z29.h, p3/M, z2.h, z16.h\n"
+ "fmla z27.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x28]\n"
+ "fmla z24.h, p3/M, z4.h, z23.h\n"
+ "fmla z28.h, p3/M, z1.h, z17.h\n"
+ "ld1h { z16.h }, p2/Z, [x10, x25, LSL #1]\n"
+ "fmla z20.h, p3/M, z2.h, z23.h\n"
+ "fmla z21.h, p3/M, z1.h, z23.h\n"
+ "fmla z29.h, p3/M, z8.h, z23.h\n"
+ "fmla z27.h, p3/M, z7.h, z23.h\n"
+ "fmla z25.h, p3/M, z5.h, z23.h\n"
+ "fmla z26.h, p3/M, z0.h, z19.h\n"
+ "ld1h { z17.h }, p2/Z, [x28, x12, LSL #1]\n"
+ "fmla z22.h, p3/M, z3.h, z18.h\n"
+ "fmla z24.h, p3/M, z2.h, z16.h\n"
+ "fmla z20.h, p3/M, z4.h, z17.h\n"
+ "fmla z21.h, p3/M, z3.h, z17.h\n"
+ "fmla z28.h, p3/M, z3.h, z19.h\n"
+ "fmla z27.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z19.h }, p2/Z, [x28, x25, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x26, x17, LSL #1]\n"
+ "fmla z26.h, p3/M, z6.h, z18.h\n"
+ "fmla z25.h, p3/M, z7.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x10, x17, LSL #1]\n"
+ "fmla z22.h, p3/M, z5.h, z17.h\n"
+ "fmla z24.h, p3/M, z6.h, z17.h\n"
+ "fmla z21.h, p3/M, z5.h, z19.h\n"
+ "fmla z20.h, p3/M, z6.h, z16.h\n"
+ "fmla z26.h, p3/M, z8.h, z17.h\n"
+ "fmla z22.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x26, x27, LSL #1]\n"
+ "fmla z29.h, p3/M, z3.h, z18.h\n"
+ "fmla z25.h, p3/M, z0.h, z18.h\n"
+ "fmla z24.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x10, x27, LSL #1]\n"
+ "fmla z20.h, p3/M, z8.h, z17.h\n"
+ "fmla z21.h, p3/M, z7.h, z17.h\n"
+ "fmla z28.h, p3/M, z4.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x28, x27, LSL #1]\n"
+ "fmla z26.h, p3/M, z1.h, z18.h\n"
+ "fmla z29.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x28, x17, LSL #1]\n"
+ "fmla z27.h, p3/M, z4.h, z16.h\n"
+ "fmla z25.h, p3/M, z2.h, z16.h\n"
+ "fmla z24.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x14, x12, LSL #1]\n"
+ "fmla z22.h, p3/M, z4.h, z17.h\n"
+ "fmla z20.h, p3/M, z3.h, z17.h\n"
+ "fmla z21.h, p3/M, z4.h, z19.h\n"
+ "fmla z26.h, p3/M, z7.h, z17.h\n"
+ "fmla z25.h, p3/M, z6.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x9]\n"
+ "fmla z28.h, p3/M, z2.h, z16.h\n"
+ "fmla z29.h, p3/M, z1.h, z16.h\n"
+ "fmax z29.h, p3/M, z29.h, z31.h\n"
+ "fmin z29.h, p3/M, z29.h, z30.h\n"
+ "fmla z27.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x9, x25, LSL #1]\n"
+ "fmla z24.h, p3/M, z7.h, z19.h\n"
+ "fmla z20.h, p3/M, z5.h, z19.h\n"
+ "fmla z22.h, p3/M, z0.h, z18.h\n"
+ "fmla z21.h, p3/M, z2.h, z17.h\n"
+ "fmla z25.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x26, x12, LSL #1]\n"
+ "fmax z25.h, p3/M, z25.h, z31.h\n"
+ "fmla z28.h, p3/M, z6.h, z18.h\n"
+ "fmla z26.h, p3/M, z3.h, z18.h\n"
+ "fmax z28.h, p3/M, z28.h, z31.h\n"
+ "fmax z26.h, p3/M, z26.h, z31.h\n"
+ "fmla z27.h, p3/M, z8.h, z17.h\n"
+ "fmla z24.h, p3/M, z5.h, z17.h\n"
+ "fmax z27.h, p3/M, z27.h, z31.h\n"
+ "fmax z24.h, p3/M, z24.h, z31.h\n"
+ "fmla z22.h, p3/M, z8.h, z16.h\n"
+ "fmla z20.h, p3/M, z7.h, z16.h\n"
+ "fmax z22.h, p3/M, z22.h, z31.h\n"
+ "fmax z20.h, p3/M, z20.h, z31.h\n"
+ "fmla z21.h, p3/M, z6.h, z16.h\n"
+ "fmax z21.h, p3/M, z21.h, z31.h\n"
+ "fmin z28.h, p3/M, z28.h, z30.h\n"
+ "st1h { z28.h }, p0, [x11]\n"
+ "fmin z27.h, p3/M, z27.h, z30.h\n"
+ "fmin z26.h, p3/M, z26.h, z30.h\n"
+ "st1h { z29.h }, p0, [x11, x16, LSL #1]\n"
+ "fmin z25.h, p3/M, z25.h, z30.h\n"
+ "fmin z24.h, p3/M, z24.h, z30.h\n"
+ "st1h { z27.h }, p0, [x11, x22, LSL #1]\n"
+ "fmin z22.h, p3/M, z22.h, z30.h\n"
+ "fmin z20.h, p3/M, z20.h, z30.h\n"
"st1h { z26.h }, p0, [x24]\n"
- "fmin z31.h, p3/M, z31.h, z16.h\n"
- "st1h { z27.h }, p0, [x24, x16, LSL #1]\n"
- "st1h { z28.h }, p0, [x24, x22, LSL #1]\n"
- "st1h { z29.h }, p0, [x23]\n"
- "st1h { z30.h }, p0, [x23, x16, LSL #1]\n"
- "st1h { z31.h }, p0, [x23, x22, LSL #1]\n"
+ "fmin z21.h, p3/M, z21.h, z30.h\n"
+ "st1h { z25.h }, p0, [x24, x16, LSL #1]\n"
+ "st1h { z24.h }, p0, [x24, x22, LSL #1]\n"
+ "st1h { z22.h }, p0, [x23]\n"
+ "st1h { z20.h }, p0, [x23, x16, LSL #1]\n"
+ "st1h { z21.h }, p0, [x23, x22, LSL #1]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
index 59c0e0cf0b..4f8368acd5 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -88,390 +88,390 @@ void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
__asm__ __volatile__(
"ptrue p3.b\n"
- "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ld1h { z18.h }, p3/Z, [x17]\n"
- "cnth x15\n"
- "mov x14, #0x0\n"
- "ld1h { z0.h }, p3/Z, [x17, #1, MUL VL]\n"
- "ld1h { z1.h }, p3/Z, [x17, #2, MUL VL]\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x17, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ld1h { z14.h }, p3/Z, [x8]\n"
+ "cnth x16\n"
+ "mov x15, #0x0\n"
+ "ld1h { z0.h }, p3/Z, [x8, #1, MUL VL]\n"
+ "ld1h { z1.h }, p3/Z, [x8, #2, MUL VL]\n"
"whilelt p2.h, XZR, %x[n_channels]\n"
- "ld1h { z2.h }, p3/Z, [x17, #3, MUL VL]\n"
- "ld1h { z3.h }, p3/Z, [x17, #4, MUL VL]\n"
- "cmp x15, %x[n_channels]\n"
- "ld1h { z4.h }, p3/Z, [x17, #5, MUL VL]\n"
- "ld1h { z5.h }, p3/Z, [x17, #6, MUL VL]\n"
- "sub x13, XZR, x15\n"
- "ld1h { z6.h }, p3/Z, [x17, #7, MUL VL]\n"
- "addvl x17, x17, #16\n"
- "ldp x12, x11, [x16, #0x0]\n"
- "ldp x10, x9, [x16, #0x10]\n"
- "ldr x28, [x16, #0x20]\n"
- "ldr x27, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "ld1h { z7.h }, p3/Z, [x17, #-8, MUL VL]\n"
- "ld1h { z8.h }, p3/Z, [x17, #-7, MUL VL]\n"
- "ld1h { z9.h }, p2/Z, [x12, x14, LSL #1]\n"
- "addvl x17, x17, #-6\n"
- "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
- "ld1h { z11.h }, p2/Z, [x10, x14, LSL #1]\n"
- "ld1h { z12.h }, p2/Z, [x9, x14, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x28, x14, LSL #1]\n"
+ "ld1h { z2.h }, p3/Z, [x8, #3, MUL VL]\n"
+ "ld1h { z3.h }, p3/Z, [x8, #4, MUL VL]\n"
+ "cmp x16, %x[n_channels]\n"
+ "ld1h { z4.h }, p3/Z, [x8, #5, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x8, #6, MUL VL]\n"
+ "sub x14, XZR, x16\n"
+ "ld1h { z6.h }, p3/Z, [x8, #7, MUL VL]\n"
+ "addvl x8, x8, #16\n"
+ "ldp x24, x23, [x17, #0x0]\n"
+ "ldp x22, x21, [x17, #0x10]\n"
+ "ldr x20, [x17, #0x20]\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ld1rh { z31.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z30.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1h { z7.h }, p3/Z, [x8, #-8, MUL VL]\n"
+ "ld1h { z8.h }, p3/Z, [x8, #-7, MUL VL]\n"
+ "ld1h { z9.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "addvl x8, x8, #-6\n"
+ "ld1h { z10.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x20, x15, LSL #1]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z23, z18\n fmla z23.h, p3/M, z8.h, z9.h\n"
- "movprfx z24, z18\n fmla z24.h, p3/M, z7.h, z9.h\n"
- "ldr x26, [x16, #0x30]\n"
- "ldr x25, [x16, #0x38]\n"
- "movprfx z25, z18\n fmla z25.h, p3/M, z6.h, z9.h\n"
- "fmla z23.h, p3/M, z0.h, z10.h\n"
- "ldr x24, [x16, #0x28]\n"
- "ldr x11, [x16, #0x48]\n"
- "fmla z24.h, p3/M, z4.h, z13.h\n"
- "movprfx z26, z18\n fmla z26.h, p3/M, z5.h, z9.h\n"
- "ldr x12, [x16, #0x40]\n"
- "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
- "movprfx z27, z18\n fmla z27.h, p3/M, z4.h, z9.h\n"
- "movprfx z28, z18\n fmla z28.h, p3/M, z3.h, z9.h\n"
- "ldr x10, [x16, #0x50]\n"
- "ldr x9, [x16, #0x58]\n"
- "fmla z25.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x14, LSL #1]\n"
- "movprfx z29, z18\n fmla z29.h, p3/M, z2.h, z9.h\n"
- "ldr x28, [x16, #0x60]\n"
- "fmla z23.h, p3/M, z5.h, z13.h\n"
- "fmla z24.h, p3/M, z6.h, z11.h\n"
- "ldr x26, [x16, #0x70]\n"
- "ldr x11, [x16, #0x88]\n"
- "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
- "fmla z25.h, p3/M, z3.h, z13.h\n"
- "inch x13\n"
+ "movprfx z29, z14\n fmla z29.h, p3/M, z8.h, z9.h\n"
+ "movprfx z28, z14\n fmla z28.h, p3/M, z7.h, z9.h\n"
+ "ldr x23, [x17, #0x30]\n"
+ "ldr x26, [x17, #0x38]\n"
+ "movprfx z27, z14\n fmla z27.h, p3/M, z6.h, z9.h\n"
+ "fmla z29.h, p3/M, z0.h, z10.h\n"
+ "ldr x22, [x17, #0x28]\n"
+ "ldr x21, [x17, #0x48]\n"
+ "fmla z28.h, p3/M, z4.h, z13.h\n"
+ "movprfx z26, z14\n fmla z26.h, p3/M, z5.h, z9.h\n"
+ "ldr x20, [x17, #0x40]\n"
+ "ld1h { z19.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "movprfx z25, z14\n fmla z25.h, p3/M, z4.h, z9.h\n"
+ "movprfx z24, z14\n fmla z24.h, p3/M, z3.h, z9.h\n"
+ "ldr x25, [x17, #0x50]\n"
+ "ldr x24, [x17, #0x58]\n"
+ "fmla z27.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z18.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "movprfx z23, z14\n fmla z23.h, p3/M, z2.h, z9.h\n"
+ "ldr x23, [x17, #0x60]\n"
+ "fmla z29.h, p3/M, z5.h, z13.h\n"
+ "fmla z28.h, p3/M, z6.h, z18.h\n"
+ "ldr x12, [x17, #0x70]\n"
+ "ldr x11, [x17, #0x88]\n"
+ "movprfx z22, z14\n fmla z22.h, p3/M, z0.h, z9.h\n"
+ "fmla z27.h, p3/M, z3.h, z13.h\n"
+ "inch x14\n"
"mov p1.b, p2.b\n"
"fmla z26.h, p3/M, z2.h, z13.h\n"
- "fmla z27.h, p3/M, z1.h, z13.h\n"
- "ldr x23, [x27, #0x0]\n"
- "whilelt p0.h, x15, %x[n_channels]\n"
- "fmla z28.h, p3/M, z0.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x25, x14, LSL #1]\n"
- "fmla z29.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x14, LSL #1]\n"
- "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
- "fmla z23.h, p3/M, z7.h, z11.h\n"
- "ldr x24, [x16, #0x68]\n"
- "ldr x25, [x16, #0x78]\n"
+ "fmla z25.h, p3/M, z1.h, z13.h\n"
+ "ldr x10, [x13, #0x0]\n"
+ "whilelt p0.h, x16, %x[n_channels]\n"
"fmla z24.h, p3/M, z0.h, z13.h\n"
- "fmla z31.h, p3/M, z8.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x12, x14, LSL #1]\n"
- "ldr x12, [x16, #0x80]\n"
- "fmla z26.h, p3/M, z4.h, z11.h\n"
- "fmla z27.h, p3/M, z3.h, z11.h\n"
- "ldr x22, [x27, #0x8]\n"
- "ldr x21, [x27, #0x10]\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "fmla z28.h, p3/M, z4.h, z10.h\n"
- "ldr x20, [x27, #0x18]\n"
- "ld1h { z18.h }, p3/Z, [x17]\n"
- "fmla z29.h, p3/M, z1.h, z11.h\n"
- "fmla z23.h, p3/M, z1.h, z13.h\n"
- "ld1h { z11.h }, p2/Z, [x10, x14, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x9, x14, LSL #1]\n"
- "fmla z24.h, p3/M, z2.h, z12.h\n"
- "fmla z25.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x28, x14, LSL #1]\n"
- "ldr x10, [x16, #0x90]\n"
- "fmla z27.h, p3/M, z5.h, z10.h\n"
- "fmla z30.h, p3/M, z2.h, z10.h\n"
- "ldr x28, [x16, #0xa0]\n"
- "ldr x9, [x16, #0x98]\n"
- "fmla z26.h, p3/M, z0.h, z11.h\n"
- "fmla z28.h, p3/M, z2.h, z13.h\n"
- "fmla z24.h, p3/M, z8.h, z10.h\n"
- "fmla z25.h, p3/M, z7.h, z10.h\n"
- "fmla z31.h, p3/M, z1.h, z10.h\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "ld1h { z10.h }, p2/Z, [x24, x14, LSL #1]\n"
- "ldr x24, [x16, #0xa8]\n"
- "fmla z26.h, p3/M, z6.h, z12.h\n"
- "fmla z27.h, p3/M, z7.h, z10.h\n"
- "ld1h { z12.h }, p2/Z, [x12, x14, LSL #1]\n"
- "ldr x12, [x16, #0xc0]\n"
- "fmla z28.h, p3/M, z6.h, z10.h\n"
- "fmla z30.h, p3/M, z4.h, z10.h\n"
- "fmla z23.h, p3/M, z3.h, z11.h\n"
- "fmla z25.h, p3/M, z5.h, z13.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x14, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x25, x14, LSL #1]\n"
- "fmla z29.h, p3/M, z5.h, z10.h\n"
- "fmla z31.h, p3/M, z3.h, z10.h\n"
- "ldr x26, [x16, #0xb0]\n"
- "ldr x25, [x16, #0xb8]\n"
- "fmla z26.h, p3/M, z8.h, z10.h\n"
- "fmla z28.h, p3/M, z8.h, z11.h\n"
- "fmla z30.h, p3/M, z6.h, z13.h\n"
- "fmla z24.h, p3/M, z3.h, z12.h\n"
- "fmla z27.h, p3/M, z0.h, z12.h\n"
- "fmla z31.h, p3/M, z5.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11, x14, LSL #1]\n"
- "fmla z29.h, p3/M, z7.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x10, x14, LSL #1]\n"
- "fmla z23.h, p3/M, z4.h, z12.h\n"
- "fmla z26.h, p3/M, z1.h, z12.h\n"
- "fmla z24.h, p3/M, z5.h, z11.h\n"
- "ld1h { z12.h }, p2/Z, [x9, x14, LSL #1]\n"
- "fmla z25.h, p3/M, z4.h, z11.h\n"
- "fmla z27.h, p3/M, z2.h, z11.h\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "fmla z30.h, p3/M, z8.h, z13.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x14, LSL #1]\n"
- "ldr x28, [x16, #0x20]\n"
- "fmla z31.h, p3/M, z7.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x24, x14, LSL #1]\n"
- "fmla z23.h, p3/M, z2.h, z11.h\n"
- "fmla z26.h, p3/M, z7.h, z12.h\n"
- "fmla z27.h, p3/M, z6.h, z12.h\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "fmla z30.h, p3/M, z3.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x14, LSL #1]\n"
- "fmla z31.h, p3/M, z4.h, z13.h\n"
- "fmla z24.h, p3/M, z1.h, z11.h\n"
- "fmax z24.h, p3/M, z24.h, z17.h\n"
- "fmin z24.h, p3/M, z24.h, z16.h\n"
- "fmla z25.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x25, x14, LSL #1]\n"
+ "ld1h { z17.h }, p2/Z, [x26, x15, LSL #1]\n"
"fmla z23.h, p3/M, z6.h, z12.h\n"
- "fmax z23.h, p3/M, z23.h, z17.h\n"
- "fmla z28.h, p3/M, z7.h, z13.h\n"
- "fmla z30.h, p3/M, z5.h, z13.h\n"
- "fmin z23.h, p3/M, z23.h, z16.h\n"
- "st1h { z23.h }, p1, [x23, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z0.h, z12.h\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "ldr x23, [x27, #0x20]\n"
- "st1h { z24.h }, p1, [x22, x13, LSL #1]\n"
- "fmla z27.h, p3/M, z8.h, z13.h\n"
- "fmla z26.h, p3/M, z3.h, z12.h\n"
- "ld1h { z13.h }, p2/Z, [x12, x14, LSL #1]\n"
- "ldp x12, x11, [x16, #0x0]\n"
- "fmla z25.h, p3/M, z8.h, z11.h\n"
- "fmla z28.h, p3/M, z5.h, z11.h\n"
- "ldp x10, x9, [x16, #0x10]\n"
- "fmax z25.h, p3/M, z25.h, z17.h\n"
- "fmla z29.h, p3/M, z8.h, z13.h\n"
- "fmla z30.h, p3/M, z7.h, z13.h\n"
- "fmax z26.h, p3/M, z26.h, z17.h\n"
- "fmax z27.h, p3/M, z27.h, z17.h\n"
- "fmla z31.h, p3/M, z6.h, z13.h\n"
- "inch x14\n"
- "ld1h { z9.h }, p0/Z, [x12, x15, LSL #1]\n"
- "ld1h { z10.h }, p0/Z, [x11, x15, LSL #1]\n"
- "ld1h { z11.h }, p0/Z, [x10, x15, LSL #1]\n"
- "ld1h { z12.h }, p0/Z, [x9, x15, LSL #1]\n"
- "fmin z25.h, p3/M, z25.h, z16.h\n"
- "fmin z26.h, p3/M, z26.h, z16.h\n"
- "ld1h { z13.h }, p0/Z, [x28, x15, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "movprfx z21, z14\n fmla z21.h, p3/M, z1.h, z9.h\n"
+ "fmla z29.h, p3/M, z7.h, z18.h\n"
+ "ldr x22, [x17, #0x68]\n"
+ "ldr x21, [x17, #0x78]\n"
+ "fmla z28.h, p3/M, z0.h, z17.h\n"
+ "fmla z22.h, p3/M, z8.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "ldr x20, [x17, #0x80]\n"
+ "fmla z26.h, p3/M, z4.h, z18.h\n"
+ "fmla z25.h, p3/M, z3.h, z18.h\n"
+ "ldr x9, [x13, #0x8]\n"
+ "ldr x28, [x13, #0x10]\n"
+ "fmla z21.h, p3/M, z0.h, z18.h\n"
+ "fmla z24.h, p3/M, z4.h, z19.h\n"
+ "ldr x27, [x13, #0x18]\n"
+ "ld1h { z14.h }, p3/Z, [x8]\n"
+ "fmla z23.h, p3/M, z1.h, z18.h\n"
+ "fmla z29.h, p3/M, z1.h, z17.h\n"
+ "ld1h { z20.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "ld1h { z17.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z2.h, z16.h\n"
+ "fmla z27.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "ldr x26, [x17, #0x90]\n"
+ "fmla z25.h, p3/M, z5.h, z19.h\n"
+ "fmla z21.h, p3/M, z2.h, z19.h\n"
+ "ldr x25, [x17, #0xa0]\n"
+ "ldr x24, [x17, #0x98]\n"
+ "fmla z26.h, p3/M, z0.h, z20.h\n"
+ "fmla z24.h, p3/M, z2.h, z17.h\n"
+ "fmla z28.h, p3/M, z8.h, z19.h\n"
+ "fmla z27.h, p3/M, z7.h, z19.h\n"
+ "fmla z22.h, p3/M, z1.h, z19.h\n"
+ "fmla z23.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "ldr x23, [x17, #0xa8]\n"
+ "fmla z26.h, p3/M, z6.h, z16.h\n"
+ "fmla z25.h, p3/M, z7.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "ldr x22, [x17, #0xc0]\n"
+ "fmla z24.h, p3/M, z6.h, z18.h\n"
+ "fmla z21.h, p3/M, z4.h, z18.h\n"
+ "fmla z29.h, p3/M, z3.h, z20.h\n"
+ "fmla z27.h, p3/M, z5.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x12, x15, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "fmla z23.h, p3/M, z5.h, z18.h\n"
+ "fmla z22.h, p3/M, z3.h, z18.h\n"
+ "ldr x21, [x17, #0xb0]\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "fmla z26.h, p3/M, z8.h, z18.h\n"
+ "fmla z24.h, p3/M, z8.h, z17.h\n"
+ "fmla z21.h, p3/M, z6.h, z16.h\n"
+ "fmla z28.h, p3/M, z3.h, z19.h\n"
+ "fmla z25.h, p3/M, z0.h, z19.h\n"
+ "fmla z22.h, p3/M, z5.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "fmla z23.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z19.h\n"
+ "fmla z26.h, p3/M, z1.h, z19.h\n"
+ "fmla z28.h, p3/M, z5.h, z17.h\n"
+ "ld1h { z16.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z27.h, p3/M, z4.h, z17.h\n"
+ "fmla z25.h, p3/M, z2.h, z17.h\n"
+ "fmla z24.h, p3/M, z1.h, z17.h\n"
+ "fmla z21.h, p3/M, z8.h, z18.h\n"
+ "ld1h { z17.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "ldr x25, [x17, #0x20]\n"
+ "fmla z22.h, p3/M, z7.h, z18.h\n"
+ "ld1h { z18.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z2.h, z17.h\n"
+ "fmla z26.h, p3/M, z7.h, z16.h\n"
+ "fmla z25.h, p3/M, z6.h, z16.h\n"
+ "fmla z23.h, p3/M, z4.h, z16.h\n"
+ "fmla z21.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "fmla z22.h, p3/M, z4.h, z18.h\n"
+ "fmla z28.h, p3/M, z1.h, z17.h\n"
+ "fmax z28.h, p3/M, z28.h, z31.h\n"
+ "fmin z28.h, p3/M, z28.h, z30.h\n"
+ "fmla z27.h, p3/M, z0.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z6.h, z16.h\n"
+ "fmax z29.h, p3/M, z29.h, z31.h\n"
+ "fmla z24.h, p3/M, z7.h, z18.h\n"
+ "fmla z21.h, p3/M, z5.h, z18.h\n"
+ "fmin z29.h, p3/M, z29.h, z30.h\n"
+ "st1h { z29.h }, p1, [x10, x14, LSL #1]\n"
+ "fmla z23.h, p3/M, z0.h, z16.h\n"
+ "fmla z22.h, p3/M, z2.h, z17.h\n"
+ "ldr x24, [x13, #0x20]\n"
+ "st1h { z28.h }, p1, [x9, x14, LSL #1]\n"
+ "fmla z25.h, p3/M, z8.h, z18.h\n"
+ "fmla z26.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "ldp x23, x22, [x17, #0x0]\n"
+ "fmla z27.h, p3/M, z8.h, z17.h\n"
+ "fmla z24.h, p3/M, z5.h, z17.h\n"
+ "ldp x21, x20, [x17, #0x10]\n"
+ "fmax z27.h, p3/M, z27.h, z31.h\n"
+ "fmla z23.h, p3/M, z8.h, z16.h\n"
+ "fmla z21.h, p3/M, z7.h, z16.h\n"
+ "fmax z26.h, p3/M, z26.h, z31.h\n"
+ "fmax z25.h, p3/M, z25.h, z31.h\n"
+ "fmla z22.h, p3/M, z6.h, z16.h\n"
"inch x15\n"
- "fmin z27.h, p3/M, z27.h, z16.h\n"
- "st1h { z25.h }, p1, [x21, x13, LSL #1]\n"
- "fmax z28.h, p3/M, z28.h, z17.h\n"
- "fmax z29.h, p3/M, z29.h, z17.h\n"
- "st1h { z26.h }, p1, [x20, x13, LSL #1]\n"
- "ldr x22, [x27, #0x28]\n"
- "fmax z30.h, p3/M, z30.h, z17.h\n"
- "fmax z31.h, p3/M, z31.h, z17.h\n"
- "st1h { z27.h }, p1, [x23, x13, LSL #1]\n"
- "ldr x21, [x27, #0x30]\n"
- "ldr x20, [x27, #0x38]\n"
- "ldr x23, [x27, #0x40]\n"
- "whilelt p2.h, x14, %x[n_channels]\n"
- "cmp x15, %x[n_channels]\n"
- "ld1h { z0.h }, p3/Z, [x17, #1, MUL VL]\n"
- "ld1h { z1.h }, p3/Z, [x17, #2, MUL VL]\n"
- "fmin z28.h, p3/M, z28.h, z16.h\n"
- "fmin z29.h, p3/M, z29.h, z16.h\n"
- "ld1h { z2.h }, p3/Z, [x17, #3, MUL VL]\n"
- "ld1h { z3.h }, p3/Z, [x17, #4, MUL VL]\n"
- "fmin z30.h, p3/M, z30.h, z16.h\n"
- "fmin z31.h, p3/M, z31.h, z16.h\n"
- "ld1h { z4.h }, p3/Z, [x17, #5, MUL VL]\n"
- "ld1h { z5.h }, p3/Z, [x17, #6, MUL VL]\n"
- "st1h { z28.h }, p1, [x22, x13, LSL #1]\n"
- "ld1h { z6.h }, p3/Z, [x17, #7, MUL VL]\n"
- "addvl x17, x17, #16\n"
- "st1h { z29.h }, p1, [x21, x13, LSL #1]\n"
- "ld1h { z7.h }, p3/Z, [x17, #-8, MUL VL]\n"
- "st1h { z30.h }, p1, [x20, x13, LSL #1]\n"
- "ld1h { z8.h }, p3/Z, [x17, #-7, MUL VL]\n"
- "addvl x17, x17, #-6\n"
- "st1h { z31.h }, p1, [x23, x13, LSL #1]\n"
+ "ld1h { z9.h }, p0/Z, [x23, x16, LSL #1]\n"
+ "ld1h { z10.h }, p0/Z, [x22, x16, LSL #1]\n"
+ "ld1h { z11.h }, p0/Z, [x21, x16, LSL #1]\n"
+ "ld1h { z12.h }, p0/Z, [x20, x16, LSL #1]\n"
+ "fmin z27.h, p3/M, z27.h, z30.h\n"
+ "fmin z26.h, p3/M, z26.h, z30.h\n"
+ "ld1h { z13.h }, p0/Z, [x25, x16, LSL #1]\n"
+ "inch x16\n"
+ "fmin z25.h, p3/M, z25.h, z30.h\n"
+ "st1h { z27.h }, p1, [x28, x14, LSL #1]\n"
+ "fmax z24.h, p3/M, z24.h, z31.h\n"
+ "fmax z23.h, p3/M, z23.h, z31.h\n"
+ "st1h { z26.h }, p1, [x27, x14, LSL #1]\n"
+ "ldr x23, [x13, #0x28]\n"
+ "fmax z21.h, p3/M, z21.h, z31.h\n"
+ "fmax z22.h, p3/M, z22.h, z31.h\n"
+ "st1h { z25.h }, p1, [x24, x14, LSL #1]\n"
+ "ldr x22, [x13, #0x30]\n"
+ "ldr x21, [x13, #0x38]\n"
+ "ldr x20, [x13, #0x40]\n"
+ "whilelt p2.h, x15, %x[n_channels]\n"
+ "cmp x16, %x[n_channels]\n"
+ "ld1h { z0.h }, p3/Z, [x8, #1, MUL VL]\n"
+ "ld1h { z1.h }, p3/Z, [x8, #2, MUL VL]\n"
+ "fmin z24.h, p3/M, z24.h, z30.h\n"
+ "fmin z23.h, p3/M, z23.h, z30.h\n"
+ "ld1h { z2.h }, p3/Z, [x8, #3, MUL VL]\n"
+ "ld1h { z3.h }, p3/Z, [x8, #4, MUL VL]\n"
+ "fmin z21.h, p3/M, z21.h, z30.h\n"
+ "fmin z22.h, p3/M, z22.h, z30.h\n"
+ "ld1h { z4.h }, p3/Z, [x8, #5, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x8, #6, MUL VL]\n"
+ "st1h { z24.h }, p1, [x23, x14, LSL #1]\n"
+ "ld1h { z6.h }, p3/Z, [x8, #7, MUL VL]\n"
+ "addvl x8, x8, #16\n"
+ "st1h { z23.h }, p1, [x22, x14, LSL #1]\n"
+ "ld1h { z7.h }, p3/Z, [x8, #-8, MUL VL]\n"
+ "st1h { z21.h }, p1, [x21, x14, LSL #1]\n"
+ "ld1h { z8.h }, p3/Z, [x8, #-7, MUL VL]\n"
+ "addvl x8, x8, #-6\n"
+ "st1h { z22.h }, p1, [x20, x14, LSL #1]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z23, z18\n fmla z23.h, p3/M, z8.h, z9.h\n"
- "movprfx z24, z18\n fmla z24.h, p3/M, z7.h, z9.h\n"
- "ldr x26, [x16, #0x30]\n"
- "ldr x25, [x16, #0x38]\n"
- "movprfx z25, z18\n fmla z25.h, p3/M, z6.h, z9.h\n"
- "fmla z23.h, p3/M, z0.h, z10.h\n"
- "ldr x24, [x16, #0x28]\n"
- "ldr x11, [x16, #0x48]\n"
- "fmla z24.h, p3/M, z4.h, z13.h\n"
- "movprfx z26, z18\n fmla z26.h, p3/M, z5.h, z9.h\n"
- "ldr x12, [x16, #0x40]\n"
- "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
- "movprfx z27, z18\n fmla z27.h, p3/M, z4.h, z9.h\n"
- "movprfx z28, z18\n fmla z28.h, p3/M, z3.h, z9.h\n"
- "ldr x10, [x16, #0x50]\n"
- "ldr x9, [x16, #0x58]\n"
- "fmla z25.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x14, LSL #1]\n"
- "movprfx z29, z18\n fmla z29.h, p3/M, z2.h, z9.h\n"
- "ldr x28, [x16, #0x60]\n"
- "fmla z23.h, p3/M, z5.h, z13.h\n"
- "fmla z24.h, p3/M, z6.h, z11.h\n"
- "ldr x26, [x16, #0x70]\n"
- "ldr x11, [x16, #0x88]\n"
- "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
- "fmla z25.h, p3/M, z3.h, z13.h\n"
- "inch x13\n"
- "mov p1.b, p2.b\n"
+ "movprfx z29, z14\n fmla z29.h, p3/M, z8.h, z9.h\n"
+ "movprfx z28, z14\n fmla z28.h, p3/M, z7.h, z9.h\n"
+ "ldr x23, [x17, #0x30]\n"
+ "ldr x26, [x17, #0x38]\n"
+ "movprfx z27, z14\n fmla z27.h, p3/M, z6.h, z9.h\n"
+ "fmla z29.h, p3/M, z0.h, z10.h\n"
+ "ldr x22, [x17, #0x28]\n"
+ "ldr x21, [x17, #0x48]\n"
+ "fmla z28.h, p3/M, z4.h, z13.h\n"
+ "movprfx z26, z14\n fmla z26.h, p3/M, z5.h, z9.h\n"
+ "ldr x20, [x17, #0x40]\n"
+ "ld1h { z19.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "movprfx z25, z14\n fmla z25.h, p3/M, z4.h, z9.h\n"
+ "movprfx z24, z14\n fmla z24.h, p3/M, z3.h, z9.h\n"
+ "ldr x25, [x17, #0x50]\n"
+ "ldr x24, [x17, #0x58]\n"
+ "fmla z27.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z18.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "movprfx z23, z14\n fmla z23.h, p3/M, z2.h, z9.h\n"
+ "ldr x23, [x17, #0x60]\n"
+ "fmla z29.h, p3/M, z5.h, z13.h\n"
+ "fmla z28.h, p3/M, z6.h, z18.h\n"
+ "ldr x12, [x17, #0x70]\n"
+ "ldr x11, [x17, #0x88]\n"
+ "movprfx z22, z14\n fmla z22.h, p3/M, z0.h, z9.h\n"
+ "fmla z27.h, p3/M, z3.h, z13.h\n"
+ "inch x14\n"
+ "mov p0.b, p2.b\n"
"fmla z26.h, p3/M, z2.h, z13.h\n"
- "fmla z27.h, p3/M, z1.h, z13.h\n"
- "ldr x23, [x27, #0x0]\n"
- "ldr x22, [x27, #0x8]\n"
- "fmla z28.h, p3/M, z0.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x25, x14, LSL #1]\n"
- "fmla z29.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x14, LSL #1]\n"
- "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
- "fmla z23.h, p3/M, z7.h, z11.h\n"
- "ldr x24, [x16, #0x68]\n"
- "ldr x25, [x16, #0x78]\n"
+ "fmla z25.h, p3/M, z1.h, z13.h\n"
+ "ldr x10, [x13, #0x0]\n"
+ "ldr x9, [x13, #0x8]\n"
"fmla z24.h, p3/M, z0.h, z13.h\n"
- "fmla z31.h, p3/M, z8.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x12, x14, LSL #1]\n"
- "ldr x12, [x16, #0x80]\n"
- "fmla z26.h, p3/M, z4.h, z11.h\n"
- "fmla z27.h, p3/M, z3.h, z11.h\n"
- "ldr x21, [x27, #0x10]\n"
- "ldr x20, [x27, #0x18]\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "fmla z28.h, p3/M, z4.h, z10.h\n"
- "fmla z29.h, p3/M, z1.h, z11.h\n"
- "fmla z23.h, p3/M, z1.h, z13.h\n"
- "ld1h { z11.h }, p2/Z, [x10, x14, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x9, x14, LSL #1]\n"
- "fmla z24.h, p3/M, z2.h, z12.h\n"
- "fmla z25.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x28, x14, LSL #1]\n"
- "ldr x10, [x16, #0x90]\n"
- "fmla z27.h, p3/M, z5.h, z10.h\n"
- "fmla z30.h, p3/M, z2.h, z10.h\n"
- "ldr x28, [x16, #0xa0]\n"
- "ldr x9, [x16, #0x98]\n"
- "fmla z26.h, p3/M, z0.h, z11.h\n"
- "fmla z28.h, p3/M, z2.h, z13.h\n"
- "fmla z24.h, p3/M, z8.h, z10.h\n"
- "fmla z25.h, p3/M, z7.h, z10.h\n"
- "fmla z31.h, p3/M, z1.h, z10.h\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "ld1h { z10.h }, p2/Z, [x24, x14, LSL #1]\n"
- "ldr x24, [x16, #0xa8]\n"
- "fmla z26.h, p3/M, z6.h, z12.h\n"
- "fmla z27.h, p3/M, z7.h, z10.h\n"
- "ld1h { z12.h }, p2/Z, [x12, x14, LSL #1]\n"
- "ldr x12, [x16, #0xc0]\n"
- "fmla z28.h, p3/M, z6.h, z10.h\n"
- "fmla z30.h, p3/M, z4.h, z10.h\n"
- "fmla z23.h, p3/M, z3.h, z11.h\n"
- "fmla z25.h, p3/M, z5.h, z13.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x14, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x25, x14, LSL #1]\n"
- "fmla z29.h, p3/M, z5.h, z10.h\n"
- "fmla z31.h, p3/M, z3.h, z10.h\n"
- "ldr x26, [x16, #0xb0]\n"
- "ldr x25, [x16, #0xb8]\n"
- "fmla z26.h, p3/M, z8.h, z10.h\n"
- "fmla z28.h, p3/M, z8.h, z11.h\n"
- "fmla z30.h, p3/M, z6.h, z13.h\n"
- "fmla z24.h, p3/M, z3.h, z12.h\n"
- "fmla z27.h, p3/M, z0.h, z12.h\n"
- "fmla z31.h, p3/M, z5.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11, x14, LSL #1]\n"
- "fmla z29.h, p3/M, z7.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x10, x14, LSL #1]\n"
- "fmla z23.h, p3/M, z4.h, z12.h\n"
- "fmla z26.h, p3/M, z1.h, z12.h\n"
- "fmla z24.h, p3/M, z5.h, z11.h\n"
- "ld1h { z12.h }, p2/Z, [x9, x14, LSL #1]\n"
- "fmla z25.h, p3/M, z4.h, z11.h\n"
- "fmla z27.h, p3/M, z2.h, z11.h\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "fmla z30.h, p3/M, z8.h, z13.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x14, LSL #1]\n"
- "fmla z31.h, p3/M, z7.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x24, x14, LSL #1]\n"
- "fmla z23.h, p3/M, z2.h, z11.h\n"
- "fmla z26.h, p3/M, z7.h, z12.h\n"
- "fmla z27.h, p3/M, z6.h, z12.h\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "fmla z30.h, p3/M, z3.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x14, LSL #1]\n"
- "fmla z31.h, p3/M, z4.h, z13.h\n"
- "fmla z24.h, p3/M, z1.h, z11.h\n"
- "fmax z24.h, p3/M, z24.h, z17.h\n"
- "fmin z24.h, p3/M, z24.h, z16.h\n"
- "fmla z25.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x25, x14, LSL #1]\n"
+ "ld1h { z17.h }, p2/Z, [x26, x15, LSL #1]\n"
"fmla z23.h, p3/M, z6.h, z12.h\n"
- "fmax z23.h, p3/M, z23.h, z17.h\n"
- "fmla z28.h, p3/M, z7.h, z13.h\n"
- "fmla z30.h, p3/M, z5.h, z13.h\n"
- "fmin z23.h, p3/M, z23.h, z16.h\n"
- "st1h { z23.h }, p1, [x23, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z0.h, z12.h\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "ldr x23, [x27, #0x20]\n"
- "st1h { z24.h }, p1, [x22, x13, LSL #1]\n"
- "fmla z27.h, p3/M, z8.h, z13.h\n"
- "fmla z26.h, p3/M, z3.h, z12.h\n"
- "ld1h { z13.h }, p2/Z, [x12, x14, LSL #1]\n"
- "fmax z26.h, p3/M, z26.h, z17.h\n"
- "fmla z25.h, p3/M, z8.h, z11.h\n"
- "fmla z28.h, p3/M, z5.h, z11.h\n"
- "fmax z25.h, p3/M, z25.h, z17.h\n"
- "fmax z27.h, p3/M, z27.h, z17.h\n"
- "fmla z29.h, p3/M, z8.h, z13.h\n"
- "fmla z30.h, p3/M, z7.h, z13.h\n"
- "fmin z25.h, p3/M, z25.h, z16.h\n"
- "fmin z26.h, p3/M, z26.h, z16.h\n"
- "fmla z31.h, p3/M, z6.h, z13.h\n"
- "fmin z27.h, p3/M, z27.h, z16.h\n"
- "fmax z28.h, p3/M, z28.h, z17.h\n"
- "st1h { z25.h }, p1, [x21, x13, LSL #1]\n"
- "fmax z29.h, p3/M, z29.h, z17.h\n"
- "fmax z30.h, p3/M, z30.h, z17.h\n"
- "st1h { z26.h }, p1, [x20, x13, LSL #1]\n"
- "ldr x22, [x27, #0x28]\n"
- "fmax z31.h, p3/M, z31.h, z17.h\n"
- "st1h { z27.h }, p1, [x23, x13, LSL #1]\n"
- "ldr x21, [x27, #0x30]\n"
- "ldr x20, [x27, #0x38]\n"
- "ldr x23, [x27, #0x40]\n"
- "fmin z28.h, p3/M, z28.h, z16.h\n"
- "fmin z29.h, p3/M, z29.h, z16.h\n"
- "st1h { z28.h }, p1, [x22, x13, LSL #1]\n"
- "fmin z30.h, p3/M, z30.h, z16.h\n"
- "fmin z31.h, p3/M, z31.h, z16.h\n"
- "st1h { z29.h }, p1, [x21, x13, LSL #1]\n"
- "st1h { z30.h }, p1, [x20, x13, LSL #1]\n"
- "st1h { z31.h }, p1, [x23, x13, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "movprfx z21, z14\n fmla z21.h, p3/M, z1.h, z9.h\n"
+ "fmla z29.h, p3/M, z7.h, z18.h\n"
+ "ldr x22, [x17, #0x68]\n"
+ "ldr x21, [x17, #0x78]\n"
+ "fmla z28.h, p3/M, z0.h, z17.h\n"
+ "fmla z22.h, p3/M, z8.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "ldr x20, [x17, #0x80]\n"
+ "fmla z26.h, p3/M, z4.h, z18.h\n"
+ "fmla z25.h, p3/M, z3.h, z18.h\n"
+ "ldr x28, [x13, #0x10]\n"
+ "ldr x27, [x13, #0x18]\n"
+ "fmla z21.h, p3/M, z0.h, z18.h\n"
+ "fmla z24.h, p3/M, z4.h, z19.h\n"
+ "fmla z23.h, p3/M, z1.h, z18.h\n"
+ "fmla z29.h, p3/M, z1.h, z17.h\n"
+ "ld1h { z20.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "ld1h { z17.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z2.h, z16.h\n"
+ "fmla z27.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "ldr x26, [x17, #0x90]\n"
+ "fmla z25.h, p3/M, z5.h, z19.h\n"
+ "fmla z21.h, p3/M, z2.h, z19.h\n"
+ "ldr x25, [x17, #0xa0]\n"
+ "ldr x24, [x17, #0x98]\n"
+ "fmla z26.h, p3/M, z0.h, z20.h\n"
+ "fmla z24.h, p3/M, z2.h, z17.h\n"
+ "fmla z28.h, p3/M, z8.h, z19.h\n"
+ "fmla z27.h, p3/M, z7.h, z19.h\n"
+ "fmla z22.h, p3/M, z1.h, z19.h\n"
+ "fmla z23.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "ldr x23, [x17, #0xa8]\n"
+ "fmla z26.h, p3/M, z6.h, z16.h\n"
+ "fmla z25.h, p3/M, z7.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "ldr x22, [x17, #0xc0]\n"
+ "fmla z24.h, p3/M, z6.h, z18.h\n"
+ "fmla z21.h, p3/M, z4.h, z18.h\n"
+ "fmla z29.h, p3/M, z3.h, z20.h\n"
+ "fmla z27.h, p3/M, z5.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x12, x15, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "fmla z23.h, p3/M, z5.h, z18.h\n"
+ "fmla z22.h, p3/M, z3.h, z18.h\n"
+ "ldr x21, [x17, #0xb0]\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "fmla z26.h, p3/M, z8.h, z18.h\n"
+ "fmla z24.h, p3/M, z8.h, z17.h\n"
+ "fmla z21.h, p3/M, z6.h, z16.h\n"
+ "fmla z28.h, p3/M, z3.h, z19.h\n"
+ "fmla z25.h, p3/M, z0.h, z19.h\n"
+ "fmla z22.h, p3/M, z5.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "fmla z23.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z19.h\n"
+ "fmla z26.h, p3/M, z1.h, z19.h\n"
+ "fmla z28.h, p3/M, z5.h, z17.h\n"
+ "ld1h { z16.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z27.h, p3/M, z4.h, z17.h\n"
+ "fmla z25.h, p3/M, z2.h, z17.h\n"
+ "fmla z24.h, p3/M, z1.h, z17.h\n"
+ "fmla z21.h, p3/M, z8.h, z18.h\n"
+ "ld1h { z17.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z22.h, p3/M, z7.h, z18.h\n"
+ "ld1h { z18.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z2.h, z17.h\n"
+ "fmla z26.h, p3/M, z7.h, z16.h\n"
+ "fmla z25.h, p3/M, z6.h, z16.h\n"
+ "fmla z23.h, p3/M, z4.h, z16.h\n"
+ "fmla z21.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "fmla z22.h, p3/M, z4.h, z18.h\n"
+ "fmla z28.h, p3/M, z1.h, z17.h\n"
+ "fmax z28.h, p3/M, z28.h, z31.h\n"
+ "fmin z28.h, p3/M, z28.h, z30.h\n"
+ "fmla z27.h, p3/M, z0.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z6.h, z16.h\n"
+ "fmax z29.h, p3/M, z29.h, z31.h\n"
+ "fmla z24.h, p3/M, z7.h, z18.h\n"
+ "fmla z21.h, p3/M, z5.h, z18.h\n"
+ "fmin z29.h, p3/M, z29.h, z30.h\n"
+ "st1h { z29.h }, p0, [x10, x14, LSL #1]\n"
+ "fmla z23.h, p3/M, z0.h, z16.h\n"
+ "fmla z22.h, p3/M, z2.h, z17.h\n"
+ "ldr x20, [x13, #0x20]\n"
+ "st1h { z28.h }, p0, [x9, x14, LSL #1]\n"
+ "fmla z25.h, p3/M, z8.h, z18.h\n"
+ "fmla z26.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "fmax z26.h, p3/M, z26.h, z31.h\n"
+ "fmla z27.h, p3/M, z8.h, z17.h\n"
+ "fmla z24.h, p3/M, z5.h, z17.h\n"
+ "fmax z27.h, p3/M, z27.h, z31.h\n"
+ "fmax z25.h, p3/M, z25.h, z31.h\n"
+ "fmla z23.h, p3/M, z8.h, z16.h\n"
+ "fmla z21.h, p3/M, z7.h, z16.h\n"
+ "fmin z27.h, p3/M, z27.h, z30.h\n"
+ "fmin z26.h, p3/M, z26.h, z30.h\n"
+ "fmla z22.h, p3/M, z6.h, z16.h\n"
+ "fmin z25.h, p3/M, z25.h, z30.h\n"
+ "fmax z24.h, p3/M, z24.h, z31.h\n"
+ "st1h { z27.h }, p0, [x28, x14, LSL #1]\n"
+ "fmax z23.h, p3/M, z23.h, z31.h\n"
+ "fmax z21.h, p3/M, z21.h, z31.h\n"
+ "st1h { z26.h }, p0, [x27, x14, LSL #1]\n"
+ "ldr x23, [x13, #0x28]\n"
+ "fmax z22.h, p3/M, z22.h, z31.h\n"
+ "st1h { z25.h }, p0, [x20, x14, LSL #1]\n"
+ "ldr x22, [x13, #0x30]\n"
+ "ldr x21, [x13, #0x38]\n"
+ "ldr x20, [x13, #0x40]\n"
+ "fmin z24.h, p3/M, z24.h, z30.h\n"
+ "fmin z23.h, p3/M, z23.h, z30.h\n"
+ "st1h { z24.h }, p0, [x23, x14, LSL #1]\n"
+ "fmin z21.h, p3/M, z21.h, z30.h\n"
+ "fmin z22.h, p3/M, z22.h, z30.h\n"
+ "st1h { z23.h }, p0, [x22, x14, LSL #1]\n"
+ "st1h { z21.h }, p0, [x21, x14, LSL #1]\n"
+ "st1h { z22.h }, p0, [x20, x14, LSL #1]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
index ac6ae284fd..af5ee740c9 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,19 +22,19 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
-void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
-void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
class sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
{
@@ -57,7 +57,7 @@ class sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 4;
sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(4, 3, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
@@ -68,4 +68,4 @@ class sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirst
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
index c0b9137f6b..41eaa4f18c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -113,7 +113,7 @@ void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"mul x21, x21, x25\n" // offset *= kernel_stride * output_size
"add x8, x8, x21, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
"add x13, x8, x23, LSL #1\n"
- "ld1h { z15.h }, p3/Z, [x17]\n"
+ "ld1h { z19.h }, p3/Z, [x17]\n"
"mul x20, x20, x24\n" // offset *= output_tile_size
"add x12, x13, x23, LSL #1\n"
"add x15, x15, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
@@ -132,8 +132,8 @@ void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"add x27, x10, x5\n"
"add x26, x9, x22, LSL #1\n"
"add x25, x6, x6\n"
- "ld1rh { z14.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rh { z13.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"cmp x16, %x[n_channels]\n"
"add x24, x28, x23, LSL #1\n"
"ld1h { z7.h }, p3/Z, [x17, #-8, MUL VL]\n"
@@ -149,500 +149,500 @@ void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"addvl x17, x17, #-6\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z21, z15\n fmla z21.h, p3/M, z4.h, z9.h\n"
- "movprfx z16, z15\n fmla z16.h, p3/M, z8.h, z9.h\n"
+ "movprfx z14, z19\n fmla z14.h, p3/M, z4.h, z9.h\n"
+ "movprfx z31, z19\n fmla z31.h, p3/M, z8.h, z9.h\n"
"whilelt p1.h, x16, %x[n_channels]\n"
"inch x21\n"
- "movprfx z22, z15\n fmla z22.h, p3/M, z3.h, z9.h\n"
- "movprfx z25, z15\n fmla z25.h, p3/M, z1.h, z9.h\n"
+ "movprfx z21, z19\n fmla z21.h, p3/M, z3.h, z9.h\n"
+ "movprfx z22, z19\n fmla z22.h, p3/M, z1.h, z9.h\n"
"inch x16\n"
"mov p0.b, p2.b\n"
- "movprfx z26, z15\n fmla z26.h, p3/M, z0.h, z9.h\n"
- "fmla z21.h, p3/M, z5.h, z12.h\n"
+ "movprfx z20, z19\n fmla z20.h, p3/M, z0.h, z9.h\n"
+ "fmla z14.h, p3/M, z5.h, z12.h\n"
"inch x20\n"
- "movprfx z17, z15\n fmla z17.h, p3/M, z7.h, z9.h\n"
- "movprfx z18, z15\n fmla z18.h, p3/M, z6.h, z9.h\n"
- "movprfx z20, z15\n fmla z20.h, p3/M, z5.h, z9.h\n"
- "movprfx z24, z15\n fmla z24.h, p3/M, z2.h, z9.h\n"
+ "movprfx z13, z19\n fmla z13.h, p3/M, z7.h, z9.h\n"
+ "movprfx z17, z19\n fmla z17.h, p3/M, z6.h, z9.h\n"
+ "movprfx z27, z19\n fmla z27.h, p3/M, z5.h, z9.h\n"
+ "movprfx z18, z19\n fmla z18.h, p3/M, z2.h, z9.h\n"
"ld1h { z9.h }, p2/Z, [x11, x7, LSL #1]\n"
- "fmla z16.h, p3/M, z0.h, z10.h\n"
- "movprfx z19, z15\n fmla z19.h, p3/M, z2.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x24]\n"
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
+ "movprfx z30, z19\n fmla z30.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z29.h }, p2/Z, [x24]\n"
"ld1h { z11.h }, p2/Z, [x24, x27, LSL #1]\n"
- "fmla z22.h, p3/M, z4.h, z12.h\n"
- "fmla z25.h, p3/M, z2.h, z12.h\n"
- "fmla z26.h, p3/M, z1.h, z12.h\n"
- "movprfx z28, z15\n fmla z28.h, p3/M, z6.h, z10.h\n"
+ "fmla z21.h, p3/M, z4.h, z12.h\n"
+ "fmla z22.h, p3/M, z2.h, z12.h\n"
+ "fmla z20.h, p3/M, z1.h, z12.h\n"
+ "movprfx z23, z19\n fmla z23.h, p3/M, z6.h, z29.h\n"
"ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
- "fmla z21.h, p3/M, z7.h, z9.h\n"
- "fmla z17.h, p3/M, z8.h, z12.h\n"
- "fmla z18.h, p3/M, z7.h, z12.h\n"
- "fmla z19.h, p3/M, z6.h, z12.h\n"
- "movprfx z23, z15\n fmla z23.h, p3/M, z3.h, z12.h\n"
- "movprfx z27, z15\n fmla z27.h, p3/M, z0.h, z12.h\n"
+ "fmla z14.h, p3/M, z7.h, z9.h\n"
+ "fmla z13.h, p3/M, z8.h, z12.h\n"
+ "fmla z17.h, p3/M, z7.h, z12.h\n"
+ "fmla z30.h, p3/M, z6.h, z12.h\n"
+ "movprfx z26, z19\n fmla z26.h, p3/M, z3.h, z12.h\n"
+ "movprfx z28, z19\n fmla z28.h, p3/M, z0.h, z12.h\n"
"ld1h { z12.h }, p2/Z, [x8, x5, LSL #1]\n"
- "movprfx z31, z15\n fmla z31.h, p3/M, z8.h, z11.h\n"
- "fmla z22.h, p3/M, z6.h, z9.h\n"
+ "movprfx z24, z19\n fmla z24.h, p3/M, z8.h, z11.h\n"
+ "fmla z21.h, p3/M, z6.h, z9.h\n"
"ld1h { z11.h }, p2/Z, [x8, x10, LSL #1]\n"
- "fmla z25.h, p3/M, z4.h, z9.h\n"
- "fmla z26.h, p3/M, z3.h, z9.h\n"
- "fmla z20.h, p3/M, z8.h, z9.h\n"
- "fmla z24.h, p3/M, z5.h, z9.h\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "fmla z21.h, p3/M, z8.h, z10.h\n"
- "fmla z16.h, p3/M, z1.h, z12.h\n"
- "fmla z17.h, p3/M, z0.h, z12.h\n"
+ "fmla z22.h, p3/M, z4.h, z9.h\n"
+ "fmla z20.h, p3/M, z3.h, z9.h\n"
+ "movprfx z25, z19\n fmla z25.h, p3/M, z1.h, z9.h\n"
+ "movprfx z29, z19\n fmla z29.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z19.h }, p3/Z, [x17]\n"
+ "fmla z27.h, p3/M, z8.h, z9.h\n"
+ "fmla z18.h, p3/M, z5.h, z9.h\n"
+ "fmla z23.h, p3/M, z2.h, z9.h\n"
+ "fmla z14.h, p3/M, z8.h, z10.h\n"
+ "ld1h { z9.h }, p2/Z, [x13]\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "fmla z13.h, p3/M, z0.h, z12.h\n"
"ld1h { z12.h }, p2/Z, [x13, x27, LSL #1]\n"
- "fmla z18.h, p3/M, z2.h, z11.h\n"
- "fmla z19.h, p3/M, z1.h, z11.h\n"
+ "fmla z17.h, p3/M, z2.h, z11.h\n"
+ "fmla z30.h, p3/M, z1.h, z11.h\n"
"ld1h { z11.h }, p2/Z, [x28]\n"
- "fmla z22.h, p3/M, z7.h, z10.h\n"
- "fmla z23.h, p3/M, z6.h, z10.h\n"
- "fmla z25.h, p3/M, z5.h, z10.h\n"
- "fmla z26.h, p3/M, z4.h, z10.h\n"
- "fmla z27.h, p3/M, z3.h, z10.h\n"
- "fmla z31.h, p3/M, z0.h, z10.h\n"
- "fmla z24.h, p3/M, z6.h, z11.h\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x27, LSL #1]\n"
- "fmla z19.h, p3/M, z5.h, z12.h\n"
- "fmla z23.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x13, x14, LSL #1]\n"
- "fmla z27.h, p3/M, z8.h, z11.h\n"
- "fmla z31.h, p3/M, z5.h, z11.h\n"
- "movprfx z29, z15\n fmla z29.h, p3/M, z1.h, z9.h\n"
- "movprfx z30, z15\n fmla z30.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x13]\n"
- "fmla z29.h, p3/M, z2.h, z10.h\n"
- "fmla z30.h, p3/M, z1.h, z10.h\n"
+ "fmla z21.h, p3/M, z7.h, z10.h\n"
+ "fmla z26.h, p3/M, z6.h, z10.h\n"
+ "fmla z22.h, p3/M, z5.h, z10.h\n"
+ "fmla z20.h, p3/M, z4.h, z10.h\n"
+ "fmla z28.h, p3/M, z3.h, z10.h\n"
+ "fmla z25.h, p3/M, z2.h, z10.h\n"
+ "fmla z29.h, p3/M, z1.h, z10.h\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
"ld1h { z10.h }, p2/Z, [x13, x7, LSL #1]\n"
- "fmla z20.h, p3/M, z0.h, z9.h\n"
- "fmla z21.h, p3/M, z1.h, z10.h\n"
- "fmla z16.h, p3/M, z3.h, z9.h\n"
- "fmla z17.h, p3/M, z4.h, z10.h\n"
- "ld1h { z11.h }, p2/Z, [x24, x5, LSL #1]\n"
- "fmla z18.h, p3/M, z3.h, z10.h\n"
- "fmla z22.h, p3/M, z0.h, z10.h\n"
+ "fmla z27.h, p3/M, z0.h, z9.h\n"
+ "fmla z18.h, p3/M, z6.h, z11.h\n"
+ "fmla z23.h, p3/M, z3.h, z11.h\n"
+ "fmla z14.h, p3/M, z1.h, z10.h\n"
+ "ld1h { z11.h }, p2/Z, [x28, x27, LSL #1]\n"
+ "fmla z31.h, p3/M, z3.h, z9.h\n"
+ "fmla z30.h, p3/M, z5.h, z12.h\n"
+ "fmla z26.h, p3/M, z2.h, z12.h\n"
+ "fmla z13.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z9.h }, p2/Z, [x13, x14, LSL #1]\n"
+ "fmla z17.h, p3/M, z3.h, z10.h\n"
+ "fmla z21.h, p3/M, z0.h, z10.h\n"
+ "fmla z28.h, p3/M, z8.h, z11.h\n"
+ "fmla z24.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x5, LSL #1]\n"
+ "fmla z27.h, p3/M, z2.h, z10.h\n"
+ "fmla z14.h, p3/M, z2.h, z9.h\n"
+ "fmla z31.h, p3/M, z5.h, z10.h\n"
+ "fmla z13.h, p3/M, z5.h, z9.h\n"
+ "ld1h { z11.h }, p2/Z, [x12, x5, LSL #1]\n"
+ "fmla z17.h, p3/M, z4.h, z9.h\n"
+ "fmla z30.h, p3/M, z3.h, z9.h\n"
+ "fmla z21.h, p3/M, z1.h, z9.h\n"
+ "fmla z26.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z10.h }, p2/Z, [x12, x10, LSL #1]\n"
+ "fmla z23.h, p3/M, z7.h, z12.h\n"
+ "fmla z25.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x10, LSL #1]\n"
+ "fmla z27.h, p3/M, z4.h, z11.h\n"
+ "fmla z14.h, p3/M, z3.h, z11.h\n"
+ "fmla z18.h, p3/M, z1.h, z11.h\n"
+ "fmla z22.h, p3/M, z0.h, z11.h\n"
+ "fmla z31.h, p3/M, z7.h, z11.h\n"
+ "fmla z13.h, p3/M, z6.h, z11.h\n"
+ "ld1h { z9.h }, p2/Z, [x8, x7, LSL #1]\n"
+ "fmla z29.h, p3/M, z8.h, z12.h\n"
+ "fmla z24.h, p3/M, z7.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x5, LSL #1]\n"
+ "fmla z17.h, p3/M, z8.h, z10.h\n"
+ "fmla z30.h, p3/M, z7.h, z10.h\n"
+ "fmla z21.h, p3/M, z5.h, z10.h\n"
+ "fmla z26.h, p3/M, z4.h, z10.h\n"
"fmla z20.h, p3/M, z2.h, z10.h\n"
- "fmla z21.h, p3/M, z2.h, z12.h\n"
- "fmla z16.h, p3/M, z5.h, z10.h\n"
- "fmla z17.h, p3/M, z5.h, z12.h\n"
- "ld1h { z10.h }, p2/Z, [x12, x5, LSL #1]\n"
+ "fmla z28.h, p3/M, z1.h, z10.h\n"
+ "ld1h { z11.h }, p2/Z, [x8, x14, LSL #1]\n"
+ "addvl x8, x8, #1\n"
+ "fmla z27.h, p3/M, z7.h, z12.h\n"
+ "fmla z14.h, p3/M, z6.h, z12.h\n"
"fmla z18.h, p3/M, z4.h, z12.h\n"
- "fmla z19.h, p3/M, z3.h, z12.h\n"
- "fmla z22.h, p3/M, z1.h, z12.h\n"
- "fmla z23.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x12, x10, LSL #1]\n"
- "fmla z28.h, p3/M, z7.h, z11.h\n"
- "fmla z29.h, p3/M, z6.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x24, x10, LSL #1]\n"
- "fmla z20.h, p3/M, z4.h, z10.h\n"
- "fmla z21.h, p3/M, z3.h, z10.h\n"
- "fmla z24.h, p3/M, z1.h, z10.h\n"
- "fmla z25.h, p3/M, z0.h, z10.h\n"
- "fmla z16.h, p3/M, z7.h, z10.h\n"
- "fmla z17.h, p3/M, z6.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x8, x7, LSL #1]\n"
+ "fmla z22.h, p3/M, z3.h, z12.h\n"
+ "fmla z23.h, p3/M, z1.h, z12.h\n"
+ "fmla z25.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x10, LSL #1]\n"
+ "fmla z31.h, p3/M, z2.h, z9.h\n"
+ "fmla z13.h, p3/M, z1.h, z9.h\n"
+ "fmla z17.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x12]\n"
+ "fmla z29.h, p3/M, z2.h, z12.h\n"
+ "fmla z30.h, p3/M, z0.h, z11.h\n"
+ "fmla z27.h, p3/M, z3.h, z9.h\n"
+ "fmla z18.h, p3/M, z0.h, z9.h\n"
+ "fmla z21.h, p3/M, z8.h, z12.h\n"
+ "fmla z26.h, p3/M, z7.h, z12.h\n"
+ "fmla z20.h, p3/M, z5.h, z12.h\n"
+ "fmla z28.h, p3/M, z4.h, z12.h\n"
+ "fmla z24.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z10.h }, p2/Z, [x28, x7, LSL #1]\n"
+ "fmla z13.h, p3/M, z2.h, z11.h\n"
+ "fmla z17.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x12, x27, LSL #1]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z31.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z12.h }, p2/Z, [x11]\n"
+ "fmla z25.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z9.h }, p1/Z, [x12, x7, LSL #1]\n"
+ "fmla z29.h, p3/M, z3.h, z10.h\n"
"fmla z30.h, p3/M, z8.h, z11.h\n"
- "fmla z31.h, p3/M, z7.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11, x5, LSL #1]\n"
- "fmla z18.h, p3/M, z8.h, z12.h\n"
- "fmla z19.h, p3/M, z7.h, z12.h\n"
- "fmla z22.h, p3/M, z5.h, z12.h\n"
- "fmla z23.h, p3/M, z4.h, z12.h\n"
- "fmla z26.h, p3/M, z2.h, z12.h\n"
- "fmla z27.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x8, x14, LSL #1]\n"
- "addvl x8, x8, #1\n"
- "fmla z20.h, p3/M, z7.h, z11.h\n"
- "fmla z21.h, p3/M, z6.h, z11.h\n"
- "fmla z24.h, p3/M, z4.h, z11.h\n"
- "fmla z25.h, p3/M, z3.h, z11.h\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "fmla z29.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11, x10, LSL #1]\n"
- "fmla z16.h, p3/M, z2.h, z10.h\n"
- "fmla z17.h, p3/M, z1.h, z10.h\n"
- "fmla z18.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x12]\n"
- "fmla z30.h, p3/M, z2.h, z11.h\n"
- "fmla z19.h, p3/M, z0.h, z12.h\n"
- "fmla z20.h, p3/M, z3.h, z10.h\n"
- "fmla z24.h, p3/M, z0.h, z10.h\n"
- "fmla z22.h, p3/M, z8.h, z11.h\n"
- "fmla z23.h, p3/M, z7.h, z11.h\n"
"fmla z26.h, p3/M, z5.h, z11.h\n"
- "fmla z27.h, p3/M, z4.h, z11.h\n"
- "fmla z31.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x7, LSL #1]\n"
- "fmla z17.h, p3/M, z2.h, z12.h\n"
- "fmla z18.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x12, x27, LSL #1]\n"
- "addvl x12, x12, #1\n"
- "fmla z16.h, p3/M, z6.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x11]\n"
- "fmla z29.h, p3/M, z4.h, z11.h\n"
- "fmla z30.h, p3/M, z3.h, z11.h\n"
- "fmla z19.h, p3/M, z8.h, z12.h\n"
- "fmla z23.h, p3/M, z5.h, z12.h\n"
- "fmla z27.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x11, x27, LSL #1]\n"
+ "fmla z28.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x11, x27, LSL #1]\n"
"addvl x11, x11, #1\n"
+ "fmla z27.h, p3/M, z6.h, z12.h\n"
+ "fmla z18.h, p3/M, z3.h, z12.h\n"
+ "fmla z23.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x7, LSL #1]\n"
+ "fmla z24.h, p3/M, z2.h, z11.h\n"
+ "fmla z25.h, p3/M, z7.h, z12.h\n"
+ "fmla z29.h, p3/M, z6.h, z12.h\n"
+ "fmla z18.h, p3/M, z8.h, z10.h\n"
+ "fmla z22.h, p3/M, z7.h, z10.h\n"
"fmla z20.h, p3/M, z6.h, z10.h\n"
- "fmla z24.h, p3/M, z3.h, z10.h\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x24, x7, LSL #1]\n"
- "fmla z31.h, p3/M, z2.h, z12.h\n"
- "fmla z29.h, p3/M, z7.h, z10.h\n"
- "fmla z30.h, p3/M, z6.h, z10.h\n"
- "fmla z24.h, p3/M, z8.h, z11.h\n"
- "fmla z25.h, p3/M, z7.h, z11.h\n"
- "fmla z26.h, p3/M, z6.h, z11.h\n"
+ "fmla z23.h, p3/M, z5.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x28, x14, LSL #1]\n"
"fmla z28.h, p3/M, z5.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x14, LSL #1]\n"
- "fmla z27.h, p3/M, z5.h, z12.h\n"
- "fmla z29.h, p3/M, z5.h, z11.h\n"
- "fmla z30.h, p3/M, z4.h, z11.h\n"
- "fmla z31.h, p3/M, z3.h, z11.h\n"
+ "fmla z25.h, p3/M, z5.h, z10.h\n"
+ "fmla z29.h, p3/M, z4.h, z10.h\n"
+ "fmla z24.h, p3/M, z3.h, z10.h\n"
+ "fmla z26.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x24, x14, LSL #1]\n"
"fmla z23.h, p3/M, z8.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x14, LSL #1]\n"
- "fmla z28.h, p3/M, z8.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x13, x5, LSL #1]\n"
- "fmla z25.h, p3/M, z8.h, z11.h\n"
- "fmla z26.h, p3/M, z7.h, z11.h\n"
+ "ld1h { z12.h }, p2/Z, [x13, x5, LSL #1]\n"
+ "fmla z22.h, p3/M, z8.h, z10.h\n"
+ "fmla z20.h, p3/M, z7.h, z10.h\n"
"addvl x24, x24, #1\n"
- "fmla z27.h, p3/M, z6.h, z11.h\n"
- "fmla z29.h, p3/M, z8.h, z12.h\n"
- "ld1h { z11.h }, p2/Z, [x13, x10, LSL #1]\n"
+ "fmla z28.h, p3/M, z6.h, z10.h\n"
+ "fmla z25.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x13, x10, LSL #1]\n"
"addvl x13, x13, #1\n"
- "fmla z30.h, p3/M, z7.h, z12.h\n"
- "fmla z31.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x28, x5, LSL #1]\n"
- "fmla z16.h, p3/M, z4.h, z10.h\n"
- "fmla z17.h, p3/M, z3.h, z10.h\n"
- "fmax z16.h, p3/M, z16.h, z14.h\n"
- "fmax z17.h, p3/M, z17.h, z14.h\n"
- "fmla z20.h, p3/M, z1.h, z10.h\n"
- "fmla z21.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x28, x10, LSL #1]\n"
- "fmax z20.h, p3/M, z20.h, z14.h\n"
- "fmla z18.h, p3/M, z5.h, z11.h\n"
- "fmla z19.h, p3/M, z4.h, z11.h\n"
- "fmax z18.h, p3/M, z18.h, z14.h\n"
- "fmax z19.h, p3/M, z19.h, z14.h\n"
- "fmla z22.h, p3/M, z2.h, z11.h\n"
- "fmla z23.h, p3/M, z1.h, z11.h\n"
- "fmax z21.h, p3/M, z21.h, z14.h\n"
- "fmax z22.h, p3/M, z22.h, z14.h\n"
- "fmla z24.h, p3/M, z7.h, z12.h\n"
- "fmla z25.h, p3/M, z6.h, z12.h\n"
- "fmax z23.h, p3/M, z23.h, z14.h\n"
- "fmax z24.h, p3/M, z24.h, z14.h\n"
- "fmla z28.h, p3/M, z4.h, z12.h\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "fmax z25.h, p3/M, z25.h, z14.h\n"
- "fmax z28.h, p3/M, z28.h, z14.h\n"
- "fmla z26.h, p3/M, z8.h, z10.h\n"
- "fmla z27.h, p3/M, z7.h, z10.h\n"
- "fmax z26.h, p3/M, z26.h, z14.h\n"
- "fmax z27.h, p3/M, z27.h, z14.h\n"
- "fmla z30.h, p3/M, z5.h, z10.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
- "fmax z29.h, p3/M, z29.h, z14.h\n"
- "fmax z30.h, p3/M, z30.h, z14.h\n"
- "fmax z31.h, p3/M, z31.h, z14.h\n"
- "ld1h { z15.h }, p3/Z, [x17]\n"
+ "fmla z29.h, p3/M, z7.h, z11.h\n"
+ "fmla z24.h, p3/M, z6.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x28, x5, LSL #1]\n"
+ "fmla z31.h, p3/M, z4.h, z12.h\n"
+ "fmla z13.h, p3/M, z3.h, z12.h\n"
+ "fmax z31.h, p3/M, z31.h, z15.h\n"
+ "fmax z13.h, p3/M, z13.h, z15.h\n"
+ "fmla z27.h, p3/M, z1.h, z12.h\n"
+ "fmla z14.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z0.h }, p2/Z, [x28, x10, LSL #1]\n"
+ "fmax z27.h, p3/M, z27.h, z15.h\n"
+ "fmla z17.h, p3/M, z5.h, z10.h\n"
+ "fmla z30.h, p3/M, z4.h, z10.h\n"
+ "fmax z17.h, p3/M, z17.h, z15.h\n"
+ "fmax z30.h, p3/M, z30.h, z15.h\n"
+ "fmla z21.h, p3/M, z2.h, z10.h\n"
+ "fmla z26.h, p3/M, z1.h, z10.h\n"
+ "fmax z14.h, p3/M, z14.h, z15.h\n"
+ "fmax z21.h, p3/M, z21.h, z15.h\n"
+ "fmla z18.h, p3/M, z7.h, z11.h\n"
+ "fmla z22.h, p3/M, z6.h, z11.h\n"
+ "fmax z26.h, p3/M, z26.h, z15.h\n"
+ "fmax z18.h, p3/M, z18.h, z15.h\n"
+ "fmla z23.h, p3/M, z4.h, z11.h\n"
+ "fmla z25.h, p3/M, z3.h, z11.h\n"
+ "fmax z22.h, p3/M, z22.h, z15.h\n"
+ "fmax z23.h, p3/M, z23.h, z15.h\n"
+ "fmla z20.h, p3/M, z8.h, z0.h\n"
+ "fmla z28.h, p3/M, z7.h, z0.h\n"
+ "fmax z20.h, p3/M, z20.h, z15.h\n"
+ "fmax z28.h, p3/M, z28.h, z15.h\n"
+ "fmla z29.h, p3/M, z5.h, z0.h\n"
+ "fmla z24.h, p3/M, z4.h, z0.h\n"
+ "fmax z25.h, p3/M, z25.h, z15.h\n"
+ "fmax z29.h, p3/M, z29.h, z15.h\n"
+ "fmax z24.h, p3/M, z24.h, z15.h\n"
"ld1h { z0.h }, p3/Z, [x17, #1, MUL VL]\n"
- "whilelt p2.h, x21, %x[n_channels]\n"
"ld1h { z1.h }, p3/Z, [x17, #2, MUL VL]\n"
+ "whilelt p2.h, x21, %x[n_channels]\n"
"ld1h { z2.h }, p3/Z, [x17, #3, MUL VL]\n"
- "cmp x16, %x[n_channels]\n"
- "fmin z16.h, p3/M, z16.h, z13.h\n"
"ld1h { z3.h }, p3/Z, [x17, #4, MUL VL]\n"
+ "cmp x16, %x[n_channels]\n"
+ "fmin z31.h, p3/M, z31.h, z16.h\n"
"ld1h { z4.h }, p3/Z, [x17, #5, MUL VL]\n"
- "fmin z17.h, p3/M, z17.h, z13.h\n"
- "fmin z18.h, p3/M, z18.h, z13.h\n"
"ld1h { z5.h }, p3/Z, [x17, #6, MUL VL]\n"
+ "fmin z13.h, p3/M, z13.h, z16.h\n"
+ "fmin z17.h, p3/M, z17.h, z16.h\n"
"ld1h { z6.h }, p3/Z, [x17, #7, MUL VL]\n"
"addvl x17, x17, #16\n"
- "fmin z19.h, p3/M, z19.h, z13.h\n"
- "fmin z20.h, p3/M, z20.h, z13.h\n"
- "fmin z21.h, p3/M, z21.h, z13.h\n"
- "ld1h { z9.h }, p1/Z, [x12, x7, LSL #1]\n"
+ "fmin z30.h, p3/M, z30.h, z16.h\n"
"ld1h { z10.h }, p1/Z, [x8]\n"
- "fmin z22.h, p3/M, z22.h, z13.h\n"
- "fmin z23.h, p3/M, z23.h, z13.h\n"
+ "fmin z27.h, p3/M, z27.h, z16.h\n"
+ "fmin z14.h, p3/M, z14.h, z16.h\n"
"ld1h { z11.h }, p1/Z, [x8, x27, LSL #1]\n"
"ld1h { z12.h }, p1/Z, [x12, x14, LSL #1]\n"
- "fmin z24.h, p3/M, z24.h, z13.h\n"
- "fmin z25.h, p3/M, z25.h, z13.h\n"
- "st1h { z16.h }, p0, [x15]\n"
+ "fmin z21.h, p3/M, z21.h, z16.h\n"
+ "fmin z26.h, p3/M, z26.h, z16.h\n"
+ "st1h { z31.h }, p0, [x15]\n"
"ld1h { z7.h }, p3/Z, [x17, #-8, MUL VL]\n"
- "fmin z26.h, p3/M, z26.h, z13.h\n"
- "fmin z27.h, p3/M, z27.h, z13.h\n"
- "st1h { z17.h }, p0, [x15, x6, LSL #1]\n"
+ "fmin z18.h, p3/M, z18.h, z16.h\n"
+ "fmin z22.h, p3/M, z22.h, z16.h\n"
+ "st1h { z13.h }, p0, [x15, x6, LSL #1]\n"
"ld1h { z8.h }, p3/Z, [x17, #-7, MUL VL]\n"
- "fmin z28.h, p3/M, z28.h, z13.h\n"
- "fmin z29.h, p3/M, z29.h, z13.h\n"
- "st1h { z18.h }, p0, [x15, x25, LSL #1]\n"
- "fmin z30.h, p3/M, z30.h, z13.h\n"
- "fmin z31.h, p3/M, z31.h, z13.h\n"
- "st1h { z19.h }, p0, [x15, x22, LSL #1]\n"
+ "fmin z20.h, p3/M, z20.h, z16.h\n"
+ "fmin z28.h, p3/M, z28.h, z16.h\n"
+ "st1h { z17.h }, p0, [x15, x25, LSL #1]\n"
+ "fmin z23.h, p3/M, z23.h, z16.h\n"
+ "fmin z25.h, p3/M, z25.h, z16.h\n"
+ "st1h { z30.h }, p0, [x15, x22, LSL #1]\n"
+ "fmin z29.h, p3/M, z29.h, z16.h\n"
+ "fmin z24.h, p3/M, z24.h, z16.h\n"
+ "st1h { z27.h }, p0, [x9]\n"
"addvl x28, x28, #1\n"
- "st1h { z20.h }, p0, [x9]\n"
+ "st1h { z14.h }, p0, [x9, x6, LSL #1]\n"
"addvl x15, x15, #1\n"
- "st1h { z21.h }, p0, [x9, x6, LSL #1]\n"
+ "st1h { z21.h }, p0, [x9, x25, LSL #1]\n"
"addvl x17, x17, #-6\n"
- "st1h { z22.h }, p0, [x9, x25, LSL #1]\n"
- "st1h { z23.h }, p0, [x9, x22, LSL #1]\n"
+ "st1h { z26.h }, p0, [x9, x22, LSL #1]\n"
"addvl x9, x9, #1\n"
- "st1h { z24.h }, p0, [x26]\n"
- "st1h { z25.h }, p0, [x26, x6, LSL #1]\n"
- "st1h { z26.h }, p0, [x26, x25, LSL #1]\n"
- "st1h { z27.h }, p0, [x26, x22, LSL #1]\n"
+ "st1h { z18.h }, p0, [x26]\n"
+ "st1h { z22.h }, p0, [x26, x6, LSL #1]\n"
+ "st1h { z20.h }, p0, [x26, x25, LSL #1]\n"
+ "st1h { z28.h }, p0, [x26, x22, LSL #1]\n"
"addvl x26, x26, #1\n"
- "st1h { z28.h }, p0, [x23]\n"
- "st1h { z29.h }, p0, [x23, x6, LSL #1]\n"
- "st1h { z30.h }, p0, [x23, x25, LSL #1]\n"
- "st1h { z31.h }, p0, [x23, x22, LSL #1]\n"
+ "st1h { z23.h }, p0, [x23]\n"
+ "st1h { z25.h }, p0, [x23, x6, LSL #1]\n"
+ "st1h { z29.h }, p0, [x23, x25, LSL #1]\n"
+ "st1h { z24.h }, p0, [x23, x22, LSL #1]\n"
"addvl x23, x23, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z21, z15\n fmla z21.h, p3/M, z4.h, z9.h\n"
- "movprfx z16, z15\n fmla z16.h, p3/M, z8.h, z9.h\n"
+ "movprfx z14, z19\n fmla z14.h, p3/M, z4.h, z9.h\n"
+ "movprfx z31, z19\n fmla z31.h, p3/M, z8.h, z9.h\n"
"ldr x4, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x16, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "movprfx z22, z15\n fmla z22.h, p3/M, z3.h, z9.h\n"
- "movprfx z25, z15\n fmla z25.h, p3/M, z1.h, z9.h\n"
+ "movprfx z30, z19\n fmla z30.h, p3/M, z3.h, z9.h\n"
+ "movprfx z13, z19\n fmla z13.h, p3/M, z1.h, z9.h\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
"add x4, x4, #0x1\n"
- "movprfx z26, z15\n fmla z26.h, p3/M, z0.h, z9.h\n"
- "fmla z21.h, p3/M, z5.h, z12.h\n"
+ "movprfx z20, z19\n fmla z20.h, p3/M, z0.h, z9.h\n"
+ "fmla z14.h, p3/M, z5.h, z12.h\n"
"cmp x4, x20\n"
"add x21, x16, #0x1\n"
- "movprfx z17, z15\n fmla z17.h, p3/M, z7.h, z9.h\n"
- "movprfx z18, z15\n fmla z18.h, p3/M, z6.h, z9.h\n"
+ "movprfx z18, z19\n fmla z18.h, p3/M, z7.h, z9.h\n"
+ "movprfx z28, z19\n fmla z28.h, p3/M, z6.h, z9.h\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
"csel x16, x16, x21, LT\n"
- "movprfx z20, z15\n fmla z20.h, p3/M, z5.h, z9.h\n"
- "movprfx z24, z15\n fmla z24.h, p3/M, z2.h, z9.h\n"
+ "movprfx z17, z19\n fmla z17.h, p3/M, z5.h, z9.h\n"
+ "movprfx z26, z19\n fmla z26.h, p3/M, z2.h, z9.h\n"
"ld1h { z9.h }, p2/Z, [x11, x7, LSL #1]\n"
"mov p0.b, p2.b\n"
- "fmla z16.h, p3/M, z0.h, z10.h\n"
- "movprfx z19, z15\n fmla z19.h, p3/M, z2.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x24]\n"
- "ld1h { z11.h }, p2/Z, [x24, x27, LSL #1]\n"
- "fmla z22.h, p3/M, z4.h, z12.h\n"
- "fmla z25.h, p3/M, z2.h, z12.h\n"
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
+ "movprfx z27, z19\n fmla z27.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z29.h }, p2/Z, [x24]\n"
+ "ld1h { z21.h }, p2/Z, [x24, x27, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z12.h\n"
+ "fmla z13.h, p3/M, z2.h, z12.h\n"
"csel x4, x4, XZR, LT\n"
"cmp x16, x20\n"
- "fmla z26.h, p3/M, z1.h, z12.h\n"
- "movprfx z28, z15\n fmla z28.h, p3/M, z6.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
- "fmla z21.h, p3/M, z7.h, z9.h\n"
- "fmla z17.h, p3/M, z8.h, z12.h\n"
- "fmla z18.h, p3/M, z7.h, z12.h\n"
- "fmla z19.h, p3/M, z6.h, z12.h\n"
- "movprfx z23, z15\n fmla z23.h, p3/M, z3.h, z12.h\n"
- "movprfx z27, z15\n fmla z27.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x8, x5, LSL #1]\n"
- "movprfx z31, z15\n fmla z31.h, p3/M, z8.h, z11.h\n"
- "fmla z22.h, p3/M, z6.h, z9.h\n"
- "ld1h { z11.h }, p2/Z, [x8, x10, LSL #1]\n"
- "fmla z25.h, p3/M, z4.h, z9.h\n"
- "fmla z26.h, p3/M, z3.h, z9.h\n"
- "fmla z20.h, p3/M, z8.h, z9.h\n"
- "fmla z24.h, p3/M, z5.h, z9.h\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "fmla z21.h, p3/M, z8.h, z10.h\n"
- "fmla z16.h, p3/M, z1.h, z12.h\n"
- "fmla z17.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x13, x27, LSL #1]\n"
- "fmla z18.h, p3/M, z2.h, z11.h\n"
- "fmla z19.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x28]\n"
- "fmla z22.h, p3/M, z7.h, z10.h\n"
- "fmla z23.h, p3/M, z6.h, z10.h\n"
- "fmla z25.h, p3/M, z5.h, z10.h\n"
- "fmla z26.h, p3/M, z4.h, z10.h\n"
- "fmla z27.h, p3/M, z3.h, z10.h\n"
- "fmla z31.h, p3/M, z0.h, z10.h\n"
- "fmla z24.h, p3/M, z6.h, z11.h\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x27, LSL #1]\n"
- "fmla z19.h, p3/M, z5.h, z12.h\n"
- "fmla z23.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x13, x14, LSL #1]\n"
- "fmla z27.h, p3/M, z8.h, z11.h\n"
- "fmla z31.h, p3/M, z5.h, z11.h\n"
- "movprfx z29, z15\n fmla z29.h, p3/M, z1.h, z9.h\n"
- "movprfx z30, z15\n fmla z30.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x13]\n"
- "fmla z29.h, p3/M, z2.h, z10.h\n"
- "fmla z30.h, p3/M, z1.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x13, x7, LSL #1]\n"
- "fmla z20.h, p3/M, z0.h, z9.h\n"
- "fmla z21.h, p3/M, z1.h, z10.h\n"
- "fmla z16.h, p3/M, z3.h, z9.h\n"
- "fmla z17.h, p3/M, z4.h, z10.h\n"
- "ld1h { z11.h }, p2/Z, [x24, x5, LSL #1]\n"
- "fmla z18.h, p3/M, z3.h, z10.h\n"
- "fmla z22.h, p3/M, z0.h, z10.h\n"
- "fmla z20.h, p3/M, z2.h, z10.h\n"
- "fmla z21.h, p3/M, z2.h, z12.h\n"
- "fmla z16.h, p3/M, z5.h, z10.h\n"
- "fmla z17.h, p3/M, z5.h, z12.h\n"
- "ld1h { z10.h }, p2/Z, [x12, x5, LSL #1]\n"
- "fmla z18.h, p3/M, z4.h, z12.h\n"
- "fmla z19.h, p3/M, z3.h, z12.h\n"
- "fmla z22.h, p3/M, z1.h, z12.h\n"
- "fmla z23.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x12, x10, LSL #1]\n"
- "fmla z28.h, p3/M, z7.h, z11.h\n"
- "fmla z29.h, p3/M, z6.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x24, x10, LSL #1]\n"
- "fmla z20.h, p3/M, z4.h, z10.h\n"
- "fmla z21.h, p3/M, z3.h, z10.h\n"
- "fmla z24.h, p3/M, z1.h, z10.h\n"
- "fmla z25.h, p3/M, z0.h, z10.h\n"
- "fmla z16.h, p3/M, z7.h, z10.h\n"
- "fmla z17.h, p3/M, z6.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x8, x7, LSL #1]\n"
- "fmla z30.h, p3/M, z8.h, z11.h\n"
- "fmla z31.h, p3/M, z7.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11, x5, LSL #1]\n"
+ "fmla z20.h, p3/M, z1.h, z12.h\n"
+ "movprfx z10, z19\n fmla z10.h, p3/M, z6.h, z29.h\n"
+ "ld1h { z29.h }, p2/Z, [x11, x14, LSL #1]\n"
+ "fmla z14.h, p3/M, z7.h, z9.h\n"
"fmla z18.h, p3/M, z8.h, z12.h\n"
- "fmla z19.h, p3/M, z7.h, z12.h\n"
- "fmla z22.h, p3/M, z5.h, z12.h\n"
- "fmla z23.h, p3/M, z4.h, z12.h\n"
- "fmla z26.h, p3/M, z2.h, z12.h\n"
- "fmla z27.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x8, x14, LSL #1]\n"
- "fmla z20.h, p3/M, z7.h, z11.h\n"
- "fmla z21.h, p3/M, z6.h, z11.h\n"
- "fmla z24.h, p3/M, z4.h, z11.h\n"
- "fmla z25.h, p3/M, z3.h, z11.h\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "fmla z29.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11, x10, LSL #1]\n"
- "fmla z16.h, p3/M, z2.h, z10.h\n"
- "fmla z17.h, p3/M, z1.h, z10.h\n"
- "fmla z18.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x12]\n"
- "fmla z30.h, p3/M, z2.h, z11.h\n"
- "fmla z19.h, p3/M, z0.h, z12.h\n"
- "fmla z20.h, p3/M, z3.h, z10.h\n"
- "fmla z24.h, p3/M, z0.h, z10.h\n"
- "fmla z22.h, p3/M, z8.h, z11.h\n"
- "fmla z23.h, p3/M, z7.h, z11.h\n"
- "fmla z26.h, p3/M, z5.h, z11.h\n"
- "fmla z27.h, p3/M, z4.h, z11.h\n"
- "fmla z31.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x7, LSL #1]\n"
- "fmla z17.h, p3/M, z2.h, z12.h\n"
- "fmla z18.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x12, x27, LSL #1]\n"
- "fmla z16.h, p3/M, z6.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x11]\n"
- "fmla z29.h, p3/M, z4.h, z11.h\n"
- "fmla z30.h, p3/M, z3.h, z11.h\n"
- "fmla z19.h, p3/M, z8.h, z12.h\n"
- "fmla z23.h, p3/M, z5.h, z12.h\n"
- "fmla z27.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x11, x27, LSL #1]\n"
- "fmla z20.h, p3/M, z6.h, z10.h\n"
- "fmla z24.h, p3/M, z3.h, z10.h\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x24, x7, LSL #1]\n"
- "fmla z31.h, p3/M, z2.h, z12.h\n"
- "fmla z29.h, p3/M, z7.h, z10.h\n"
- "fmla z30.h, p3/M, z6.h, z10.h\n"
- "fmla z24.h, p3/M, z8.h, z11.h\n"
- "fmla z25.h, p3/M, z7.h, z11.h\n"
- "fmla z26.h, p3/M, z6.h, z11.h\n"
- "fmla z28.h, p3/M, z5.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x14, LSL #1]\n"
- "fmla z27.h, p3/M, z5.h, z12.h\n"
- "fmla z29.h, p3/M, z5.h, z11.h\n"
- "fmla z30.h, p3/M, z4.h, z11.h\n"
- "fmla z31.h, p3/M, z3.h, z11.h\n"
- "fmla z23.h, p3/M, z8.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x14, LSL #1]\n"
- "fmla z28.h, p3/M, z8.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x13, x5, LSL #1]\n"
- "fmla z25.h, p3/M, z8.h, z11.h\n"
- "fmla z26.h, p3/M, z7.h, z11.h\n"
- "fmla z27.h, p3/M, z6.h, z11.h\n"
- "fmla z29.h, p3/M, z8.h, z12.h\n"
- "ld1h { z11.h }, p2/Z, [x13, x10, LSL #1]\n"
- "fmla z30.h, p3/M, z7.h, z12.h\n"
- "fmla z31.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x28, x5, LSL #1]\n"
- "fmla z16.h, p3/M, z4.h, z10.h\n"
- "fmla z17.h, p3/M, z3.h, z10.h\n"
- "fmax z16.h, p3/M, z16.h, z14.h\n"
- "fmax z17.h, p3/M, z17.h, z14.h\n"
- "fmla z20.h, p3/M, z1.h, z10.h\n"
- "fmla z21.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x28, x10, LSL #1]\n"
- "fmax z20.h, p3/M, z20.h, z14.h\n"
- "fmla z18.h, p3/M, z5.h, z11.h\n"
- "fmla z19.h, p3/M, z4.h, z11.h\n"
- "fmax z18.h, p3/M, z18.h, z14.h\n"
- "fmax z19.h, p3/M, z19.h, z14.h\n"
- "fmla z22.h, p3/M, z2.h, z11.h\n"
- "fmla z23.h, p3/M, z1.h, z11.h\n"
- "fmax z21.h, p3/M, z21.h, z14.h\n"
- "fmax z22.h, p3/M, z22.h, z14.h\n"
- "fmla z24.h, p3/M, z7.h, z12.h\n"
- "fmla z25.h, p3/M, z6.h, z12.h\n"
- "fmax z23.h, p3/M, z23.h, z14.h\n"
- "fmax z24.h, p3/M, z24.h, z14.h\n"
- "fmla z28.h, p3/M, z4.h, z12.h\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "fmax z25.h, p3/M, z25.h, z14.h\n"
- "fmax z28.h, p3/M, z28.h, z14.h\n"
- "fmla z26.h, p3/M, z8.h, z10.h\n"
- "fmla z27.h, p3/M, z7.h, z10.h\n"
- "fmax z26.h, p3/M, z26.h, z14.h\n"
- "fmax z27.h, p3/M, z27.h, z14.h\n"
- "fmla z30.h, p3/M, z5.h, z10.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
- "fmax z29.h, p3/M, z29.h, z14.h\n"
- "fmax z30.h, p3/M, z30.h, z14.h\n"
- "fmax z31.h, p3/M, z31.h, z14.h\n"
- "fmin z16.h, p3/M, z16.h, z13.h\n"
- "st1h { z16.h }, p0, [x15]\n"
- "fmin z17.h, p3/M, z17.h, z13.h\n"
- "fmin z18.h, p3/M, z18.h, z13.h\n"
- "st1h { z17.h }, p0, [x15, x6, LSL #1]\n"
- "fmin z19.h, p3/M, z19.h, z13.h\n"
- "fmin z20.h, p3/M, z20.h, z13.h\n"
- "st1h { z18.h }, p0, [x15, x25, LSL #1]\n"
- "fmin z21.h, p3/M, z21.h, z13.h\n"
- "fmin z22.h, p3/M, z22.h, z13.h\n"
- "st1h { z19.h }, p0, [x15, x22, LSL #1]\n"
- "fmin z23.h, p3/M, z23.h, z13.h\n"
- "fmin z24.h, p3/M, z24.h, z13.h\n"
- "st1h { z20.h }, p0, [x9]\n"
- "fmin z25.h, p3/M, z25.h, z13.h\n"
- "fmin z26.h, p3/M, z26.h, z13.h\n"
- "st1h { z21.h }, p0, [x9, x6, LSL #1]\n"
- "fmin z27.h, p3/M, z27.h, z13.h\n"
- "fmin z28.h, p3/M, z28.h, z13.h\n"
- "st1h { z22.h }, p0, [x9, x25, LSL #1]\n"
- "fmin z29.h, p3/M, z29.h, z13.h\n"
- "fmin z30.h, p3/M, z30.h, z13.h\n"
- "st1h { z23.h }, p0, [x9, x22, LSL #1]\n"
- "fmin z31.h, p3/M, z31.h, z13.h\n"
- "st1h { z24.h }, p0, [x26]\n"
- "st1h { z25.h }, p0, [x26, x6, LSL #1]\n"
- "st1h { z26.h }, p0, [x26, x25, LSL #1]\n"
- "st1h { z27.h }, p0, [x26, x22, LSL #1]\n"
- "st1h { z28.h }, p0, [x23]\n"
- "st1h { z29.h }, p0, [x23, x6, LSL #1]\n"
- "st1h { z30.h }, p0, [x23, x25, LSL #1]\n"
- "st1h { z31.h }, p0, [x23, x22, LSL #1]\n"
+ "fmla z28.h, p3/M, z7.h, z12.h\n"
+ "fmla z27.h, p3/M, z6.h, z12.h\n"
+ "movprfx z11, z19\n fmla z11.h, p3/M, z3.h, z12.h\n"
+ "movprfx z25, z19\n fmla z25.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z22.h }, p2/Z, [x8, x5, LSL #1]\n"
+ "movprfx z24, z19\n fmla z24.h, p3/M, z8.h, z21.h\n"
+ "fmla z30.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z21.h }, p2/Z, [x8, x10, LSL #1]\n"
+ "fmla z13.h, p3/M, z4.h, z9.h\n"
+ "fmla z20.h, p3/M, z3.h, z9.h\n"
+ "movprfx z12, z19\n fmla z12.h, p3/M, z1.h, z9.h\n"
+ "movprfx z23, z19\n fmla z23.h, p3/M, z0.h, z9.h\n"
+ "fmla z17.h, p3/M, z8.h, z9.h\n"
+ "fmla z26.h, p3/M, z5.h, z9.h\n"
+ "fmla z10.h, p3/M, z2.h, z9.h\n"
+ "fmla z14.h, p3/M, z8.h, z29.h\n"
+ "ld1h { z9.h }, p2/Z, [x13]\n"
+ "fmla z31.h, p3/M, z1.h, z22.h\n"
+ "fmla z18.h, p3/M, z0.h, z22.h\n"
+ "ld1h { z22.h }, p2/Z, [x13, x27, LSL #1]\n"
+ "fmla z28.h, p3/M, z2.h, z21.h\n"
+ "fmla z27.h, p3/M, z1.h, z21.h\n"
+ "ld1h { z19.h }, p2/Z, [x28]\n"
+ "fmla z30.h, p3/M, z7.h, z29.h\n"
+ "fmla z11.h, p3/M, z6.h, z29.h\n"
+ "fmla z13.h, p3/M, z5.h, z29.h\n"
+ "fmla z20.h, p3/M, z4.h, z29.h\n"
+ "fmla z25.h, p3/M, z3.h, z29.h\n"
+ "fmla z12.h, p3/M, z2.h, z29.h\n"
+ "fmla z23.h, p3/M, z1.h, z29.h\n"
+ "fmla z24.h, p3/M, z0.h, z29.h\n"
+ "ld1h { z21.h }, p2/Z, [x13, x7, LSL #1]\n"
+ "fmla z17.h, p3/M, z0.h, z9.h\n"
+ "fmla z26.h, p3/M, z6.h, z19.h\n"
+ "fmla z10.h, p3/M, z3.h, z19.h\n"
+ "fmla z14.h, p3/M, z1.h, z21.h\n"
+ "ld1h { z19.h }, p2/Z, [x28, x27, LSL #1]\n"
+ "fmla z31.h, p3/M, z3.h, z9.h\n"
+ "fmla z27.h, p3/M, z5.h, z22.h\n"
+ "fmla z11.h, p3/M, z2.h, z22.h\n"
+ "fmla z18.h, p3/M, z4.h, z21.h\n"
+ "ld1h { z29.h }, p2/Z, [x13, x14, LSL #1]\n"
+ "fmla z28.h, p3/M, z3.h, z21.h\n"
+ "fmla z30.h, p3/M, z0.h, z21.h\n"
+ "fmla z25.h, p3/M, z8.h, z19.h\n"
+ "fmla z24.h, p3/M, z5.h, z19.h\n"
+ "ld1h { z19.h }, p2/Z, [x24, x5, LSL #1]\n"
+ "fmla z17.h, p3/M, z2.h, z21.h\n"
+ "fmla z14.h, p3/M, z2.h, z29.h\n"
+ "fmla z31.h, p3/M, z5.h, z21.h\n"
+ "fmla z18.h, p3/M, z5.h, z29.h\n"
+ "ld1h { z22.h }, p2/Z, [x12, x5, LSL #1]\n"
+ "fmla z28.h, p3/M, z4.h, z29.h\n"
+ "fmla z27.h, p3/M, z3.h, z29.h\n"
+ "fmla z30.h, p3/M, z1.h, z29.h\n"
+ "fmla z11.h, p3/M, z0.h, z29.h\n"
+ "ld1h { z21.h }, p2/Z, [x12, x10, LSL #1]\n"
+ "fmla z10.h, p3/M, z7.h, z19.h\n"
+ "fmla z12.h, p3/M, z6.h, z19.h\n"
+ "ld1h { z19.h }, p2/Z, [x24, x10, LSL #1]\n"
+ "fmla z17.h, p3/M, z4.h, z22.h\n"
+ "fmla z14.h, p3/M, z3.h, z22.h\n"
+ "fmla z26.h, p3/M, z1.h, z22.h\n"
+ "fmla z13.h, p3/M, z0.h, z22.h\n"
+ "fmla z31.h, p3/M, z7.h, z22.h\n"
+ "fmla z18.h, p3/M, z6.h, z22.h\n"
+ "ld1h { z29.h }, p2/Z, [x8, x7, LSL #1]\n"
+ "fmla z23.h, p3/M, z8.h, z19.h\n"
+ "fmla z24.h, p3/M, z7.h, z19.h\n"
+ "ld1h { z19.h }, p2/Z, [x11, x5, LSL #1]\n"
+ "fmla z28.h, p3/M, z8.h, z21.h\n"
+ "fmla z27.h, p3/M, z7.h, z21.h\n"
+ "fmla z30.h, p3/M, z5.h, z21.h\n"
+ "fmla z11.h, p3/M, z4.h, z21.h\n"
+ "fmla z20.h, p3/M, z2.h, z21.h\n"
+ "fmla z25.h, p3/M, z1.h, z21.h\n"
+ "ld1h { z22.h }, p2/Z, [x8, x14, LSL #1]\n"
+ "fmla z17.h, p3/M, z7.h, z19.h\n"
+ "fmla z14.h, p3/M, z6.h, z19.h\n"
+ "fmla z26.h, p3/M, z4.h, z19.h\n"
+ "fmla z13.h, p3/M, z3.h, z19.h\n"
+ "fmla z10.h, p3/M, z1.h, z19.h\n"
+ "fmla z12.h, p3/M, z0.h, z19.h\n"
+ "ld1h { z21.h }, p2/Z, [x11, x10, LSL #1]\n"
+ "fmla z31.h, p3/M, z2.h, z29.h\n"
+ "fmla z18.h, p3/M, z1.h, z29.h\n"
+ "fmla z28.h, p3/M, z0.h, z29.h\n"
+ "ld1h { z29.h }, p2/Z, [x12]\n"
+ "fmla z23.h, p3/M, z2.h, z21.h\n"
+ "fmla z27.h, p3/M, z0.h, z22.h\n"
+ "fmla z17.h, p3/M, z3.h, z29.h\n"
+ "fmla z26.h, p3/M, z0.h, z29.h\n"
+ "fmla z30.h, p3/M, z8.h, z21.h\n"
+ "fmla z11.h, p3/M, z7.h, z21.h\n"
+ "fmla z20.h, p3/M, z5.h, z21.h\n"
+ "fmla z25.h, p3/M, z4.h, z21.h\n"
+ "fmla z24.h, p3/M, z1.h, z21.h\n"
+ "ld1h { z19.h }, p2/Z, [x28, x7, LSL #1]\n"
+ "fmla z18.h, p3/M, z2.h, z22.h\n"
+ "fmla z28.h, p3/M, z1.h, z22.h\n"
+ "ld1h { z21.h }, p2/Z, [x12, x27, LSL #1]\n"
+ "fmla z31.h, p3/M, z6.h, z29.h\n"
+ "ld1h { z29.h }, p2/Z, [x11]\n"
+ "fmla z12.h, p3/M, z4.h, z19.h\n"
+ "fmla z23.h, p3/M, z3.h, z19.h\n"
+ "fmla z27.h, p3/M, z8.h, z21.h\n"
+ "fmla z11.h, p3/M, z5.h, z21.h\n"
+ "fmla z25.h, p3/M, z2.h, z21.h\n"
+ "ld1h { z9.h }, p2/Z, [x11, x27, LSL #1]\n"
+ "fmla z17.h, p3/M, z6.h, z29.h\n"
+ "fmla z26.h, p3/M, z3.h, z29.h\n"
+ "fmla z10.h, p3/M, z0.h, z29.h\n"
+ "ld1h { z22.h }, p2/Z, [x24, x7, LSL #1]\n"
+ "fmla z24.h, p3/M, z2.h, z9.h\n"
+ "fmla z12.h, p3/M, z7.h, z22.h\n"
+ "fmla z23.h, p3/M, z6.h, z22.h\n"
+ "fmla z26.h, p3/M, z8.h, z19.h\n"
+ "fmla z13.h, p3/M, z7.h, z19.h\n"
+ "fmla z20.h, p3/M, z6.h, z19.h\n"
+ "fmla z10.h, p3/M, z5.h, z19.h\n"
+ "ld1h { z21.h }, p2/Z, [x28, x14, LSL #1]\n"
+ "fmla z25.h, p3/M, z5.h, z9.h\n"
+ "fmla z12.h, p3/M, z5.h, z21.h\n"
+ "fmla z23.h, p3/M, z4.h, z21.h\n"
+ "fmla z24.h, p3/M, z3.h, z21.h\n"
+ "fmla z11.h, p3/M, z8.h, z9.h\n"
+ "ld1h { z19.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "fmla z10.h, p3/M, z8.h, z22.h\n"
+ "ld1h { z22.h }, p2/Z, [x13, x5, LSL #1]\n"
+ "fmla z13.h, p3/M, z8.h, z21.h\n"
+ "fmla z20.h, p3/M, z7.h, z21.h\n"
+ "fmla z25.h, p3/M, z6.h, z21.h\n"
+ "fmla z12.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z29.h }, p2/Z, [x13, x10, LSL #1]\n"
+ "fmla z23.h, p3/M, z7.h, z19.h\n"
+ "fmla z24.h, p3/M, z6.h, z19.h\n"
+ "ld1h { z21.h }, p2/Z, [x28, x5, LSL #1]\n"
+ "fmla z31.h, p3/M, z4.h, z22.h\n"
+ "fmla z18.h, p3/M, z3.h, z22.h\n"
+ "fmax z31.h, p3/M, z31.h, z15.h\n"
+ "fmax z18.h, p3/M, z18.h, z15.h\n"
+ "fmla z17.h, p3/M, z1.h, z22.h\n"
+ "fmla z14.h, p3/M, z0.h, z22.h\n"
+ "ld1h { z9.h }, p2/Z, [x28, x10, LSL #1]\n"
+ "fmax z17.h, p3/M, z17.h, z15.h\n"
+ "fmla z28.h, p3/M, z5.h, z29.h\n"
+ "fmla z27.h, p3/M, z4.h, z29.h\n"
+ "fmax z28.h, p3/M, z28.h, z15.h\n"
+ "fmax z27.h, p3/M, z27.h, z15.h\n"
+ "fmla z30.h, p3/M, z2.h, z29.h\n"
+ "fmla z11.h, p3/M, z1.h, z29.h\n"
+ "fmax z14.h, p3/M, z14.h, z15.h\n"
+ "fmax z30.h, p3/M, z30.h, z15.h\n"
+ "fmla z26.h, p3/M, z7.h, z21.h\n"
+ "fmla z13.h, p3/M, z6.h, z21.h\n"
+ "fmax z11.h, p3/M, z11.h, z15.h\n"
+ "fmax z26.h, p3/M, z26.h, z15.h\n"
+ "fmla z10.h, p3/M, z4.h, z21.h\n"
+ "fmla z12.h, p3/M, z3.h, z21.h\n"
+ "fmax z13.h, p3/M, z13.h, z15.h\n"
+ "fmax z10.h, p3/M, z10.h, z15.h\n"
+ "fmla z20.h, p3/M, z8.h, z9.h\n"
+ "fmla z25.h, p3/M, z7.h, z9.h\n"
+ "fmax z20.h, p3/M, z20.h, z15.h\n"
+ "fmax z25.h, p3/M, z25.h, z15.h\n"
+ "fmla z23.h, p3/M, z5.h, z9.h\n"
+ "fmla z24.h, p3/M, z4.h, z9.h\n"
+ "fmax z12.h, p3/M, z12.h, z15.h\n"
+ "fmax z23.h, p3/M, z23.h, z15.h\n"
+ "fmax z24.h, p3/M, z24.h, z15.h\n"
+ "fmin z31.h, p3/M, z31.h, z16.h\n"
+ "st1h { z31.h }, p0, [x15]\n"
+ "fmin z18.h, p3/M, z18.h, z16.h\n"
+ "fmin z28.h, p3/M, z28.h, z16.h\n"
+ "st1h { z18.h }, p0, [x15, x6, LSL #1]\n"
+ "fmin z27.h, p3/M, z27.h, z16.h\n"
+ "fmin z17.h, p3/M, z17.h, z16.h\n"
+ "st1h { z28.h }, p0, [x15, x25, LSL #1]\n"
+ "fmin z14.h, p3/M, z14.h, z16.h\n"
+ "fmin z30.h, p3/M, z30.h, z16.h\n"
+ "st1h { z27.h }, p0, [x15, x22, LSL #1]\n"
+ "fmin z11.h, p3/M, z11.h, z16.h\n"
+ "fmin z26.h, p3/M, z26.h, z16.h\n"
+ "st1h { z17.h }, p0, [x9]\n"
+ "fmin z13.h, p3/M, z13.h, z16.h\n"
+ "fmin z20.h, p3/M, z20.h, z16.h\n"
+ "st1h { z14.h }, p0, [x9, x6, LSL #1]\n"
+ "fmin z25.h, p3/M, z25.h, z16.h\n"
+ "fmin z10.h, p3/M, z10.h, z16.h\n"
+ "st1h { z30.h }, p0, [x9, x25, LSL #1]\n"
+ "fmin z12.h, p3/M, z12.h, z16.h\n"
+ "fmin z23.h, p3/M, z23.h, z16.h\n"
+ "st1h { z11.h }, p0, [x9, x22, LSL #1]\n"
+ "fmin z24.h, p3/M, z24.h, z16.h\n"
+ "st1h { z26.h }, p0, [x26]\n"
+ "st1h { z13.h }, p0, [x26, x6, LSL #1]\n"
+ "st1h { z20.h }, p0, [x26, x25, LSL #1]\n"
+ "st1h { z25.h }, p0, [x26, x22, LSL #1]\n"
+ "st1h { z10.h }, p0, [x23]\n"
+ "st1h { z12.h }, p0, [x23, x6, LSL #1]\n"
+ "st1h { z23.h }, p0, [x23, x25, LSL #1]\n"
+ "st1h { z24.h }, p0, [x23, x22, LSL #1]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
@@ -653,4 +653,4 @@ void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
index 972b78b6d5..c0be293cd7 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -99,616 +99,616 @@ void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
__asm__ __volatile__(
"ptrue p3.b\n"
- "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ld1h { z15.h }, p3/Z, [x17]\n"
- "cnth x15\n"
- "mov x14, #0x0\n"
- "ld1h { z0.h }, p3/Z, [x17, #1, MUL VL]\n"
- "ld1h { z1.h }, p3/Z, [x17, #2, MUL VL]\n"
+ "ldr x7, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x8, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ld1h { z17.h }, p3/Z, [x7]\n"
+ "cnth x17\n"
+ "mov x16, #0x0\n"
+ "ld1h { z0.h }, p3/Z, [x7, #1, MUL VL]\n"
+ "ld1h { z1.h }, p3/Z, [x7, #2, MUL VL]\n"
"whilelt p2.h, XZR, %x[n_channels]\n"
- "ld1h { z2.h }, p3/Z, [x17, #3, MUL VL]\n"
- "ld1h { z3.h }, p3/Z, [x17, #4, MUL VL]\n"
- "cmp x15, %x[n_channels]\n"
- "ld1h { z4.h }, p3/Z, [x17, #5, MUL VL]\n"
- "ld1h { z5.h }, p3/Z, [x17, #6, MUL VL]\n"
- "sub x13, XZR, x15\n"
- "ld1h { z6.h }, p3/Z, [x17, #7, MUL VL]\n"
- "addvl x17, x17, #16\n"
- "ldp x12, x11, [x16, #0x0]\n"
- "ldp x10, x9, [x16, #0x10]\n"
- "ldr x28, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "ld1rh { z14.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rh { z13.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "ld1h { z7.h }, p3/Z, [x17, #-8, MUL VL]\n"
- "ld1h { z8.h }, p3/Z, [x17, #-7, MUL VL]\n"
- "addvl x17, x17, #-6\n"
- "ld1h { z9.h }, p2/Z, [x12, x14, LSL #1]\n"
- "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
- "ld1h { z11.h }, p2/Z, [x10, x14, LSL #1]\n"
- "ld1h { z12.h }, p2/Z, [x9, x14, LSL #1]\n"
+ "ld1h { z2.h }, p3/Z, [x7, #3, MUL VL]\n"
+ "ld1h { z3.h }, p3/Z, [x7, #4, MUL VL]\n"
+ "cmp x17, %x[n_channels]\n"
+ "ld1h { z4.h }, p3/Z, [x7, #5, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x7, #6, MUL VL]\n"
+ "sub x15, XZR, x17\n"
+ "ld1h { z6.h }, p3/Z, [x7, #7, MUL VL]\n"
+ "addvl x7, x7, #16\n"
+ "ldp x23, x22, [x8, #0x0]\n"
+ "ldp x21, x20, [x8, #0x10]\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z19.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1h { z7.h }, p3/Z, [x7, #-8, MUL VL]\n"
+ "ld1h { z8.h }, p3/Z, [x7, #-7, MUL VL]\n"
+ "addvl x7, x7, #-6\n"
+ "ld1h { z9.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x20, x16, LSL #1]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z21, z15\n fmla z21.h, p3/M, z4.h, z9.h\n"
- "movprfx z16, z15\n fmla z16.h, p3/M, z8.h, z9.h\n"
- "ldr x27, [x16, #0x20]\n"
- "ldr x26, [x16, #0x30]\n"
- "movprfx z22, z15\n fmla z22.h, p3/M, z3.h, z9.h\n"
- "movprfx z25, z15\n fmla z25.h, p3/M, z1.h, z9.h\n"
- "ldr x25, [x16, #0x28]\n"
- "ldr x24, [x16, #0x38]\n"
- "movprfx z26, z15\n fmla z26.h, p3/M, z0.h, z9.h\n"
- "movprfx z17, z15\n fmla z17.h, p3/M, z7.h, z9.h\n"
- "ldr x12, [x16, #0x40]\n"
- "ldr x11, [x16, #0x48]\n"
- "movprfx z18, z15\n fmla z18.h, p3/M, z6.h, z9.h\n"
- "fmla z21.h, p3/M, z5.h, z12.h\n"
- "ldr x10, [x16, #0x50]\n"
- "ldr x9, [x16, #0x58]\n"
- "movprfx z20, z15\n fmla z20.h, p3/M, z5.h, z9.h\n"
- "movprfx z24, z15\n fmla z24.h, p3/M, z2.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x26, x14, LSL #1]\n"
- "ldr x26, [x16, #0x70]\n"
- "fmla z16.h, p3/M, z0.h, z10.h\n"
- "movprfx z19, z15\n fmla z19.h, p3/M, z2.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x27, x14, LSL #1]\n"
- "ld1h { z11.h }, p2/Z, [x25, x14, LSL #1]\n"
- "fmla z22.h, p3/M, z4.h, z12.h\n"
- "fmla z25.h, p3/M, z2.h, z12.h\n"
- "ldr x27, [x16, #0x60]\n"
- "ldr x25, [x16, #0x68]\n"
- "fmla z26.h, p3/M, z1.h, z12.h\n"
- "fmla z17.h, p3/M, z8.h, z12.h\n"
- "inch x13\n"
+ "movprfx z20, z17\n fmla z20.h, p3/M, z4.h, z9.h\n"
+ "movprfx z26, z17\n fmla z26.h, p3/M, z8.h, z9.h\n"
+ "ldr x27, [x8, #0x20]\n"
+ "ldr x24, [x8, #0x30]\n"
+ "movprfx z24, z17\n fmla z24.h, p3/M, z3.h, z9.h\n"
+ "movprfx z30, z17\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "ldr x23, [x8, #0x28]\n"
+ "ldr x22, [x8, #0x38]\n"
+ "movprfx z31, z17\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "movprfx z22, z17\n fmla z22.h, p3/M, z7.h, z9.h\n"
+ "ldr x26, [x8, #0x40]\n"
+ "ldr x21, [x8, #0x48]\n"
+ "movprfx z27, z17\n fmla z27.h, p3/M, z6.h, z9.h\n"
+ "fmla z20.h, p3/M, z5.h, z12.h\n"
+ "ldr x25, [x8, #0x50]\n"
+ "ldr x20, [x8, #0x58]\n"
+ "movprfx z14, z17\n fmla z14.h, p3/M, z5.h, z9.h\n"
+ "movprfx z23, z17\n fmla z23.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z25.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "ldr x13, [x8, #0x70]\n"
+ "fmla z26.h, p3/M, z0.h, z10.h\n"
+ "movprfx z9, z17\n fmla z9.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z28.h }, p2/Z, [x27, x16, LSL #1]\n"
+ "ld1h { z21.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "fmla z24.h, p3/M, z4.h, z12.h\n"
+ "fmla z30.h, p3/M, z2.h, z12.h\n"
+ "ldr x24, [x8, #0x60]\n"
+ "ldr x23, [x8, #0x68]\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "fmla z22.h, p3/M, z8.h, z12.h\n"
+ "inch x15\n"
"mov p1.b, p2.b\n"
- "fmla z18.h, p3/M, z7.h, z12.h\n"
- "movprfx z28, z15\n fmla z28.h, p3/M, z6.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
- "ldr x11, [x16, #0x88]\n"
- "fmla z21.h, p3/M, z7.h, z9.h\n"
- "fmla z19.h, p3/M, z6.h, z12.h\n"
- "ldr x23, [x28, #0x0]\n"
- "ldr x22, [x28, #0x8]\n"
- "movprfx z23, z15\n fmla z23.h, p3/M, z3.h, z12.h\n"
- "movprfx z27, z15\n fmla z27.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x14, LSL #1]\n"
- "ldr x24, [x16, #0x78]\n"
- "movprfx z31, z15\n fmla z31.h, p3/M, z8.h, z11.h\n"
- "fmla z22.h, p3/M, z6.h, z9.h\n"
- "ld1h { z11.h }, p2/Z, [x12, x14, LSL #1]\n"
- "ldr x12, [x16, #0x80]\n"
- "fmla z25.h, p3/M, z4.h, z9.h\n"
- "fmla z26.h, p3/M, z3.h, z9.h\n"
- "ldr x21, [x28, #0x10]\n"
- "ldr x20, [x28, #0x18]\n"
- "fmla z20.h, p3/M, z8.h, z9.h\n"
- "fmla z24.h, p3/M, z5.h, z9.h\n"
- "whilelt p0.h, x15, %x[n_channels]\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "fmla z16.h, p3/M, z1.h, z12.h\n"
- "fmla z17.h, p3/M, z0.h, z12.h\n"
- "movprfx z29, z15\n fmla z29.h, p3/M, z1.h, z9.h\n"
- "movprfx z30, z15\n fmla z30.h, p3/M, z0.h, z9.h\n"
- "fmla z18.h, p3/M, z2.h, z11.h\n"
- "ld1h { z9.h }, p2/Z, [x10, x14, LSL #1]\n"
- "ldr x10, [x16, #0x90]\n"
- "fmla z21.h, p3/M, z8.h, z10.h\n"
- "fmla z19.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x14, LSL #1]\n"
- "ldr x27, [x16, #0xa0]\n"
- "fmla z22.h, p3/M, z7.h, z10.h\n"
- "fmla z23.h, p3/M, z6.h, z10.h\n"
- "fmla z25.h, p3/M, z5.h, z10.h\n"
- "fmla z26.h, p3/M, z4.h, z10.h\n"
- "fmla z27.h, p3/M, z3.h, z10.h\n"
- "fmla z29.h, p3/M, z2.h, z10.h\n"
- "fmla z30.h, p3/M, z1.h, z10.h\n"
- "fmla z31.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x25, x14, LSL #1]\n"
- "ldr x25, [x16, #0xa8]\n"
- "fmla z16.h, p3/M, z3.h, z9.h\n"
- "fmla z20.h, p3/M, z0.h, z9.h\n"
- "ld1h { z12.h }, p2/Z, [x9, x14, LSL #1]\n"
- "ldr x9, [x16, #0x98]\n"
- "fmla z24.h, p3/M, z6.h, z11.h\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x14, LSL #1]\n"
- "ldr x26, [x16, #0xb0]\n"
- "fmla z17.h, p3/M, z4.h, z10.h\n"
- "fmla z18.h, p3/M, z3.h, z10.h\n"
+ "fmla z27.h, p3/M, z7.h, z12.h\n"
+ "movprfx z15, z17\n fmla z15.h, p3/M, z6.h, z28.h\n"
+ "ld1h { z10.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "ldr x28, [x8, #0x88]\n"
+ "fmla z20.h, p3/M, z7.h, z25.h\n"
+ "fmla z9.h, p3/M, z6.h, z12.h\n"
+ "ldr x12, [x14, #0x0]\n"
+ "ldr x11, [x14, #0x8]\n"
+ "movprfx z11, z17\n fmla z11.h, p3/M, z3.h, z12.h\n"
+ "movprfx z13, z17\n fmla z13.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "ldr x22, [x8, #0x78]\n"
+ "movprfx z28, z17\n fmla z28.h, p3/M, z8.h, z21.h\n"
+ "fmla z24.h, p3/M, z6.h, z25.h\n"
+ "ld1h { z29.h }, p2/Z, [x26, x16, LSL #1]\n"
+ "ldr x21, [x8, #0x80]\n"
+ "fmla z30.h, p3/M, z4.h, z25.h\n"
+ "fmla z31.h, p3/M, z3.h, z25.h\n"
+ "ldr x10, [x14, #0x10]\n"
+ "ldr x9, [x14, #0x18]\n"
+ "movprfx z18, z17\n fmla z18.h, p3/M, z1.h, z25.h\n"
+ "movprfx z21, z17\n fmla z21.h, p3/M, z0.h, z25.h\n"
+ "whilelt p0.h, x17, %x[n_channels]\n"
+ "ld1h { z17.h }, p3/Z, [x7]\n"
+ "fmla z14.h, p3/M, z8.h, z25.h\n"
+ "fmla z23.h, p3/M, z5.h, z25.h\n"
+ "fmla z15.h, p3/M, z2.h, z25.h\n"
+ "fmla z26.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z25.h }, p2/Z, [x25, x16, LSL #1]\n"
+ "ldr x27, [x8, #0x90]\n"
+ "fmla z22.h, p3/M, z0.h, z12.h\n"
+ "fmla z27.h, p3/M, z2.h, z29.h\n"
+ "ld1h { z12.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "ldr x20, [x8, #0x98]\n"
+ "fmla z20.h, p3/M, z8.h, z10.h\n"
+ "fmla z9.h, p3/M, z1.h, z29.h\n"
+ "ld1h { z29.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "ldr x26, [x8, #0xa0]\n"
+ "fmla z24.h, p3/M, z7.h, z10.h\n"
+ "fmla z11.h, p3/M, z6.h, z10.h\n"
+ "fmla z30.h, p3/M, z5.h, z10.h\n"
+ "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "fmla z13.h, p3/M, z3.h, z10.h\n"
+ "fmla z18.h, p3/M, z2.h, z10.h\n"
"fmla z21.h, p3/M, z1.h, z10.h\n"
- "fmla z19.h, p3/M, z5.h, z12.h\n"
- "fmla z23.h, p3/M, z2.h, z12.h\n"
- "fmla z22.h, p3/M, z0.h, z10.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x14, LSL #1]\n"
- "ldr x24, [x16, #0xb8]\n"
- "fmla z27.h, p3/M, z8.h, z11.h\n"
- "fmla z31.h, p3/M, z5.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x12, x14, LSL #1]\n"
- "ldr x12, [x16, #0xc0]\n"
- "fmla z16.h, p3/M, z5.h, z10.h\n"
- "fmla z20.h, p3/M, z2.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
- "ldr x11, [x16, #0xc8]\n"
- "fmla z17.h, p3/M, z5.h, z12.h\n"
- "fmla z18.h, p3/M, z4.h, z12.h\n"
- "fmla z21.h, p3/M, z2.h, z12.h\n"
- "fmla z19.h, p3/M, z3.h, z12.h\n"
- "fmla z22.h, p3/M, z1.h, z12.h\n"
- "fmla z23.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x9, x14, LSL #1]\n"
- "ldr x9, [x16, #0xd8]\n"
- "fmla z28.h, p3/M, z7.h, z11.h\n"
- "fmla z29.h, p3/M, z6.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x10, x14, LSL #1]\n"
- "ldr x10, [x16, #0xd0]\n"
- "fmla z16.h, p3/M, z7.h, z10.h\n"
- "fmla z17.h, p3/M, z6.h, z10.h\n"
- "fmla z20.h, p3/M, z4.h, z10.h\n"
- "fmla z21.h, p3/M, z3.h, z10.h\n"
- "fmla z24.h, p3/M, z1.h, z10.h\n"
- "fmla z25.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x27, x14, LSL #1]\n"
- "ldr x27, [x16, #0xe0]\n"
- "fmla z18.h, p3/M, z8.h, z12.h\n"
- "fmla z30.h, p3/M, z8.h, z11.h\n"
- "fmla z31.h, p3/M, z7.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x25, x14, LSL #1]\n"
- "fmla z27.h, p3/M, z1.h, z12.h\n"
- "ldr x25, [x16, #0xe8]\n"
- "fmla z19.h, p3/M, z7.h, z12.h\n"
- "fmla z22.h, p3/M, z5.h, z12.h\n"
- "fmla z23.h, p3/M, z4.h, z12.h\n"
- "fmla z26.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x14, LSL #1]\n"
- "ldr x26, [x16, #0xf0]\n"
- "fmla z16.h, p3/M, z2.h, z10.h\n"
- "fmla z17.h, p3/M, z1.h, z10.h\n"
- "fmla z18.h, p3/M, z0.h, z10.h\n"
- "fmla z20.h, p3/M, z7.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x24, x14, LSL #1]\n"
- "ldr x24, [x16, #0xf8]\n"
- "fmla z21.h, p3/M, z6.h, z11.h\n"
- "fmla z24.h, p3/M, z4.h, z11.h\n"
- "fmla z25.h, p3/M, z3.h, z11.h\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "fmla z29.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x12, x14, LSL #1]\n"
- "fmla z27.h, p3/M, z4.h, z11.h\n"
- "ldr x12, [x16, #0x100]\n"
- "fmla z30.h, p3/M, z2.h, z11.h\n"
- "fmla z17.h, p3/M, z2.h, z12.h\n"
- "fmla z18.h, p3/M, z1.h, z12.h\n"
- "fmla z19.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x11, x14, LSL #1]\n"
- "ldr x11, [x16, #0x108]\n"
- "fmla z16.h, p3/M, z6.h, z10.h\n"
- "fmla z20.h, p3/M, z3.h, z10.h\n"
- "fmla z24.h, p3/M, z0.h, z10.h\n"
- "fmla z22.h, p3/M, z8.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x10, x14, LSL #1]\n"
- "ldr x10, [x16, #0x110]\n"
- "fmla z23.h, p3/M, z7.h, z11.h\n"
- "fmla z26.h, p3/M, z5.h, z11.h\n"
- "fmla z31.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x9, x14, LSL #1]\n"
- "fmla z27.h, p3/M, z2.h, z12.h\n"
- "ldr x9, [x16, #0x118]\n"
"fmla z28.h, p3/M, z0.h, z10.h\n"
- "fmla z29.h, p3/M, z4.h, z11.h\n"
- "fmla z30.h, p3/M, z3.h, z11.h\n"
- "fmla z19.h, p3/M, z8.h, z12.h\n"
- "fmla z23.h, p3/M, z5.h, z12.h\n"
- "fmla z20.h, p3/M, z6.h, z10.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x14, LSL #1]\n"
- "fmla z24.h, p3/M, z3.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x25, x14, LSL #1]\n"
- "fmla z25.h, p3/M, z7.h, z11.h\n"
- "fmla z26.h, p3/M, z6.h, z11.h\n"
- "fmla z28.h, p3/M, z5.h, z11.h\n"
- "fmla z27.h, p3/M, z5.h, z12.h\n"
- "fmla z31.h, p3/M, z2.h, z12.h\n"
- "fmla z29.h, p3/M, z7.h, z10.h\n"
- "fmla z30.h, p3/M, z6.h, z10.h\n"
- "fmla z24.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x14, LSL #1]\n"
- "fmla z28.h, p3/M, z8.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x12, x14, LSL #1]\n"
- "fmla z25.h, p3/M, z8.h, z11.h\n"
- "fmla z26.h, p3/M, z7.h, z11.h\n"
- "fmla z27.h, p3/M, z6.h, z11.h\n"
- "fmla z29.h, p3/M, z5.h, z11.h\n"
- "fmla z30.h, p3/M, z4.h, z11.h\n"
- "fmla z31.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11, x14, LSL #1]\n"
- "ldp x12, x11, [x16, #0x0]\n"
- "fmla z23.h, p3/M, z8.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x14, LSL #1]\n"
- "fmla z16.h, p3/M, z4.h, z10.h\n"
- "fmax z16.h, p3/M, z16.h, z14.h\n"
- "fmla z17.h, p3/M, z3.h, z10.h\n"
- "fmla z18.h, p3/M, z5.h, z11.h\n"
- "fmax z17.h, p3/M, z17.h, z14.h\n"
- "fmax z18.h, p3/M, z18.h, z14.h\n"
- "fmla z19.h, p3/M, z4.h, z11.h\n"
- "fmla z29.h, p3/M, z8.h, z12.h\n"
- "fmax z19.h, p3/M, z19.h, z14.h\n"
- "fmin z16.h, p3/M, z16.h, z13.h\n"
- "fmla z30.h, p3/M, z7.h, z12.h\n"
- "fmla z31.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x10, x14, LSL #1]\n"
- "fmin z17.h, p3/M, z17.h, z13.h\n"
+ "ld1h { z10.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "ldr x25, [x8, #0xa8]\n"
+ "fmla z26.h, p3/M, z3.h, z25.h\n"
+ "fmla z14.h, p3/M, z0.h, z25.h\n"
+ "fmla z23.h, p3/M, z6.h, z29.h\n"
+ "fmla z15.h, p3/M, z3.h, z29.h\n"
+ "ld1h { z25.h }, p2/Z, [x13, x16, LSL #1]\n"
+ "ldr x24, [x8, #0xb0]\n"
+ "fmla z22.h, p3/M, z4.h, z10.h\n"
+ "fmla z27.h, p3/M, z3.h, z10.h\n"
"fmla z20.h, p3/M, z1.h, z10.h\n"
- "fmla z21.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x9, x14, LSL #1]\n"
- "fmin z18.h, p3/M, z18.h, z13.h\n"
- "fmla z22.h, p3/M, z2.h, z11.h\n"
- "fmla z23.h, p3/M, z1.h, z11.h\n"
- "fmin z19.h, p3/M, z19.h, z13.h\n"
- "fmax z20.h, p3/M, z20.h, z14.h\n"
- "fmla z24.h, p3/M, z7.h, z12.h\n"
- "fmla z25.h, p3/M, z6.h, z12.h\n"
- "fmax z21.h, p3/M, z21.h, z14.h\n"
- "fmax z22.h, p3/M, z22.h, z14.h\n"
- "fmla z26.h, p3/M, z8.h, z10.h\n"
- "fmla z27.h, p3/M, z7.h, z10.h\n"
- "fmax z23.h, p3/M, z23.h, z14.h\n"
- "st1h { z16.h }, p1, [x23, x13, LSL #1]\n"
- "st1h { z17.h }, p1, [x22, x13, LSL #1]\n"
- "ldr x23, [x28, #0x20]\n"
- "ldr x22, [x28, #0x28]\n"
+ "fmla z9.h, p3/M, z5.h, z12.h\n"
+ "fmla z11.h, p3/M, z2.h, z12.h\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z12.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "ldr x23, [x8, #0xb8]\n"
+ "fmla z13.h, p3/M, z8.h, z25.h\n"
+ "fmla z28.h, p3/M, z5.h, z25.h\n"
+ "ld1h { z25.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "ldr x22, [x8, #0xc0]\n"
+ "fmla z26.h, p3/M, z5.h, z10.h\n"
+ "fmla z14.h, p3/M, z2.h, z10.h\n"
+ "ld1h { z29.h }, p2/Z, [x28, x16, LSL #1]\n"
+ "ldr x21, [x8, #0xc8]\n"
+ "fmla z22.h, p3/M, z5.h, z12.h\n"
+ "fmla z27.h, p3/M, z4.h, z12.h\n"
+ "fmla z20.h, p3/M, z2.h, z12.h\n"
+ "fmla z9.h, p3/M, z3.h, z12.h\n"
+ "fmla z24.h, p3/M, z1.h, z12.h\n"
+ "fmla z11.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z10.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "ldr x28, [x8, #0xd8]\n"
+ "fmla z15.h, p3/M, z7.h, z25.h\n"
+ "fmla z18.h, p3/M, z6.h, z25.h\n"
+ "ld1h { z25.h }, p2/Z, [x27, x16, LSL #1]\n"
+ "ldr x20, [x8, #0xd0]\n"
+ "fmla z26.h, p3/M, z7.h, z29.h\n"
+ "fmla z22.h, p3/M, z6.h, z29.h\n"
+ "fmla z14.h, p3/M, z4.h, z29.h\n"
+ "fmla z20.h, p3/M, z3.h, z29.h\n"
+ "fmla z23.h, p3/M, z1.h, z29.h\n"
+ "fmla z30.h, p3/M, z0.h, z29.h\n"
+ "ld1h { z29.h }, p2/Z, [x26, x16, LSL #1]\n"
+ "ldr x27, [x8, #0xe0]\n"
+ "fmla z27.h, p3/M, z8.h, z10.h\n"
+ "fmla z21.h, p3/M, z8.h, z25.h\n"
+ "fmla z28.h, p3/M, z7.h, z25.h\n"
+ "ld1h { z25.h }, p2/Z, [x25, x16, LSL #1]\n"
+ "fmla z13.h, p3/M, z1.h, z10.h\n"
+ "ldr x26, [x8, #0xe8]\n"
+ "fmla z9.h, p3/M, z7.h, z10.h\n"
+ "fmla z24.h, p3/M, z5.h, z10.h\n"
+ "fmla z11.h, p3/M, z4.h, z10.h\n"
+ "fmla z31.h, p3/M, z2.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "ldr x25, [x8, #0xf0]\n"
+ "fmla z26.h, p3/M, z2.h, z29.h\n"
+ "fmla z22.h, p3/M, z1.h, z29.h\n"
+ "fmla z27.h, p3/M, z0.h, z29.h\n"
+ "fmla z14.h, p3/M, z7.h, z25.h\n"
+ "ld1h { z29.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "ldr x24, [x8, #0xf8]\n"
+ "fmla z20.h, p3/M, z6.h, z25.h\n"
+ "fmla z23.h, p3/M, z4.h, z25.h\n"
+ "fmla z30.h, p3/M, z3.h, z25.h\n"
+ "fmla z15.h, p3/M, z1.h, z25.h\n"
+ "fmla z18.h, p3/M, z0.h, z25.h\n"
+ "ld1h { z25.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "fmla z13.h, p3/M, z4.h, z25.h\n"
+ "ldr x23, [x8, #0x100]\n"
+ "fmla z21.h, p3/M, z2.h, z25.h\n"
+ "fmla z22.h, p3/M, z2.h, z10.h\n"
+ "fmla z27.h, p3/M, z1.h, z10.h\n"
+ "fmla z9.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z12.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "ldr x22, [x8, #0x108]\n"
+ "fmla z26.h, p3/M, z6.h, z29.h\n"
+ "fmla z14.h, p3/M, z3.h, z29.h\n"
+ "fmla z23.h, p3/M, z0.h, z29.h\n"
+ "fmla z24.h, p3/M, z8.h, z25.h\n"
+ "ld1h { z10.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "ldr x21, [x8, #0x110]\n"
+ "fmla z11.h, p3/M, z7.h, z25.h\n"
+ "fmla z31.h, p3/M, z5.h, z25.h\n"
+ "fmla z28.h, p3/M, z1.h, z25.h\n"
+ "ld1h { z25.h }, p2/Z, [x28, x16, LSL #1]\n"
+ "fmla z13.h, p3/M, z2.h, z12.h\n"
+ "ldr x20, [x8, #0x118]\n"
+ "fmla z15.h, p3/M, z0.h, z10.h\n"
+ "fmla z18.h, p3/M, z4.h, z25.h\n"
+ "fmla z21.h, p3/M, z3.h, z25.h\n"
+ "fmla z9.h, p3/M, z8.h, z12.h\n"
+ "fmla z11.h, p3/M, z5.h, z12.h\n"
+ "fmla z14.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z12.h }, p2/Z, [x27, x16, LSL #1]\n"
+ "fmla z23.h, p3/M, z3.h, z10.h\n"
+ "ld1h { z29.h }, p2/Z, [x26, x16, LSL #1]\n"
+ "fmla z30.h, p3/M, z7.h, z25.h\n"
+ "fmla z31.h, p3/M, z6.h, z25.h\n"
+ "fmla z15.h, p3/M, z5.h, z25.h\n"
+ "fmla z13.h, p3/M, z5.h, z12.h\n"
+ "fmla z28.h, p3/M, z2.h, z12.h\n"
+ "fmla z18.h, p3/M, z7.h, z29.h\n"
+ "fmla z21.h, p3/M, z6.h, z29.h\n"
+ "fmla z23.h, p3/M, z8.h, z25.h\n"
+ "ld1h { z25.h }, p2/Z, [x25, x16, LSL #1]\n"
+ "fmla z15.h, p3/M, z8.h, z29.h\n"
+ "ld1h { z29.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "fmla z30.h, p3/M, z8.h, z25.h\n"
+ "fmla z31.h, p3/M, z7.h, z25.h\n"
+ "fmla z13.h, p3/M, z6.h, z25.h\n"
+ "fmla z18.h, p3/M, z5.h, z25.h\n"
+ "fmla z21.h, p3/M, z4.h, z25.h\n"
+ "fmla z28.h, p3/M, z3.h, z25.h\n"
+ "ld1h { z25.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "ldp x27, x26, [x8, #0x0]\n"
+ "fmla z11.h, p3/M, z8.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "fmla z26.h, p3/M, z4.h, z29.h\n"
+ "fmax z26.h, p3/M, z26.h, z16.h\n"
+ "fmla z22.h, p3/M, z3.h, z29.h\n"
+ "fmla z27.h, p3/M, z5.h, z25.h\n"
+ "fmax z22.h, p3/M, z22.h, z16.h\n"
+ "fmax z27.h, p3/M, z27.h, z16.h\n"
+ "fmla z9.h, p3/M, z4.h, z25.h\n"
+ "fmla z18.h, p3/M, z8.h, z12.h\n"
+ "fmax z9.h, p3/M, z9.h, z16.h\n"
+ "fmin z26.h, p3/M, z26.h, z19.h\n"
+ "fmla z21.h, p3/M, z7.h, z12.h\n"
+ "fmla z28.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z10.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "fmin z22.h, p3/M, z22.h, z19.h\n"
+ "fmla z14.h, p3/M, z1.h, z29.h\n"
+ "fmla z20.h, p3/M, z0.h, z29.h\n"
+ "ld1h { z12.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "fmin z27.h, p3/M, z27.h, z19.h\n"
+ "fmla z24.h, p3/M, z2.h, z25.h\n"
+ "fmla z11.h, p3/M, z1.h, z25.h\n"
+ "fmin z9.h, p3/M, z9.h, z19.h\n"
+ "fmax z14.h, p3/M, z14.h, z16.h\n"
+ "fmla z23.h, p3/M, z7.h, z10.h\n"
+ "fmla z30.h, p3/M, z6.h, z10.h\n"
+ "fmax z20.h, p3/M, z20.h, z16.h\n"
+ "fmax z24.h, p3/M, z24.h, z16.h\n"
+ "fmla z31.h, p3/M, z8.h, z12.h\n"
+ "fmla z13.h, p3/M, z7.h, z12.h\n"
+ "fmax z11.h, p3/M, z11.h, z16.h\n"
+ "st1h { z26.h }, p1, [x12, x15, LSL #1]\n"
+ "st1h { z22.h }, p1, [x11, x15, LSL #1]\n"
+ "ldr x23, [x14, #0x20]\n"
+ "ldr x22, [x14, #0x28]\n"
+ "fmla z15.h, p3/M, z4.h, z10.h\n"
+ "st1h { z27.h }, p1, [x10, x15, LSL #1]\n"
+ "ldr x21, [x14, #0x30]\n"
+ "fmla z18.h, p3/M, z3.h, z10.h\n"
+ "fmla z21.h, p3/M, z5.h, z12.h\n"
+ "st1h { z9.h }, p1, [x9, x15, LSL #1]\n"
+ "ldr x20, [x14, #0x38]\n"
"fmla z28.h, p3/M, z4.h, z12.h\n"
- "st1h { z18.h }, p1, [x21, x13, LSL #1]\n"
- "ldr x21, [x28, #0x30]\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "fmla z30.h, p3/M, z5.h, z10.h\n"
- "st1h { z19.h }, p1, [x20, x13, LSL #1]\n"
- "ldr x20, [x28, #0x38]\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
- "ldp x10, x9, [x16, #0x10]\n"
- "fmin z20.h, p3/M, z20.h, z13.h\n"
- "fmin z21.h, p3/M, z21.h, z13.h\n"
- "st1h { z20.h }, p1, [x23, x13, LSL #1]\n"
- "ldr x23, [x28, #0x40]\n"
- "fmin z22.h, p3/M, z22.h, z13.h\n"
- "fmin z23.h, p3/M, z23.h, z13.h\n"
- "st1h { z21.h }, p1, [x22, x13, LSL #1]\n"
- "ldr x22, [x28, #0x48]\n"
- "fmax z24.h, p3/M, z24.h, z14.h\n"
- "fmax z25.h, p3/M, z25.h, z14.h\n"
- "st1h { z22.h }, p1, [x21, x13, LSL #1]\n"
- "ldr x21, [x28, #0x50]\n"
- "fmax z26.h, p3/M, z26.h, z14.h\n"
- "fmax z27.h, p3/M, z27.h, z14.h\n"
- "st1h { z23.h }, p1, [x20, x13, LSL #1]\n"
- "ldr x20, [x28, #0x58]\n"
- "inch x14\n"
- "ld1h { z9.h }, p0/Z, [x12, x15, LSL #1]\n"
- "ld1h { z10.h }, p0/Z, [x11, x15, LSL #1]\n"
- "fmin z24.h, p3/M, z24.h, z13.h\n"
- "ld1h { z11.h }, p0/Z, [x10, x15, LSL #1]\n"
- "ld1h { z12.h }, p0/Z, [x9, x15, LSL #1]\n"
- "inch x15\n"
- "fmin z25.h, p3/M, z25.h, z13.h\n"
- "fmin z26.h, p3/M, z26.h, z13.h\n"
- "fmin z27.h, p3/M, z27.h, z13.h\n"
- "st1h { z24.h }, p1, [x23, x13, LSL #1]\n"
- "ldr x23, [x28, #0x60]\n"
- "fmax z28.h, p3/M, z28.h, z14.h\n"
- "fmax z29.h, p3/M, z29.h, z14.h\n"
- "st1h { z25.h }, p1, [x22, x13, LSL #1]\n"
- "ldr x22, [x28, #0x68]\n"
- "fmax z30.h, p3/M, z30.h, z14.h\n"
- "fmax z31.h, p3/M, z31.h, z14.h\n"
- "st1h { z26.h }, p1, [x21, x13, LSL #1]\n"
- "ldr x21, [x28, #0x70]\n"
- "st1h { z27.h }, p1, [x20, x13, LSL #1]\n"
- "ldr x20, [x28, #0x78]\n"
- "ld1h { z15.h }, p3/Z, [x17]\n"
- "whilelt p2.h, x14, %x[n_channels]\n"
- "ld1h { z0.h }, p3/Z, [x17, #1, MUL VL]\n"
- "ld1h { z1.h }, p3/Z, [x17, #2, MUL VL]\n"
- "cmp x15, %x[n_channels]\n"
- "fmin z28.h, p3/M, z28.h, z13.h\n"
- "ld1h { z2.h }, p3/Z, [x17, #3, MUL VL]\n"
- "ld1h { z3.h }, p3/Z, [x17, #4, MUL VL]\n"
- "fmin z29.h, p3/M, z29.h, z13.h\n"
- "fmin z30.h, p3/M, z30.h, z13.h\n"
- "ld1h { z4.h }, p3/Z, [x17, #5, MUL VL]\n"
- "ld1h { z5.h }, p3/Z, [x17, #6, MUL VL]\n"
- "fmin z31.h, p3/M, z31.h, z13.h\n"
- "st1h { z28.h }, p1, [x23, x13, LSL #1]\n"
- "ld1h { z6.h }, p3/Z, [x17, #7, MUL VL]\n"
- "addvl x17, x17, #16\n"
- "st1h { z29.h }, p1, [x22, x13, LSL #1]\n"
- "ld1h { z7.h }, p3/Z, [x17, #-8, MUL VL]\n"
- "st1h { z30.h }, p1, [x21, x13, LSL #1]\n"
- "ld1h { z8.h }, p3/Z, [x17, #-7, MUL VL]\n"
- "addvl x17, x17, #-6\n"
- "st1h { z31.h }, p1, [x20, x13, LSL #1]\n"
+ "ldp x25, x24, [x8, #0x10]\n"
+ "fmin z14.h, p3/M, z14.h, z19.h\n"
+ "fmin z20.h, p3/M, z20.h, z19.h\n"
+ "st1h { z14.h }, p1, [x23, x15, LSL #1]\n"
+ "ldr x23, [x14, #0x40]\n"
+ "fmin z24.h, p3/M, z24.h, z19.h\n"
+ "fmin z11.h, p3/M, z11.h, z19.h\n"
+ "st1h { z20.h }, p1, [x22, x15, LSL #1]\n"
+ "ldr x22, [x14, #0x48]\n"
+ "fmax z23.h, p3/M, z23.h, z16.h\n"
+ "fmax z30.h, p3/M, z30.h, z16.h\n"
+ "st1h { z24.h }, p1, [x21, x15, LSL #1]\n"
+ "ldr x21, [x14, #0x50]\n"
+ "fmax z31.h, p3/M, z31.h, z16.h\n"
+ "fmax z13.h, p3/M, z13.h, z16.h\n"
+ "st1h { z11.h }, p1, [x20, x15, LSL #1]\n"
+ "ldr x20, [x14, #0x58]\n"
+ "inch x16\n"
+ "ld1h { z9.h }, p0/Z, [x27, x17, LSL #1]\n"
+ "ld1h { z10.h }, p0/Z, [x26, x17, LSL #1]\n"
+ "fmin z23.h, p3/M, z23.h, z19.h\n"
+ "ld1h { z11.h }, p0/Z, [x25, x17, LSL #1]\n"
+ "ld1h { z12.h }, p0/Z, [x24, x17, LSL #1]\n"
+ "inch x17\n"
+ "fmin z30.h, p3/M, z30.h, z19.h\n"
+ "fmin z31.h, p3/M, z31.h, z19.h\n"
+ "fmin z13.h, p3/M, z13.h, z19.h\n"
+ "st1h { z23.h }, p1, [x23, x15, LSL #1]\n"
+ "ldr x23, [x14, #0x60]\n"
+ "fmax z15.h, p3/M, z15.h, z16.h\n"
+ "fmax z18.h, p3/M, z18.h, z16.h\n"
+ "st1h { z30.h }, p1, [x22, x15, LSL #1]\n"
+ "ldr x22, [x14, #0x68]\n"
+ "fmax z21.h, p3/M, z21.h, z16.h\n"
+ "fmax z28.h, p3/M, z28.h, z16.h\n"
+ "st1h { z31.h }, p1, [x21, x15, LSL #1]\n"
+ "ldr x21, [x14, #0x70]\n"
+ "st1h { z13.h }, p1, [x20, x15, LSL #1]\n"
+ "ldr x20, [x14, #0x78]\n"
+ "ld1h { z0.h }, p3/Z, [x7, #1, MUL VL]\n"
+ "whilelt p2.h, x16, %x[n_channels]\n"
+ "ld1h { z1.h }, p3/Z, [x7, #2, MUL VL]\n"
+ "ld1h { z2.h }, p3/Z, [x7, #3, MUL VL]\n"
+ "cmp x17, %x[n_channels]\n"
+ "fmin z15.h, p3/M, z15.h, z19.h\n"
+ "ld1h { z3.h }, p3/Z, [x7, #4, MUL VL]\n"
+ "ld1h { z4.h }, p3/Z, [x7, #5, MUL VL]\n"
+ "fmin z18.h, p3/M, z18.h, z19.h\n"
+ "fmin z21.h, p3/M, z21.h, z19.h\n"
+ "ld1h { z5.h }, p3/Z, [x7, #6, MUL VL]\n"
+ "ld1h { z6.h }, p3/Z, [x7, #7, MUL VL]\n"
+ "addvl x7, x7, #16\n"
+ "fmin z28.h, p3/M, z28.h, z19.h\n"
+ "st1h { z15.h }, p1, [x23, x15, LSL #1]\n"
+ "ld1h { z7.h }, p3/Z, [x7, #-8, MUL VL]\n"
+ "ld1h { z8.h }, p3/Z, [x7, #-7, MUL VL]\n"
+ "addvl x7, x7, #-6\n"
+ "st1h { z18.h }, p1, [x22, x15, LSL #1]\n"
+ "st1h { z21.h }, p1, [x21, x15, LSL #1]\n"
+ "st1h { z28.h }, p1, [x20, x15, LSL #1]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z21, z15\n fmla z21.h, p3/M, z4.h, z9.h\n"
- "movprfx z16, z15\n fmla z16.h, p3/M, z8.h, z9.h\n"
- "ldr x27, [x16, #0x20]\n"
- "ldr x26, [x16, #0x30]\n"
- "movprfx z22, z15\n fmla z22.h, p3/M, z3.h, z9.h\n"
- "movprfx z25, z15\n fmla z25.h, p3/M, z1.h, z9.h\n"
- "ldr x25, [x16, #0x28]\n"
- "ldr x24, [x16, #0x38]\n"
- "movprfx z26, z15\n fmla z26.h, p3/M, z0.h, z9.h\n"
- "movprfx z17, z15\n fmla z17.h, p3/M, z7.h, z9.h\n"
- "ldr x12, [x16, #0x40]\n"
- "ldr x11, [x16, #0x48]\n"
- "movprfx z18, z15\n fmla z18.h, p3/M, z6.h, z9.h\n"
- "fmla z21.h, p3/M, z5.h, z12.h\n"
- "ldr x10, [x16, #0x50]\n"
- "ldr x9, [x16, #0x58]\n"
- "movprfx z20, z15\n fmla z20.h, p3/M, z5.h, z9.h\n"
- "movprfx z24, z15\n fmla z24.h, p3/M, z2.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x26, x14, LSL #1]\n"
- "ldr x26, [x16, #0x70]\n"
- "fmla z16.h, p3/M, z0.h, z10.h\n"
- "movprfx z19, z15\n fmla z19.h, p3/M, z2.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x27, x14, LSL #1]\n"
- "ld1h { z11.h }, p2/Z, [x25, x14, LSL #1]\n"
- "fmla z22.h, p3/M, z4.h, z12.h\n"
- "fmla z25.h, p3/M, z2.h, z12.h\n"
- "ldr x27, [x16, #0x60]\n"
- "ldr x25, [x16, #0x68]\n"
- "fmla z26.h, p3/M, z1.h, z12.h\n"
- "fmla z17.h, p3/M, z8.h, z12.h\n"
- "inch x13\n"
- "mov p1.b, p2.b\n"
- "fmla z18.h, p3/M, z7.h, z12.h\n"
- "movprfx z28, z15\n fmla z28.h, p3/M, z6.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
- "ldr x11, [x16, #0x88]\n"
- "fmla z21.h, p3/M, z7.h, z9.h\n"
- "fmla z19.h, p3/M, z6.h, z12.h\n"
- "ldr x23, [x28, #0x0]\n"
- "ldr x22, [x28, #0x8]\n"
- "movprfx z23, z15\n fmla z23.h, p3/M, z3.h, z12.h\n"
- "movprfx z27, z15\n fmla z27.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x14, LSL #1]\n"
- "ldr x24, [x16, #0x78]\n"
- "movprfx z31, z15\n fmla z31.h, p3/M, z8.h, z11.h\n"
- "fmla z22.h, p3/M, z6.h, z9.h\n"
- "ld1h { z11.h }, p2/Z, [x12, x14, LSL #1]\n"
- "ldr x12, [x16, #0x80]\n"
- "fmla z25.h, p3/M, z4.h, z9.h\n"
- "fmla z26.h, p3/M, z3.h, z9.h\n"
- "ldr x21, [x28, #0x10]\n"
- "ldr x20, [x28, #0x18]\n"
- "fmla z20.h, p3/M, z8.h, z9.h\n"
- "fmla z24.h, p3/M, z5.h, z9.h\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "fmla z16.h, p3/M, z1.h, z12.h\n"
- "fmla z17.h, p3/M, z0.h, z12.h\n"
- "movprfx z29, z15\n fmla z29.h, p3/M, z1.h, z9.h\n"
- "movprfx z30, z15\n fmla z30.h, p3/M, z0.h, z9.h\n"
- "fmla z18.h, p3/M, z2.h, z11.h\n"
- "ld1h { z9.h }, p2/Z, [x10, x14, LSL #1]\n"
- "ldr x10, [x16, #0x90]\n"
- "fmla z21.h, p3/M, z8.h, z10.h\n"
- "fmla z19.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x14, LSL #1]\n"
- "ldr x27, [x16, #0xa0]\n"
- "fmla z22.h, p3/M, z7.h, z10.h\n"
- "fmla z23.h, p3/M, z6.h, z10.h\n"
- "fmla z25.h, p3/M, z5.h, z10.h\n"
- "fmla z26.h, p3/M, z4.h, z10.h\n"
- "fmla z27.h, p3/M, z3.h, z10.h\n"
- "fmla z29.h, p3/M, z2.h, z10.h\n"
- "fmla z30.h, p3/M, z1.h, z10.h\n"
- "fmla z31.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x25, x14, LSL #1]\n"
- "ldr x25, [x16, #0xa8]\n"
- "fmla z16.h, p3/M, z3.h, z9.h\n"
- "fmla z20.h, p3/M, z0.h, z9.h\n"
- "ld1h { z12.h }, p2/Z, [x9, x14, LSL #1]\n"
- "ldr x9, [x16, #0x98]\n"
- "fmla z24.h, p3/M, z6.h, z11.h\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x14, LSL #1]\n"
- "ldr x26, [x16, #0xb0]\n"
- "fmla z17.h, p3/M, z4.h, z10.h\n"
- "fmla z18.h, p3/M, z3.h, z10.h\n"
- "fmla z21.h, p3/M, z1.h, z10.h\n"
- "fmla z19.h, p3/M, z5.h, z12.h\n"
- "fmla z23.h, p3/M, z2.h, z12.h\n"
- "fmla z22.h, p3/M, z0.h, z10.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x14, LSL #1]\n"
- "ldr x24, [x16, #0xb8]\n"
- "fmla z27.h, p3/M, z8.h, z11.h\n"
- "fmla z31.h, p3/M, z5.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x12, x14, LSL #1]\n"
- "ldr x12, [x16, #0xc0]\n"
- "fmla z16.h, p3/M, z5.h, z10.h\n"
- "fmla z20.h, p3/M, z2.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
- "ldr x11, [x16, #0xc8]\n"
- "fmla z17.h, p3/M, z5.h, z12.h\n"
- "fmla z18.h, p3/M, z4.h, z12.h\n"
- "fmla z21.h, p3/M, z2.h, z12.h\n"
- "fmla z19.h, p3/M, z3.h, z12.h\n"
- "fmla z22.h, p3/M, z1.h, z12.h\n"
- "fmla z23.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x9, x14, LSL #1]\n"
- "ldr x9, [x16, #0xd8]\n"
- "fmla z28.h, p3/M, z7.h, z11.h\n"
- "fmla z29.h, p3/M, z6.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x10, x14, LSL #1]\n"
- "ldr x10, [x16, #0xd0]\n"
- "fmla z16.h, p3/M, z7.h, z10.h\n"
- "fmla z17.h, p3/M, z6.h, z10.h\n"
- "fmla z20.h, p3/M, z4.h, z10.h\n"
- "fmla z21.h, p3/M, z3.h, z10.h\n"
- "fmla z24.h, p3/M, z1.h, z10.h\n"
- "fmla z25.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x27, x14, LSL #1]\n"
- "ldr x27, [x16, #0xe0]\n"
- "fmla z18.h, p3/M, z8.h, z12.h\n"
- "fmla z30.h, p3/M, z8.h, z11.h\n"
- "fmla z31.h, p3/M, z7.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x25, x14, LSL #1]\n"
- "fmla z27.h, p3/M, z1.h, z12.h\n"
- "ldr x25, [x16, #0xe8]\n"
- "fmla z19.h, p3/M, z7.h, z12.h\n"
- "fmla z22.h, p3/M, z5.h, z12.h\n"
- "fmla z23.h, p3/M, z4.h, z12.h\n"
- "fmla z26.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x14, LSL #1]\n"
- "ldr x26, [x16, #0xf0]\n"
- "fmla z16.h, p3/M, z2.h, z10.h\n"
- "fmla z17.h, p3/M, z1.h, z10.h\n"
+ "movprfx z14, z17\n fmla z14.h, p3/M, z4.h, z9.h\n"
+ "movprfx z18, z17\n fmla z18.h, p3/M, z8.h, z9.h\n"
+ "ldr x27, [x8, #0x20]\n"
+ "ldr x24, [x8, #0x30]\n"
+ "movprfx z15, z17\n fmla z15.h, p3/M, z3.h, z9.h\n"
+ "movprfx z30, z17\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "ldr x23, [x8, #0x28]\n"
+ "ldr x22, [x8, #0x38]\n"
+ "movprfx z20, z17\n fmla z20.h, p3/M, z0.h, z9.h\n"
+ "movprfx z13, z17\n fmla z13.h, p3/M, z7.h, z9.h\n"
+ "ldr x26, [x8, #0x40]\n"
+ "ldr x21, [x8, #0x48]\n"
+ "movprfx z22, z17\n fmla z22.h, p3/M, z6.h, z9.h\n"
+ "fmla z14.h, p3/M, z5.h, z12.h\n"
+ "ldr x25, [x8, #0x50]\n"
+ "ldr x20, [x8, #0x58]\n"
+ "movprfx z27, z17\n fmla z27.h, p3/M, z5.h, z9.h\n"
+ "movprfx z31, z17\n fmla z31.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z23.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "ldr x13, [x8, #0x70]\n"
"fmla z18.h, p3/M, z0.h, z10.h\n"
- "fmla z20.h, p3/M, z7.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x24, x14, LSL #1]\n"
- "ldr x24, [x16, #0xf8]\n"
- "fmla z21.h, p3/M, z6.h, z11.h\n"
- "fmla z24.h, p3/M, z4.h, z11.h\n"
- "fmla z25.h, p3/M, z3.h, z11.h\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "fmla z29.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x12, x14, LSL #1]\n"
- "fmla z27.h, p3/M, z4.h, z11.h\n"
- "ldr x12, [x16, #0x100]\n"
- "fmla z30.h, p3/M, z2.h, z11.h\n"
- "fmla z17.h, p3/M, z2.h, z12.h\n"
+ "movprfx z9, z17\n fmla z9.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z21.h }, p2/Z, [x27, x16, LSL #1]\n"
+ "ld1h { z25.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "fmla z15.h, p3/M, z4.h, z12.h\n"
+ "fmla z30.h, p3/M, z2.h, z12.h\n"
+ "ldr x24, [x8, #0x60]\n"
+ "ldr x23, [x8, #0x68]\n"
+ "fmla z20.h, p3/M, z1.h, z12.h\n"
+ "fmla z13.h, p3/M, z8.h, z12.h\n"
+ "inch x15\n"
+ "mov p0.b, p2.b\n"
+ "fmla z22.h, p3/M, z7.h, z12.h\n"
+ "movprfx z28, z17\n fmla z28.h, p3/M, z6.h, z21.h\n"
+ "ld1h { z29.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "ldr x28, [x8, #0x88]\n"
+ "fmla z14.h, p3/M, z7.h, z23.h\n"
+ "fmla z9.h, p3/M, z6.h, z12.h\n"
+ "ldr x12, [x14, #0x0]\n"
+ "ldr x11, [x14, #0x8]\n"
+ "movprfx z11, z17\n fmla z11.h, p3/M, z3.h, z12.h\n"
+ "movprfx z10, z17\n fmla z10.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "ldr x22, [x8, #0x78]\n"
+ "movprfx z26, z17\n fmla z26.h, p3/M, z8.h, z25.h\n"
+ "fmla z15.h, p3/M, z6.h, z23.h\n"
+ "ld1h { z21.h }, p2/Z, [x26, x16, LSL #1]\n"
+ "ldr x21, [x8, #0x80]\n"
+ "fmla z30.h, p3/M, z4.h, z23.h\n"
+ "fmla z20.h, p3/M, z3.h, z23.h\n"
+ "ldr x10, [x14, #0x10]\n"
+ "ldr x9, [x14, #0x18]\n"
+ "movprfx z25, z17\n fmla z25.h, p3/M, z1.h, z23.h\n"
+ "movprfx z24, z17\n fmla z24.h, p3/M, z0.h, z23.h\n"
+ "fmla z27.h, p3/M, z8.h, z23.h\n"
+ "fmla z31.h, p3/M, z5.h, z23.h\n"
+ "fmla z28.h, p3/M, z2.h, z23.h\n"
"fmla z18.h, p3/M, z1.h, z12.h\n"
- "fmla z19.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x11, x14, LSL #1]\n"
- "ldr x11, [x16, #0x108]\n"
- "fmla z16.h, p3/M, z6.h, z10.h\n"
- "fmla z20.h, p3/M, z3.h, z10.h\n"
- "fmla z24.h, p3/M, z0.h, z10.h\n"
- "fmla z22.h, p3/M, z8.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x10, x14, LSL #1]\n"
- "ldr x10, [x16, #0x110]\n"
- "fmla z23.h, p3/M, z7.h, z11.h\n"
- "fmla z26.h, p3/M, z5.h, z11.h\n"
- "fmla z31.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x9, x14, LSL #1]\n"
- "fmla z27.h, p3/M, z2.h, z12.h\n"
- "ldr x9, [x16, #0x118]\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "fmla z29.h, p3/M, z4.h, z11.h\n"
- "fmla z30.h, p3/M, z3.h, z11.h\n"
- "fmla z19.h, p3/M, z8.h, z12.h\n"
- "fmla z23.h, p3/M, z5.h, z12.h\n"
- "fmla z20.h, p3/M, z6.h, z10.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x14, LSL #1]\n"
- "fmla z24.h, p3/M, z3.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x25, x14, LSL #1]\n"
- "fmla z25.h, p3/M, z7.h, z11.h\n"
- "fmla z26.h, p3/M, z6.h, z11.h\n"
- "fmla z28.h, p3/M, z5.h, z11.h\n"
- "fmla z27.h, p3/M, z5.h, z12.h\n"
- "fmla z31.h, p3/M, z2.h, z12.h\n"
- "fmla z29.h, p3/M, z7.h, z10.h\n"
- "fmla z30.h, p3/M, z6.h, z10.h\n"
- "fmla z24.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x14, LSL #1]\n"
- "fmla z28.h, p3/M, z8.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x12, x14, LSL #1]\n"
- "fmla z25.h, p3/M, z8.h, z11.h\n"
- "fmla z26.h, p3/M, z7.h, z11.h\n"
- "fmla z27.h, p3/M, z6.h, z11.h\n"
- "fmla z29.h, p3/M, z5.h, z11.h\n"
- "fmla z30.h, p3/M, z4.h, z11.h\n"
- "fmla z31.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11, x14, LSL #1]\n"
- "fmla z23.h, p3/M, z8.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x14, LSL #1]\n"
- "fmla z16.h, p3/M, z4.h, z10.h\n"
- "fmax z16.h, p3/M, z16.h, z14.h\n"
- "fmla z17.h, p3/M, z3.h, z10.h\n"
- "fmla z18.h, p3/M, z5.h, z11.h\n"
- "fmax z17.h, p3/M, z17.h, z14.h\n"
- "fmax z18.h, p3/M, z18.h, z14.h\n"
- "fmla z19.h, p3/M, z4.h, z11.h\n"
- "fmla z29.h, p3/M, z8.h, z12.h\n"
- "fmax z19.h, p3/M, z19.h, z14.h\n"
- "fmin z16.h, p3/M, z16.h, z13.h\n"
- "fmla z30.h, p3/M, z7.h, z12.h\n"
- "fmla z31.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x10, x14, LSL #1]\n"
- "fmin z17.h, p3/M, z17.h, z13.h\n"
- "fmla z20.h, p3/M, z1.h, z10.h\n"
- "fmla z21.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x9, x14, LSL #1]\n"
- "fmin z18.h, p3/M, z18.h, z13.h\n"
- "fmla z22.h, p3/M, z2.h, z11.h\n"
- "fmla z23.h, p3/M, z1.h, z11.h\n"
- "fmin z19.h, p3/M, z19.h, z13.h\n"
- "fmax z20.h, p3/M, z20.h, z14.h\n"
- "fmla z24.h, p3/M, z7.h, z12.h\n"
- "fmla z25.h, p3/M, z6.h, z12.h\n"
- "fmax z21.h, p3/M, z21.h, z14.h\n"
- "fmax z22.h, p3/M, z22.h, z14.h\n"
- "fmla z26.h, p3/M, z8.h, z10.h\n"
- "fmla z27.h, p3/M, z7.h, z10.h\n"
- "fmax z23.h, p3/M, z23.h, z14.h\n"
- "st1h { z16.h }, p1, [x23, x13, LSL #1]\n"
- "st1h { z17.h }, p1, [x22, x13, LSL #1]\n"
- "ldr x23, [x28, #0x20]\n"
- "ldr x22, [x28, #0x28]\n"
- "fmla z28.h, p3/M, z4.h, z12.h\n"
- "st1h { z18.h }, p1, [x21, x13, LSL #1]\n"
- "ldr x21, [x28, #0x30]\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "fmla z30.h, p3/M, z5.h, z10.h\n"
- "st1h { z19.h }, p1, [x20, x13, LSL #1]\n"
- "ldr x20, [x28, #0x38]\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
- "fmin z20.h, p3/M, z20.h, z13.h\n"
- "fmin z21.h, p3/M, z21.h, z13.h\n"
- "fmin z22.h, p3/M, z22.h, z13.h\n"
- "st1h { z20.h }, p1, [x23, x13, LSL #1]\n"
- "ldr x23, [x28, #0x40]\n"
- "fmin z23.h, p3/M, z23.h, z13.h\n"
- "fmax z24.h, p3/M, z24.h, z14.h\n"
- "st1h { z21.h }, p1, [x22, x13, LSL #1]\n"
- "ldr x22, [x28, #0x48]\n"
- "fmax z25.h, p3/M, z25.h, z14.h\n"
- "fmax z26.h, p3/M, z26.h, z14.h\n"
- "st1h { z22.h }, p1, [x21, x13, LSL #1]\n"
- "ldr x21, [x28, #0x50]\n"
- "fmax z27.h, p3/M, z27.h, z14.h\n"
- "st1h { z23.h }, p1, [x20, x13, LSL #1]\n"
- "ldr x20, [x28, #0x58]\n"
- "fmin z24.h, p3/M, z24.h, z13.h\n"
- "fmin z25.h, p3/M, z25.h, z13.h\n"
- "fmin z26.h, p3/M, z26.h, z13.h\n"
- "st1h { z24.h }, p1, [x23, x13, LSL #1]\n"
- "ldr x23, [x28, #0x60]\n"
- "fmin z27.h, p3/M, z27.h, z13.h\n"
- "fmax z28.h, p3/M, z28.h, z14.h\n"
- "st1h { z25.h }, p1, [x22, x13, LSL #1]\n"
- "ldr x22, [x28, #0x68]\n"
- "fmax z29.h, p3/M, z29.h, z14.h\n"
- "fmax z30.h, p3/M, z30.h, z14.h\n"
- "st1h { z26.h }, p1, [x21, x13, LSL #1]\n"
- "ldr x21, [x28, #0x70]\n"
- "fmax z31.h, p3/M, z31.h, z14.h\n"
- "st1h { z27.h }, p1, [x20, x13, LSL #1]\n"
- "ldr x20, [x28, #0x78]\n"
- "fmin z28.h, p3/M, z28.h, z13.h\n"
- "fmin z29.h, p3/M, z29.h, z13.h\n"
- "fmin z30.h, p3/M, z30.h, z13.h\n"
- "st1h { z28.h }, p1, [x23, x13, LSL #1]\n"
- "fmin z31.h, p3/M, z31.h, z13.h\n"
- "st1h { z29.h }, p1, [x22, x13, LSL #1]\n"
- "st1h { z30.h }, p1, [x21, x13, LSL #1]\n"
- "st1h { z31.h }, p1, [x20, x13, LSL #1]\n"
+ "ld1h { z23.h }, p2/Z, [x25, x16, LSL #1]\n"
+ "ldr x27, [x8, #0x90]\n"
+ "fmla z13.h, p3/M, z0.h, z12.h\n"
+ "fmla z22.h, p3/M, z2.h, z21.h\n"
+ "ld1h { z12.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "ldr x20, [x8, #0x98]\n"
+ "fmla z14.h, p3/M, z8.h, z29.h\n"
+ "fmla z9.h, p3/M, z1.h, z21.h\n"
+ "ld1h { z21.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "ldr x26, [x8, #0xa0]\n"
+ "fmla z15.h, p3/M, z7.h, z29.h\n"
+ "fmla z11.h, p3/M, z6.h, z29.h\n"
+ "fmla z30.h, p3/M, z5.h, z29.h\n"
+ "fmla z20.h, p3/M, z4.h, z29.h\n"
+ "fmla z10.h, p3/M, z3.h, z29.h\n"
+ "fmla z25.h, p3/M, z2.h, z29.h\n"
+ "fmla z24.h, p3/M, z1.h, z29.h\n"
+ "fmla z26.h, p3/M, z0.h, z29.h\n"
+ "ld1h { z29.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "ldr x25, [x8, #0xa8]\n"
+ "fmla z18.h, p3/M, z3.h, z23.h\n"
+ "fmla z27.h, p3/M, z0.h, z23.h\n"
+ "fmla z31.h, p3/M, z6.h, z21.h\n"
+ "fmla z28.h, p3/M, z3.h, z21.h\n"
+ "ld1h { z21.h }, p2/Z, [x13, x16, LSL #1]\n"
+ "ldr x24, [x8, #0xb0]\n"
+ "fmla z13.h, p3/M, z4.h, z29.h\n"
+ "fmla z22.h, p3/M, z3.h, z29.h\n"
+ "fmla z14.h, p3/M, z1.h, z29.h\n"
+ "fmla z9.h, p3/M, z5.h, z12.h\n"
+ "fmla z11.h, p3/M, z2.h, z12.h\n"
+ "fmla z15.h, p3/M, z0.h, z29.h\n"
+ "ld1h { z17.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "ldr x23, [x8, #0xb8]\n"
+ "fmla z10.h, p3/M, z8.h, z21.h\n"
+ "fmla z26.h, p3/M, z5.h, z21.h\n"
+ "ld1h { z23.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "ldr x22, [x8, #0xc0]\n"
+ "fmla z18.h, p3/M, z5.h, z29.h\n"
+ "fmla z27.h, p3/M, z2.h, z29.h\n"
+ "ld1h { z21.h }, p2/Z, [x28, x16, LSL #1]\n"
+ "ldr x21, [x8, #0xc8]\n"
+ "fmla z13.h, p3/M, z5.h, z17.h\n"
+ "fmla z22.h, p3/M, z4.h, z17.h\n"
+ "fmla z14.h, p3/M, z2.h, z17.h\n"
+ "fmla z9.h, p3/M, z3.h, z17.h\n"
+ "fmla z15.h, p3/M, z1.h, z17.h\n"
+ "fmla z11.h, p3/M, z0.h, z17.h\n"
+ "ld1h { z29.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "ldr x28, [x8, #0xd8]\n"
+ "fmla z28.h, p3/M, z7.h, z23.h\n"
+ "fmla z25.h, p3/M, z6.h, z23.h\n"
+ "ld1h { z23.h }, p2/Z, [x27, x16, LSL #1]\n"
+ "ldr x20, [x8, #0xd0]\n"
+ "fmla z18.h, p3/M, z7.h, z21.h\n"
+ "fmla z13.h, p3/M, z6.h, z21.h\n"
+ "fmla z27.h, p3/M, z4.h, z21.h\n"
+ "fmla z14.h, p3/M, z3.h, z21.h\n"
+ "fmla z31.h, p3/M, z1.h, z21.h\n"
+ "fmla z30.h, p3/M, z0.h, z21.h\n"
+ "ld1h { z21.h }, p2/Z, [x26, x16, LSL #1]\n"
+ "ldr x27, [x8, #0xe0]\n"
+ "fmla z22.h, p3/M, z8.h, z29.h\n"
+ "fmla z24.h, p3/M, z8.h, z23.h\n"
+ "fmla z26.h, p3/M, z7.h, z23.h\n"
+ "ld1h { z23.h }, p2/Z, [x25, x16, LSL #1]\n"
+ "fmla z10.h, p3/M, z1.h, z29.h\n"
+ "ldr x26, [x8, #0xe8]\n"
+ "fmla z9.h, p3/M, z7.h, z29.h\n"
+ "fmla z15.h, p3/M, z5.h, z29.h\n"
+ "fmla z11.h, p3/M, z4.h, z29.h\n"
+ "fmla z20.h, p3/M, z2.h, z29.h\n"
+ "ld1h { z29.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "ldr x25, [x8, #0xf0]\n"
+ "fmla z18.h, p3/M, z2.h, z21.h\n"
+ "fmla z13.h, p3/M, z1.h, z21.h\n"
+ "fmla z22.h, p3/M, z0.h, z21.h\n"
+ "fmla z27.h, p3/M, z7.h, z23.h\n"
+ "ld1h { z21.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "ldr x24, [x8, #0xf8]\n"
+ "fmla z14.h, p3/M, z6.h, z23.h\n"
+ "fmla z31.h, p3/M, z4.h, z23.h\n"
+ "fmla z30.h, p3/M, z3.h, z23.h\n"
+ "fmla z28.h, p3/M, z1.h, z23.h\n"
+ "fmla z25.h, p3/M, z0.h, z23.h\n"
+ "ld1h { z17.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "fmla z10.h, p3/M, z4.h, z17.h\n"
+ "ldr x23, [x8, #0x100]\n"
+ "fmla z24.h, p3/M, z2.h, z17.h\n"
+ "fmla z13.h, p3/M, z2.h, z29.h\n"
+ "fmla z22.h, p3/M, z1.h, z29.h\n"
+ "fmla z9.h, p3/M, z0.h, z29.h\n"
+ "ld1h { z23.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "ldr x22, [x8, #0x108]\n"
+ "fmla z18.h, p3/M, z6.h, z21.h\n"
+ "fmla z27.h, p3/M, z3.h, z21.h\n"
+ "fmla z31.h, p3/M, z0.h, z21.h\n"
+ "fmla z15.h, p3/M, z8.h, z17.h\n"
+ "ld1h { z29.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "ldr x21, [x8, #0x110]\n"
+ "fmla z11.h, p3/M, z7.h, z17.h\n"
+ "fmla z20.h, p3/M, z5.h, z17.h\n"
+ "fmla z26.h, p3/M, z1.h, z17.h\n"
+ "ld1h { z21.h }, p2/Z, [x28, x16, LSL #1]\n"
+ "fmla z10.h, p3/M, z2.h, z23.h\n"
+ "ldr x20, [x8, #0x118]\n"
+ "fmla z28.h, p3/M, z0.h, z29.h\n"
+ "fmla z25.h, p3/M, z4.h, z21.h\n"
+ "fmla z24.h, p3/M, z3.h, z21.h\n"
+ "fmla z9.h, p3/M, z8.h, z23.h\n"
+ "fmla z11.h, p3/M, z5.h, z23.h\n"
+ "fmla z27.h, p3/M, z6.h, z29.h\n"
+ "ld1h { z23.h }, p2/Z, [x27, x16, LSL #1]\n"
+ "fmla z31.h, p3/M, z3.h, z29.h\n"
+ "ld1h { z17.h }, p2/Z, [x26, x16, LSL #1]\n"
+ "fmla z30.h, p3/M, z7.h, z21.h\n"
+ "fmla z20.h, p3/M, z6.h, z21.h\n"
+ "fmla z28.h, p3/M, z5.h, z21.h\n"
+ "fmla z10.h, p3/M, z5.h, z23.h\n"
+ "fmla z26.h, p3/M, z2.h, z23.h\n"
+ "fmla z25.h, p3/M, z7.h, z17.h\n"
+ "fmla z24.h, p3/M, z6.h, z17.h\n"
+ "fmla z31.h, p3/M, z8.h, z21.h\n"
+ "ld1h { z21.h }, p2/Z, [x25, x16, LSL #1]\n"
+ "fmla z28.h, p3/M, z8.h, z17.h\n"
+ "ld1h { z12.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "fmla z30.h, p3/M, z8.h, z21.h\n"
+ "fmla z20.h, p3/M, z7.h, z21.h\n"
+ "fmla z10.h, p3/M, z6.h, z21.h\n"
+ "fmla z25.h, p3/M, z5.h, z21.h\n"
+ "fmla z24.h, p3/M, z4.h, z21.h\n"
+ "fmla z26.h, p3/M, z3.h, z21.h\n"
+ "ld1h { z21.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "fmla z11.h, p3/M, z8.h, z23.h\n"
+ "ld1h { z29.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "fmla z18.h, p3/M, z4.h, z12.h\n"
+ "fmax z18.h, p3/M, z18.h, z16.h\n"
+ "fmla z13.h, p3/M, z3.h, z12.h\n"
+ "fmla z22.h, p3/M, z5.h, z21.h\n"
+ "fmax z13.h, p3/M, z13.h, z16.h\n"
+ "fmax z22.h, p3/M, z22.h, z16.h\n"
+ "fmla z9.h, p3/M, z4.h, z21.h\n"
+ "fmla z25.h, p3/M, z8.h, z29.h\n"
+ "fmax z9.h, p3/M, z9.h, z16.h\n"
+ "fmin z18.h, p3/M, z18.h, z19.h\n"
+ "fmla z24.h, p3/M, z7.h, z29.h\n"
+ "fmla z26.h, p3/M, z6.h, z29.h\n"
+ "ld1h { z23.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "fmin z13.h, p3/M, z13.h, z19.h\n"
+ "fmla z27.h, p3/M, z1.h, z12.h\n"
+ "fmla z14.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z29.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "fmin z22.h, p3/M, z22.h, z19.h\n"
+ "fmla z15.h, p3/M, z2.h, z21.h\n"
+ "fmla z11.h, p3/M, z1.h, z21.h\n"
+ "fmin z9.h, p3/M, z9.h, z19.h\n"
+ "fmax z27.h, p3/M, z27.h, z16.h\n"
+ "fmla z31.h, p3/M, z7.h, z23.h\n"
+ "fmla z30.h, p3/M, z6.h, z23.h\n"
+ "fmax z14.h, p3/M, z14.h, z16.h\n"
+ "fmax z15.h, p3/M, z15.h, z16.h\n"
+ "fmla z20.h, p3/M, z8.h, z29.h\n"
+ "fmla z10.h, p3/M, z7.h, z29.h\n"
+ "fmax z11.h, p3/M, z11.h, z16.h\n"
+ "st1h { z18.h }, p0, [x12, x15, LSL #1]\n"
+ "st1h { z13.h }, p0, [x11, x15, LSL #1]\n"
+ "ldr x23, [x14, #0x20]\n"
+ "ldr x22, [x14, #0x28]\n"
+ "fmla z28.h, p3/M, z4.h, z23.h\n"
+ "st1h { z22.h }, p0, [x10, x15, LSL #1]\n"
+ "ldr x21, [x14, #0x30]\n"
+ "fmla z25.h, p3/M, z3.h, z23.h\n"
+ "fmla z24.h, p3/M, z5.h, z29.h\n"
+ "st1h { z9.h }, p0, [x9, x15, LSL #1]\n"
+ "ldr x20, [x14, #0x38]\n"
+ "fmla z26.h, p3/M, z4.h, z29.h\n"
+ "fmin z27.h, p3/M, z27.h, z19.h\n"
+ "fmin z14.h, p3/M, z14.h, z19.h\n"
+ "fmin z15.h, p3/M, z15.h, z19.h\n"
+ "st1h { z27.h }, p0, [x23, x15, LSL #1]\n"
+ "ldr x23, [x14, #0x40]\n"
+ "fmin z11.h, p3/M, z11.h, z19.h\n"
+ "fmax z31.h, p3/M, z31.h, z16.h\n"
+ "st1h { z14.h }, p0, [x22, x15, LSL #1]\n"
+ "ldr x22, [x14, #0x48]\n"
+ "fmax z30.h, p3/M, z30.h, z16.h\n"
+ "fmax z20.h, p3/M, z20.h, z16.h\n"
+ "st1h { z15.h }, p0, [x21, x15, LSL #1]\n"
+ "ldr x21, [x14, #0x50]\n"
+ "fmax z10.h, p3/M, z10.h, z16.h\n"
+ "st1h { z11.h }, p0, [x20, x15, LSL #1]\n"
+ "ldr x20, [x14, #0x58]\n"
+ "fmin z31.h, p3/M, z31.h, z19.h\n"
+ "fmin z30.h, p3/M, z30.h, z19.h\n"
+ "fmin z20.h, p3/M, z20.h, z19.h\n"
+ "st1h { z31.h }, p0, [x23, x15, LSL #1]\n"
+ "ldr x23, [x14, #0x60]\n"
+ "fmin z10.h, p3/M, z10.h, z19.h\n"
+ "fmax z28.h, p3/M, z28.h, z16.h\n"
+ "st1h { z30.h }, p0, [x22, x15, LSL #1]\n"
+ "ldr x22, [x14, #0x68]\n"
+ "fmax z25.h, p3/M, z25.h, z16.h\n"
+ "fmax z24.h, p3/M, z24.h, z16.h\n"
+ "st1h { z20.h }, p0, [x21, x15, LSL #1]\n"
+ "ldr x21, [x14, #0x70]\n"
+ "fmax z26.h, p3/M, z26.h, z16.h\n"
+ "st1h { z10.h }, p0, [x20, x15, LSL #1]\n"
+ "ldr x20, [x14, #0x78]\n"
+ "fmin z28.h, p3/M, z28.h, z19.h\n"
+ "fmin z25.h, p3/M, z25.h, z19.h\n"
+ "fmin z24.h, p3/M, z24.h, z19.h\n"
+ "st1h { z28.h }, p0, [x23, x15, LSL #1]\n"
+ "fmin z26.h, p3/M, z26.h, z19.h\n"
+ "st1h { z25.h }, p0, [x22, x15, LSL #1]\n"
+ "st1h { z24.h }, p0, [x21, x15, LSL #1]\n"
+ "st1h { z26.h }, p0, [x20, x15, LSL #1]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index 82173ee71f..d8a25666bd 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,19 +22,19 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
-void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
-void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
class sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
{
@@ -57,7 +57,7 @@ class sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 2;
sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(2, 3, 2) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
@@ -68,4 +68,4 @@ class sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirst
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
index 6a9b354c02..58decdba1c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -112,7 +112,7 @@ void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"add x28, x12, x23, LSL #1\n"
"madd x20, x16, x14, x20\n" // offset += tile_j * ld_output_col
"whilelt p2.h, XZR, %x[n_channels]\n"
- "ld1h { z19.h }, p3/Z, [x11]\n"
+ "ld1h { z30.h }, p3/Z, [x11]\n"
"ld1h { z0.h }, p3/Z, [x11, #1, MUL VL]\n"
"mul x20, x20, x24\n" // offset *= output_tile_size
"ld1h { z1.h }, p3/Z, [x11, #2, MUL VL]\n"
@@ -128,8 +128,8 @@ void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"add x24, x26, x15\n"
"add x9, x9, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
"cmp x13, %x[n_channels]\n"
- "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rh { z29.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z28.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"add x23, x25, x23, LSL #1\n"
"add x22, x9, x21, LSL #1\n"
"ld1h { z7.h }, p3/Z, [x11, #-8, MUL VL]\n"
@@ -147,191 +147,191 @@ void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ld1h { z16.h }, p2/Z, [x12, x10, LSL #1]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z28, z19\n fmla z28.h, p3/M, z8.h, z9.h\n"
- "movprfx z29, z19\n fmla z29.h, p3/M, z6.h, z9.h\n"
+ "movprfx z27, z30\n fmla z27.h, p3/M, z8.h, z9.h\n"
+ "movprfx z26, z30\n fmla z26.h, p3/M, z6.h, z9.h\n"
"whilelt p1.h, x13, %x[n_channels]\n"
"inch x21\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "fmla z29.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x28, x24, LSL #1]\n"
+ "fmla z27.h, p3/M, z0.h, z10.h\n"
+ "fmla z26.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z20.h }, p2/Z, [x28, x24, LSL #1]\n"
"inch x13\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "fmla z29.h, p3/M, z2.h, z13.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x26, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x28, x10, LSL #1]\n"
- "fmla z28.h, p3/M, z3.h, z14.h\n"
- "fmla z29.h, p3/M, z0.h, z16.h\n"
- "ld1h { z14.h }, p2/Z, [x25]\n"
+ "fmla z27.h, p3/M, z1.h, z11.h\n"
+ "fmla z26.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z17.h }, p2/Z, [x28, x26, LSL #1]\n"
+ "ld1h { z19.h }, p2/Z, [x28, x10, LSL #1]\n"
+ "fmla z27.h, p3/M, z3.h, z14.h\n"
+ "fmla z26.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x25]\n"
"mov p0.b, p2.b\n"
- "fmla z28.h, p3/M, z4.h, z15.h\n"
- "fmla z29.h, p3/M, z4.h, z11.h\n"
- "ld1h { z15.h }, p2/Z, [x27]\n"
- "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
- "fmla z28.h, p3/M, z2.h, z16.h\n"
- "fmla z29.h, p3/M, z5.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x26, LSL #1]\n"
- "ld1h { z16.h }, p2/Z, [x27, x15, LSL #1]\n"
- "movprfx z30, z19\n fmla z30.h, p3/M, z2.h, z9.h\n"
- "movprfx z31, z19\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "fmla z27.h, p3/M, z4.h, z15.h\n"
+ "fmla z26.h, p3/M, z4.h, z17.h\n"
+ "ld1h { z25.h }, p2/Z, [x27]\n"
+ "ld1h { z17.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z27.h, p3/M, z2.h, z16.h\n"
+ "fmla z26.h, p3/M, z5.h, z20.h\n"
+ "ld1h { z24.h }, p2/Z, [x27, x26, LSL #1]\n"
+ "ld1h { z23.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "movprfx z22, z30\n fmla z22.h, p3/M, z2.h, z9.h\n"
+ "movprfx z21, z30\n fmla z21.h, p3/M, z0.h, z9.h\n"
"addvl x12, x12, #1\n"
"addvl x28, x28, #1\n"
- "fmla z28.h, p3/M, z5.h, z13.h\n"
- "fmla z29.h, p3/M, z3.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x25, x26, LSL #1]\n"
- "ld1h { z19.h }, p3/Z, [x11]\n"
- "fmla z30.h, p3/M, z3.h, z14.h\n"
- "fmla z31.h, p3/M, z4.h, z13.h\n"
- "ld1h { z14.h }, p2/Z, [x25, x24, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x23, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z15.h\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "fmla z27.h, p3/M, z5.h, z19.h\n"
+ "fmla z26.h, p3/M, z3.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x26, LSL #1]\n"
+ "ld1h { z30.h }, p3/Z, [x11]\n"
+ "fmla z22.h, p3/M, z3.h, z18.h\n"
+ "fmla z21.h, p3/M, z4.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x24, LSL #1]\n"
+ "ld1h { z20.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z22.h, p3/M, z0.h, z25.h\n"
+ "fmla z21.h, p3/M, z1.h, z24.h\n"
"ld1h { z0.h }, p3/Z, [x11, #1, MUL VL]\n"
"inch x20\n"
- "fmla z30.h, p3/M, z4.h, z11.h\n"
- "fmla z31.h, p3/M, z5.h, z14.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x24, LSL #1]\n"
- "ld1h { z14.h }, p2/Z, [x23, x26, LSL #1]\n"
- "fmla z28.h, p3/M, z6.h, z15.h\n"
- "fmla z30.h, p3/M, z1.h, z16.h\n"
- "ld1h { z15.h }, p2/Z, [x23]\n"
+ "fmla z22.h, p3/M, z4.h, z17.h\n"
+ "fmla z21.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z19.h }, p2/Z, [x27, x24, LSL #1]\n"
+ "ld1h { z18.h }, p2/Z, [x23, x26, LSL #1]\n"
+ "fmla z27.h, p3/M, z6.h, z25.h\n"
+ "fmla z22.h, p3/M, z1.h, z23.h\n"
+ "ld1h { z17.h }, p2/Z, [x23]\n"
"addvl x27, x27, #1\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "fmla z28.h, p3/M, z7.h, z16.h\n"
+ "fmla z21.h, p3/M, z2.h, z19.h\n"
+ "fmla z27.h, p3/M, z7.h, z23.h\n"
"ld1h { z16.h }, p2/Z, [x25, x10, LSL #1]\n"
- "fmax z28.h, p3/M, z28.h, z18.h\n"
- "fmla z30.h, p3/M, z6.h, z15.h\n"
- "fmla z31.h, p3/M, z3.h, z16.h\n"
- "ld1h { z15.h }, p2/Z, [x23, x10, LSL #1]\n"
+ "fmax z27.h, p3/M, z27.h, z29.h\n"
+ "fmla z22.h, p3/M, z6.h, z17.h\n"
+ "fmla z21.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, x10, LSL #1]\n"
"ld1h { z1.h }, p3/Z, [x11, #2, MUL VL]\n"
- "fmla z30.h, p3/M, z7.h, z13.h\n"
- "fmla z31.h, p3/M, z7.h, z14.h\n"
+ "fmla z22.h, p3/M, z7.h, z20.h\n"
+ "fmla z21.h, p3/M, z7.h, z18.h\n"
"ld1h { z2.h }, p3/Z, [x11, #3, MUL VL]\n"
"ld1h { z3.h }, p3/Z, [x11, #4, MUL VL]\n"
- "fmla z29.h, p3/M, z7.h, z12.h\n"
- "fmla z30.h, p3/M, z5.h, z16.h\n"
+ "fmla z26.h, p3/M, z7.h, z24.h\n"
+ "fmla z22.h, p3/M, z5.h, z16.h\n"
"ld1h { z4.h }, p3/Z, [x11, #5, MUL VL]\n"
"ld1h { z5.h }, p3/Z, [x11, #6, MUL VL]\n"
- "fmla z31.h, p3/M, z6.h, z15.h\n"
- "fmla z29.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x23, x24, LSL #1]\n"
- "fmax z29.h, p3/M, z29.h, z18.h\n"
- "fmla z30.h, p3/M, z8.h, z15.h\n"
- "fmla z31.h, p3/M, z8.h, z11.h\n"
- "fmax z30.h, p3/M, z30.h, z18.h\n"
- "fmax z31.h, p3/M, z31.h, z18.h\n"
+ "fmla z21.h, p3/M, z6.h, z17.h\n"
+ "fmla z26.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x24, LSL #1]\n"
+ "fmax z26.h, p3/M, z26.h, z29.h\n"
+ "fmla z22.h, p3/M, z8.h, z17.h\n"
+ "fmla z21.h, p3/M, z8.h, z16.h\n"
+ "fmax z22.h, p3/M, z22.h, z29.h\n"
+ "fmax z21.h, p3/M, z21.h, z29.h\n"
"ld1h { z6.h }, p3/Z, [x11, #7, MUL VL]\n"
"addvl x11, x11, #16\n"
"whilelt p2.h, x21, %x[n_channels]\n"
"ld1h { z9.h }, p1/Z, [x27, x10, LSL #1]\n"
"cmp x13, %x[n_channels]\n"
- "fmin z28.h, p3/M, z28.h, z17.h\n"
+ "fmin z27.h, p3/M, z27.h, z28.h\n"
"ld1h { z10.h }, p1/Z, [x12]\n"
"ld1h { z11.h }, p1/Z, [x12, x15, LSL #1]\n"
- "fmin z29.h, p3/M, z29.h, z17.h\n"
- "fmin z30.h, p3/M, z30.h, z17.h\n"
+ "fmin z26.h, p3/M, z26.h, z28.h\n"
+ "fmin z22.h, p3/M, z22.h, z28.h\n"
"ld1h { z12.h }, p1/Z, [x12, x26, LSL #1]\n"
"ld1h { z13.h }, p1/Z, [x12, x24, LSL #1]\n"
- "fmin z31.h, p3/M, z31.h, z17.h\n"
+ "fmin z21.h, p3/M, z21.h, z28.h\n"
"addvl x25, x25, #1\n"
"ld1h { z14.h }, p1/Z, [x28]\n"
"ld1h { z15.h }, p1/Z, [x28, x15, LSL #1]\n"
"addvl x23, x23, #1\n"
"ld1h { z16.h }, p1/Z, [x12, x10, LSL #1]\n"
- "st1h { z28.h }, p0, [x9]\n"
+ "st1h { z27.h }, p0, [x9]\n"
"ld1h { z7.h }, p3/Z, [x11, #-8, MUL VL]\n"
- "st1h { z29.h }, p0, [x9, x14, LSL #1]\n"
+ "st1h { z26.h }, p0, [x9, x14, LSL #1]\n"
"addvl x9, x9, #1\n"
"ld1h { z8.h }, p3/Z, [x11, #-7, MUL VL]\n"
"addvl x11, x11, #-6\n"
- "st1h { z30.h }, p0, [x22]\n"
- "st1h { z31.h }, p0, [x22, x14, LSL #1]\n"
+ "st1h { z22.h }, p0, [x22]\n"
+ "st1h { z21.h }, p0, [x22, x14, LSL #1]\n"
"addvl x22, x22, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z28, z19\n fmla z28.h, p3/M, z8.h, z9.h\n"
- "movprfx z29, z19\n fmla z29.h, p3/M, z6.h, z9.h\n"
+ "movprfx z27, z30\n fmla z27.h, p3/M, z8.h, z9.h\n"
+ "movprfx z26, z30\n fmla z26.h, p3/M, z6.h, z9.h\n"
"ldr x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "fmla z29.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x28, x24, LSL #1]\n"
+ "fmla z27.h, p3/M, z0.h, z10.h\n"
+ "fmla z26.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z20.h }, p2/Z, [x28, x24, LSL #1]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "fmla z29.h, p3/M, z2.h, z13.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x26, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x28, x10, LSL #1]\n"
- "fmla z28.h, p3/M, z3.h, z14.h\n"
- "fmla z29.h, p3/M, z0.h, z16.h\n"
- "ld1h { z14.h }, p2/Z, [x25]\n"
+ "fmla z27.h, p3/M, z1.h, z11.h\n"
+ "fmla z26.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z17.h }, p2/Z, [x28, x26, LSL #1]\n"
+ "ld1h { z19.h }, p2/Z, [x28, x10, LSL #1]\n"
+ "fmla z27.h, p3/M, z3.h, z14.h\n"
+ "fmla z26.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x25]\n"
"add x16, x16, #0x1\n"
- "fmla z28.h, p3/M, z4.h, z15.h\n"
- "fmla z29.h, p3/M, z4.h, z11.h\n"
- "ld1h { z15.h }, p2/Z, [x27]\n"
- "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
- "fmla z28.h, p3/M, z2.h, z16.h\n"
- "fmla z29.h, p3/M, z5.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x26, LSL #1]\n"
- "ld1h { z16.h }, p2/Z, [x27, x15, LSL #1]\n"
- "movprfx z30, z19\n fmla z30.h, p3/M, z2.h, z9.h\n"
- "movprfx z31, z19\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "fmla z27.h, p3/M, z4.h, z15.h\n"
+ "fmla z26.h, p3/M, z4.h, z17.h\n"
+ "ld1h { z25.h }, p2/Z, [x27]\n"
+ "ld1h { z17.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z27.h, p3/M, z2.h, z16.h\n"
+ "fmla z26.h, p3/M, z5.h, z20.h\n"
+ "ld1h { z24.h }, p2/Z, [x27, x26, LSL #1]\n"
+ "ld1h { z23.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "movprfx z22, z30\n fmla z22.h, p3/M, z2.h, z9.h\n"
+ "movprfx z21, z30\n fmla z21.h, p3/M, z0.h, z9.h\n"
"cmp x16, x20\n"
"add x21, x11, #0x1\n"
- "fmla z28.h, p3/M, z5.h, z13.h\n"
- "fmla z29.h, p3/M, z3.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x25, x26, LSL #1]\n"
+ "fmla z27.h, p3/M, z5.h, z19.h\n"
+ "fmla z26.h, p3/M, z3.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x26, LSL #1]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "fmla z30.h, p3/M, z3.h, z14.h\n"
- "fmla z31.h, p3/M, z4.h, z13.h\n"
- "ld1h { z14.h }, p2/Z, [x25, x24, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x23, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z15.h\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "fmla z22.h, p3/M, z3.h, z18.h\n"
+ "fmla z21.h, p3/M, z4.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x24, LSL #1]\n"
+ "ld1h { z20.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z22.h, p3/M, z0.h, z25.h\n"
+ "fmla z21.h, p3/M, z1.h, z24.h\n"
"csel x11, x11, x21, LT\n"
"mov p0.b, p2.b\n"
- "fmla z30.h, p3/M, z4.h, z11.h\n"
- "fmla z31.h, p3/M, z5.h, z14.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x24, LSL #1]\n"
- "ld1h { z14.h }, p2/Z, [x23, x26, LSL #1]\n"
- "fmla z28.h, p3/M, z6.h, z15.h\n"
- "fmla z30.h, p3/M, z1.h, z16.h\n"
- "ld1h { z15.h }, p2/Z, [x23]\n"
+ "fmla z22.h, p3/M, z4.h, z17.h\n"
+ "fmla z21.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z19.h }, p2/Z, [x27, x24, LSL #1]\n"
+ "ld1h { z18.h }, p2/Z, [x23, x26, LSL #1]\n"
+ "fmla z27.h, p3/M, z6.h, z25.h\n"
+ "fmla z22.h, p3/M, z1.h, z23.h\n"
+ "ld1h { z17.h }, p2/Z, [x23]\n"
"csel x16, x16, XZR, LT\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "fmla z28.h, p3/M, z7.h, z16.h\n"
+ "fmla z21.h, p3/M, z2.h, z19.h\n"
+ "fmla z27.h, p3/M, z7.h, z23.h\n"
"ld1h { z16.h }, p2/Z, [x25, x10, LSL #1]\n"
- "fmax z28.h, p3/M, z28.h, z18.h\n"
- "fmla z30.h, p3/M, z6.h, z15.h\n"
- "fmla z31.h, p3/M, z3.h, z16.h\n"
- "ld1h { z15.h }, p2/Z, [x23, x10, LSL #1]\n"
+ "fmax z27.h, p3/M, z27.h, z29.h\n"
+ "fmla z22.h, p3/M, z6.h, z17.h\n"
+ "fmla z21.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, x10, LSL #1]\n"
"cmp x11, x20\n"
- "fmla z30.h, p3/M, z7.h, z13.h\n"
- "fmla z31.h, p3/M, z7.h, z14.h\n"
- "fmin z28.h, p3/M, z28.h, z17.h\n"
- "st1h { z28.h }, p0, [x9]\n"
- "fmla z29.h, p3/M, z7.h, z12.h\n"
- "fmla z30.h, p3/M, z5.h, z16.h\n"
- "fmla z31.h, p3/M, z6.h, z15.h\n"
- "fmla z29.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x23, x24, LSL #1]\n"
- "fmax z29.h, p3/M, z29.h, z18.h\n"
- "fmla z30.h, p3/M, z8.h, z15.h\n"
- "fmla z31.h, p3/M, z8.h, z11.h\n"
- "fmax z30.h, p3/M, z30.h, z18.h\n"
- "fmax z31.h, p3/M, z31.h, z18.h\n"
- "fmin z29.h, p3/M, z29.h, z17.h\n"
- "fmin z30.h, p3/M, z30.h, z17.h\n"
- "st1h { z29.h }, p0, [x9, x14, LSL #1]\n"
- "fmin z31.h, p3/M, z31.h, z17.h\n"
- "st1h { z30.h }, p0, [x22]\n"
- "st1h { z31.h }, p0, [x22, x14, LSL #1]\n"
+ "fmla z22.h, p3/M, z7.h, z20.h\n"
+ "fmla z21.h, p3/M, z7.h, z18.h\n"
+ "fmin z27.h, p3/M, z27.h, z28.h\n"
+ "st1h { z27.h }, p0, [x9]\n"
+ "fmla z26.h, p3/M, z7.h, z24.h\n"
+ "fmla z22.h, p3/M, z5.h, z16.h\n"
+ "fmla z21.h, p3/M, z6.h, z17.h\n"
+ "fmla z26.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x24, LSL #1]\n"
+ "fmax z26.h, p3/M, z26.h, z29.h\n"
+ "fmla z22.h, p3/M, z8.h, z17.h\n"
+ "fmla z21.h, p3/M, z8.h, z16.h\n"
+ "fmax z22.h, p3/M, z22.h, z29.h\n"
+ "fmax z21.h, p3/M, z21.h, z29.h\n"
+ "fmin z26.h, p3/M, z26.h, z28.h\n"
+ "fmin z22.h, p3/M, z22.h, z28.h\n"
+ "st1h { z26.h }, p0, [x9, x14, LSL #1]\n"
+ "fmin z21.h, p3/M, z21.h, z28.h\n"
+ "st1h { z22.h }, p0, [x22]\n"
+ "st1h { z21.h }, p0, [x22, x14, LSL #1]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
index ff97b51e28..d5fbb6baee 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -96,7 +96,7 @@ void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"ldp x11, x10, [x20, #0x10]\n"
"mov x9, #0x0\n"
"whilelt p2.h, XZR, %x[n_channels]\n"
- "ld1h { z19.h }, p3/Z, [x16]\n"
+ "ld1h { z20.h }, p3/Z, [x16]\n"
"ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
"cmp x14, %x[n_channels]\n"
"ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
@@ -111,8 +111,8 @@ void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"ldp x25, x24, [x15, #0x10]\n"
"ldp x23, x22, [x15, #0x20]\n"
"ldp x21, x20, [x15, #0x30]\n"
- "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rh { z26.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z25.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
"ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
"addvl x16, x16, #-6\n"
@@ -126,89 +126,89 @@ void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z28, z19\n fmla z28.h, p3/M, z8.h, z9.h\n"
- "movprfx z29, z19\n fmla z29.h, p3/M, z6.h, z9.h\n"
- "ldr x27, [x15, #0x40]\n"
- "ldr x26, [x15, #0x48]\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "fmla z29.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x9, LSL #1]\n"
- "ldr x25, [x15, #0x50]\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "fmla z29.h, p3/M, z2.h, z13.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x9, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x25, x9, LSL #1]\n"
- "fmla z28.h, p3/M, z3.h, z14.h\n"
- "fmla z29.h, p3/M, z0.h, z16.h\n"
- "ldr x24, [x15, #0x58]\n"
- "ldr x20, [x15, #0x78]\n"
- "fmla z28.h, p3/M, z4.h, z15.h\n"
- "fmla z29.h, p3/M, z4.h, z11.h\n"
- "ld1h { z14.h }, p2/Z, [x24, x9, LSL #1]\n"
- "ldr x23, [x15, #0x60]\n"
- "fmla z28.h, p3/M, z2.h, z16.h\n"
- "fmla z29.h, p3/M, z5.h, z12.h\n"
- "ldr x27, [x15, #0x80]\n"
- "ld1h { z15.h }, p2/Z, [x23, x9, LSL #1]\n"
- "movprfx z30, z19\n fmla z30.h, p3/M, z2.h, z9.h\n"
- "movprfx z31, z19\n fmla z31.h, p3/M, z0.h, z9.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x9, LSL #1]\n"
- "ldr x22, [x15, #0x68]\n"
- "fmla z28.h, p3/M, z5.h, z13.h\n"
- "fmla z29.h, p3/M, z3.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x20, x9, LSL #1]\n"
- "ldr x26, [x15, #0x88]\n"
- "fmla z30.h, p3/M, z3.h, z14.h\n"
- "fmla z31.h, p3/M, z4.h, z13.h\n"
- "ld1h { z11.h }, p2/Z, [x22, x9, LSL #1]\n"
- "ld1h { z14.h }, p2/Z, [x26, x9, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z15.h\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "movprfx z24, z20\n fmla z24.h, p3/M, z8.h, z9.h\n"
+ "movprfx z23, z20\n fmla z23.h, p3/M, z6.h, z9.h\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "fmla z23.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x20, [x15, #0x50]\n"
+ "fmla z24.h, p3/M, z1.h, z11.h\n"
+ "fmla z23.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z19.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z24.h, p3/M, z3.h, z14.h\n"
+ "fmla z23.h, p3/M, z0.h, z16.h\n"
+ "ldr x20, [x15, #0x58]\n"
+ "ldr x22, [x15, #0x78]\n"
+ "fmla z24.h, p3/M, z4.h, z15.h\n"
+ "fmla z23.h, p3/M, z4.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x21, [x15, #0x60]\n"
+ "fmla z24.h, p3/M, z2.h, z16.h\n"
+ "fmla z23.h, p3/M, z5.h, z18.h\n"
+ "ldr x20, [x15, #0x80]\n"
+ "ld1h { z18.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "movprfx z22, z20\n fmla z22.h, p3/M, z2.h, z9.h\n"
+ "movprfx z21, z20\n fmla z21.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z20.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x21, [x15, #0x68]\n"
+ "fmla z24.h, p3/M, z5.h, z19.h\n"
+ "fmla z23.h, p3/M, z3.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "fmla z22.h, p3/M, z3.h, z17.h\n"
+ "fmla z21.h, p3/M, z4.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z0.h, z18.h\n"
+ "fmla z21.h, p3/M, z1.h, z20.h\n"
"ldr x21, [x15, #0x70]\n"
- "ldr x24, [x15, #0x98]\n"
- "fmla z30.h, p3/M, z4.h, z11.h\n"
- "fmla z31.h, p3/M, z5.h, z14.h\n"
+ "ldr x20, [x15, #0x98]\n"
+ "fmla z22.h, p3/M, z4.h, z17.h\n"
+ "fmla z21.h, p3/M, z5.h, z16.h\n"
"ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
- "ld1h { z11.h }, p2/Z, [x24, x9, LSL #1]\n"
- "fmla z28.h, p3/M, z6.h, z15.h\n"
- "ldr x25, [x15, #0x90]\n"
- "ldr x22, [x15, #0xa8]\n"
- "fmla z30.h, p3/M, z1.h, z16.h\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "fmla z28.h, p3/M, z7.h, z16.h\n"
- "ld1h { z15.h }, p2/Z, [x25, x9, LSL #1]\n"
- "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
- "ldr x23, [x15, #0xa0]\n"
- "ldr x21, [x15, #0xb0]\n"
- "fmla z30.h, p3/M, z6.h, z15.h\n"
- "fmla z31.h, p3/M, z3.h, z16.h\n"
- "ld1h { z13.h }, p2/Z, [x23, x9, LSL #1]\n"
- "ld1h { z14.h }, p2/Z, [x21, x9, LSL #1]\n"
- "fmla z30.h, p3/M, z7.h, z13.h\n"
- "fmla z31.h, p3/M, z7.h, z14.h\n"
+ "ld1h { z19.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z24.h, p3/M, z6.h, z18.h\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "fmla z22.h, p3/M, z1.h, z16.h\n"
+ "fmla z21.h, p3/M, z2.h, z19.h\n"
+ "fmla z24.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x21, [x15, #0xa0]\n"
+ "ldr x20, [x15, #0xb0]\n"
+ "fmla z22.h, p3/M, z6.h, z16.h\n"
+ "fmla z21.h, p3/M, z3.h, z18.h\n"
+ "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z7.h, z17.h\n"
+ "fmla z21.h, p3/M, z7.h, z16.h\n"
"ldr x20, [x15, #0xb8]\n"
- "fmla z29.h, p3/M, z7.h, z12.h\n"
- "ld1h { z15.h }, p2/Z, [x20, x9, LSL #1]\n"
- "fmla z30.h, p3/M, z5.h, z16.h\n"
- "ldr x27, [x15, #0xc0]\n"
- "fmla z31.h, p3/M, z6.h, z15.h\n"
- "fmla z29.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x9, LSL #1]\n"
- "fmla z30.h, p3/M, z8.h, z15.h\n"
- "fmla z31.h, p3/M, z8.h, z11.h\n"
+ "fmla z23.h, p3/M, z7.h, z20.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z5.h, z18.h\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "fmla z21.h, p3/M, z6.h, z17.h\n"
+ "fmla z23.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z8.h, z17.h\n"
+ "fmla z21.h, p3/M, z8.h, z16.h\n"
"whilelt p1.h, x14, %x[n_channels]\n"
"ldp x27, x26, [x15, #0x0]\n"
"ldp x25, x24, [x15, #0x10]\n"
"ldp x23, x22, [x15, #0x20]\n"
"inch x9\n"
- "fmax z28.h, p3/M, z28.h, z18.h\n"
+ "fmax z24.h, p3/M, z24.h, z26.h\n"
"ldp x21, x20, [x15, #0x30]\n"
"ld1h { z9.h }, p1/Z, [x27, x14, LSL #1]\n"
- "fmax z29.h, p3/M, z29.h, z18.h\n"
- "fmax z30.h, p3/M, z30.h, z18.h\n"
+ "fmax z23.h, p3/M, z23.h, z26.h\n"
+ "fmax z22.h, p3/M, z22.h, z26.h\n"
"ld1h { z10.h }, p1/Z, [x26, x14, LSL #1]\n"
"ld1h { z11.h }, p1/Z, [x25, x14, LSL #1]\n"
- "fmax z31.h, p3/M, z31.h, z18.h\n"
+ "fmax z21.h, p3/M, z21.h, z26.h\n"
"inch x28\n"
"ld1h { z12.h }, p1/Z, [x24, x14, LSL #1]\n"
"ld1h { z13.h }, p1/Z, [x23, x14, LSL #1]\n"
@@ -216,122 +216,122 @@ void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"whilelt p2.h, x9, %x[n_channels]\n"
"ld1h { z14.h }, p1/Z, [x22, x14, LSL #1]\n"
"ld1h { z15.h }, p1/Z, [x21, x14, LSL #1]\n"
- "fmin z28.h, p3/M, z28.h, z17.h\n"
- "fmin z29.h, p3/M, z29.h, z17.h\n"
+ "fmin z24.h, p3/M, z24.h, z25.h\n"
+ "fmin z23.h, p3/M, z23.h, z25.h\n"
"ld1h { z16.h }, p1/Z, [x20, x14, LSL #1]\n"
"inch x14\n"
- "ld1h { z19.h }, p3/Z, [x16]\n"
+ "ld1h { z20.h }, p3/Z, [x16]\n"
"cmp x14, %x[n_channels]\n"
"ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
"ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
- "fmin z30.h, p3/M, z30.h, z17.h\n"
- "fmin z31.h, p3/M, z31.h, z17.h\n"
+ "fmin z22.h, p3/M, z22.h, z25.h\n"
+ "fmin z21.h, p3/M, z21.h, z25.h\n"
"ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n"
"ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n"
- "st1h { z28.h }, p0, [x13, x28, LSL #1]\n"
+ "st1h { z24.h }, p0, [x13, x28, LSL #1]\n"
"ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n"
"ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n"
- "st1h { z29.h }, p0, [x12, x28, LSL #1]\n"
+ "st1h { z23.h }, p0, [x12, x28, LSL #1]\n"
"ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n"
"addvl x16, x16, #16\n"
- "st1h { z30.h }, p0, [x11, x28, LSL #1]\n"
+ "st1h { z22.h }, p0, [x11, x28, LSL #1]\n"
"ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
- "st1h { z31.h }, p0, [x10, x28, LSL #1]\n"
+ "st1h { z21.h }, p0, [x10, x28, LSL #1]\n"
"ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
"addvl x16, x16, #-6\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z28, z19\n fmla z28.h, p3/M, z8.h, z9.h\n"
- "movprfx z29, z19\n fmla z29.h, p3/M, z6.h, z9.h\n"
- "ldr x27, [x15, #0x40]\n"
- "ldr x26, [x15, #0x48]\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "fmla z29.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x9, LSL #1]\n"
- "ldr x25, [x15, #0x50]\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "fmla z29.h, p3/M, z2.h, z13.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x9, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x25, x9, LSL #1]\n"
- "fmla z28.h, p3/M, z3.h, z14.h\n"
- "fmla z29.h, p3/M, z0.h, z16.h\n"
- "ldr x24, [x15, #0x58]\n"
- "ldr x20, [x15, #0x78]\n"
- "fmla z28.h, p3/M, z4.h, z15.h\n"
- "fmla z29.h, p3/M, z4.h, z11.h\n"
- "ld1h { z14.h }, p2/Z, [x24, x9, LSL #1]\n"
- "ldr x23, [x15, #0x60]\n"
- "fmla z28.h, p3/M, z2.h, z16.h\n"
- "fmla z29.h, p3/M, z5.h, z12.h\n"
- "ldr x27, [x15, #0x80]\n"
- "ld1h { z15.h }, p2/Z, [x23, x9, LSL #1]\n"
- "movprfx z30, z19\n fmla z30.h, p3/M, z2.h, z9.h\n"
- "movprfx z31, z19\n fmla z31.h, p3/M, z0.h, z9.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x9, LSL #1]\n"
- "ldr x22, [x15, #0x68]\n"
- "fmla z28.h, p3/M, z5.h, z13.h\n"
- "fmla z29.h, p3/M, z3.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x20, x9, LSL #1]\n"
- "ldr x26, [x15, #0x88]\n"
- "fmla z30.h, p3/M, z3.h, z14.h\n"
- "fmla z31.h, p3/M, z4.h, z13.h\n"
- "ld1h { z11.h }, p2/Z, [x22, x9, LSL #1]\n"
- "ld1h { z14.h }, p2/Z, [x26, x9, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z15.h\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "movprfx z24, z20\n fmla z24.h, p3/M, z8.h, z9.h\n"
+ "movprfx z23, z20\n fmla z23.h, p3/M, z6.h, z9.h\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "fmla z23.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x20, [x15, #0x50]\n"
+ "fmla z24.h, p3/M, z1.h, z11.h\n"
+ "fmla z23.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z19.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z24.h, p3/M, z3.h, z14.h\n"
+ "fmla z23.h, p3/M, z0.h, z16.h\n"
+ "ldr x20, [x15, #0x58]\n"
+ "ldr x22, [x15, #0x78]\n"
+ "fmla z24.h, p3/M, z4.h, z15.h\n"
+ "fmla z23.h, p3/M, z4.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x21, [x15, #0x60]\n"
+ "fmla z24.h, p3/M, z2.h, z16.h\n"
+ "fmla z23.h, p3/M, z5.h, z18.h\n"
+ "ldr x20, [x15, #0x80]\n"
+ "ld1h { z18.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "movprfx z22, z20\n fmla z22.h, p3/M, z2.h, z9.h\n"
+ "movprfx z21, z20\n fmla z21.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z20.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x21, [x15, #0x68]\n"
+ "fmla z24.h, p3/M, z5.h, z19.h\n"
+ "fmla z23.h, p3/M, z3.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "fmla z22.h, p3/M, z3.h, z17.h\n"
+ "fmla z21.h, p3/M, z4.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z0.h, z18.h\n"
+ "fmla z21.h, p3/M, z1.h, z20.h\n"
"ldr x21, [x15, #0x70]\n"
- "ldr x24, [x15, #0x98]\n"
- "fmla z30.h, p3/M, z4.h, z11.h\n"
- "fmla z31.h, p3/M, z5.h, z14.h\n"
+ "ldr x20, [x15, #0x98]\n"
+ "fmla z22.h, p3/M, z4.h, z17.h\n"
+ "fmla z21.h, p3/M, z5.h, z16.h\n"
"ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
- "ld1h { z11.h }, p2/Z, [x24, x9, LSL #1]\n"
- "fmla z28.h, p3/M, z6.h, z15.h\n"
- "ldr x25, [x15, #0x90]\n"
- "ldr x22, [x15, #0xa8]\n"
- "fmla z30.h, p3/M, z1.h, z16.h\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "fmla z28.h, p3/M, z7.h, z16.h\n"
- "ld1h { z15.h }, p2/Z, [x25, x9, LSL #1]\n"
- "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
- "ldr x23, [x15, #0xa0]\n"
- "ldr x21, [x15, #0xb0]\n"
- "fmla z30.h, p3/M, z6.h, z15.h\n"
- "fmla z31.h, p3/M, z3.h, z16.h\n"
- "ld1h { z13.h }, p2/Z, [x23, x9, LSL #1]\n"
- "ld1h { z14.h }, p2/Z, [x21, x9, LSL #1]\n"
- "fmla z30.h, p3/M, z7.h, z13.h\n"
- "fmla z31.h, p3/M, z7.h, z14.h\n"
+ "ld1h { z19.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z24.h, p3/M, z6.h, z18.h\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "fmla z22.h, p3/M, z1.h, z16.h\n"
+ "fmla z21.h, p3/M, z2.h, z19.h\n"
+ "fmla z24.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x21, [x15, #0xa0]\n"
+ "ldr x20, [x15, #0xb0]\n"
+ "fmla z22.h, p3/M, z6.h, z16.h\n"
+ "fmla z21.h, p3/M, z3.h, z18.h\n"
+ "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z7.h, z17.h\n"
+ "fmla z21.h, p3/M, z7.h, z16.h\n"
"ldr x20, [x15, #0xb8]\n"
- "fmla z29.h, p3/M, z7.h, z12.h\n"
- "ld1h { z15.h }, p2/Z, [x20, x9, LSL #1]\n"
- "fmla z30.h, p3/M, z5.h, z16.h\n"
- "ldr x27, [x15, #0xc0]\n"
- "fmla z31.h, p3/M, z6.h, z15.h\n"
- "fmla z29.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x9, LSL #1]\n"
- "fmla z30.h, p3/M, z8.h, z15.h\n"
- "fmla z31.h, p3/M, z8.h, z11.h\n"
+ "fmla z23.h, p3/M, z7.h, z20.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z5.h, z18.h\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "fmla z21.h, p3/M, z6.h, z17.h\n"
+ "fmla z23.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z8.h, z17.h\n"
+ "fmla z21.h, p3/M, z8.h, z16.h\n"
"inch x28\n"
"mov p0.b, p2.b\n"
- "fmax z28.h, p3/M, z28.h, z18.h\n"
- "fmax z29.h, p3/M, z29.h, z18.h\n"
- "fmax z30.h, p3/M, z30.h, z18.h\n"
- "fmax z31.h, p3/M, z31.h, z18.h\n"
- "fmin z28.h, p3/M, z28.h, z17.h\n"
- "fmin z29.h, p3/M, z29.h, z17.h\n"
- "st1h { z28.h }, p0, [x13, x28, LSL #1]\n"
- "fmin z30.h, p3/M, z30.h, z17.h\n"
- "fmin z31.h, p3/M, z31.h, z17.h\n"
- "st1h { z29.h }, p0, [x12, x28, LSL #1]\n"
- "st1h { z30.h }, p0, [x11, x28, LSL #1]\n"
- "st1h { z31.h }, p0, [x10, x28, LSL #1]\n"
+ "fmax z24.h, p3/M, z24.h, z26.h\n"
+ "fmax z23.h, p3/M, z23.h, z26.h\n"
+ "fmax z22.h, p3/M, z22.h, z26.h\n"
+ "fmax z21.h, p3/M, z21.h, z26.h\n"
+ "fmin z24.h, p3/M, z24.h, z25.h\n"
+ "fmin z23.h, p3/M, z23.h, z25.h\n"
+ "st1h { z24.h }, p0, [x13, x28, LSL #1]\n"
+ "fmin z22.h, p3/M, z22.h, z25.h\n"
+ "fmin z21.h, p3/M, z21.h, z25.h\n"
+ "st1h { z23.h }, p0, [x12, x28, LSL #1]\n"
+ "st1h { z22.h }, p0, [x11, x28, LSL #1]\n"
+ "st1h { z21.h }, p0, [x10, x28, LSL #1]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index f5d4189a47..abdfac5a3f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,19 +22,19 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
-void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
-void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
class sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
{
@@ -57,7 +57,7 @@ class sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 2;
sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(2, 5, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
@@ -68,4 +68,4 @@ class sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirst
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
index e6bfea1790..fdbee67926 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -113,14 +113,14 @@ void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"madd x20, x8, x16, x20\n" // offset += tile_j * ld_output_col
"add x9, x11, x23, LSL #1\n"
"add x28, x15, x17\n"
- "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
"mul x20, x20, x24\n" // offset *= output_tile_size
"whilelt p2.h, XZR, %x[n_channels]\n"
"add x27, x9, x23, LSL #1\n"
- "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rh { z28.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"add x26, x28, x17\n"
"add x25, x27, x23, LSL #1\n"
- "ld1h { z16.h }, p3/Z, [x10]\n"
+ "ld1h { z29.h }, p3/Z, [x10]\n"
"ld1h { z0.h }, p3/Z, [x10, #1, MUL VL]\n"
"add x24, x26, x17\n"
"add x13, x13, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
@@ -146,378 +146,378 @@ void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"ld1h { z14.h }, p2/Z, [x9]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z28, z16\n fmla z28.h, p3/M, z0.h, z5.h\n"
- "movprfx z29, z16\n fmla z29.h, p3/M, z0.h, z6.h\n"
- "ld1h { z5.h }, p2/Z, [x11, x28, LSL #1]\n"
+ "movprfx z27, z29\n fmla z27.h, p3/M, z0.h, z5.h\n"
+ "movprfx z31, z29\n fmla z31.h, p3/M, z0.h, z6.h\n"
+ "ld1h { z24.h }, p2/Z, [x11, x28, LSL #1]\n"
"whilelt p1.h, x12, %x[n_channels]\n"
- "movprfx z30, z16\n fmla z30.h, p3/M, z0.h, z7.h\n"
- "movprfx z31, z16\n fmla z31.h, p3/M, z0.h, z8.h\n"
- "ld1h { z0.h }, p3/Z, [x10]\n"
+ "movprfx z26, z29\n fmla z26.h, p3/M, z0.h, z7.h\n"
+ "movprfx z30, z29\n fmla z30.h, p3/M, z0.h, z8.h\n"
+ "ld1h { z18.h }, p3/Z, [x10]\n"
"inch x21\n"
- "fmla z28.h, p3/M, z1.h, z6.h\n"
- "fmla z29.h, p3/M, z1.h, z9.h\n"
- "ld1h { z6.h }, p2/Z, [x11, x26, LSL #1]\n"
+ "fmla z27.h, p3/M, z1.h, z6.h\n"
+ "fmla z31.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z23.h }, p2/Z, [x11, x26, LSL #1]\n"
"inch x12\n"
- "fmla z30.h, p3/M, z1.h, z8.h\n"
- "fmla z31.h, p3/M, z1.h, z13.h\n"
- "ld1h { z1.h }, p3/Z, [x10, #1, MUL VL]\n"
+ "fmla z26.h, p3/M, z1.h, z8.h\n"
+ "fmla z30.h, p3/M, z1.h, z13.h\n"
+ "ld1h { z22.h }, p3/Z, [x10, #1, MUL VL]\n"
"mov p0.b, p2.b\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "fmla z29.h, p3/M, z2.h, z11.h\n"
- "ld1h { z9.h }, p2/Z, [x14, x24, LSL #1]\n"
+ "fmla z27.h, p3/M, z2.h, z9.h\n"
+ "fmla z31.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z16.h }, p2/Z, [x14, x24, LSL #1]\n"
"addvl x14, x14, #1\n"
- "fmla z30.h, p3/M, z2.h, z13.h\n"
- "fmla z31.h, p3/M, z2.h, z5.h\n"
- "ld1h { z2.h }, p3/Z, [x10, #2, MUL VL]\n"
+ "fmla z26.h, p3/M, z2.h, z13.h\n"
+ "fmla z30.h, p3/M, z2.h, z24.h\n"
+ "ld1h { z20.h }, p3/Z, [x10, #2, MUL VL]\n"
"addvl x11, x11, #1\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "ld1h { z11.h }, p2/Z, [x9, x17, LSL #1]\n"
+ "fmla z27.h, p3/M, z3.h, z11.h\n"
+ "fmla z31.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z0.h }, p2/Z, [x9, x17, LSL #1]\n"
"inch x20\n"
- "fmla z30.h, p3/M, z3.h, z5.h\n"
- "fmla z31.h, p3/M, z3.h, z6.h\n"
- "ld1h { z3.h }, p3/Z, [x10, #3, MUL VL]\n"
- "fmla z28.h, p3/M, z4.h, z12.h\n"
- "fmla z29.h, p3/M, z4.h, z9.h\n"
- "ld1h { z12.h }, p2/Z, [x9, x15, LSL #1]\n"
- "ld1h { z9.h }, p2/Z, [x9, x28, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z6.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p3/Z, [x10, #4, MUL VL]\n"
- "fmla z28.h, p3/M, z0.h, z7.h\n"
- "fmla z29.h, p3/M, z0.h, z8.h\n"
+ "fmla z26.h, p3/M, z3.h, z24.h\n"
+ "fmla z30.h, p3/M, z3.h, z23.h\n"
+ "ld1h { z17.h }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z27.h, p3/M, z4.h, z12.h\n"
+ "fmla z31.h, p3/M, z4.h, z16.h\n"
+ "ld1h { z19.h }, p2/Z, [x9, x15, LSL #1]\n"
+ "ld1h { z5.h }, p2/Z, [x9, x28, LSL #1]\n"
+ "fmla z26.h, p3/M, z4.h, z23.h\n"
+ "fmla z30.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z21.h }, p3/Z, [x10, #4, MUL VL]\n"
+ "fmla z27.h, p3/M, z18.h, z7.h\n"
+ "fmla z31.h, p3/M, z18.h, z8.h\n"
"ld1h { z7.h }, p1/Z, [x11]\n"
- "fmla z30.h, p3/M, z0.h, z14.h\n"
- "fmla z31.h, p3/M, z0.h, z11.h\n"
- "ld1h { z0.h }, p3/Z, [x10, #5, MUL VL]\n"
- "fmla z28.h, p3/M, z1.h, z8.h\n"
- "fmla z29.h, p3/M, z1.h, z13.h\n"
- "ld1h { z8.h }, p2/Z, [x9, x24, LSL #1]\n"
- "fmla z30.h, p3/M, z1.h, z11.h\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
- "ld1h { z1.h }, p3/Z, [x10, #6, MUL VL]\n"
- "fmla z28.h, p3/M, z2.h, z13.h\n"
- "fmla z29.h, p3/M, z2.h, z5.h\n"
- "ld1h { z13.h }, p2/Z, [x9, x26, LSL #1]\n"
+ "fmla z26.h, p3/M, z18.h, z14.h\n"
+ "fmla z30.h, p3/M, z18.h, z0.h\n"
+ "ld1h { z18.h }, p3/Z, [x10, #5, MUL VL]\n"
+ "fmla z27.h, p3/M, z22.h, z8.h\n"
+ "fmla z31.h, p3/M, z22.h, z13.h\n"
+ "ld1h { z3.h }, p2/Z, [x9, x24, LSL #1]\n"
+ "fmla z26.h, p3/M, z22.h, z0.h\n"
+ "fmla z30.h, p3/M, z22.h, z19.h\n"
+ "ld1h { z8.h }, p3/Z, [x10, #6, MUL VL]\n"
+ "fmla z27.h, p3/M, z20.h, z13.h\n"
+ "fmla z31.h, p3/M, z20.h, z24.h\n"
+ "ld1h { z2.h }, p2/Z, [x9, x26, LSL #1]\n"
"addvl x9, x9, #1\n"
- "fmla z30.h, p3/M, z2.h, z12.h\n"
- "fmla z31.h, p3/M, z2.h, z9.h\n"
- "ld1h { z2.h }, p3/Z, [x10, #7, MUL VL]\n"
+ "fmla z26.h, p3/M, z20.h, z19.h\n"
+ "fmla z30.h, p3/M, z20.h, z5.h\n"
+ "ld1h { z16.h }, p3/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "fmla z28.h, p3/M, z3.h, z5.h\n"
- "fmla z29.h, p3/M, z3.h, z6.h\n"
- "ld1h { z5.h }, p2/Z, [x27]\n"
- "ld1h { z16.h }, p3/Z, [x10, #4, MUL VL]\n"
- "fmla z30.h, p3/M, z3.h, z9.h\n"
- "fmla z31.h, p3/M, z3.h, z13.h\n"
- "ld1h { z3.h }, p3/Z, [x10, #-8, MUL VL]\n"
- "fmla z28.h, p3/M, z4.h, z6.h\n"
- "fmla z29.h, p3/M, z4.h, z10.h\n"
- "ld1h { z6.h }, p2/Z, [x27, x17, LSL #1]\n"
- "ld1h { z10.h }, p2/Z, [x27, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z13.h\n"
- "fmla z31.h, p3/M, z4.h, z8.h\n"
- "ld1h { z4.h }, p3/Z, [x10, #-7, MUL VL]\n"
- "fmla z28.h, p3/M, z0.h, z14.h\n"
- "fmla z29.h, p3/M, z0.h, z11.h\n"
- "ld1h { z14.h }, p2/Z, [x27, x24, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z5.h\n"
- "fmla z31.h, p3/M, z0.h, z6.h\n"
- "ld1h { z0.h }, p3/Z, [x10, #-6, MUL VL]\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "fmla z29.h, p3/M, z1.h, z12.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x28, LSL #1]\n"
- "fmla z30.h, p3/M, z1.h, z6.h\n"
- "fmla z31.h, p3/M, z1.h, z10.h\n"
- "ld1h { z1.h }, p3/Z, [x10, #-5, MUL VL]\n"
- "fmla z28.h, p3/M, z2.h, z12.h\n"
- "fmla z29.h, p3/M, z2.h, z9.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x26, LSL #1]\n"
+ "fmla z27.h, p3/M, z17.h, z24.h\n"
+ "fmla z31.h, p3/M, z17.h, z23.h\n"
+ "ld1h { z25.h }, p2/Z, [x27]\n"
+ "ld1h { z29.h }, p3/Z, [x10, #4, MUL VL]\n"
+ "fmla z26.h, p3/M, z17.h, z5.h\n"
+ "fmla z30.h, p3/M, z17.h, z2.h\n"
+ "ld1h { z17.h }, p3/Z, [x10, #-8, MUL VL]\n"
+ "fmla z27.h, p3/M, z21.h, z23.h\n"
+ "fmla z31.h, p3/M, z21.h, z10.h\n"
+ "ld1h { z24.h }, p2/Z, [x27, x17, LSL #1]\n"
+ "ld1h { z22.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z21.h, z2.h\n"
+ "fmla z30.h, p3/M, z21.h, z3.h\n"
+ "ld1h { z21.h }, p3/Z, [x10, #-7, MUL VL]\n"
+ "fmla z27.h, p3/M, z18.h, z14.h\n"
+ "fmla z31.h, p3/M, z18.h, z0.h\n"
+ "ld1h { z1.h }, p2/Z, [x27, x24, LSL #1]\n"
+ "fmla z26.h, p3/M, z18.h, z25.h\n"
+ "fmla z30.h, p3/M, z18.h, z24.h\n"
+ "ld1h { z23.h }, p3/Z, [x10, #-6, MUL VL]\n"
+ "fmla z27.h, p3/M, z8.h, z0.h\n"
+ "fmla z31.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z0.h }, p2/Z, [x27, x28, LSL #1]\n"
+ "fmla z26.h, p3/M, z8.h, z24.h\n"
+ "fmla z30.h, p3/M, z8.h, z22.h\n"
+ "ld1h { z20.h }, p3/Z, [x10, #-5, MUL VL]\n"
+ "fmla z27.h, p3/M, z16.h, z19.h\n"
+ "fmla z31.h, p3/M, z16.h, z5.h\n"
+ "ld1h { z19.h }, p2/Z, [x27, x26, LSL #1]\n"
"addvl x27, x27, #1\n"
- "fmla z30.h, p3/M, z2.h, z10.h\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "ld1h { z2.h }, p3/Z, [x10, #-4, MUL VL]\n"
- "fmla z28.h, p3/M, z3.h, z9.h\n"
- "fmla z29.h, p3/M, z3.h, z13.h\n"
- "ld1h { z9.h }, p2/Z, [x25]\n"
- "fmla z30.h, p3/M, z3.h, z11.h\n"
- "fmla z31.h, p3/M, z3.h, z12.h\n"
- "ld1h { z3.h }, p3/Z, [x10, #-3, MUL VL]\n"
- "fmla z28.h, p3/M, z4.h, z13.h\n"
- "fmla z29.h, p3/M, z4.h, z8.h\n"
- "ld1h { z13.h }, p2/Z, [x25, x17, LSL #1]\n"
+ "fmla z26.h, p3/M, z16.h, z22.h\n"
+ "fmla z30.h, p3/M, z16.h, z0.h\n"
+ "ld1h { z18.h }, p3/Z, [x10, #-4, MUL VL]\n"
+ "fmla z27.h, p3/M, z17.h, z5.h\n"
+ "fmla z31.h, p3/M, z17.h, z2.h\n"
+ "ld1h { z16.h }, p2/Z, [x25]\n"
+ "fmla z26.h, p3/M, z17.h, z0.h\n"
+ "fmla z30.h, p3/M, z17.h, z19.h\n"
+ "ld1h { z17.h }, p3/Z, [x10, #-3, MUL VL]\n"
+ "fmla z27.h, p3/M, z21.h, z2.h\n"
+ "fmla z31.h, p3/M, z21.h, z3.h\n"
+ "ld1h { z4.h }, p2/Z, [x25, x17, LSL #1]\n"
"ld1h { z8.h }, p2/Z, [x25, x26, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z12.h\n"
- "fmla z31.h, p3/M, z4.h, z14.h\n"
- "ld1h { z4.h }, p3/Z, [x10, #-2, MUL VL]\n"
- "fmla z28.h, p3/M, z0.h, z5.h\n"
- "fmla z29.h, p3/M, z0.h, z6.h\n"
- "ld1h { z5.h }, p2/Z, [x25, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z9.h\n"
- "fmla z31.h, p3/M, z0.h, z13.h\n"
- "ld1h { z0.h }, p3/Z, [x10, #-1, MUL VL]\n"
- "fmla z28.h, p3/M, z1.h, z6.h\n"
- "fmla z29.h, p3/M, z1.h, z10.h\n"
- "ld1h { z6.h }, p2/Z, [x25, x28, LSL #1]\n"
- "fmla z30.h, p3/M, z1.h, z13.h\n"
- "fmla z31.h, p3/M, z1.h, z5.h\n"
- "ld1h { z1.h }, p3/Z, [x10]\n"
- "fmla z28.h, p3/M, z2.h, z10.h\n"
- "fmla z29.h, p3/M, z2.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x25, x24, LSL #1]\n"
+ "fmla z26.h, p3/M, z21.h, z19.h\n"
+ "fmla z30.h, p3/M, z21.h, z1.h\n"
+ "ld1h { z13.h }, p3/Z, [x10, #-2, MUL VL]\n"
+ "fmla z27.h, p3/M, z23.h, z25.h\n"
+ "fmla z31.h, p3/M, z23.h, z24.h\n"
+ "ld1h { z25.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z23.h, z16.h\n"
+ "fmla z30.h, p3/M, z23.h, z4.h\n"
+ "ld1h { z5.h }, p3/Z, [x10, #-1, MUL VL]\n"
+ "fmla z27.h, p3/M, z20.h, z24.h\n"
+ "fmla z31.h, p3/M, z20.h, z22.h\n"
+ "ld1h { z24.h }, p2/Z, [x25, x28, LSL #1]\n"
+ "fmla z26.h, p3/M, z20.h, z4.h\n"
+ "fmla z30.h, p3/M, z20.h, z25.h\n"
+ "ld1h { z23.h }, p3/Z, [x10]\n"
+ "fmla z27.h, p3/M, z18.h, z22.h\n"
+ "fmla z31.h, p3/M, z18.h, z0.h\n"
+ "ld1h { z22.h }, p2/Z, [x25, x24, LSL #1]\n"
"addvl x25, x25, #1\n"
- "fmla z30.h, p3/M, z2.h, z5.h\n"
- "fmla z31.h, p3/M, z2.h, z6.h\n"
- "ld1h { z2.h }, p3/Z, [x10, #1, MUL VL]\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "ld1h { z11.h }, p2/Z, [x23]\n"
- "fmla z30.h, p3/M, z3.h, z6.h\n"
- "fmla z31.h, p3/M, z3.h, z8.h\n"
- "ld1h { z3.h }, p3/Z, [x10, #2, MUL VL]\n"
- "fmla z28.h, p3/M, z4.h, z12.h\n"
- "fmla z29.h, p3/M, z4.h, z14.h\n"
- "ld1h { z12.h }, p2/Z, [x23, x17, LSL #1]\n"
+ "fmla z26.h, p3/M, z18.h, z25.h\n"
+ "fmla z30.h, p3/M, z18.h, z24.h\n"
+ "ld1h { z21.h }, p3/Z, [x10, #1, MUL VL]\n"
+ "fmla z27.h, p3/M, z17.h, z0.h\n"
+ "fmla z31.h, p3/M, z17.h, z19.h\n"
+ "ld1h { z18.h }, p2/Z, [x23]\n"
+ "fmla z26.h, p3/M, z17.h, z24.h\n"
+ "fmla z30.h, p3/M, z17.h, z8.h\n"
+ "ld1h { z20.h }, p3/Z, [x10, #2, MUL VL]\n"
+ "fmla z27.h, p3/M, z13.h, z19.h\n"
+ "fmla z31.h, p3/M, z13.h, z1.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, x17, LSL #1]\n"
"ld1h { z14.h }, p1/Z, [x9]\n"
- "fmla z30.h, p3/M, z4.h, z8.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p3/Z, [x10, #3, MUL VL]\n"
- "fmla z28.h, p3/M, z0.h, z9.h\n"
- "fmla z29.h, p3/M, z0.h, z13.h\n"
- "ld1h { z9.h }, p2/Z, [x23, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "fmla z31.h, p3/M, z0.h, z12.h\n"
- "ld1h { z11.h }, p2/Z, [x23, x28, LSL #1]\n"
+ "fmla z26.h, p3/M, z13.h, z8.h\n"
+ "fmla z30.h, p3/M, z13.h, z22.h\n"
+ "ld1h { z19.h }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z27.h, p3/M, z5.h, z16.h\n"
+ "fmla z31.h, p3/M, z5.h, z4.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z5.h, z18.h\n"
+ "fmla z30.h, p3/M, z5.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x23, x28, LSL #1]\n"
"ld1h { z0.h }, p3/Z, [x10, #5, MUL VL]\n"
- "fmla z28.h, p3/M, z1.h, z13.h\n"
- "fmla z29.h, p3/M, z1.h, z5.h\n"
+ "fmla z27.h, p3/M, z23.h, z4.h\n"
+ "fmla z31.h, p3/M, z23.h, z25.h\n"
"ld1h { z13.h }, p1/Z, [x11, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z1.h, z12.h\n"
- "fmla z31.h, p3/M, z1.h, z9.h\n"
- "ld1h { z12.h }, p2/Z, [x23, x26, LSL #1]\n"
+ "fmla z26.h, p3/M, z23.h, z17.h\n"
+ "fmla z30.h, p3/M, z23.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, x26, LSL #1]\n"
"ld1h { z1.h }, p3/Z, [x10, #6, MUL VL]\n"
- "fmla z28.h, p3/M, z2.h, z5.h\n"
- "fmla z29.h, p3/M, z2.h, z6.h\n"
+ "fmla z27.h, p3/M, z21.h, z25.h\n"
+ "fmla z31.h, p3/M, z21.h, z24.h\n"
"ld1h { z5.h }, p1/Z, [x14]\n"
- "fmla z30.h, p3/M, z2.h, z9.h\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "ld1h { z9.h }, p2/Z, [x23, x24, LSL #1]\n"
+ "fmla z26.h, p3/M, z21.h, z16.h\n"
+ "fmla z30.h, p3/M, z21.h, z18.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x24, LSL #1]\n"
"ld1h { z2.h }, p3/Z, [x10, #7, MUL VL]\n"
- "fmla z28.h, p3/M, z3.h, z6.h\n"
- "fmla z29.h, p3/M, z3.h, z8.h\n"
+ "fmla z27.h, p3/M, z20.h, z24.h\n"
+ "fmla z31.h, p3/M, z20.h, z8.h\n"
"addvl x10, x10, #16\n"
"whilelt p2.h, x21, %x[n_channels]\n"
- "fmla z30.h, p3/M, z3.h, z11.h\n"
- "fmla z31.h, p3/M, z3.h, z12.h\n"
+ "fmla z26.h, p3/M, z20.h, z18.h\n"
+ "fmla z30.h, p3/M, z20.h, z17.h\n"
"cmp x12, %x[n_channels]\n"
"addvl x23, x23, #1\n"
- "fmla z28.h, p3/M, z4.h, z8.h\n"
- "fmla z29.h, p3/M, z4.h, z10.h\n"
- "fmax z28.h, p3/M, z28.h, z18.h\n"
- "fmax z29.h, p3/M, z29.h, z18.h\n"
- "fmla z30.h, p3/M, z4.h, z12.h\n"
- "fmla z31.h, p3/M, z4.h, z9.h\n"
- "fmax z30.h, p3/M, z30.h, z18.h\n"
- "fmax z31.h, p3/M, z31.h, z18.h\n"
- "fmin z28.h, p3/M, z28.h, z17.h\n"
- "fmin z29.h, p3/M, z29.h, z17.h\n"
+ "fmla z27.h, p3/M, z19.h, z8.h\n"
+ "fmla z31.h, p3/M, z19.h, z22.h\n"
+ "fmax z27.h, p3/M, z27.h, z15.h\n"
+ "fmax z31.h, p3/M, z31.h, z15.h\n"
+ "fmla z26.h, p3/M, z19.h, z17.h\n"
+ "fmla z30.h, p3/M, z19.h, z16.h\n"
+ "fmax z26.h, p3/M, z26.h, z15.h\n"
+ "fmax z30.h, p3/M, z30.h, z15.h\n"
+ "fmin z27.h, p3/M, z27.h, z28.h\n"
+ "fmin z31.h, p3/M, z31.h, z28.h\n"
"ld1h { z6.h }, p1/Z, [x14, x17, LSL #1]\n"
"ld1h { z8.h }, p1/Z, [x11, x17, LSL #1]\n"
- "fmin z30.h, p3/M, z30.h, z17.h\n"
- "fmin z31.h, p3/M, z31.h, z17.h\n"
+ "fmin z26.h, p3/M, z26.h, z28.h\n"
+ "fmin z30.h, p3/M, z30.h, z28.h\n"
"ld1h { z9.h }, p1/Z, [x14, x15, LSL #1]\n"
"ld1h { z11.h }, p1/Z, [x14, x28, LSL #1]\n"
"ld1h { z12.h }, p1/Z, [x14, x26, LSL #1]\n"
"ld1h { z10.h }, p1/Z, [x11, x24, LSL #1]\n"
- "st1h { z28.h }, p0, [x13]\n"
- "st1h { z29.h }, p0, [x13, x16, LSL #1]\n"
+ "st1h { z27.h }, p0, [x13]\n"
+ "st1h { z31.h }, p0, [x13, x16, LSL #1]\n"
"addvl x13, x13, #1\n"
"ld1h { z3.h }, p3/Z, [x10, #-8, MUL VL]\n"
"ld1h { z4.h }, p3/Z, [x10, #-7, MUL VL]\n"
- "st1h { z30.h }, p0, [x22]\n"
+ "st1h { z26.h }, p0, [x22]\n"
"addvl x10, x10, #-6\n"
- "st1h { z31.h }, p0, [x22, x16, LSL #1]\n"
+ "st1h { z30.h }, p0, [x22, x16, LSL #1]\n"
"addvl x22, x22, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z28, z16\n fmla z28.h, p3/M, z0.h, z5.h\n"
- "movprfx z29, z16\n fmla z29.h, p3/M, z0.h, z6.h\n"
- "ld1h { z5.h }, p2/Z, [x11, x28, LSL #1]\n"
+ "movprfx z30, z29\n fmla z30.h, p3/M, z0.h, z5.h\n"
+ "movprfx z31, z29\n fmla z31.h, p3/M, z0.h, z6.h\n"
+ "ld1h { z22.h }, p2/Z, [x11, x28, LSL #1]\n"
"ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "movprfx z30, z16\n fmla z30.h, p3/M, z0.h, z7.h\n"
- "movprfx z31, z16\n fmla z31.h, p3/M, z0.h, z8.h\n"
- "ld1h { z0.h }, p3/Z, [x10]\n"
+ "movprfx z5, z29\n fmla z5.h, p3/M, z0.h, z7.h\n"
+ "fmla z29.h, p3/M, z0.h, z8.h\n"
+ "ld1h { z20.h }, p3/Z, [x10]\n"
"ldr x12, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "fmla z28.h, p3/M, z1.h, z6.h\n"
- "fmla z29.h, p3/M, z1.h, z9.h\n"
+ "fmla z30.h, p3/M, z1.h, z6.h\n"
+ "fmla z31.h, p3/M, z1.h, z9.h\n"
"ld1h { z6.h }, p2/Z, [x11, x26, LSL #1]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "fmla z30.h, p3/M, z1.h, z8.h\n"
- "fmla z31.h, p3/M, z1.h, z13.h\n"
- "ld1h { z1.h }, p3/Z, [x10, #1, MUL VL]\n"
+ "fmla z5.h, p3/M, z1.h, z8.h\n"
+ "fmla z29.h, p3/M, z1.h, z13.h\n"
+ "ld1h { z19.h }, p3/Z, [x10, #1, MUL VL]\n"
"add x8, x8, #0x1\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "fmla z29.h, p3/M, z2.h, z11.h\n"
- "ld1h { z9.h }, p2/Z, [x14, x24, LSL #1]\n"
+ "fmla z30.h, p3/M, z2.h, z9.h\n"
+ "fmla z31.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z16.h }, p2/Z, [x14, x24, LSL #1]\n"
"cmp x8, x20\n"
- "fmla z30.h, p3/M, z2.h, z13.h\n"
- "fmla z31.h, p3/M, z2.h, z5.h\n"
- "ld1h { z2.h }, p3/Z, [x10, #2, MUL VL]\n"
+ "fmla z5.h, p3/M, z2.h, z13.h\n"
+ "fmla z29.h, p3/M, z2.h, z22.h\n"
+ "ld1h { z18.h }, p3/Z, [x10, #2, MUL VL]\n"
"add x21, x12, #0x1\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "ld1h { z11.h }, p2/Z, [x9, x17, LSL #1]\n"
+ "fmla z30.h, p3/M, z3.h, z11.h\n"
+ "fmla z31.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z1.h }, p2/Z, [x9, x17, LSL #1]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "fmla z30.h, p3/M, z3.h, z5.h\n"
- "fmla z31.h, p3/M, z3.h, z6.h\n"
- "ld1h { z3.h }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z5.h, p3/M, z3.h, z22.h\n"
+ "fmla z29.h, p3/M, z3.h, z6.h\n"
+ "ld1h { z17.h }, p3/Z, [x10, #3, MUL VL]\n"
"csel x12, x12, x21, LT\n"
- "fmla z28.h, p3/M, z4.h, z12.h\n"
- "fmla z29.h, p3/M, z4.h, z9.h\n"
- "ld1h { z12.h }, p2/Z, [x9, x15, LSL #1]\n"
- "ld1h { z9.h }, p2/Z, [x9, x28, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z6.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p3/Z, [x10, #4, MUL VL]\n"
+ "fmla z30.h, p3/M, z4.h, z12.h\n"
+ "fmla z31.h, p3/M, z4.h, z16.h\n"
+ "ld1h { z0.h }, p2/Z, [x9, x15, LSL #1]\n"
+ "ld1h { z27.h }, p2/Z, [x9, x28, LSL #1]\n"
+ "fmla z5.h, p3/M, z4.h, z6.h\n"
+ "fmla z29.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z16.h }, p3/Z, [x10, #4, MUL VL]\n"
"mov p0.b, p2.b\n"
- "fmla z28.h, p3/M, z0.h, z7.h\n"
- "fmla z29.h, p3/M, z0.h, z8.h\n"
+ "fmla z30.h, p3/M, z20.h, z7.h\n"
+ "fmla z31.h, p3/M, z20.h, z8.h\n"
"csel x8, x8, XZR, LT\n"
"cmp x12, x20\n"
- "fmla z30.h, p3/M, z0.h, z14.h\n"
- "fmla z31.h, p3/M, z0.h, z11.h\n"
- "ld1h { z0.h }, p3/Z, [x10, #5, MUL VL]\n"
- "fmla z28.h, p3/M, z1.h, z8.h\n"
- "fmla z29.h, p3/M, z1.h, z13.h\n"
- "ld1h { z8.h }, p2/Z, [x9, x24, LSL #1]\n"
- "fmla z30.h, p3/M, z1.h, z11.h\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
- "ld1h { z1.h }, p3/Z, [x10, #6, MUL VL]\n"
- "fmla z28.h, p3/M, z2.h, z13.h\n"
- "fmla z29.h, p3/M, z2.h, z5.h\n"
- "ld1h { z13.h }, p2/Z, [x9, x26, LSL #1]\n"
- "fmla z30.h, p3/M, z2.h, z12.h\n"
- "fmla z31.h, p3/M, z2.h, z9.h\n"
- "ld1h { z2.h }, p3/Z, [x10, #7, MUL VL]\n"
+ "fmla z5.h, p3/M, z20.h, z14.h\n"
+ "fmla z29.h, p3/M, z20.h, z1.h\n"
+ "ld1h { z21.h }, p3/Z, [x10, #5, MUL VL]\n"
+ "fmla z30.h, p3/M, z19.h, z8.h\n"
+ "fmla z31.h, p3/M, z19.h, z13.h\n"
+ "ld1h { z26.h }, p2/Z, [x9, x24, LSL #1]\n"
+ "fmla z5.h, p3/M, z19.h, z1.h\n"
+ "fmla z29.h, p3/M, z19.h, z0.h\n"
+ "ld1h { z25.h }, p3/Z, [x10, #6, MUL VL]\n"
+ "fmla z30.h, p3/M, z18.h, z13.h\n"
+ "fmla z31.h, p3/M, z18.h, z22.h\n"
+ "ld1h { z24.h }, p2/Z, [x9, x26, LSL #1]\n"
+ "fmla z5.h, p3/M, z18.h, z0.h\n"
+ "fmla z29.h, p3/M, z18.h, z27.h\n"
+ "ld1h { z23.h }, p3/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "fmla z28.h, p3/M, z3.h, z5.h\n"
- "fmla z29.h, p3/M, z3.h, z6.h\n"
- "ld1h { z5.h }, p2/Z, [x27]\n"
- "fmla z30.h, p3/M, z3.h, z9.h\n"
- "fmla z31.h, p3/M, z3.h, z13.h\n"
- "ld1h { z3.h }, p3/Z, [x10, #-8, MUL VL]\n"
- "fmla z28.h, p3/M, z4.h, z6.h\n"
- "fmla z29.h, p3/M, z4.h, z10.h\n"
- "ld1h { z6.h }, p2/Z, [x27, x17, LSL #1]\n"
- "ld1h { z10.h }, p2/Z, [x27, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z13.h\n"
- "fmla z31.h, p3/M, z4.h, z8.h\n"
- "ld1h { z4.h }, p3/Z, [x10, #-7, MUL VL]\n"
- "fmla z28.h, p3/M, z0.h, z14.h\n"
- "fmla z29.h, p3/M, z0.h, z11.h\n"
- "ld1h { z14.h }, p2/Z, [x27, x24, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z5.h\n"
- "fmla z31.h, p3/M, z0.h, z6.h\n"
- "ld1h { z0.h }, p3/Z, [x10, #-6, MUL VL]\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "fmla z29.h, p3/M, z1.h, z12.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x28, LSL #1]\n"
- "fmla z30.h, p3/M, z1.h, z6.h\n"
- "fmla z31.h, p3/M, z1.h, z10.h\n"
- "ld1h { z1.h }, p3/Z, [x10, #-5, MUL VL]\n"
- "fmla z28.h, p3/M, z2.h, z12.h\n"
- "fmla z29.h, p3/M, z2.h, z9.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x26, LSL #1]\n"
- "fmla z30.h, p3/M, z2.h, z10.h\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "ld1h { z2.h }, p3/Z, [x10, #-4, MUL VL]\n"
- "fmla z28.h, p3/M, z3.h, z9.h\n"
- "fmla z29.h, p3/M, z3.h, z13.h\n"
- "ld1h { z9.h }, p2/Z, [x25]\n"
- "fmla z30.h, p3/M, z3.h, z11.h\n"
- "fmla z31.h, p3/M, z3.h, z12.h\n"
- "ld1h { z3.h }, p3/Z, [x10, #-3, MUL VL]\n"
- "fmla z28.h, p3/M, z4.h, z13.h\n"
- "fmla z29.h, p3/M, z4.h, z8.h\n"
- "ld1h { z13.h }, p2/Z, [x25, x17, LSL #1]\n"
- "ld1h { z8.h }, p2/Z, [x25, x26, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z12.h\n"
- "fmla z31.h, p3/M, z4.h, z14.h\n"
- "ld1h { z4.h }, p3/Z, [x10, #-2, MUL VL]\n"
- "fmla z28.h, p3/M, z0.h, z5.h\n"
- "fmla z29.h, p3/M, z0.h, z6.h\n"
- "ld1h { z5.h }, p2/Z, [x25, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z9.h\n"
- "fmla z31.h, p3/M, z0.h, z13.h\n"
- "ld1h { z0.h }, p3/Z, [x10, #-1, MUL VL]\n"
- "fmla z28.h, p3/M, z1.h, z6.h\n"
- "fmla z29.h, p3/M, z1.h, z10.h\n"
- "ld1h { z6.h }, p2/Z, [x25, x28, LSL #1]\n"
- "fmla z30.h, p3/M, z1.h, z13.h\n"
- "fmla z31.h, p3/M, z1.h, z5.h\n"
- "ld1h { z1.h }, p3/Z, [x10]\n"
- "fmla z28.h, p3/M, z2.h, z10.h\n"
- "fmla z29.h, p3/M, z2.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x25, x24, LSL #1]\n"
- "fmla z30.h, p3/M, z2.h, z5.h\n"
- "fmla z31.h, p3/M, z2.h, z6.h\n"
- "ld1h { z2.h }, p3/Z, [x10, #1, MUL VL]\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "ld1h { z11.h }, p2/Z, [x23]\n"
- "fmla z30.h, p3/M, z3.h, z6.h\n"
- "fmla z31.h, p3/M, z3.h, z8.h\n"
- "ld1h { z3.h }, p3/Z, [x10, #2, MUL VL]\n"
- "fmla z28.h, p3/M, z4.h, z12.h\n"
- "fmla z29.h, p3/M, z4.h, z14.h\n"
- "ld1h { z12.h }, p2/Z, [x23, x17, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z8.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p3/Z, [x10, #3, MUL VL]\n"
- "fmla z28.h, p3/M, z0.h, z9.h\n"
- "fmla z29.h, p3/M, z0.h, z13.h\n"
- "ld1h { z9.h }, p2/Z, [x23, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "fmla z31.h, p3/M, z0.h, z12.h\n"
- "ld1h { z11.h }, p2/Z, [x23, x28, LSL #1]\n"
- "fmla z28.h, p3/M, z1.h, z13.h\n"
- "fmla z29.h, p3/M, z1.h, z5.h\n"
- "fmla z30.h, p3/M, z1.h, z12.h\n"
- "fmla z31.h, p3/M, z1.h, z9.h\n"
- "ld1h { z12.h }, p2/Z, [x23, x26, LSL #1]\n"
- "fmla z28.h, p3/M, z2.h, z5.h\n"
- "fmla z29.h, p3/M, z2.h, z6.h\n"
- "fmla z30.h, p3/M, z2.h, z9.h\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "ld1h { z9.h }, p2/Z, [x23, x24, LSL #1]\n"
- "fmla z28.h, p3/M, z3.h, z6.h\n"
- "fmla z29.h, p3/M, z3.h, z8.h\n"
- "fmla z30.h, p3/M, z3.h, z11.h\n"
- "fmla z31.h, p3/M, z3.h, z12.h\n"
- "fmla z28.h, p3/M, z4.h, z8.h\n"
- "fmla z29.h, p3/M, z4.h, z10.h\n"
- "fmax z28.h, p3/M, z28.h, z18.h\n"
- "fmax z29.h, p3/M, z29.h, z18.h\n"
- "fmla z30.h, p3/M, z4.h, z12.h\n"
- "fmla z31.h, p3/M, z4.h, z9.h\n"
- "fmax z30.h, p3/M, z30.h, z18.h\n"
- "fmax z31.h, p3/M, z31.h, z18.h\n"
- "fmin z28.h, p3/M, z28.h, z17.h\n"
- "fmin z29.h, p3/M, z29.h, z17.h\n"
- "st1h { z28.h }, p0, [x13]\n"
- "fmin z30.h, p3/M, z30.h, z17.h\n"
- "fmin z31.h, p3/M, z31.h, z17.h\n"
- "st1h { z29.h }, p0, [x13, x16, LSL #1]\n"
- "st1h { z30.h }, p0, [x22]\n"
- "st1h { z31.h }, p0, [x22, x16, LSL #1]\n"
+ "fmla z30.h, p3/M, z17.h, z22.h\n"
+ "fmla z31.h, p3/M, z17.h, z6.h\n"
+ "ld1h { z22.h }, p2/Z, [x27]\n"
+ "fmla z5.h, p3/M, z17.h, z27.h\n"
+ "fmla z29.h, p3/M, z17.h, z24.h\n"
+ "ld1h { z20.h }, p3/Z, [x10, #-8, MUL VL]\n"
+ "fmla z30.h, p3/M, z16.h, z6.h\n"
+ "fmla z31.h, p3/M, z16.h, z10.h\n"
+ "ld1h { z19.h }, p2/Z, [x27, x17, LSL #1]\n"
+ "ld1h { z18.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "fmla z5.h, p3/M, z16.h, z24.h\n"
+ "fmla z29.h, p3/M, z16.h, z26.h\n"
+ "ld1h { z16.h }, p3/Z, [x10, #-7, MUL VL]\n"
+ "fmla z30.h, p3/M, z21.h, z14.h\n"
+ "fmla z31.h, p3/M, z21.h, z1.h\n"
+ "ld1h { z17.h }, p2/Z, [x27, x24, LSL #1]\n"
+ "fmla z5.h, p3/M, z21.h, z22.h\n"
+ "fmla z29.h, p3/M, z21.h, z19.h\n"
+ "ld1h { z21.h }, p3/Z, [x10, #-6, MUL VL]\n"
+ "fmla z30.h, p3/M, z25.h, z1.h\n"
+ "fmla z31.h, p3/M, z25.h, z0.h\n"
+ "ld1h { z7.h }, p2/Z, [x27, x28, LSL #1]\n"
+ "fmla z5.h, p3/M, z25.h, z19.h\n"
+ "fmla z29.h, p3/M, z25.h, z18.h\n"
+ "ld1h { z10.h }, p3/Z, [x10, #-5, MUL VL]\n"
+ "fmla z30.h, p3/M, z23.h, z0.h\n"
+ "fmla z31.h, p3/M, z23.h, z27.h\n"
+ "ld1h { z11.h }, p2/Z, [x27, x26, LSL #1]\n"
+ "fmla z5.h, p3/M, z23.h, z18.h\n"
+ "fmla z29.h, p3/M, z23.h, z7.h\n"
+ "ld1h { z6.h }, p3/Z, [x10, #-4, MUL VL]\n"
+ "fmla z30.h, p3/M, z20.h, z27.h\n"
+ "fmla z31.h, p3/M, z20.h, z24.h\n"
+ "ld1h { z0.h }, p2/Z, [x25]\n"
+ "fmla z5.h, p3/M, z20.h, z7.h\n"
+ "fmla z29.h, p3/M, z20.h, z11.h\n"
+ "ld1h { z9.h }, p3/Z, [x10, #-3, MUL VL]\n"
+ "fmla z30.h, p3/M, z16.h, z24.h\n"
+ "fmla z31.h, p3/M, z16.h, z26.h\n"
+ "ld1h { z3.h }, p2/Z, [x25, x17, LSL #1]\n"
+ "ld1h { z27.h }, p2/Z, [x25, x26, LSL #1]\n"
+ "fmla z5.h, p3/M, z16.h, z11.h\n"
+ "fmla z29.h, p3/M, z16.h, z17.h\n"
+ "ld1h { z16.h }, p3/Z, [x10, #-2, MUL VL]\n"
+ "fmla z30.h, p3/M, z21.h, z22.h\n"
+ "fmla z31.h, p3/M, z21.h, z19.h\n"
+ "ld1h { z26.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z5.h, p3/M, z21.h, z0.h\n"
+ "fmla z29.h, p3/M, z21.h, z3.h\n"
+ "ld1h { z25.h }, p3/Z, [x10, #-1, MUL VL]\n"
+ "fmla z30.h, p3/M, z10.h, z19.h\n"
+ "fmla z31.h, p3/M, z10.h, z18.h\n"
+ "ld1h { z24.h }, p2/Z, [x25, x28, LSL #1]\n"
+ "fmla z5.h, p3/M, z10.h, z3.h\n"
+ "fmla z29.h, p3/M, z10.h, z26.h\n"
+ "ld1h { z23.h }, p3/Z, [x10]\n"
+ "fmla z30.h, p3/M, z6.h, z18.h\n"
+ "fmla z31.h, p3/M, z6.h, z7.h\n"
+ "ld1h { z22.h }, p2/Z, [x25, x24, LSL #1]\n"
+ "fmla z5.h, p3/M, z6.h, z26.h\n"
+ "fmla z29.h, p3/M, z6.h, z24.h\n"
+ "ld1h { z21.h }, p3/Z, [x10, #1, MUL VL]\n"
+ "fmla z30.h, p3/M, z9.h, z7.h\n"
+ "fmla z31.h, p3/M, z9.h, z11.h\n"
+ "ld1h { z18.h }, p2/Z, [x23]\n"
+ "fmla z5.h, p3/M, z9.h, z24.h\n"
+ "fmla z29.h, p3/M, z9.h, z27.h\n"
+ "ld1h { z20.h }, p3/Z, [x10, #2, MUL VL]\n"
+ "fmla z30.h, p3/M, z16.h, z11.h\n"
+ "fmla z31.h, p3/M, z16.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, x17, LSL #1]\n"
+ "fmla z5.h, p3/M, z16.h, z27.h\n"
+ "fmla z29.h, p3/M, z16.h, z22.h\n"
+ "ld1h { z19.h }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z30.h, p3/M, z25.h, z0.h\n"
+ "fmla z31.h, p3/M, z25.h, z3.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z5.h, p3/M, z25.h, z18.h\n"
+ "fmla z29.h, p3/M, z25.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x23, x28, LSL #1]\n"
+ "fmla z30.h, p3/M, z23.h, z3.h\n"
+ "fmla z31.h, p3/M, z23.h, z26.h\n"
+ "fmla z5.h, p3/M, z23.h, z17.h\n"
+ "fmla z29.h, p3/M, z23.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, x26, LSL #1]\n"
+ "fmla z30.h, p3/M, z21.h, z26.h\n"
+ "fmla z31.h, p3/M, z21.h, z24.h\n"
+ "fmla z5.h, p3/M, z21.h, z16.h\n"
+ "fmla z29.h, p3/M, z21.h, z18.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x24, LSL #1]\n"
+ "fmla z30.h, p3/M, z20.h, z24.h\n"
+ "fmla z31.h, p3/M, z20.h, z27.h\n"
+ "fmla z5.h, p3/M, z20.h, z18.h\n"
+ "fmla z29.h, p3/M, z20.h, z17.h\n"
+ "fmla z30.h, p3/M, z19.h, z27.h\n"
+ "fmla z31.h, p3/M, z19.h, z22.h\n"
+ "fmax z30.h, p3/M, z30.h, z15.h\n"
+ "fmax z31.h, p3/M, z31.h, z15.h\n"
+ "fmla z5.h, p3/M, z19.h, z17.h\n"
+ "fmla z29.h, p3/M, z19.h, z16.h\n"
+ "fmax z5.h, p3/M, z5.h, z15.h\n"
+ "fmax z29.h, p3/M, z29.h, z15.h\n"
+ "fmin z30.h, p3/M, z30.h, z28.h\n"
+ "fmin z31.h, p3/M, z31.h, z28.h\n"
+ "st1h { z30.h }, p0, [x13]\n"
+ "fmin z5.h, p3/M, z5.h, z28.h\n"
+ "fmin z29.h, p3/M, z29.h, z28.h\n"
+ "st1h { z31.h }, p0, [x13, x16, LSL #1]\n"
+ "st1h { z5.h }, p0, [x22]\n"
+ "st1h { z29.h }, p0, [x22, x16, LSL #1]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index 2e20b524d8..1ec0cb2cbf 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -104,448 +104,448 @@ void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"mov x13, #0x0\n"
"ldp x12, x11, [x20, #0x10]\n"
"whilelt p3.h, XZR, %x[n_channels]\n"
- "ldp x10, x9, [x16, #0x0]\n"
- "cnth x28\n"
+ "ldp x21, x20, [x16, #0x0]\n"
+ "cnth x10\n"
"ptrue p2.b\n"
- "ldr x27, [%x[params_struct], %[offsetof_args_params]]\n"
- "ld1h { z5.h }, p3/Z, [x10, x13, LSL #1]\n"
- "cmp x28, %x[n_channels]\n"
- "ld1h { z6.h }, p3/Z, [x9, x13, LSL #1]\n"
- "ldp x26, x25, [x16, #0x10]\n"
- "sub x24, XZR, x28\n"
- "ldp x23, x22, [x16, #0x20]\n"
- "ldp x21, x20, [x16, #0x30]\n"
- "ldp x10, x9, [x16, #0x40]\n"
- "ld1rh { z18.h }, p2/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rh { z17.h }, p2/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "ld1h { z16.h }, p2/Z, [x27]\n"
- "ld1h { z0.h }, p2/Z, [x27, #1, MUL VL]\n"
- "ld1h { z1.h }, p2/Z, [x27, #2, MUL VL]\n"
- "ld1h { z2.h }, p2/Z, [x27, #3, MUL VL]\n"
- "ld1h { z3.h }, p2/Z, [x27, #4, MUL VL]\n"
- "ld1h { z4.h }, p2/Z, [x27, #5, MUL VL]\n"
- "ld1h { z7.h }, p3/Z, [x26, x13, LSL #1]\n"
- "addvl x27, x27, #6\n"
- "ld1h { z8.h }, p3/Z, [x25, x13, LSL #1]\n"
- "ld1h { z9.h }, p3/Z, [x23, x13, LSL #1]\n"
- "ld1h { z13.h }, p3/Z, [x22, x13, LSL #1]\n"
- "ld1h { z11.h }, p3/Z, [x21, x13, LSL #1]\n"
- "ld1h { z12.h }, p3/Z, [x20, x13, LSL #1]\n"
- "ld1h { z10.h }, p3/Z, [x10, x13, LSL #1]\n"
- "ld1h { z14.h }, p3/Z, [x9, x13, LSL #1]\n"
+ "ldr x9, [%x[params_struct], %[offsetof_args_params]]\n"
+ "ld1h { z5.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "cmp x10, %x[n_channels]\n"
+ "ld1h { z6.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldp x27, x26, [x16, #0x10]\n"
+ "sub x28, XZR, x10\n"
+ "ldp x25, x24, [x16, #0x20]\n"
+ "ldp x23, x22, [x16, #0x30]\n"
+ "ldp x21, x20, [x16, #0x40]\n"
+ "ld1rh { z15.h }, p2/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z28.h }, p2/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1h { z29.h }, p2/Z, [x9]\n"
+ "ld1h { z0.h }, p2/Z, [x9, #1, MUL VL]\n"
+ "ld1h { z1.h }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1h { z2.h }, p2/Z, [x9, #3, MUL VL]\n"
+ "ld1h { z3.h }, p2/Z, [x9, #4, MUL VL]\n"
+ "ld1h { z4.h }, p2/Z, [x9, #5, MUL VL]\n"
+ "ld1h { z7.h }, p3/Z, [x27, x13, LSL #1]\n"
+ "addvl x9, x9, #6\n"
+ "ld1h { z8.h }, p3/Z, [x26, x13, LSL #1]\n"
+ "ld1h { z9.h }, p3/Z, [x25, x13, LSL #1]\n"
+ "ld1h { z13.h }, p3/Z, [x24, x13, LSL #1]\n"
+ "ld1h { z11.h }, p3/Z, [x23, x13, LSL #1]\n"
+ "ld1h { z12.h }, p3/Z, [x22, x13, LSL #1]\n"
+ "ld1h { z10.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "ld1h { z14.h }, p3/Z, [x20, x13, LSL #1]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z28, z16\n fmla z28.h, p2/M, z0.h, z5.h\n"
- "movprfx z29, z16\n fmla z29.h, p2/M, z0.h, z6.h\n"
- "ldr x26, [x16, #0x50]\n"
- "ld1h { z5.h }, p3/Z, [x26, x13, LSL #1]\n"
- "movprfx z30, z16\n fmla z30.h, p2/M, z0.h, z7.h\n"
- "movprfx z31, z16\n fmla z31.h, p2/M, z0.h, z8.h\n"
- "ldr x25, [x16, #0x58]\n"
- "ldr x23, [x16, #0x60]\n"
- "fmla z28.h, p2/M, z1.h, z6.h\n"
- "fmla z29.h, p2/M, z1.h, z9.h\n"
- "ld1h { z6.h }, p3/Z, [x25, x13, LSL #1]\n"
- "ldr x22, [x16, #0x68]\n"
- "fmla z30.h, p2/M, z1.h, z8.h\n"
- "fmla z31.h, p2/M, z1.h, z13.h\n"
- "ld1h { z0.h }, p2/Z, [x27]\n"
- "ldr x21, [x16, #0x70]\n"
- "fmla z28.h, p2/M, z2.h, z9.h\n"
- "fmla z29.h, p2/M, z2.h, z11.h\n"
- "ld1h { z9.h }, p3/Z, [x23, x13, LSL #1]\n"
- "ld1h { z1.h }, p2/Z, [x27, #1, MUL VL]\n"
- "fmla z30.h, p2/M, z2.h, z13.h\n"
- "fmla z31.h, p2/M, z2.h, z5.h\n"
- "ldr x20, [x16, #0x78]\n"
- "ld1h { z2.h }, p2/Z, [x27, #2, MUL VL]\n"
- "fmla z28.h, p2/M, z3.h, z11.h\n"
- "fmla z29.h, p2/M, z3.h, z12.h\n"
- "ld1h { z11.h }, p3/Z, [x22, x13, LSL #1]\n"
- "ldr x10, [x16, #0x80]\n"
- "fmla z30.h, p2/M, z3.h, z5.h\n"
- "fmla z31.h, p2/M, z3.h, z6.h\n"
- "ld1h { z3.h }, p2/Z, [x27, #3, MUL VL]\n"
- "ldr x9, [x16, #0x88]\n"
- "fmla z28.h, p2/M, z4.h, z12.h\n"
- "fmla z29.h, p2/M, z4.h, z9.h\n"
- "ld1h { z12.h }, p3/Z, [x21, x13, LSL #1]\n"
- "ld1h { z9.h }, p3/Z, [x20, x13, LSL #1]\n"
- "fmla z30.h, p2/M, z4.h, z6.h\n"
- "fmla z31.h, p2/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p2/Z, [x27, #4, MUL VL]\n"
- "ldr x26, [x16, #0x90]\n"
- "fmla z28.h, p2/M, z0.h, z7.h\n"
- "fmla z29.h, p2/M, z0.h, z8.h\n"
- "ldr x25, [x16, #0x98]\n"
- "ldr x23, [x16, #0xa0]\n"
- "fmla z30.h, p2/M, z0.h, z14.h\n"
- "fmla z31.h, p2/M, z0.h, z11.h\n"
- "ld1h { z0.h }, p2/Z, [x27, #5, MUL VL]\n"
- "ldr x22, [x16, #0xa8]\n"
- "fmla z28.h, p2/M, z1.h, z8.h\n"
- "fmla z29.h, p2/M, z1.h, z13.h\n"
- "ld1h { z8.h }, p3/Z, [x9, x13, LSL #1]\n"
- "ldr x21, [x16, #0xb0]\n"
- "fmla z30.h, p2/M, z1.h, z11.h\n"
- "fmla z31.h, p2/M, z1.h, z12.h\n"
- "ld1h { z1.h }, p2/Z, [x27, #6, MUL VL]\n"
- "ldr x20, [x16, #0xb8]\n"
- "fmla z28.h, p2/M, z2.h, z13.h\n"
- "fmla z29.h, p2/M, z2.h, z5.h\n"
- "ld1h { z13.h }, p3/Z, [x10, x13, LSL #1]\n"
- "ldr x10, [x16, #0xc0]\n"
- "fmla z30.h, p2/M, z2.h, z12.h\n"
- "fmla z31.h, p2/M, z2.h, z9.h\n"
- "ld1h { z2.h }, p2/Z, [x27, #7, MUL VL]\n"
- "addvl x27, x27, #16\n"
- "fmla z28.h, p2/M, z3.h, z5.h\n"
- "fmla z29.h, p2/M, z3.h, z6.h\n"
- "ld1h { z5.h }, p3/Z, [x26, x13, LSL #1]\n"
- "ldr x9, [x16, #0xc8]\n"
- "fmla z30.h, p2/M, z3.h, z9.h\n"
- "fmla z31.h, p2/M, z3.h, z13.h\n"
- "ld1h { z3.h }, p2/Z, [x27, #-8, MUL VL]\n"
- "ldr x26, [x16, #0xd0]\n"
- "fmla z28.h, p2/M, z4.h, z6.h\n"
- "fmla z29.h, p2/M, z4.h, z10.h\n"
- "ld1h { z6.h }, p3/Z, [x25, x13, LSL #1]\n"
- "ld1h { z10.h }, p3/Z, [x23, x13, LSL #1]\n"
- "fmla z30.h, p2/M, z4.h, z13.h\n"
- "fmla z31.h, p2/M, z4.h, z8.h\n"
- "ld1h { z4.h }, p2/Z, [x27, #-7, MUL VL]\n"
- "ldr x25, [x16, #0xd8]\n"
- "fmla z28.h, p2/M, z0.h, z14.h\n"
- "fmla z29.h, p2/M, z0.h, z11.h\n"
- "ld1h { z14.h }, p3/Z, [x20, x13, LSL #1]\n"
- "ldr x23, [x16, #0xe0]\n"
- "fmla z30.h, p2/M, z0.h, z5.h\n"
- "fmla z31.h, p2/M, z0.h, z6.h\n"
- "ld1h { z0.h }, p2/Z, [x27, #-6, MUL VL]\n"
- "ldr x20, [x16, #0xf8]\n"
- "fmla z28.h, p2/M, z1.h, z11.h\n"
- "fmla z29.h, p2/M, z1.h, z12.h\n"
- "ld1h { z11.h }, p3/Z, [x22, x13, LSL #1]\n"
- "ldr x22, [x16, #0xe8]\n"
+ "movprfx z30, z29\n fmla z30.h, p2/M, z0.h, z5.h\n"
+ "movprfx z27, z29\n fmla z27.h, p2/M, z0.h, z6.h\n"
+ "ldr x20, [x16, #0x50]\n"
+ "ld1h { z5.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "movprfx z31, z29\n fmla z31.h, p2/M, z0.h, z7.h\n"
+ "movprfx z26, z29\n fmla z26.h, p2/M, z0.h, z8.h\n"
+ "ldr x20, [x16, #0x58]\n"
+ "ldr x21, [x16, #0x60]\n"
"fmla z30.h, p2/M, z1.h, z6.h\n"
- "fmla z31.h, p2/M, z1.h, z10.h\n"
- "ld1h { z1.h }, p2/Z, [x27, #-5, MUL VL]\n"
- "whilelt p1.h, x28, %x[n_channels]\n"
- "fmla z28.h, p2/M, z2.h, z12.h\n"
- "fmla z29.h, p2/M, z2.h, z9.h\n"
- "ld1h { z12.h }, p3/Z, [x21, x13, LSL #1]\n"
- "ldr x21, [x16, #0xf0]\n"
- "fmla z30.h, p2/M, z2.h, z10.h\n"
- "fmla z31.h, p2/M, z2.h, z11.h\n"
- "ld1h { z2.h }, p2/Z, [x27, #-4, MUL VL]\n"
- "inch x24\n"
- "fmla z28.h, p2/M, z3.h, z9.h\n"
- "fmla z29.h, p2/M, z3.h, z13.h\n"
- "ld1h { z9.h }, p3/Z, [x10, x13, LSL #1]\n"
- "ldr x10, [x16, #0x100]\n"
- "fmla z30.h, p2/M, z3.h, z11.h\n"
- "fmla z31.h, p2/M, z3.h, z12.h\n"
- "ld1h { z3.h }, p2/Z, [x27, #-3, MUL VL]\n"
- "mov p0.b, p3.b\n"
- "fmla z28.h, p2/M, z4.h, z13.h\n"
- "fmla z29.h, p2/M, z4.h, z8.h\n"
- "ld1h { z13.h }, p3/Z, [x9, x13, LSL #1]\n"
- "ld1h { z8.h }, p3/Z, [x23, x13, LSL #1]\n"
- "fmla z30.h, p2/M, z4.h, z12.h\n"
- "fmla z31.h, p2/M, z4.h, z14.h\n"
- "ld1h { z4.h }, p2/Z, [x27, #-2, MUL VL]\n"
- "ldr x9, [x16, #0x108]\n"
- "fmla z28.h, p2/M, z0.h, z5.h\n"
- "fmla z29.h, p2/M, z0.h, z6.h\n"
- "ld1h { z5.h }, p3/Z, [x26, x13, LSL #1]\n"
- "ldr x26, [x16, #0x110]\n"
- "fmla z30.h, p2/M, z0.h, z9.h\n"
- "fmla z31.h, p2/M, z0.h, z13.h\n"
- "ld1h { z0.h }, p2/Z, [x27, #-1, MUL VL]\n"
- "ld1h { z16.h }, p2/Z, [x27, #4, MUL VL]\n"
- "fmla z28.h, p2/M, z1.h, z6.h\n"
- "fmla z29.h, p2/M, z1.h, z10.h\n"
- "ld1h { z6.h }, p3/Z, [x25, x13, LSL #1]\n"
- "ldr x25, [x16, #0x118]\n"
- "fmla z30.h, p2/M, z1.h, z13.h\n"
- "fmla z31.h, p2/M, z1.h, z5.h\n"
- "ld1h { z1.h }, p2/Z, [x27]\n"
- "fmla z28.h, p2/M, z2.h, z10.h\n"
- "fmla z29.h, p2/M, z2.h, z11.h\n"
- "ld1h { z10.h }, p3/Z, [x22, x13, LSL #1]\n"
- "fmla z30.h, p2/M, z2.h, z5.h\n"
- "fmla z31.h, p2/M, z2.h, z6.h\n"
- "ld1h { z2.h }, p2/Z, [x27, #1, MUL VL]\n"
- "fmla z28.h, p2/M, z3.h, z11.h\n"
- "fmla z29.h, p2/M, z3.h, z12.h\n"
- "ld1h { z11.h }, p3/Z, [x21, x13, LSL #1]\n"
- "fmla z30.h, p2/M, z3.h, z6.h\n"
- "fmla z31.h, p2/M, z3.h, z8.h\n"
- "ld1h { z3.h }, p2/Z, [x27, #2, MUL VL]\n"
- "fmla z28.h, p2/M, z4.h, z12.h\n"
- "fmla z29.h, p2/M, z4.h, z14.h\n"
- "ld1h { z12.h }, p3/Z, [x20, x13, LSL #1]\n"
- "fmla z30.h, p2/M, z4.h, z8.h\n"
- "fmla z31.h, p2/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p2/Z, [x27, #3, MUL VL]\n"
- "fmla z28.h, p2/M, z0.h, z9.h\n"
- "fmla z29.h, p2/M, z0.h, z13.h\n"
- "ld1h { z9.h }, p3/Z, [x10, x13, LSL #1]\n"
- "fmla z30.h, p2/M, z0.h, z11.h\n"
- "fmla z31.h, p2/M, z0.h, z12.h\n"
- "ld1h { z11.h }, p3/Z, [x9, x13, LSL #1]\n"
- "ldp x10, x9, [x16, #0x0]\n"
- "fmla z28.h, p2/M, z1.h, z13.h\n"
- "fmla z29.h, p2/M, z1.h, z5.h\n"
- "ld1h { z0.h }, p2/Z, [x27, #5, MUL VL]\n"
- "fmla z30.h, p2/M, z1.h, z12.h\n"
- "fmla z31.h, p2/M, z1.h, z9.h\n"
- "ld1h { z12.h }, p3/Z, [x26, x13, LSL #1]\n"
- "ld1h { z1.h }, p2/Z, [x27, #6, MUL VL]\n"
- "fmla z28.h, p2/M, z2.h, z5.h\n"
- "fmla z29.h, p2/M, z2.h, z6.h\n"
- "ld1h { z5.h }, p1/Z, [x10, x28, LSL #1]\n"
+ "fmla z27.h, p2/M, z1.h, z9.h\n"
+ "ld1h { z22.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldr x20, [x16, #0x68]\n"
+ "fmla z31.h, p2/M, z1.h, z8.h\n"
+ "fmla z26.h, p2/M, z1.h, z13.h\n"
+ "ld1h { z21.h }, p2/Z, [x9]\n"
+ "ldr x23, [x16, #0x70]\n"
"fmla z30.h, p2/M, z2.h, z9.h\n"
- "fmla z31.h, p2/M, z2.h, z11.h\n"
- "ld1h { z9.h }, p3/Z, [x25, x13, LSL #1]\n"
- "ldp x26, x25, [x16, #0x10]\n"
- "fmla z28.h, p2/M, z3.h, z6.h\n"
- "fmla z29.h, p2/M, z3.h, z8.h\n"
- "ld1h { z6.h }, p1/Z, [x9, x28, LSL #1]\n"
- "ldp x23, x22, [x16, #0x20]\n"
+ "fmla z27.h, p2/M, z2.h, z11.h\n"
+ "ld1h { z20.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "ld1h { z18.h }, p2/Z, [x9, #1, MUL VL]\n"
+ "fmla z31.h, p2/M, z2.h, z13.h\n"
+ "fmla z26.h, p2/M, z2.h, z5.h\n"
+ "ldr x22, [x16, #0x78]\n"
+ "ld1h { z17.h }, p2/Z, [x9, #2, MUL VL]\n"
"fmla z30.h, p2/M, z3.h, z11.h\n"
- "fmla z31.h, p2/M, z3.h, z12.h\n"
- "ldp x21, x20, [x16, #0x30]\n"
- "ldp x10, x9, [x16, #0x40]\n"
- "fmla z28.h, p2/M, z4.h, z8.h\n"
- "fmla z29.h, p2/M, z4.h, z10.h\n"
- "inch x13\n"
- "ld1h { z7.h }, p1/Z, [x26, x28, LSL #1]\n"
+ "fmla z27.h, p2/M, z3.h, z12.h\n"
+ "ld1h { z11.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldr x21, [x16, #0x80]\n"
+ "fmla z31.h, p2/M, z3.h, z5.h\n"
+ "fmla z26.h, p2/M, z3.h, z22.h\n"
+ "ld1h { z16.h }, p2/Z, [x9, #3, MUL VL]\n"
+ "ldr x20, [x16, #0x88]\n"
"fmla z30.h, p2/M, z4.h, z12.h\n"
- "fmla z31.h, p2/M, z4.h, z9.h\n"
- "ld1h { z8.h }, p1/Z, [x25, x28, LSL #1]\n"
- "ld1h { z9.h }, p1/Z, [x23, x28, LSL #1]\n"
- "ld1h { z13.h }, p1/Z, [x22, x28, LSL #1]\n"
- "ld1h { z11.h }, p1/Z, [x21, x28, LSL #1]\n"
- "fmax z28.h, p2/M, z28.h, z18.h\n"
- "fmax z29.h, p2/M, z29.h, z18.h\n"
- "ld1h { z12.h }, p1/Z, [x20, x28, LSL #1]\n"
- "ld1h { z10.h }, p1/Z, [x10, x28, LSL #1]\n"
- "fmax z30.h, p2/M, z30.h, z18.h\n"
- "fmax z31.h, p2/M, z31.h, z18.h\n"
- "ld1h { z14.h }, p1/Z, [x9, x28, LSL #1]\n"
+ "fmla z27.h, p2/M, z4.h, z20.h\n"
+ "ld1h { z0.h }, p3/Z, [x23, x13, LSL #1]\n"
+ "ld1h { z29.h }, p3/Z, [x22, x13, LSL #1]\n"
+ "fmla z31.h, p2/M, z4.h, z22.h\n"
+ "fmla z26.h, p2/M, z4.h, z10.h\n"
+ "ld1h { z19.h }, p2/Z, [x9, #4, MUL VL]\n"
+ "ldr x23, [x16, #0x90]\n"
+ "fmla z30.h, p2/M, z21.h, z7.h\n"
+ "fmla z27.h, p2/M, z21.h, z8.h\n"
+ "ldr x26, [x16, #0x98]\n"
+ "ldr x22, [x16, #0xa0]\n"
+ "fmla z31.h, p2/M, z21.h, z14.h\n"
+ "fmla z26.h, p2/M, z21.h, z11.h\n"
+ "ld1h { z25.h }, p2/Z, [x9, #5, MUL VL]\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla z30.h, p2/M, z18.h, z8.h\n"
+ "fmla z27.h, p2/M, z18.h, z13.h\n"
+ "ld1h { z24.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla z31.h, p2/M, z18.h, z11.h\n"
+ "fmla z26.h, p2/M, z18.h, z0.h\n"
+ "ld1h { z18.h }, p2/Z, [x9, #6, MUL VL]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ "fmla z30.h, p2/M, z17.h, z13.h\n"
+ "fmla z27.h, p2/M, z17.h, z5.h\n"
+ "ld1h { z3.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "ldr x21, [x16, #0xc0]\n"
+ "fmla z31.h, p2/M, z17.h, z0.h\n"
+ "fmla z26.h, p2/M, z17.h, z29.h\n"
+ "ld1h { z17.h }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
+ "fmla z30.h, p2/M, z16.h, z5.h\n"
+ "fmla z27.h, p2/M, z16.h, z22.h\n"
+ "ld1h { z6.h }, p3/Z, [x23, x13, LSL #1]\n"
+ "ldr x27, [x16, #0xc8]\n"
+ "fmla z31.h, p2/M, z16.h, z29.h\n"
+ "fmla z26.h, p2/M, z16.h, z3.h\n"
+ "ld1h { z16.h }, p2/Z, [x9, #-8, MUL VL]\n"
+ "ldr x23, [x16, #0xd0]\n"
+ "fmla z30.h, p2/M, z19.h, z22.h\n"
+ "fmla z27.h, p2/M, z19.h, z10.h\n"
+ "ld1h { z23.h }, p3/Z, [x26, x13, LSL #1]\n"
+ "ld1h { z22.h }, p3/Z, [x22, x13, LSL #1]\n"
+ "fmla z31.h, p2/M, z19.h, z3.h\n"
+ "fmla z26.h, p2/M, z19.h, z24.h\n"
+ "ld1h { z21.h }, p2/Z, [x9, #-7, MUL VL]\n"
+ "ldr x22, [x16, #0xd8]\n"
+ "fmla z30.h, p2/M, z25.h, z14.h\n"
+ "fmla z27.h, p2/M, z25.h, z11.h\n"
+ "ld1h { z1.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldr x20, [x16, #0xe0]\n"
+ "fmla z31.h, p2/M, z25.h, z6.h\n"
+ "fmla z26.h, p2/M, z25.h, z23.h\n"
+ "ld1h { z20.h }, p2/Z, [x9, #-6, MUL VL]\n"
+ "ldr x26, [x16, #0xf8]\n"
+ "fmla z30.h, p2/M, z18.h, z11.h\n"
+ "fmla z27.h, p2/M, z18.h, z0.h\n"
+ "ld1h { z7.h }, p3/Z, [x25, x13, LSL #1]\n"
+ "ldr x25, [x16, #0xe8]\n"
+ "fmla z31.h, p2/M, z18.h, z23.h\n"
+ "fmla z26.h, p2/M, z18.h, z22.h\n"
+ "ld1h { z18.h }, p2/Z, [x9, #-5, MUL VL]\n"
+ "whilelt p1.h, x10, %x[n_channels]\n"
+ "fmla z30.h, p2/M, z17.h, z0.h\n"
+ "fmla z27.h, p2/M, z17.h, z29.h\n"
+ "ld1h { z19.h }, p3/Z, [x24, x13, LSL #1]\n"
+ "ldr x24, [x16, #0xf0]\n"
+ "fmla z31.h, p2/M, z17.h, z22.h\n"
+ "fmla z26.h, p2/M, z17.h, z7.h\n"
+ "ld1h { z17.h }, p2/Z, [x9, #-4, MUL VL]\n"
"inch x28\n"
- "ld1h { z2.h }, p2/Z, [x27, #7, MUL VL]\n"
- "addvl x27, x27, #16\n"
+ "fmla z30.h, p2/M, z16.h, z29.h\n"
+ "fmla z27.h, p2/M, z16.h, z3.h\n"
+ "ld1h { z0.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "ldr x21, [x16, #0x100]\n"
+ "fmla z31.h, p2/M, z16.h, z7.h\n"
+ "fmla z26.h, p2/M, z16.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x9, #-3, MUL VL]\n"
+ "mov p0.b, p3.b\n"
+ "fmla z30.h, p2/M, z21.h, z3.h\n"
+ "fmla z27.h, p2/M, z21.h, z24.h\n"
+ "ld1h { z11.h }, p3/Z, [x27, x13, LSL #1]\n"
+ "ld1h { z13.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "fmla z31.h, p2/M, z21.h, z19.h\n"
+ "fmla z26.h, p2/M, z21.h, z1.h\n"
+ "ld1h { z10.h }, p2/Z, [x9, #-2, MUL VL]\n"
+ "ldr x20, [x16, #0x108]\n"
+ "fmla z30.h, p2/M, z20.h, z6.h\n"
+ "fmla z27.h, p2/M, z20.h, z23.h\n"
+ "ld1h { z25.h }, p3/Z, [x23, x13, LSL #1]\n"
+ "ldr x23, [x16, #0x110]\n"
+ "fmla z31.h, p2/M, z20.h, z0.h\n"
+ "fmla z26.h, p2/M, z20.h, z11.h\n"
+ "ld1h { z8.h }, p2/Z, [x9, #-1, MUL VL]\n"
+ "ld1h { z29.h }, p2/Z, [x9, #4, MUL VL]\n"
+ "fmla z30.h, p2/M, z18.h, z23.h\n"
+ "fmla z27.h, p2/M, z18.h, z22.h\n"
+ "ld1h { z24.h }, p3/Z, [x22, x13, LSL #1]\n"
+ "ldr x22, [x16, #0x118]\n"
+ "fmla z31.h, p2/M, z18.h, z11.h\n"
+ "fmla z26.h, p2/M, z18.h, z25.h\n"
+ "ld1h { z23.h }, p2/Z, [x9]\n"
+ "fmla z30.h, p2/M, z17.h, z22.h\n"
+ "fmla z27.h, p2/M, z17.h, z7.h\n"
+ "ld1h { z22.h }, p3/Z, [x25, x13, LSL #1]\n"
+ "fmla z31.h, p2/M, z17.h, z25.h\n"
+ "fmla z26.h, p2/M, z17.h, z24.h\n"
+ "ld1h { z21.h }, p2/Z, [x9, #1, MUL VL]\n"
+ "fmla z30.h, p2/M, z16.h, z7.h\n"
+ "fmla z27.h, p2/M, z16.h, z19.h\n"
+ "ld1h { z18.h }, p3/Z, [x24, x13, LSL #1]\n"
+ "fmla z31.h, p2/M, z16.h, z24.h\n"
+ "fmla z26.h, p2/M, z16.h, z13.h\n"
+ "ld1h { z20.h }, p2/Z, [x9, #2, MUL VL]\n"
+ "fmla z30.h, p2/M, z10.h, z19.h\n"
+ "fmla z27.h, p2/M, z10.h, z1.h\n"
+ "ld1h { z17.h }, p3/Z, [x26, x13, LSL #1]\n"
+ "fmla z31.h, p2/M, z10.h, z13.h\n"
+ "fmla z26.h, p2/M, z10.h, z22.h\n"
+ "ld1h { z19.h }, p2/Z, [x9, #3, MUL VL]\n"
+ "fmla z30.h, p2/M, z8.h, z0.h\n"
+ "fmla z27.h, p2/M, z8.h, z11.h\n"
+ "ld1h { z16.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "fmla z31.h, p2/M, z8.h, z18.h\n"
+ "fmla z26.h, p2/M, z8.h, z17.h\n"
+ "ld1h { z18.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldp x21, x20, [x16, #0x0]\n"
+ "fmla z30.h, p2/M, z23.h, z11.h\n"
+ "fmla z27.h, p2/M, z23.h, z25.h\n"
+ "ld1h { z0.h }, p2/Z, [x9, #5, MUL VL]\n"
+ "fmla z31.h, p2/M, z23.h, z17.h\n"
+ "fmla z26.h, p2/M, z23.h, z16.h\n"
+ "ld1h { z17.h }, p3/Z, [x23, x13, LSL #1]\n"
+ "ld1h { z1.h }, p2/Z, [x9, #6, MUL VL]\n"
+ "fmla z30.h, p2/M, z21.h, z25.h\n"
+ "fmla z27.h, p2/M, z21.h, z24.h\n"
+ "ld1h { z5.h }, p1/Z, [x21, x10, LSL #1]\n"
+ "fmla z31.h, p2/M, z21.h, z16.h\n"
+ "fmla z26.h, p2/M, z21.h, z18.h\n"
+ "ld1h { z16.h }, p3/Z, [x22, x13, LSL #1]\n"
+ "ldp x27, x26, [x16, #0x10]\n"
+ "fmla z30.h, p2/M, z20.h, z24.h\n"
+ "fmla z27.h, p2/M, z20.h, z13.h\n"
+ "ld1h { z6.h }, p1/Z, [x20, x10, LSL #1]\n"
+ "ldp x25, x24, [x16, #0x20]\n"
+ "fmla z31.h, p2/M, z20.h, z18.h\n"
+ "fmla z26.h, p2/M, z20.h, z17.h\n"
+ "ldp x23, x22, [x16, #0x30]\n"
+ "ldp x21, x20, [x16, #0x40]\n"
+ "fmla z30.h, p2/M, z19.h, z13.h\n"
+ "fmla z27.h, p2/M, z19.h, z22.h\n"
+ "inch x13\n"
+ "ld1h { z7.h }, p1/Z, [x27, x10, LSL #1]\n"
+ "fmla z31.h, p2/M, z19.h, z17.h\n"
+ "fmla z26.h, p2/M, z19.h, z16.h\n"
+ "ld1h { z8.h }, p1/Z, [x26, x10, LSL #1]\n"
+ "ld1h { z9.h }, p1/Z, [x25, x10, LSL #1]\n"
+ "ld1h { z13.h }, p1/Z, [x24, x10, LSL #1]\n"
+ "ld1h { z11.h }, p1/Z, [x23, x10, LSL #1]\n"
+ "fmax z30.h, p2/M, z30.h, z15.h\n"
+ "fmax z27.h, p2/M, z27.h, z15.h\n"
+ "ld1h { z12.h }, p1/Z, [x22, x10, LSL #1]\n"
+ "ld1h { z10.h }, p1/Z, [x21, x10, LSL #1]\n"
+ "fmax z31.h, p2/M, z31.h, z15.h\n"
+ "fmax z26.h, p2/M, z26.h, z15.h\n"
+ "ld1h { z14.h }, p1/Z, [x20, x10, LSL #1]\n"
+ "inch x10\n"
+ "ld1h { z2.h }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
"whilelt p3.h, x13, %x[n_channels]\n"
- "cmp x28, %x[n_channels]\n"
- "ld1h { z3.h }, p2/Z, [x27, #-8, MUL VL]\n"
- "ld1h { z4.h }, p2/Z, [x27, #-7, MUL VL]\n"
- "fmin z28.h, p2/M, z28.h, z17.h\n"
- "fmin z29.h, p2/M, z29.h, z17.h\n"
- "st1h { z28.h }, p0, [x15, x24, LSL #1]\n"
- "fmin z30.h, p2/M, z30.h, z17.h\n"
- "fmin z31.h, p2/M, z31.h, z17.h\n"
- "st1h { z29.h }, p0, [x14, x24, LSL #1]\n"
- "st1h { z30.h }, p0, [x12, x24, LSL #1]\n"
- "addvl x27, x27, #-6\n"
- "st1h { z31.h }, p0, [x11, x24, LSL #1]\n"
+ "cmp x10, %x[n_channels]\n"
+ "ld1h { z3.h }, p2/Z, [x9, #-8, MUL VL]\n"
+ "ld1h { z4.h }, p2/Z, [x9, #-7, MUL VL]\n"
+ "fmin z30.h, p2/M, z30.h, z28.h\n"
+ "fmin z27.h, p2/M, z27.h, z28.h\n"
+ "st1h { z30.h }, p0, [x15, x28, LSL #1]\n"
+ "fmin z31.h, p2/M, z31.h, z28.h\n"
+ "fmin z26.h, p2/M, z26.h, z28.h\n"
+ "st1h { z27.h }, p0, [x14, x28, LSL #1]\n"
+ "st1h { z31.h }, p0, [x12, x28, LSL #1]\n"
+ "addvl x9, x9, #-6\n"
+ "st1h { z26.h }, p0, [x11, x28, LSL #1]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z28, z16\n fmla z28.h, p2/M, z0.h, z5.h\n"
- "movprfx z29, z16\n fmla z29.h, p2/M, z0.h, z6.h\n"
- "ldr x26, [x16, #0x50]\n"
- "ld1h { z5.h }, p3/Z, [x26, x13, LSL #1]\n"
- "movprfx z30, z16\n fmla z30.h, p2/M, z0.h, z7.h\n"
- "movprfx z31, z16\n fmla z31.h, p2/M, z0.h, z8.h\n"
- "ldr x25, [x16, #0x58]\n"
- "ldr x23, [x16, #0x60]\n"
- "fmla z28.h, p2/M, z1.h, z6.h\n"
- "fmla z29.h, p2/M, z1.h, z9.h\n"
- "ld1h { z6.h }, p3/Z, [x25, x13, LSL #1]\n"
- "ldr x22, [x16, #0x68]\n"
- "fmla z30.h, p2/M, z1.h, z8.h\n"
- "fmla z31.h, p2/M, z1.h, z13.h\n"
- "ld1h { z0.h }, p2/Z, [x27]\n"
- "ldr x21, [x16, #0x70]\n"
- "fmla z28.h, p2/M, z2.h, z9.h\n"
- "fmla z29.h, p2/M, z2.h, z11.h\n"
- "ld1h { z9.h }, p3/Z, [x23, x13, LSL #1]\n"
- "ld1h { z1.h }, p2/Z, [x27, #1, MUL VL]\n"
- "fmla z30.h, p2/M, z2.h, z13.h\n"
- "fmla z31.h, p2/M, z2.h, z5.h\n"
- "ldr x20, [x16, #0x78]\n"
- "ld1h { z2.h }, p2/Z, [x27, #2, MUL VL]\n"
- "fmla z28.h, p2/M, z3.h, z11.h\n"
- "fmla z29.h, p2/M, z3.h, z12.h\n"
- "ld1h { z11.h }, p3/Z, [x22, x13, LSL #1]\n"
- "ldr x10, [x16, #0x80]\n"
- "fmla z30.h, p2/M, z3.h, z5.h\n"
- "fmla z31.h, p2/M, z3.h, z6.h\n"
- "ld1h { z3.h }, p2/Z, [x27, #3, MUL VL]\n"
- "ldr x9, [x16, #0x88]\n"
- "fmla z28.h, p2/M, z4.h, z12.h\n"
- "fmla z29.h, p2/M, z4.h, z9.h\n"
- "ld1h { z12.h }, p3/Z, [x21, x13, LSL #1]\n"
- "ld1h { z9.h }, p3/Z, [x20, x13, LSL #1]\n"
- "fmla z30.h, p2/M, z4.h, z6.h\n"
- "fmla z31.h, p2/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p2/Z, [x27, #4, MUL VL]\n"
- "ldr x26, [x16, #0x90]\n"
- "fmla z28.h, p2/M, z0.h, z7.h\n"
+ "movprfx z30, z29\n fmla z30.h, p2/M, z0.h, z5.h\n"
+ "movprfx z31, z29\n fmla z31.h, p2/M, z0.h, z6.h\n"
+ "ldr x20, [x16, #0x50]\n"
+ "ld1h { z22.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "movprfx z5, z29\n fmla z5.h, p2/M, z0.h, z7.h\n"
"fmla z29.h, p2/M, z0.h, z8.h\n"
- "ldr x25, [x16, #0x98]\n"
- "ldr x23, [x16, #0xa0]\n"
- "fmla z30.h, p2/M, z0.h, z14.h\n"
- "fmla z31.h, p2/M, z0.h, z11.h\n"
- "ld1h { z0.h }, p2/Z, [x27, #5, MUL VL]\n"
- "ldr x22, [x16, #0xa8]\n"
- "fmla z28.h, p2/M, z1.h, z8.h\n"
- "fmla z29.h, p2/M, z1.h, z13.h\n"
- "ld1h { z8.h }, p3/Z, [x9, x13, LSL #1]\n"
- "ldr x21, [x16, #0xb0]\n"
- "fmla z30.h, p2/M, z1.h, z11.h\n"
- "fmla z31.h, p2/M, z1.h, z12.h\n"
- "ld1h { z1.h }, p2/Z, [x27, #6, MUL VL]\n"
- "ldr x20, [x16, #0xb8]\n"
- "fmla z28.h, p2/M, z2.h, z13.h\n"
- "fmla z29.h, p2/M, z2.h, z5.h\n"
- "ld1h { z13.h }, p3/Z, [x10, x13, LSL #1]\n"
- "ldr x10, [x16, #0xc0]\n"
- "fmla z30.h, p2/M, z2.h, z12.h\n"
- "fmla z31.h, p2/M, z2.h, z9.h\n"
- "ld1h { z2.h }, p2/Z, [x27, #7, MUL VL]\n"
- "addvl x27, x27, #16\n"
- "fmla z28.h, p2/M, z3.h, z5.h\n"
- "fmla z29.h, p2/M, z3.h, z6.h\n"
- "ld1h { z5.h }, p3/Z, [x26, x13, LSL #1]\n"
- "ldr x9, [x16, #0xc8]\n"
- "fmla z30.h, p2/M, z3.h, z9.h\n"
- "fmla z31.h, p2/M, z3.h, z13.h\n"
- "ld1h { z3.h }, p2/Z, [x27, #-8, MUL VL]\n"
- "ldr x26, [x16, #0xd0]\n"
- "fmla z28.h, p2/M, z4.h, z6.h\n"
- "fmla z29.h, p2/M, z4.h, z10.h\n"
- "ld1h { z6.h }, p3/Z, [x25, x13, LSL #1]\n"
- "ld1h { z10.h }, p3/Z, [x23, x13, LSL #1]\n"
- "fmla z30.h, p2/M, z4.h, z13.h\n"
- "fmla z31.h, p2/M, z4.h, z8.h\n"
- "ld1h { z4.h }, p2/Z, [x27, #-7, MUL VL]\n"
- "ldr x25, [x16, #0xd8]\n"
- "fmla z28.h, p2/M, z0.h, z14.h\n"
- "fmla z29.h, p2/M, z0.h, z11.h\n"
- "ld1h { z14.h }, p3/Z, [x20, x13, LSL #1]\n"
- "ldr x23, [x16, #0xe0]\n"
- "fmla z30.h, p2/M, z0.h, z5.h\n"
- "fmla z31.h, p2/M, z0.h, z6.h\n"
- "ld1h { z0.h }, p2/Z, [x27, #-6, MUL VL]\n"
- "ldr x20, [x16, #0xf8]\n"
- "fmla z28.h, p2/M, z1.h, z11.h\n"
- "fmla z29.h, p2/M, z1.h, z12.h\n"
- "ld1h { z11.h }, p3/Z, [x22, x13, LSL #1]\n"
- "ldr x22, [x16, #0xe8]\n"
+ "ldr x20, [x16, #0x58]\n"
+ "ldr x21, [x16, #0x60]\n"
"fmla z30.h, p2/M, z1.h, z6.h\n"
- "fmla z31.h, p2/M, z1.h, z10.h\n"
- "ld1h { z1.h }, p2/Z, [x27, #-5, MUL VL]\n"
- "inch x24\n"
- "fmla z28.h, p2/M, z2.h, z12.h\n"
- "fmla z29.h, p2/M, z2.h, z9.h\n"
- "ld1h { z12.h }, p3/Z, [x21, x13, LSL #1]\n"
- "ldr x21, [x16, #0xf0]\n"
- "fmla z30.h, p2/M, z2.h, z10.h\n"
- "fmla z31.h, p2/M, z2.h, z11.h\n"
- "ld1h { z2.h }, p2/Z, [x27, #-4, MUL VL]\n"
- "mov p0.b, p3.b\n"
- "fmla z28.h, p2/M, z3.h, z9.h\n"
- "fmla z29.h, p2/M, z3.h, z13.h\n"
- "ld1h { z9.h }, p3/Z, [x10, x13, LSL #1]\n"
- "ldr x10, [x16, #0x100]\n"
- "fmla z30.h, p2/M, z3.h, z11.h\n"
- "fmla z31.h, p2/M, z3.h, z12.h\n"
- "ld1h { z3.h }, p2/Z, [x27, #-3, MUL VL]\n"
- "fmla z28.h, p2/M, z4.h, z13.h\n"
- "fmla z29.h, p2/M, z4.h, z8.h\n"
- "ld1h { z13.h }, p3/Z, [x9, x13, LSL #1]\n"
- "ld1h { z8.h }, p3/Z, [x23, x13, LSL #1]\n"
- "fmla z30.h, p2/M, z4.h, z12.h\n"
- "fmla z31.h, p2/M, z4.h, z14.h\n"
- "ld1h { z4.h }, p2/Z, [x27, #-2, MUL VL]\n"
- "ldr x9, [x16, #0x108]\n"
- "fmla z28.h, p2/M, z0.h, z5.h\n"
- "fmla z29.h, p2/M, z0.h, z6.h\n"
- "ld1h { z5.h }, p3/Z, [x26, x13, LSL #1]\n"
- "ldr x26, [x16, #0x110]\n"
- "fmla z30.h, p2/M, z0.h, z9.h\n"
- "fmla z31.h, p2/M, z0.h, z13.h\n"
- "ld1h { z0.h }, p2/Z, [x27, #-1, MUL VL]\n"
- "fmla z28.h, p2/M, z1.h, z6.h\n"
- "fmla z29.h, p2/M, z1.h, z10.h\n"
- "ld1h { z6.h }, p3/Z, [x25, x13, LSL #1]\n"
- "ldr x25, [x16, #0x118]\n"
- "fmla z30.h, p2/M, z1.h, z13.h\n"
- "fmla z31.h, p2/M, z1.h, z5.h\n"
- "ld1h { z1.h }, p2/Z, [x27]\n"
- "fmla z28.h, p2/M, z2.h, z10.h\n"
- "fmla z29.h, p2/M, z2.h, z11.h\n"
- "ld1h { z10.h }, p3/Z, [x22, x13, LSL #1]\n"
- "fmla z30.h, p2/M, z2.h, z5.h\n"
- "fmla z31.h, p2/M, z2.h, z6.h\n"
- "ld1h { z2.h }, p2/Z, [x27, #1, MUL VL]\n"
- "fmla z28.h, p2/M, z3.h, z11.h\n"
- "fmla z29.h, p2/M, z3.h, z12.h\n"
- "ld1h { z11.h }, p3/Z, [x21, x13, LSL #1]\n"
- "fmla z30.h, p2/M, z3.h, z6.h\n"
- "fmla z31.h, p2/M, z3.h, z8.h\n"
- "ld1h { z3.h }, p2/Z, [x27, #2, MUL VL]\n"
- "fmla z28.h, p2/M, z4.h, z12.h\n"
- "fmla z29.h, p2/M, z4.h, z14.h\n"
- "ld1h { z12.h }, p3/Z, [x20, x13, LSL #1]\n"
- "fmla z30.h, p2/M, z4.h, z8.h\n"
- "fmla z31.h, p2/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p2/Z, [x27, #3, MUL VL]\n"
- "fmla z28.h, p2/M, z0.h, z9.h\n"
- "fmla z29.h, p2/M, z0.h, z13.h\n"
- "ld1h { z9.h }, p3/Z, [x10, x13, LSL #1]\n"
- "fmla z30.h, p2/M, z0.h, z11.h\n"
- "fmla z31.h, p2/M, z0.h, z12.h\n"
- "ld1h { z11.h }, p3/Z, [x9, x13, LSL #1]\n"
- "fmla z28.h, p2/M, z1.h, z13.h\n"
- "fmla z29.h, p2/M, z1.h, z5.h\n"
- "fmla z30.h, p2/M, z1.h, z12.h\n"
"fmla z31.h, p2/M, z1.h, z9.h\n"
- "ld1h { z12.h }, p3/Z, [x26, x13, LSL #1]\n"
- "fmla z28.h, p2/M, z2.h, z5.h\n"
- "fmla z29.h, p2/M, z2.h, z6.h\n"
+ "ld1h { z6.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldr x20, [x16, #0x68]\n"
+ "fmla z5.h, p2/M, z1.h, z8.h\n"
+ "fmla z29.h, p2/M, z1.h, z13.h\n"
+ "ld1h { z20.h }, p2/Z, [x9]\n"
+ "ldr x23, [x16, #0x70]\n"
"fmla z30.h, p2/M, z2.h, z9.h\n"
"fmla z31.h, p2/M, z2.h, z11.h\n"
- "ld1h { z9.h }, p3/Z, [x25, x13, LSL #1]\n"
- "fmla z28.h, p2/M, z3.h, z6.h\n"
- "fmla z29.h, p2/M, z3.h, z8.h\n"
+ "ld1h { z16.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "ld1h { z19.h }, p2/Z, [x9, #1, MUL VL]\n"
+ "fmla z5.h, p2/M, z2.h, z13.h\n"
+ "fmla z29.h, p2/M, z2.h, z22.h\n"
+ "ldr x21, [x16, #0x78]\n"
+ "ld1h { z18.h }, p2/Z, [x9, #2, MUL VL]\n"
"fmla z30.h, p2/M, z3.h, z11.h\n"
"fmla z31.h, p2/M, z3.h, z12.h\n"
- "fmla z28.h, p2/M, z4.h, z8.h\n"
- "fmla z29.h, p2/M, z4.h, z10.h\n"
- "fmax z28.h, p2/M, z28.h, z18.h\n"
- "fmax z29.h, p2/M, z29.h, z18.h\n"
+ "ld1h { z1.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldr x22, [x16, #0x80]\n"
+ "fmla z5.h, p2/M, z3.h, z22.h\n"
+ "fmla z29.h, p2/M, z3.h, z6.h\n"
+ "ld1h { z17.h }, p2/Z, [x9, #3, MUL VL]\n"
+ "ldr x20, [x16, #0x88]\n"
"fmla z30.h, p2/M, z4.h, z12.h\n"
- "fmla z31.h, p2/M, z4.h, z9.h\n"
- "fmax z30.h, p2/M, z30.h, z18.h\n"
- "fmax z31.h, p2/M, z31.h, z18.h\n"
- "fmin z28.h, p2/M, z28.h, z17.h\n"
- "fmin z29.h, p2/M, z29.h, z17.h\n"
- "st1h { z28.h }, p0, [x15, x24, LSL #1]\n"
- "fmin z30.h, p2/M, z30.h, z17.h\n"
- "fmin z31.h, p2/M, z31.h, z17.h\n"
- "st1h { z29.h }, p0, [x14, x24, LSL #1]\n"
- "st1h { z30.h }, p0, [x12, x24, LSL #1]\n"
- "st1h { z31.h }, p0, [x11, x24, LSL #1]\n"
+ "fmla z31.h, p2/M, z4.h, z16.h\n"
+ "ld1h { z0.h }, p3/Z, [x23, x13, LSL #1]\n"
+ "ld1h { z27.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "fmla z5.h, p2/M, z4.h, z6.h\n"
+ "fmla z29.h, p2/M, z4.h, z10.h\n"
+ "ld1h { z16.h }, p2/Z, [x9, #4, MUL VL]\n"
+ "ldr x21, [x16, #0x90]\n"
+ "fmla z30.h, p2/M, z20.h, z7.h\n"
+ "fmla z31.h, p2/M, z20.h, z8.h\n"
+ "ldr x27, [x16, #0x98]\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla z5.h, p2/M, z20.h, z14.h\n"
+ "fmla z29.h, p2/M, z20.h, z1.h\n"
+ "ld1h { z21.h }, p2/Z, [x9, #5, MUL VL]\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla z30.h, p2/M, z19.h, z8.h\n"
+ "fmla z31.h, p2/M, z19.h, z13.h\n"
+ "ld1h { z26.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla z5.h, p2/M, z19.h, z1.h\n"
+ "fmla z29.h, p2/M, z19.h, z0.h\n"
+ "ld1h { z25.h }, p2/Z, [x9, #6, MUL VL]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ "fmla z30.h, p2/M, z18.h, z13.h\n"
+ "fmla z31.h, p2/M, z18.h, z22.h\n"
+ "ld1h { z24.h }, p3/Z, [x22, x13, LSL #1]\n"
+ "ldr x23, [x16, #0xc0]\n"
+ "fmla z5.h, p2/M, z18.h, z0.h\n"
+ "fmla z29.h, p2/M, z18.h, z27.h\n"
+ "ld1h { z23.h }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
+ "fmla z30.h, p2/M, z17.h, z22.h\n"
+ "fmla z31.h, p2/M, z17.h, z6.h\n"
+ "ld1h { z22.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "ldr x22, [x16, #0xc8]\n"
+ "fmla z5.h, p2/M, z17.h, z27.h\n"
+ "fmla z29.h, p2/M, z17.h, z24.h\n"
+ "ld1h { z20.h }, p2/Z, [x9, #-8, MUL VL]\n"
+ "ldr x21, [x16, #0xd0]\n"
+ "fmla z30.h, p2/M, z16.h, z6.h\n"
+ "fmla z31.h, p2/M, z16.h, z10.h\n"
+ "ld1h { z19.h }, p3/Z, [x27, x13, LSL #1]\n"
+ "ld1h { z18.h }, p3/Z, [x26, x13, LSL #1]\n"
+ "fmla z5.h, p2/M, z16.h, z24.h\n"
+ "fmla z29.h, p2/M, z16.h, z26.h\n"
+ "ld1h { z16.h }, p2/Z, [x9, #-7, MUL VL]\n"
+ "ldr x27, [x16, #0xd8]\n"
+ "fmla z30.h, p2/M, z21.h, z14.h\n"
+ "fmla z31.h, p2/M, z21.h, z1.h\n"
+ "ld1h { z17.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldr x20, [x16, #0xe0]\n"
+ "fmla z5.h, p2/M, z21.h, z22.h\n"
+ "fmla z29.h, p2/M, z21.h, z19.h\n"
+ "ld1h { z21.h }, p2/Z, [x9, #-6, MUL VL]\n"
+ "ldr x26, [x16, #0xf8]\n"
+ "fmla z30.h, p2/M, z25.h, z1.h\n"
+ "fmla z31.h, p2/M, z25.h, z0.h\n"
+ "ld1h { z9.h }, p3/Z, [x25, x13, LSL #1]\n"
+ "ldr x25, [x16, #0xe8]\n"
+ "fmla z5.h, p2/M, z25.h, z19.h\n"
+ "fmla z29.h, p2/M, z25.h, z18.h\n"
+ "ld1h { z4.h }, p2/Z, [x9, #-5, MUL VL]\n"
+ "inch x28\n"
+ "fmla z30.h, p2/M, z23.h, z0.h\n"
+ "fmla z31.h, p2/M, z23.h, z27.h\n"
+ "ld1h { z8.h }, p3/Z, [x24, x13, LSL #1]\n"
+ "ldr x24, [x16, #0xf0]\n"
+ "fmla z5.h, p2/M, z23.h, z18.h\n"
+ "fmla z29.h, p2/M, z23.h, z9.h\n"
+ "ld1h { z6.h }, p2/Z, [x9, #-4, MUL VL]\n"
+ "mov p0.b, p3.b\n"
+ "fmla z30.h, p2/M, z20.h, z27.h\n"
+ "fmla z31.h, p2/M, z20.h, z24.h\n"
+ "ld1h { z10.h }, p3/Z, [x23, x13, LSL #1]\n"
+ "ldr x23, [x16, #0x100]\n"
+ "fmla z5.h, p2/M, z20.h, z9.h\n"
+ "fmla z29.h, p2/M, z20.h, z8.h\n"
+ "ld1h { z11.h }, p2/Z, [x9, #-3, MUL VL]\n"
+ "fmla z30.h, p2/M, z16.h, z24.h\n"
+ "fmla z31.h, p2/M, z16.h, z26.h\n"
+ "ld1h { z0.h }, p3/Z, [x22, x13, LSL #1]\n"
+ "ld1h { z27.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "fmla z5.h, p2/M, z16.h, z8.h\n"
+ "fmla z29.h, p2/M, z16.h, z17.h\n"
+ "ld1h { z16.h }, p2/Z, [x9, #-2, MUL VL]\n"
+ "ldr x22, [x16, #0x108]\n"
+ "fmla z30.h, p2/M, z21.h, z22.h\n"
+ "fmla z31.h, p2/M, z21.h, z19.h\n"
+ "ld1h { z26.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "ldr x21, [x16, #0x110]\n"
+ "fmla z5.h, p2/M, z21.h, z10.h\n"
+ "fmla z29.h, p2/M, z21.h, z0.h\n"
+ "ld1h { z25.h }, p2/Z, [x9, #-1, MUL VL]\n"
+ "fmla z30.h, p2/M, z4.h, z19.h\n"
+ "fmla z31.h, p2/M, z4.h, z18.h\n"
+ "ld1h { z24.h }, p3/Z, [x27, x13, LSL #1]\n"
+ "ldr x20, [x16, #0x118]\n"
+ "fmla z5.h, p2/M, z4.h, z0.h\n"
+ "fmla z29.h, p2/M, z4.h, z26.h\n"
+ "ld1h { z23.h }, p2/Z, [x9]\n"
+ "fmla z30.h, p2/M, z6.h, z18.h\n"
+ "fmla z31.h, p2/M, z6.h, z9.h\n"
+ "ld1h { z22.h }, p3/Z, [x25, x13, LSL #1]\n"
+ "fmla z5.h, p2/M, z6.h, z26.h\n"
+ "fmla z29.h, p2/M, z6.h, z24.h\n"
+ "ld1h { z21.h }, p2/Z, [x9, #1, MUL VL]\n"
+ "fmla z30.h, p2/M, z11.h, z9.h\n"
+ "fmla z31.h, p2/M, z11.h, z8.h\n"
+ "ld1h { z18.h }, p3/Z, [x24, x13, LSL #1]\n"
+ "fmla z5.h, p2/M, z11.h, z24.h\n"
+ "fmla z29.h, p2/M, z11.h, z27.h\n"
+ "ld1h { z20.h }, p2/Z, [x9, #2, MUL VL]\n"
+ "fmla z30.h, p2/M, z16.h, z8.h\n"
+ "fmla z31.h, p2/M, z16.h, z17.h\n"
+ "ld1h { z17.h }, p3/Z, [x26, x13, LSL #1]\n"
+ "fmla z5.h, p2/M, z16.h, z27.h\n"
+ "fmla z29.h, p2/M, z16.h, z22.h\n"
+ "ld1h { z19.h }, p2/Z, [x9, #3, MUL VL]\n"
+ "fmla z30.h, p2/M, z25.h, z10.h\n"
+ "fmla z31.h, p2/M, z25.h, z0.h\n"
+ "ld1h { z16.h }, p3/Z, [x23, x13, LSL #1]\n"
+ "fmla z5.h, p2/M, z25.h, z18.h\n"
+ "fmla z29.h, p2/M, z25.h, z17.h\n"
+ "ld1h { z18.h }, p3/Z, [x22, x13, LSL #1]\n"
+ "fmla z30.h, p2/M, z23.h, z0.h\n"
+ "fmla z31.h, p2/M, z23.h, z26.h\n"
+ "fmla z5.h, p2/M, z23.h, z17.h\n"
+ "fmla z29.h, p2/M, z23.h, z16.h\n"
+ "ld1h { z17.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "fmla z30.h, p2/M, z21.h, z26.h\n"
+ "fmla z31.h, p2/M, z21.h, z24.h\n"
+ "fmla z5.h, p2/M, z21.h, z16.h\n"
+ "fmla z29.h, p2/M, z21.h, z18.h\n"
+ "ld1h { z16.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "fmla z30.h, p2/M, z20.h, z24.h\n"
+ "fmla z31.h, p2/M, z20.h, z27.h\n"
+ "fmla z5.h, p2/M, z20.h, z18.h\n"
+ "fmla z29.h, p2/M, z20.h, z17.h\n"
+ "fmla z30.h, p2/M, z19.h, z27.h\n"
+ "fmla z31.h, p2/M, z19.h, z22.h\n"
+ "fmax z30.h, p2/M, z30.h, z15.h\n"
+ "fmax z31.h, p2/M, z31.h, z15.h\n"
+ "fmla z5.h, p2/M, z19.h, z17.h\n"
+ "fmla z29.h, p2/M, z19.h, z16.h\n"
+ "fmax z5.h, p2/M, z5.h, z15.h\n"
+ "fmax z29.h, p2/M, z29.h, z15.h\n"
+ "fmin z30.h, p2/M, z30.h, z28.h\n"
+ "fmin z31.h, p2/M, z31.h, z28.h\n"
+ "st1h { z30.h }, p0, [x15, x28, LSL #1]\n"
+ "fmin z5.h, p2/M, z5.h, z28.h\n"
+ "fmin z29.h, p2/M, z29.h, z28.h\n"
+ "st1h { z31.h }, p0, [x14, x28, LSL #1]\n"
+ "st1h { z5.h }, p0, [x12, x28, LSL #1]\n"
+ "st1h { z29.h }, p0, [x11, x28, LSL #1]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index d7b1de2062..16b96fdb8e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,19 +22,19 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
-void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
-void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
class sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
{
@@ -57,7 +57,7 @@ class sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 2;
sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<float, float, float, float>(2, 3, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
@@ -68,4 +68,4 @@ class sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirst
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
index a570c5aa6a..1bdef85274 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -108,10 +108,10 @@ void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"whilelt p2.s, XZR, %x[n_channels]\n"
"madd x20, x14, x12, x20\n" // offset += tile_j * ld_output_col
"ldr x28, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "ld1w { z18.s }, p3/Z, [x10]\n"
+ "ld1w { z27.s }, p3/Z, [x10]\n"
"add x27, x13, x13\n"
"mul x21, x21, x25\n" // offset *= kernel_stride * output_size
- "add x9, x9, x21, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x9, x9, x21, LSL #2\n" // inptr[0] += offset * sizeof(float)
"ld1w { z0.s }, p3/Z, [x10, #1, MUL VL]\n"
"ld1w { z1.s }, p3/Z, [x10, #2, MUL VL]\n"
"mul x20, x20, x24\n" // offset *= output_tile_size
@@ -125,10 +125,10 @@ void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"ld1w { z6.s }, p3/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
"add x28, x28, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z26.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
"cmp x11, %x[n_channels]\n"
"add x23, x25, x23, LSL #2\n"
- "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rw { z25.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"ld1w { z7.s }, p3/Z, [x10, #-8, MUL VL]\n"
"add x22, x28, x22, LSL #2\n"
"mov x21, #0x0\n"
@@ -142,175 +142,175 @@ void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"ld1w { z13.s }, p2/Z, [x25, x13, LSL #2]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z28, z18\n fmla z28.s, p3/M, z4.s, z9.s\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z3.s, z9.s\n"
+ "movprfx z24, z27\n fmla z24.s, p3/M, z4.s, z9.s\n"
+ "movprfx z23, z27\n fmla z23.s, p3/M, z3.s, z9.s\n"
"whilelt p1.s, x11, %x[n_channels]\n"
"incw x21\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x23]\n"
+ "movprfx z22, z27\n fmla z22.s, p3/M, z1.s, z9.s\n"
+ "movprfx z21, z27\n fmla z21.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z18.s }, p2/Z, [x23]\n"
"incw x11\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "fmla z29.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x23, x24, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x25, x27, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z12.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "fmla z23.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x24, LSL #2]\n"
+ "ld1w { z20.s }, p2/Z, [x25, x27, LSL #2]\n"
+ "fmla z22.s, p3/M, z2.s, z12.s\n"
+ "fmla z21.s, p3/M, z1.s, z12.s\n"
"mov p0.b, p2.b\n"
- "ld1w { z18.s }, p3/Z, [x10]\n"
- "fmla z28.s, p3/M, z5.s, z12.s\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x9, x13, LSL #2]\n"
+ "ld1w { z27.s }, p3/Z, [x10]\n"
+ "fmla z24.s, p3/M, z5.s, z12.s\n"
+ "fmla z23.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x9, x13, LSL #2]\n"
"incw x20\n"
- "fmla z30.s, p3/M, z6.s, z9.s\n"
- "fmla z31.s, p3/M, z3.s, z13.s\n"
- "ld1w { z9.s }, p2/Z, [x9, x27, LSL #2]\n"
+ "fmla z22.s, p3/M, z6.s, z18.s\n"
+ "fmla z21.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z18.s }, p2/Z, [x9, x27, LSL #2]\n"
"addvl x9, x9, #1\n"
- "fmla z28.s, p3/M, z7.s, z13.s\n"
- "fmla z29.s, p3/M, z6.s, z13.s\n"
- "fmla z30.s, p3/M, z4.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26]\n"
- "fmla z28.s, p3/M, z1.s, z12.s\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x24, LSL #2]\n"
+ "fmla z24.s, p3/M, z7.s, z13.s\n"
+ "fmla z23.s, p3/M, z6.s, z13.s\n"
+ "fmla z22.s, p3/M, z4.s, z13.s\n"
+ "fmla z21.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x26]\n"
+ "fmla z24.s, p3/M, z1.s, z16.s\n"
+ "fmla z23.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x26, x24, LSL #2]\n"
"addvl x26, x26, #1\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "fmla z22.s, p3/M, z5.s, z20.s\n"
+ "fmla z21.s, p3/M, z4.s, z20.s\n"
"ld1w { z4.s }, p3/Z, [x10, #5, MUL VL]\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z29.s, p3/M, z1.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x25]\n"
+ "fmla z24.s, p3/M, z2.s, z18.s\n"
+ "fmla z23.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x25]\n"
"ld1w { z1.s }, p3/Z, [x10, #2, MUL VL]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
+ "fmla z22.s, p3/M, z0.s, z17.s\n"
+ "fmla z21.s, p3/M, z2.s, z16.s\n"
"ld1w { z0.s }, p3/Z, [x10, #1, MUL VL]\n"
"ld1w { z2.s }, p3/Z, [x10, #3, MUL VL]\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x24, LSL #2]\n"
+ "fmla z24.s, p3/M, z8.s, z20.s\n"
+ "fmla z23.s, p3/M, z7.s, z20.s\n"
+ "ld1w { z18.s }, p2/Z, [x25, x24, LSL #2]\n"
"addvl x25, x25, #1\n"
- "fmla z30.s, p3/M, z3.s, z9.s\n"
- "fmla z31.s, p3/M, z5.s, z10.s\n"
+ "fmla z22.s, p3/M, z3.s, z19.s\n"
+ "fmla z21.s, p3/M, z5.s, z18.s\n"
"ld1w { z13.s }, p1/Z, [x25, x13, LSL #2]\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x23, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x23, x27, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z11.s\n"
- "fmla z31.s, p3/M, z6.s, z11.s\n"
+ "fmla z24.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x13, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x27, LSL #2]\n"
+ "fmla z22.s, p3/M, z7.s, z17.s\n"
+ "fmla z21.s, p3/M, z6.s, z17.s\n"
"ld1w { z3.s }, p3/Z, [x10, #4, MUL VL]\n"
"ld1w { z5.s }, p3/Z, [x10, #6, MUL VL]\n"
- "fmla z28.s, p3/M, z6.s, z9.s\n"
- "fmla z29.s, p3/M, z8.s, z10.s\n"
- "fmax z28.s, p3/M, z28.s, z17.s\n"
- "fmax z29.s, p3/M, z29.s, z17.s\n"
- "fmla z30.s, p3/M, z8.s, z12.s\n"
- "fmla z31.s, p3/M, z7.s, z12.s\n"
- "fmax z30.s, p3/M, z30.s, z17.s\n"
- "fmax z31.s, p3/M, z31.s, z17.s\n"
+ "fmla z24.s, p3/M, z6.s, z19.s\n"
+ "fmla z23.s, p3/M, z8.s, z18.s\n"
+ "fmax z24.s, p3/M, z24.s, z26.s\n"
+ "fmax z23.s, p3/M, z23.s, z26.s\n"
+ "fmla z22.s, p3/M, z8.s, z16.s\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
+ "fmax z22.s, p3/M, z22.s, z26.s\n"
+ "fmax z21.s, p3/M, z21.s, z26.s\n"
"ld1w { z6.s }, p3/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
"whilelt p2.s, x21, %x[n_channels]\n"
"ld1w { z9.s }, p1/Z, [x26, x13, LSL #2]\n"
"cmp x11, %x[n_channels]\n"
- "fmin z28.s, p3/M, z28.s, z16.s\n"
+ "fmin z24.s, p3/M, z24.s, z25.s\n"
"ld1w { z10.s }, p1/Z, [x9]\n"
"ld1w { z11.s }, p1/Z, [x9, x24, LSL #2]\n"
- "fmin z29.s, p3/M, z29.s, z16.s\n"
- "fmin z30.s, p3/M, z30.s, z16.s\n"
+ "fmin z23.s, p3/M, z23.s, z25.s\n"
+ "fmin z22.s, p3/M, z22.s, z25.s\n"
"ld1w { z12.s }, p1/Z, [x26, x27, LSL #2]\n"
- "st1w { z28.s }, p0, [x28]\n"
- "fmin z31.s, p3/M, z31.s, z16.s\n"
+ "st1w { z24.s }, p0, [x28]\n"
+ "fmin z21.s, p3/M, z21.s, z25.s\n"
"addvl x23, x23, #1\n"
- "st1w { z29.s }, p0, [x28, x12, LSL #2]\n"
+ "st1w { z23.s }, p0, [x28, x12, LSL #2]\n"
"ld1w { z7.s }, p3/Z, [x10, #-8, MUL VL]\n"
- "st1w { z30.s }, p0, [x22]\n"
+ "st1w { z22.s }, p0, [x22]\n"
"addvl x28, x28, #1\n"
"ld1w { z8.s }, p3/Z, [x10, #-7, MUL VL]\n"
"addvl x10, x10, #-6\n"
- "st1w { z31.s }, p0, [x22, x12, LSL #2]\n"
+ "st1w { z21.s }, p0, [x22, x12, LSL #2]\n"
"addvl x22, x22, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z28, z18\n fmla z28.s, p3/M, z4.s, z9.s\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z3.s, z9.s\n"
+ "movprfx z24, z27\n fmla z24.s, p3/M, z4.s, z9.s\n"
+ "movprfx z23, z27\n fmla z23.s, p3/M, z3.s, z9.s\n"
"ldr x14, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x23]\n"
+ "movprfx z22, z27\n fmla z22.s, p3/M, z1.s, z9.s\n"
+ "movprfx z21, z27\n fmla z21.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z18.s }, p2/Z, [x23]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "fmla z29.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x23, x24, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x25, x27, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z12.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "fmla z23.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x24, LSL #2]\n"
+ "ld1w { z20.s }, p2/Z, [x25, x27, LSL #2]\n"
+ "fmla z22.s, p3/M, z2.s, z12.s\n"
+ "fmla z21.s, p3/M, z1.s, z12.s\n"
"add x14, x14, #0x1\n"
"cmp x14, x20\n"
- "fmla z28.s, p3/M, z5.s, z12.s\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x9, x13, LSL #2]\n"
+ "fmla z24.s, p3/M, z5.s, z12.s\n"
+ "fmla z23.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x9, x13, LSL #2]\n"
"add x21, x10, #0x1\n"
- "fmla z30.s, p3/M, z6.s, z9.s\n"
- "fmla z31.s, p3/M, z3.s, z13.s\n"
- "ld1w { z9.s }, p2/Z, [x9, x27, LSL #2]\n"
+ "fmla z22.s, p3/M, z6.s, z18.s\n"
+ "fmla z21.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z18.s }, p2/Z, [x9, x27, LSL #2]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "fmla z28.s, p3/M, z7.s, z13.s\n"
- "fmla z29.s, p3/M, z6.s, z13.s\n"
+ "fmla z24.s, p3/M, z7.s, z13.s\n"
+ "fmla z23.s, p3/M, z6.s, z13.s\n"
"csel x10, x10, x21, LT\n"
"mov p0.b, p2.b\n"
- "fmla z30.s, p3/M, z4.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26]\n"
+ "fmla z22.s, p3/M, z4.s, z13.s\n"
+ "fmla z21.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x26]\n"
"csel x14, x14, XZR, LT\n"
- "fmla z28.s, p3/M, z1.s, z12.s\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x24, LSL #2]\n"
+ "fmla z24.s, p3/M, z1.s, z16.s\n"
+ "fmla z23.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x26, x24, LSL #2]\n"
"cmp x10, x20\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z29.s, p3/M, z1.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x25]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x24, LSL #2]\n"
- "fmla z30.s, p3/M, z3.s, z9.s\n"
- "fmla z31.s, p3/M, z5.s, z10.s\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x23, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x23, x27, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z11.s\n"
- "fmla z31.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z6.s, z9.s\n"
- "fmla z29.s, p3/M, z8.s, z10.s\n"
- "fmax z28.s, p3/M, z28.s, z17.s\n"
- "fmax z29.s, p3/M, z29.s, z17.s\n"
- "fmla z30.s, p3/M, z8.s, z12.s\n"
- "fmla z31.s, p3/M, z7.s, z12.s\n"
- "fmax z30.s, p3/M, z30.s, z17.s\n"
- "fmax z31.s, p3/M, z31.s, z17.s\n"
- "fmin z28.s, p3/M, z28.s, z16.s\n"
- "fmin z29.s, p3/M, z29.s, z16.s\n"
- "st1w { z28.s }, p0, [x28]\n"
- "fmin z30.s, p3/M, z30.s, z16.s\n"
- "fmin z31.s, p3/M, z31.s, z16.s\n"
- "st1w { z29.s }, p0, [x28, x12, LSL #2]\n"
- "st1w { z30.s }, p0, [x22]\n"
- "st1w { z31.s }, p0, [x22, x12, LSL #2]\n"
+ "fmla z22.s, p3/M, z5.s, z20.s\n"
+ "fmla z21.s, p3/M, z4.s, z20.s\n"
+ "fmla z24.s, p3/M, z2.s, z18.s\n"
+ "fmla z23.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x25]\n"
+ "fmla z22.s, p3/M, z0.s, z17.s\n"
+ "fmla z21.s, p3/M, z2.s, z16.s\n"
+ "fmla z24.s, p3/M, z8.s, z20.s\n"
+ "fmla z23.s, p3/M, z7.s, z20.s\n"
+ "ld1w { z18.s }, p2/Z, [x25, x24, LSL #2]\n"
+ "fmla z22.s, p3/M, z3.s, z19.s\n"
+ "fmla z21.s, p3/M, z5.s, z18.s\n"
+ "fmla z24.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x13, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x27, LSL #2]\n"
+ "fmla z22.s, p3/M, z7.s, z17.s\n"
+ "fmla z21.s, p3/M, z6.s, z17.s\n"
+ "fmla z24.s, p3/M, z6.s, z19.s\n"
+ "fmla z23.s, p3/M, z8.s, z18.s\n"
+ "fmax z24.s, p3/M, z24.s, z26.s\n"
+ "fmax z23.s, p3/M, z23.s, z26.s\n"
+ "fmla z22.s, p3/M, z8.s, z16.s\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
+ "fmax z22.s, p3/M, z22.s, z26.s\n"
+ "fmax z21.s, p3/M, z21.s, z26.s\n"
+ "fmin z24.s, p3/M, z24.s, z25.s\n"
+ "fmin z23.s, p3/M, z23.s, z25.s\n"
+ "st1w { z24.s }, p0, [x28]\n"
+ "fmin z22.s, p3/M, z22.s, z25.s\n"
+ "fmin z21.s, p3/M, z21.s, z25.s\n"
+ "st1w { z23.s }, p0, [x28, x12, LSL #2]\n"
+ "st1w { z22.s }, p0, [x22]\n"
+ "st1w { z21.s }, p0, [x22, x12, LSL #2]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index 903de0d309..873b4736ff 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -87,7 +87,7 @@ void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"ldp x11, x10, [x20, #0x10]\n"
"mov x9, #0x0\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- "ld1w { z18.s }, p3/Z, [x16]\n"
+ "ld1w { z20.s }, p3/Z, [x16]\n"
"ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
"cmp x14, %x[n_channels]\n"
"ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
@@ -98,99 +98,99 @@ void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n"
"ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
"addvl x16, x16, #16\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "ldr x23, [x15, #0x20]\n"
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ldp x24, x23, [x15, #0x0]\n"
+ "ldp x22, x21, [x15, #0x10]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ld1rw { z26.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z25.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
"ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
- "ld1w { z9.s }, p2/Z, [x27, x9, LSL #2]\n"
+ "ld1w { z9.s }, p2/Z, [x24, x9, LSL #2]\n"
"addvl x16, x16, #-6\n"
- "ld1w { z10.s }, p2/Z, [x26, x9, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x24, x9, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x20, x9, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z28, z18\n fmla z28.s, p3/M, z4.s, z9.s\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z3.s, z9.s\n"
- "ldr x22, [x15, #0x28]\n"
- "ldr x21, [x15, #0x30]\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x22, x9, LSL #2]\n"
- "ldr x20, [x15, #0x38]\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "fmla z29.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x21, x9, LSL #2]\n"
- "ldr x26, [x15, #0x48]\n"
- "fmla z30.s, p3/M, z2.s, z12.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
- "ldr x27, [x15, #0x40]\n"
- "ld1w { z10.s }, p2/Z, [x26, x9, LSL #2]\n"
- "fmla z28.s, p3/M, z5.s, z12.s\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x25, [x15, #0x50]\n"
- "fmla z30.s, p3/M, z6.s, z9.s\n"
- "fmla z31.s, p3/M, z3.s, z13.s\n"
- "ld1w { z9.s }, p2/Z, [x27, x9, LSL #2]\n"
- "ldr x24, [x15, #0x58]\n"
- "fmla z28.s, p3/M, z7.s, z13.s\n"
- "fmla z29.s, p3/M, z6.s, z13.s\n"
- "ldr x23, [x15, #0x60]\n"
- "ldr x22, [x15, #0x68]\n"
- "fmla z30.s, p3/M, z4.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
- "ldr x21, [x15, #0x70]\n"
- "fmla z28.s, p3/M, z1.s, z12.s\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x9, LSL #2]\n"
- "ldr x20, [x15, #0x78]\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "movprfx z24, z20\n fmla z24.s, p3/M, z4.s, z9.s\n"
+ "movprfx z23, z20\n fmla z23.s, p3/M, z3.s, z9.s\n"
+ "ldr x21, [x15, #0x28]\n"
+ "ldr x20, [x15, #0x30]\n"
+ "movprfx z22, z20\n fmla z22.s, p3/M, z1.s, z9.s\n"
+ "movprfx z21, z20\n fmla z21.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z18.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ldr x22, [x15, #0x38]\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "fmla z23.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x15, #0x48]\n"
+ "fmla z22.s, p3/M, z2.s, z12.s\n"
+ "fmla z21.s, p3/M, z1.s, z12.s\n"
+ "ldr x20, [x15, #0x40]\n"
+ "ld1w { z20.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "fmla z24.s, p3/M, z5.s, z12.s\n"
+ "fmla z23.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ldr x22, [x15, #0x50]\n"
+ "fmla z22.s, p3/M, z6.s, z18.s\n"
+ "fmla z21.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x15, #0x58]\n"
+ "fmla z24.s, p3/M, z7.s, z13.s\n"
+ "fmla z23.s, p3/M, z6.s, z13.s\n"
+ "ldr x20, [x15, #0x60]\n"
+ "ldr x27, [x15, #0x68]\n"
+ "fmla z22.s, p3/M, z4.s, z13.s\n"
+ "fmla z21.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ldr x26, [x15, #0x70]\n"
+ "fmla z24.s, p3/M, z1.s, z16.s\n"
+ "fmla z23.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ldr x25, [x15, #0x78]\n"
+ "fmla z22.s, p3/M, z5.s, z20.s\n"
+ "fmla z21.s, p3/M, z4.s, z20.s\n"
"whilelt p1.s, x14, %x[n_channels]\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z29.s, p3/M, z1.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x23, x9, LSL #2]\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "ldr x23, [x15, #0x20]\n"
- "ld1w { z13.s }, p1/Z, [x23, x14, LSL #2]\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ldp x24, x23, [x15, #0x0]\n"
+ "fmla z24.s, p3/M, z2.s, z18.s\n"
+ "fmla z23.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldp x22, x21, [x15, #0x10]\n"
+ "fmla z22.s, p3/M, z0.s, z17.s\n"
+ "fmla z21.s, p3/M, z2.s, z16.s\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ld1w { z13.s }, p1/Z, [x20, x14, LSL #2]\n"
+ "fmla z24.s, p3/M, z8.s, z20.s\n"
+ "fmla z23.s, p3/M, z7.s, z20.s\n"
+ "ld1w { z18.s }, p2/Z, [x27, x9, LSL #2]\n"
"incw x28\n"
- "fmla z30.s, p3/M, z3.s, z9.s\n"
- "fmla z31.s, p3/M, z5.s, z10.s\n"
+ "fmla z22.s, p3/M, z3.s, z19.s\n"
+ "fmla z21.s, p3/M, z5.s, z18.s\n"
"mov p0.b, p2.b\n"
- "ld1w { z18.s }, p3/Z, [x16]\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x21, x9, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x20, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z11.s\n"
- "fmla z31.s, p3/M, z6.s, z11.s\n"
+ "ld1w { z20.s }, p3/Z, [x16]\n"
+ "fmla z24.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x26, x9, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z7.s, z17.s\n"
+ "fmla z21.s, p3/M, z6.s, z17.s\n"
"incw x9\n"
- "ld1w { z11.s }, p1/Z, [x25, x14, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z9.s\n"
- "fmla z29.s, p3/M, z8.s, z10.s\n"
- "ld1w { z9.s }, p1/Z, [x27, x14, LSL #2]\n"
- "ld1w { z10.s }, p1/Z, [x26, x14, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z12.s\n"
- "fmla z31.s, p3/M, z7.s, z12.s\n"
- "ld1w { z12.s }, p1/Z, [x24, x14, LSL #2]\n"
+ "ld1w { z11.s }, p1/Z, [x22, x14, LSL #2]\n"
+ "fmla z24.s, p3/M, z6.s, z19.s\n"
+ "fmla z23.s, p3/M, z8.s, z18.s\n"
+ "ld1w { z9.s }, p1/Z, [x24, x14, LSL #2]\n"
+ "ld1w { z10.s }, p1/Z, [x23, x14, LSL #2]\n"
+ "fmla z22.s, p3/M, z8.s, z16.s\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z12.s }, p1/Z, [x21, x14, LSL #2]\n"
"incw x14\n"
- "fmax z28.s, p3/M, z28.s, z17.s\n"
- "fmax z29.s, p3/M, z29.s, z17.s\n"
+ "fmax z24.s, p3/M, z24.s, z26.s\n"
+ "fmax z23.s, p3/M, z23.s, z26.s\n"
"ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
"ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
- "fmax z30.s, p3/M, z30.s, z17.s\n"
- "fmax z31.s, p3/M, z31.s, z17.s\n"
+ "fmax z22.s, p3/M, z22.s, z26.s\n"
+ "fmax z21.s, p3/M, z21.s, z26.s\n"
"ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
"ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
"ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
@@ -199,98 +199,98 @@ void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"cmp x14, %x[n_channels]\n"
"ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
"addvl x16, x16, #16\n"
- "fmin z28.s, p3/M, z28.s, z16.s\n"
- "st1w { z28.s }, p0, [x13, x28, LSL #2]\n"
- "fmin z29.s, p3/M, z29.s, z16.s\n"
- "fmin z30.s, p3/M, z30.s, z16.s\n"
- "st1w { z29.s }, p0, [x12, x28, LSL #2]\n"
+ "fmin z24.s, p3/M, z24.s, z25.s\n"
+ "st1w { z24.s }, p0, [x13, x28, LSL #2]\n"
+ "fmin z23.s, p3/M, z23.s, z25.s\n"
+ "fmin z22.s, p3/M, z22.s, z25.s\n"
+ "st1w { z23.s }, p0, [x12, x28, LSL #2]\n"
"ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
- "fmin z31.s, p3/M, z31.s, z16.s\n"
- "st1w { z30.s }, p0, [x11, x28, LSL #2]\n"
+ "fmin z21.s, p3/M, z21.s, z25.s\n"
+ "st1w { z22.s }, p0, [x11, x28, LSL #2]\n"
"ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
"addvl x16, x16, #-6\n"
- "st1w { z31.s }, p0, [x10, x28, LSL #2]\n"
+ "st1w { z21.s }, p0, [x10, x28, LSL #2]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z28, z18\n fmla z28.s, p3/M, z4.s, z9.s\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z3.s, z9.s\n"
- "ldr x22, [x15, #0x28]\n"
- "ldr x21, [x15, #0x30]\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x22, x9, LSL #2]\n"
- "ldr x20, [x15, #0x38]\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "fmla z29.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x21, x9, LSL #2]\n"
- "ldr x26, [x15, #0x48]\n"
- "fmla z30.s, p3/M, z2.s, z12.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
- "ldr x27, [x15, #0x40]\n"
- "ld1w { z10.s }, p2/Z, [x26, x9, LSL #2]\n"
- "fmla z28.s, p3/M, z5.s, z12.s\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x25, [x15, #0x50]\n"
- "fmla z30.s, p3/M, z6.s, z9.s\n"
- "fmla z31.s, p3/M, z3.s, z13.s\n"
- "ld1w { z9.s }, p2/Z, [x27, x9, LSL #2]\n"
- "ldr x24, [x15, #0x58]\n"
- "fmla z28.s, p3/M, z7.s, z13.s\n"
- "fmla z29.s, p3/M, z6.s, z13.s\n"
+ "movprfx z24, z20\n fmla z24.s, p3/M, z4.s, z9.s\n"
+ "movprfx z23, z20\n fmla z23.s, p3/M, z3.s, z9.s\n"
+ "ldr x21, [x15, #0x28]\n"
+ "ldr x20, [x15, #0x30]\n"
+ "movprfx z22, z20\n fmla z22.s, p3/M, z1.s, z9.s\n"
+ "movprfx z21, z20\n fmla z21.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z18.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ldr x22, [x15, #0x38]\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "fmla z23.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x15, #0x48]\n"
+ "fmla z22.s, p3/M, z2.s, z12.s\n"
+ "fmla z21.s, p3/M, z1.s, z12.s\n"
+ "ldr x20, [x15, #0x40]\n"
+ "ld1w { z20.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "fmla z24.s, p3/M, z5.s, z12.s\n"
+ "fmla z23.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ldr x21, [x15, #0x50]\n"
+ "fmla z22.s, p3/M, z6.s, z18.s\n"
+ "fmla z21.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x20, [x15, #0x58]\n"
+ "fmla z24.s, p3/M, z7.s, z13.s\n"
+ "fmla z23.s, p3/M, z6.s, z13.s\n"
"ldr x23, [x15, #0x60]\n"
"ldr x22, [x15, #0x68]\n"
- "fmla z30.s, p3/M, z4.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z4.s, z13.s\n"
+ "fmla z21.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
"ldr x21, [x15, #0x70]\n"
- "fmla z28.s, p3/M, z1.s, z12.s\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x9, LSL #2]\n"
+ "fmla z24.s, p3/M, z1.s, z16.s\n"
+ "fmla z23.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
"ldr x20, [x15, #0x78]\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "fmla z22.s, p3/M, z5.s, z20.s\n"
+ "fmla z21.s, p3/M, z4.s, z20.s\n"
"incw x28\n"
"mov p0.b, p2.b\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z29.s, p3/M, z1.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x23, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x22, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z3.s, z9.s\n"
- "fmla z31.s, p3/M, z5.s, z10.s\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x21, x9, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x20, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z11.s\n"
- "fmla z31.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z6.s, z9.s\n"
- "fmla z29.s, p3/M, z8.s, z10.s\n"
- "fmax z28.s, p3/M, z28.s, z17.s\n"
- "fmax z29.s, p3/M, z29.s, z17.s\n"
- "fmla z30.s, p3/M, z8.s, z12.s\n"
- "fmla z31.s, p3/M, z7.s, z12.s\n"
- "fmax z30.s, p3/M, z30.s, z17.s\n"
- "fmax z31.s, p3/M, z31.s, z17.s\n"
- "fmin z28.s, p3/M, z28.s, z16.s\n"
- "fmin z29.s, p3/M, z29.s, z16.s\n"
- "st1w { z28.s }, p0, [x13, x28, LSL #2]\n"
- "fmin z30.s, p3/M, z30.s, z16.s\n"
- "fmin z31.s, p3/M, z31.s, z16.s\n"
- "st1w { z29.s }, p0, [x12, x28, LSL #2]\n"
- "st1w { z30.s }, p0, [x11, x28, LSL #2]\n"
- "st1w { z31.s }, p0, [x10, x28, LSL #2]\n"
+ "fmla z24.s, p3/M, z2.s, z18.s\n"
+ "fmla z23.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x23, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z0.s, z17.s\n"
+ "fmla z21.s, p3/M, z2.s, z16.s\n"
+ "fmla z24.s, p3/M, z8.s, z20.s\n"
+ "fmla z23.s, p3/M, z7.s, z20.s\n"
+ "ld1w { z18.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z3.s, z19.s\n"
+ "fmla z21.s, p3/M, z5.s, z18.s\n"
+ "fmla z24.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z7.s, z17.s\n"
+ "fmla z21.s, p3/M, z6.s, z17.s\n"
+ "fmla z24.s, p3/M, z6.s, z19.s\n"
+ "fmla z23.s, p3/M, z8.s, z18.s\n"
+ "fmax z24.s, p3/M, z24.s, z26.s\n"
+ "fmax z23.s, p3/M, z23.s, z26.s\n"
+ "fmla z22.s, p3/M, z8.s, z16.s\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
+ "fmax z22.s, p3/M, z22.s, z26.s\n"
+ "fmax z21.s, p3/M, z21.s, z26.s\n"
+ "fmin z24.s, p3/M, z24.s, z25.s\n"
+ "fmin z23.s, p3/M, z23.s, z25.s\n"
+ "st1w { z24.s }, p0, [x13, x28, LSL #2]\n"
+ "fmin z22.s, p3/M, z22.s, z25.s\n"
+ "fmin z21.s, p3/M, z21.s, z25.s\n"
+ "st1w { z23.s }, p0, [x12, x28, LSL #2]\n"
+ "st1w { z22.s }, p0, [x11, x28, LSL #2]\n"
+ "st1w { z21.s }, p0, [x10, x28, LSL #2]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
index 41ad193364..e4f432c9ed 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,19 +22,19 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
-void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
-void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
class sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
{
@@ -57,7 +57,7 @@ class sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 3;
sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<float, float, float, float>(3, 3, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
@@ -68,4 +68,4 @@ class sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirst
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
index cda34358f5..015d0e63c2 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -113,7 +113,7 @@ void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"madd x20, x8, x16, x20\n" // offset += tile_j * ld_output_col
"add x9, x10, x23, LSL #2\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- "ld1w { z18.s }, p3/Z, [x13]\n"
+ "ld1w { z14.s }, p3/Z, [x13]\n"
"mul x20, x20, x24\n" // offset *= output_tile_size
"ld1w { z0.s }, p3/Z, [x13, #1, MUL VL]\n"
"ld1w { z1.s }, p3/Z, [x13, #2, MUL VL]\n"
@@ -129,10 +129,10 @@ void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"ld1w { z6.s }, p3/Z, [x13, #7, MUL VL]\n"
"addvl x13, x13, #16\n"
"add x24, x11, x21, LSL #2\n"
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z31.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
"cmp x15, %x[n_channels]\n"
"add x23, x24, x21, LSL #2\n"
- "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rw { z30.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"ld1w { z7.s }, p3/Z, [x13, #-8, MUL VL]\n"
"add x22, x16, x16\n"
"mov x21, #0x0\n"
@@ -146,131 +146,131 @@ void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"ld1w { z13.s }, p2/Z, [x10, x12, LSL #2]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z24, z18\n fmla z24.s, p3/M, z7.s, z9.s\n"
- "movprfx z23, z18\n fmla z23.s, p3/M, z8.s, z9.s\n"
+ "movprfx z29, z14\n fmla z29.s, p3/M, z7.s, z9.s\n"
+ "movprfx z28, z14\n fmla z28.s, p3/M, z8.s, z9.s\n"
"whilelt p1.s, x15, %x[n_channels]\n"
"incw x21\n"
- "movprfx z25, z18\n fmla z25.s, p3/M, z6.s, z9.s\n"
- "fmla z24.s, p3/M, z4.s, z13.s\n"
+ "movprfx z27, z14\n fmla z27.s, p3/M, z6.s, z9.s\n"
+ "fmla z29.s, p3/M, z4.s, z13.s\n"
"incw x15\n"
"mov p0.b, p2.b\n"
- "movprfx z26, z18\n fmla z26.s, p3/M, z5.s, z9.s\n"
- "movprfx z27, z18\n fmla z27.s, p3/M, z4.s, z9.s\n"
+ "movprfx z26, z14\n fmla z26.s, p3/M, z5.s, z9.s\n"
+ "movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
"incw x20\n"
- "movprfx z28, z18\n fmla z28.s, p3/M, z3.s, z9.s\n"
- "fmla z23.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x9, x27, LSL #2]\n"
- "fmla z25.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x9, x17, LSL #2]\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z2.s, z9.s\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "fmla z23.s, p3/M, z5.s, z13.s\n"
- "fmla z25.s, p3/M, z3.s, z13.s\n"
+ "movprfx z24, z14\n fmla z24.s, p3/M, z3.s, z9.s\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z23.s }, p2/Z, [x9, x27, LSL #2]\n"
+ "fmla z27.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z18.s }, p2/Z, [x9, x17, LSL #2]\n"
+ "movprfx z22, z14\n fmla z22.s, p3/M, z2.s, z9.s\n"
+ "fmla z29.s, p3/M, z6.s, z18.s\n"
+ "movprfx z21, z14\n fmla z21.s, p3/M, z0.s, z9.s\n"
+ "fmla z28.s, p3/M, z5.s, z13.s\n"
+ "fmla z27.s, p3/M, z3.s, z13.s\n"
"fmla z26.s, p3/M, z2.s, z13.s\n"
- "fmla z27.s, p3/M, z1.s, z13.s\n"
- "fmla z28.s, p3/M, z0.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x14, x17, LSL #2]\n"
- "fmla z29.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x25, LSL #2]\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "fmla z25.s, p3/M, z1.s, z13.s\n"
"fmla z24.s, p3/M, z0.s, z13.s\n"
- "ld1w { z18.s }, p3/Z, [x13]\n"
- "fmla z31.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x14, x27, LSL #2]\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z26.s, p3/M, z4.s, z11.s\n"
- "fmla z27.s, p3/M, z3.s, z11.s\n"
- "fmla z29.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x10]\n"
- "fmla z24.s, p3/M, z2.s, z12.s\n"
- "fmla z25.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x28]\n"
- "fmla z28.s, p3/M, z4.s, z10.s\n"
- "fmla z23.s, p3/M, z1.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x10, x25, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z10.s\n"
- "fmla z31.s, p3/M, z1.s, z10.s\n"
- "fmla z24.s, p3/M, z8.s, z10.s\n"
- "fmla z25.s, p3/M, z7.s, z10.s\n"
- "fmla z27.s, p3/M, z5.s, z10.s\n"
- "fmla z26.s, p3/M, z0.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x28, x12, LSL #2]\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "fmla z28.s, p3/M, z2.s, z13.s\n"
- "fmla z30.s, p3/M, z4.s, z10.s\n"
- "fmla z31.s, p3/M, z3.s, z10.s\n"
- "fmla z23.s, p3/M, z3.s, z11.s\n"
- "fmla z25.s, p3/M, z5.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x25, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x26, x17, LSL #2]\n"
- "fmla z26.s, p3/M, z6.s, z12.s\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x10, x17, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z10.s\n"
- "fmla z28.s, p3/M, z6.s, z10.s\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "fmla z30.s, p3/M, z6.s, z13.s\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "fmla z29.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x26, x27, LSL #2]\n"
- "fmla z24.s, p3/M, z3.s, z12.s\n"
- "fmla z27.s, p3/M, z0.s, z12.s\n"
- "fmla z28.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x10, x27, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z13.s\n"
+ "ld1w { z17.s }, p2/Z, [x14, x17, LSL #2]\n"
+ "fmla z22.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x26, x25, LSL #2]\n"
+ "movprfx z20, z14\n fmla z20.s, p3/M, z1.s, z9.s\n"
+ "fmla z29.s, p3/M, z0.s, z17.s\n"
+ "ld1w { z14.s }, p3/Z, [x13]\n"
+ "fmla z21.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x27, LSL #2]\n"
+ "fmla z28.s, p3/M, z7.s, z18.s\n"
+ "fmla z20.s, p3/M, z0.s, z18.s\n"
+ "fmla z26.s, p3/M, z4.s, z18.s\n"
+ "fmla z25.s, p3/M, z3.s, z18.s\n"
+ "fmla z22.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x10]\n"
+ "fmla z29.s, p3/M, z2.s, z16.s\n"
+ "fmla z27.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x28]\n"
+ "fmla z24.s, p3/M, z4.s, z23.s\n"
+ "fmla z28.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x10, x25, LSL #2]\n"
+ "fmla z20.s, p3/M, z2.s, z23.s\n"
+ "fmla z21.s, p3/M, z1.s, z23.s\n"
+ "fmla z29.s, p3/M, z8.s, z23.s\n"
+ "fmla z27.s, p3/M, z7.s, z23.s\n"
+ "fmla z25.s, p3/M, z5.s, z23.s\n"
+ "fmla z26.s, p3/M, z0.s, z19.s\n"
+ "ld1w { z17.s }, p2/Z, [x28, x12, LSL #2]\n"
+ "fmla z22.s, p3/M, z3.s, z18.s\n"
+ "fmla z24.s, p3/M, z2.s, z16.s\n"
+ "fmla z20.s, p3/M, z4.s, z17.s\n"
+ "fmla z21.s, p3/M, z3.s, z17.s\n"
+ "fmla z28.s, p3/M, z3.s, z19.s\n"
+ "fmla z27.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x28, x25, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x26, x17, LSL #2]\n"
+ "fmla z26.s, p3/M, z6.s, z18.s\n"
+ "fmla z25.s, p3/M, z7.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x10, x17, LSL #2]\n"
+ "fmla z22.s, p3/M, z5.s, z17.s\n"
+ "fmla z24.s, p3/M, z6.s, z17.s\n"
+ "fmla z21.s, p3/M, z5.s, z19.s\n"
+ "fmla z20.s, p3/M, z6.s, z16.s\n"
+ "fmla z26.s, p3/M, z8.s, z17.s\n"
+ "fmla z22.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x26, x27, LSL #2]\n"
+ "fmla z29.s, p3/M, z3.s, z18.s\n"
+ "fmla z25.s, p3/M, z0.s, z18.s\n"
+ "fmla z24.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x10, x27, LSL #2]\n"
+ "fmla z20.s, p3/M, z8.s, z17.s\n"
"addvl x10, x10, #1\n"
- "fmla z31.s, p3/M, z7.s, z13.s\n"
- "fmla z23.s, p3/M, z4.s, z12.s\n"
- "ld1w { z13.s }, p2/Z, [x28, x27, LSL #2]\n"
- "fmla z26.s, p3/M, z1.s, z12.s\n"
- "fmla z24.s, p3/M, z5.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x28, x17, LSL #2]\n"
+ "fmla z21.s, p3/M, z7.s, z17.s\n"
+ "fmla z28.s, p3/M, z4.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x28, x27, LSL #2]\n"
+ "fmla z26.s, p3/M, z1.s, z18.s\n"
+ "fmla z29.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x28, x17, LSL #2]\n"
"addvl x28, x28, #1\n"
- "fmla z25.s, p3/M, z4.s, z11.s\n"
- "fmla z27.s, p3/M, z2.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x12, LSL #2]\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
+ "fmla z27.s, p3/M, z4.s, z16.s\n"
+ "fmla z25.s, p3/M, z2.s, z16.s\n"
+ "fmla z24.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x12, LSL #2]\n"
+ "fmla z22.s, p3/M, z4.s, z17.s\n"
"addvl x14, x14, #1\n"
- "fmla z30.s, p3/M, z3.s, z12.s\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
+ "fmla z20.s, p3/M, z3.s, z17.s\n"
+ "fmla z21.s, p3/M, z4.s, z19.s\n"
"ld1w { z4.s }, p3/Z, [x13, #5, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x14]\n"
- "fmla z26.s, p3/M, z7.s, z12.s\n"
- "fmla z27.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x9]\n"
- "fmla z23.s, p3/M, z2.s, z11.s\n"
- "fmla z24.s, p3/M, z1.s, z11.s\n"
- "fmax z24.s, p3/M, z24.s, z17.s\n"
+ "fmla z26.s, p3/M, z7.s, z17.s\n"
+ "fmla z25.s, p3/M, z6.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x9]\n"
+ "fmla z28.s, p3/M, z2.s, z16.s\n"
+ "fmla z29.s, p3/M, z1.s, z16.s\n"
+ "fmax z29.s, p3/M, z29.s, z31.s\n"
"ld1w { z1.s }, p3/Z, [x13, #2, MUL VL]\n"
- "fmla z25.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x9, x25, LSL #2]\n"
- "fmla z28.s, p3/M, z7.s, z13.s\n"
+ "fmla z27.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x9, x25, LSL #2]\n"
+ "fmla z24.s, p3/M, z7.s, z19.s\n"
"addvl x9, x9, #1\n"
- "fmla z30.s, p3/M, z5.s, z13.s\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
+ "fmla z20.s, p3/M, z5.s, z19.s\n"
+ "fmla z22.s, p3/M, z0.s, z18.s\n"
"ld1w { z0.s }, p3/Z, [x13, #1, MUL VL]\n"
- "fmin z24.s, p3/M, z24.s, z16.s\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "fmla z27.s, p3/M, z8.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x26, x12, LSL #2]\n"
- "fmax z27.s, p3/M, z27.s, z17.s\n"
- "fmla z23.s, p3/M, z6.s, z12.s\n"
- "fmla z26.s, p3/M, z3.s, z12.s\n"
- "fmax z23.s, p3/M, z23.s, z17.s\n"
- "fmax z26.s, p3/M, z26.s, z17.s\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "fmla z28.s, p3/M, z5.s, z11.s\n"
- "fmax z25.s, p3/M, z25.s, z17.s\n"
- "fmax z28.s, p3/M, z28.s, z17.s\n"
- "fmla z29.s, p3/M, z8.s, z13.s\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
- "fmax z29.s, p3/M, z29.s, z17.s\n"
- "fmax z30.s, p3/M, z30.s, z17.s\n"
- "fmla z31.s, p3/M, z6.s, z13.s\n"
- "fmax z31.s, p3/M, z31.s, z17.s\n"
+ "fmin z29.s, p3/M, z29.s, z30.s\n"
+ "fmla z21.s, p3/M, z2.s, z17.s\n"
+ "fmla z25.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x26, x12, LSL #2]\n"
+ "fmax z25.s, p3/M, z25.s, z31.s\n"
+ "fmla z28.s, p3/M, z6.s, z18.s\n"
+ "fmla z26.s, p3/M, z3.s, z18.s\n"
+ "fmax z28.s, p3/M, z28.s, z31.s\n"
+ "fmax z26.s, p3/M, z26.s, z31.s\n"
+ "fmla z27.s, p3/M, z8.s, z17.s\n"
+ "fmla z24.s, p3/M, z5.s, z17.s\n"
+ "fmax z27.s, p3/M, z27.s, z31.s\n"
+ "fmax z24.s, p3/M, z24.s, z31.s\n"
+ "fmla z22.s, p3/M, z8.s, z16.s\n"
+ "fmla z20.s, p3/M, z7.s, z16.s\n"
+ "fmax z22.s, p3/M, z22.s, z31.s\n"
+ "fmax z20.s, p3/M, z20.s, z31.s\n"
+ "fmla z21.s, p3/M, z6.s, z16.s\n"
+ "fmax z21.s, p3/M, z21.s, z31.s\n"
"addvl x26, x26, #1\n"
"ld1w { z2.s }, p3/Z, [x13, #3, MUL VL]\n"
"ld1w { z3.s }, p3/Z, [x13, #4, MUL VL]\n"
@@ -279,182 +279,182 @@ void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"cmp x15, %x[n_channels]\n"
"ld1w { z6.s }, p3/Z, [x13, #7, MUL VL]\n"
"addvl x13, x13, #16\n"
- "fmin z23.s, p3/M, z23.s, z16.s\n"
+ "fmin z28.s, p3/M, z28.s, z30.s\n"
"ld1w { z9.s }, p1/Z, [x9, x12, LSL #2]\n"
- "fmin z25.s, p3/M, z25.s, z16.s\n"
- "fmin z26.s, p3/M, z26.s, z16.s\n"
+ "fmin z27.s, p3/M, z27.s, z30.s\n"
+ "fmin z26.s, p3/M, z26.s, z30.s\n"
"ld1w { z11.s }, p1/Z, [x14, x25, LSL #2]\n"
"ld1w { z12.s }, p1/Z, [x26]\n"
- "fmin z27.s, p3/M, z27.s, z16.s\n"
- "fmin z28.s, p3/M, z28.s, z16.s\n"
+ "fmin z25.s, p3/M, z25.s, z30.s\n"
+ "fmin z24.s, p3/M, z24.s, z30.s\n"
"ld1w { z13.s }, p1/Z, [x10, x12, LSL #2]\n"
- "st1w { z23.s }, p0, [x11]\n"
- "fmin z29.s, p3/M, z29.s, z16.s\n"
- "fmin z30.s, p3/M, z30.s, z16.s\n"
- "st1w { z24.s }, p0, [x11, x16, LSL #2]\n"
+ "st1w { z28.s }, p0, [x11]\n"
+ "fmin z22.s, p3/M, z22.s, z30.s\n"
+ "fmin z20.s, p3/M, z20.s, z30.s\n"
+ "st1w { z29.s }, p0, [x11, x16, LSL #2]\n"
"ld1w { z7.s }, p3/Z, [x13, #-8, MUL VL]\n"
- "fmin z31.s, p3/M, z31.s, z16.s\n"
- "st1w { z25.s }, p0, [x11, x22, LSL #2]\n"
+ "fmin z21.s, p3/M, z21.s, z30.s\n"
+ "st1w { z27.s }, p0, [x11, x22, LSL #2]\n"
"addvl x11, x11, #1\n"
"ld1w { z8.s }, p3/Z, [x13, #-7, MUL VL]\n"
"st1w { z26.s }, p0, [x24]\n"
"addvl x13, x13, #-6\n"
- "st1w { z27.s }, p0, [x24, x16, LSL #2]\n"
- "st1w { z28.s }, p0, [x24, x22, LSL #2]\n"
+ "st1w { z25.s }, p0, [x24, x16, LSL #2]\n"
+ "st1w { z24.s }, p0, [x24, x22, LSL #2]\n"
"addvl x24, x24, #1\n"
- "st1w { z29.s }, p0, [x23]\n"
- "st1w { z30.s }, p0, [x23, x16, LSL #2]\n"
- "st1w { z31.s }, p0, [x23, x22, LSL #2]\n"
+ "st1w { z22.s }, p0, [x23]\n"
+ "st1w { z20.s }, p0, [x23, x16, LSL #2]\n"
+ "st1w { z21.s }, p0, [x23, x22, LSL #2]\n"
"addvl x23, x23, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z24, z18\n fmla z24.s, p3/M, z7.s, z9.s\n"
- "movprfx z23, z18\n fmla z23.s, p3/M, z8.s, z9.s\n"
+ "movprfx z29, z14\n fmla z29.s, p3/M, z7.s, z9.s\n"
+ "movprfx z28, z14\n fmla z28.s, p3/M, z8.s, z9.s\n"
"ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x13, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "movprfx z25, z18\n fmla z25.s, p3/M, z6.s, z9.s\n"
- "fmla z24.s, p3/M, z4.s, z13.s\n"
+ "movprfx z27, z14\n fmla z27.s, p3/M, z6.s, z9.s\n"
+ "fmla z29.s, p3/M, z4.s, z13.s\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
"add x8, x8, #0x1\n"
- "movprfx z26, z18\n fmla z26.s, p3/M, z5.s, z9.s\n"
- "movprfx z27, z18\n fmla z27.s, p3/M, z4.s, z9.s\n"
+ "movprfx z26, z14\n fmla z26.s, p3/M, z5.s, z9.s\n"
+ "movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
"cmp x8, x20\n"
"add x21, x13, #0x1\n"
- "movprfx z28, z18\n fmla z28.s, p3/M, z3.s, z9.s\n"
- "fmla z23.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x9, x27, LSL #2]\n"
+ "movprfx z24, z14\n fmla z24.s, p3/M, z3.s, z9.s\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z23.s }, p2/Z, [x9, x27, LSL #2]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "fmla z25.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x9, x17, LSL #2]\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z2.s, z9.s\n"
+ "fmla z27.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z18.s }, p2/Z, [x9, x17, LSL #2]\n"
+ "movprfx z22, z14\n fmla z22.s, p3/M, z2.s, z9.s\n"
"csel x13, x13, x21, LT\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "fmla z29.s, p3/M, z6.s, z18.s\n"
+ "movprfx z21, z14\n fmla z21.s, p3/M, z0.s, z9.s\n"
"mov p0.b, p2.b\n"
"csel x8, x8, XZR, LT\n"
- "fmla z23.s, p3/M, z5.s, z13.s\n"
- "fmla z25.s, p3/M, z3.s, z13.s\n"
+ "fmla z28.s, p3/M, z5.s, z13.s\n"
+ "fmla z27.s, p3/M, z3.s, z13.s\n"
"cmp x13, x20\n"
"fmla z26.s, p3/M, z2.s, z13.s\n"
- "fmla z27.s, p3/M, z1.s, z13.s\n"
- "fmla z28.s, p3/M, z0.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x14, x17, LSL #2]\n"
- "fmla z29.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x25, LSL #2]\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "fmla z25.s, p3/M, z1.s, z13.s\n"
"fmla z24.s, p3/M, z0.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x14, x27, LSL #2]\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z26.s, p3/M, z4.s, z11.s\n"
- "fmla z27.s, p3/M, z3.s, z11.s\n"
- "fmla z29.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x10]\n"
- "fmla z24.s, p3/M, z2.s, z12.s\n"
- "fmla z25.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x28]\n"
- "fmla z28.s, p3/M, z4.s, z10.s\n"
- "fmla z23.s, p3/M, z1.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x10, x25, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z10.s\n"
- "fmla z31.s, p3/M, z1.s, z10.s\n"
- "fmla z24.s, p3/M, z8.s, z10.s\n"
- "fmla z25.s, p3/M, z7.s, z10.s\n"
- "fmla z27.s, p3/M, z5.s, z10.s\n"
- "fmla z26.s, p3/M, z0.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x28, x12, LSL #2]\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "fmla z28.s, p3/M, z2.s, z13.s\n"
- "fmla z30.s, p3/M, z4.s, z10.s\n"
- "fmla z31.s, p3/M, z3.s, z10.s\n"
- "fmla z23.s, p3/M, z3.s, z11.s\n"
- "fmla z25.s, p3/M, z5.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x25, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x26, x17, LSL #2]\n"
- "fmla z26.s, p3/M, z6.s, z12.s\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x10, x17, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z10.s\n"
- "fmla z28.s, p3/M, z6.s, z10.s\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "fmla z30.s, p3/M, z6.s, z13.s\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "fmla z29.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x26, x27, LSL #2]\n"
- "fmla z24.s, p3/M, z3.s, z12.s\n"
- "fmla z27.s, p3/M, z0.s, z12.s\n"
- "fmla z28.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x10, x27, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z13.s\n"
- "fmla z31.s, p3/M, z7.s, z13.s\n"
- "fmla z23.s, p3/M, z4.s, z12.s\n"
- "ld1w { z13.s }, p2/Z, [x28, x27, LSL #2]\n"
- "fmla z26.s, p3/M, z1.s, z12.s\n"
- "fmla z24.s, p3/M, z5.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x28, x17, LSL #2]\n"
- "fmla z25.s, p3/M, z4.s, z11.s\n"
- "fmla z27.s, p3/M, z2.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x12, LSL #2]\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "fmla z30.s, p3/M, z3.s, z12.s\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "fmla z26.s, p3/M, z7.s, z12.s\n"
- "fmla z27.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x9]\n"
- "fmla z23.s, p3/M, z2.s, z11.s\n"
- "fmla z24.s, p3/M, z1.s, z11.s\n"
- "fmax z24.s, p3/M, z24.s, z17.s\n"
- "fmin z24.s, p3/M, z24.s, z16.s\n"
- "fmla z25.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x9, x25, LSL #2]\n"
- "fmla z28.s, p3/M, z7.s, z13.s\n"
- "fmla z30.s, p3/M, z5.s, z13.s\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "fmla z27.s, p3/M, z8.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x26, x12, LSL #2]\n"
- "fmax z27.s, p3/M, z27.s, z17.s\n"
- "fmla z23.s, p3/M, z6.s, z12.s\n"
- "fmla z26.s, p3/M, z3.s, z12.s\n"
- "fmax z23.s, p3/M, z23.s, z17.s\n"
- "fmax z26.s, p3/M, z26.s, z17.s\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "fmla z28.s, p3/M, z5.s, z11.s\n"
- "fmax z25.s, p3/M, z25.s, z17.s\n"
- "fmax z28.s, p3/M, z28.s, z17.s\n"
- "fmla z29.s, p3/M, z8.s, z13.s\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
- "fmax z29.s, p3/M, z29.s, z17.s\n"
- "fmax z30.s, p3/M, z30.s, z17.s\n"
- "fmla z31.s, p3/M, z6.s, z13.s\n"
- "fmax z31.s, p3/M, z31.s, z17.s\n"
- "fmin z23.s, p3/M, z23.s, z16.s\n"
- "st1w { z23.s }, p0, [x11]\n"
- "fmin z25.s, p3/M, z25.s, z16.s\n"
- "fmin z26.s, p3/M, z26.s, z16.s\n"
- "st1w { z24.s }, p0, [x11, x16, LSL #2]\n"
- "fmin z27.s, p3/M, z27.s, z16.s\n"
- "fmin z28.s, p3/M, z28.s, z16.s\n"
- "st1w { z25.s }, p0, [x11, x22, LSL #2]\n"
- "fmin z29.s, p3/M, z29.s, z16.s\n"
- "fmin z30.s, p3/M, z30.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x14, x17, LSL #2]\n"
+ "fmla z22.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x26, x25, LSL #2]\n"
+ "movprfx z20, z14\n fmla z20.s, p3/M, z1.s, z9.s\n"
+ "fmla z29.s, p3/M, z0.s, z17.s\n"
+ "fmla z21.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x27, LSL #2]\n"
+ "fmla z28.s, p3/M, z7.s, z18.s\n"
+ "fmla z20.s, p3/M, z0.s, z18.s\n"
+ "fmla z26.s, p3/M, z4.s, z18.s\n"
+ "fmla z25.s, p3/M, z3.s, z18.s\n"
+ "fmla z22.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x10]\n"
+ "fmla z29.s, p3/M, z2.s, z16.s\n"
+ "fmla z27.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x28]\n"
+ "fmla z24.s, p3/M, z4.s, z23.s\n"
+ "fmla z28.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x10, x25, LSL #2]\n"
+ "fmla z20.s, p3/M, z2.s, z23.s\n"
+ "fmla z21.s, p3/M, z1.s, z23.s\n"
+ "fmla z29.s, p3/M, z8.s, z23.s\n"
+ "fmla z27.s, p3/M, z7.s, z23.s\n"
+ "fmla z25.s, p3/M, z5.s, z23.s\n"
+ "fmla z26.s, p3/M, z0.s, z19.s\n"
+ "ld1w { z17.s }, p2/Z, [x28, x12, LSL #2]\n"
+ "fmla z22.s, p3/M, z3.s, z18.s\n"
+ "fmla z24.s, p3/M, z2.s, z16.s\n"
+ "fmla z20.s, p3/M, z4.s, z17.s\n"
+ "fmla z21.s, p3/M, z3.s, z17.s\n"
+ "fmla z28.s, p3/M, z3.s, z19.s\n"
+ "fmla z27.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x28, x25, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x26, x17, LSL #2]\n"
+ "fmla z26.s, p3/M, z6.s, z18.s\n"
+ "fmla z25.s, p3/M, z7.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x10, x17, LSL #2]\n"
+ "fmla z22.s, p3/M, z5.s, z17.s\n"
+ "fmla z24.s, p3/M, z6.s, z17.s\n"
+ "fmla z21.s, p3/M, z5.s, z19.s\n"
+ "fmla z20.s, p3/M, z6.s, z16.s\n"
+ "fmla z26.s, p3/M, z8.s, z17.s\n"
+ "fmla z22.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x26, x27, LSL #2]\n"
+ "fmla z29.s, p3/M, z3.s, z18.s\n"
+ "fmla z25.s, p3/M, z0.s, z18.s\n"
+ "fmla z24.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x10, x27, LSL #2]\n"
+ "fmla z20.s, p3/M, z8.s, z17.s\n"
+ "fmla z21.s, p3/M, z7.s, z17.s\n"
+ "fmla z28.s, p3/M, z4.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x28, x27, LSL #2]\n"
+ "fmla z26.s, p3/M, z1.s, z18.s\n"
+ "fmla z29.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x28, x17, LSL #2]\n"
+ "fmla z27.s, p3/M, z4.s, z16.s\n"
+ "fmla z25.s, p3/M, z2.s, z16.s\n"
+ "fmla z24.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x12, LSL #2]\n"
+ "fmla z22.s, p3/M, z4.s, z17.s\n"
+ "fmla z20.s, p3/M, z3.s, z17.s\n"
+ "fmla z21.s, p3/M, z4.s, z19.s\n"
+ "fmla z26.s, p3/M, z7.s, z17.s\n"
+ "fmla z25.s, p3/M, z6.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x9]\n"
+ "fmla z28.s, p3/M, z2.s, z16.s\n"
+ "fmla z29.s, p3/M, z1.s, z16.s\n"
+ "fmax z29.s, p3/M, z29.s, z31.s\n"
+ "fmin z29.s, p3/M, z29.s, z30.s\n"
+ "fmla z27.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x9, x25, LSL #2]\n"
+ "fmla z24.s, p3/M, z7.s, z19.s\n"
+ "fmla z20.s, p3/M, z5.s, z19.s\n"
+ "fmla z22.s, p3/M, z0.s, z18.s\n"
+ "fmla z21.s, p3/M, z2.s, z17.s\n"
+ "fmla z25.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x26, x12, LSL #2]\n"
+ "fmax z25.s, p3/M, z25.s, z31.s\n"
+ "fmla z28.s, p3/M, z6.s, z18.s\n"
+ "fmla z26.s, p3/M, z3.s, z18.s\n"
+ "fmax z28.s, p3/M, z28.s, z31.s\n"
+ "fmax z26.s, p3/M, z26.s, z31.s\n"
+ "fmla z27.s, p3/M, z8.s, z17.s\n"
+ "fmla z24.s, p3/M, z5.s, z17.s\n"
+ "fmax z27.s, p3/M, z27.s, z31.s\n"
+ "fmax z24.s, p3/M, z24.s, z31.s\n"
+ "fmla z22.s, p3/M, z8.s, z16.s\n"
+ "fmla z20.s, p3/M, z7.s, z16.s\n"
+ "fmax z22.s, p3/M, z22.s, z31.s\n"
+ "fmax z20.s, p3/M, z20.s, z31.s\n"
+ "fmla z21.s, p3/M, z6.s, z16.s\n"
+ "fmax z21.s, p3/M, z21.s, z31.s\n"
+ "fmin z28.s, p3/M, z28.s, z30.s\n"
+ "st1w { z28.s }, p0, [x11]\n"
+ "fmin z27.s, p3/M, z27.s, z30.s\n"
+ "fmin z26.s, p3/M, z26.s, z30.s\n"
+ "st1w { z29.s }, p0, [x11, x16, LSL #2]\n"
+ "fmin z25.s, p3/M, z25.s, z30.s\n"
+ "fmin z24.s, p3/M, z24.s, z30.s\n"
+ "st1w { z27.s }, p0, [x11, x22, LSL #2]\n"
+ "fmin z22.s, p3/M, z22.s, z30.s\n"
+ "fmin z20.s, p3/M, z20.s, z30.s\n"
"st1w { z26.s }, p0, [x24]\n"
- "fmin z31.s, p3/M, z31.s, z16.s\n"
- "st1w { z27.s }, p0, [x24, x16, LSL #2]\n"
- "st1w { z28.s }, p0, [x24, x22, LSL #2]\n"
- "st1w { z29.s }, p0, [x23]\n"
- "st1w { z30.s }, p0, [x23, x16, LSL #2]\n"
- "st1w { z31.s }, p0, [x23, x22, LSL #2]\n"
+ "fmin z21.s, p3/M, z21.s, z30.s\n"
+ "st1w { z25.s }, p0, [x24, x16, LSL #2]\n"
+ "st1w { z24.s }, p0, [x24, x22, LSL #2]\n"
+ "st1w { z22.s }, p0, [x23]\n"
+ "st1w { z20.s }, p0, [x23, x16, LSL #2]\n"
+ "st1w { z21.s }, p0, [x23, x22, LSL #2]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
index 2eed8cb0c4..4809b0c45c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -88,390 +88,390 @@ void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
__asm__ __volatile__(
"ptrue p3.b\n"
- "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ld1w { z18.s }, p3/Z, [x17]\n"
- "cntw x15\n"
- "mov x14, #0x0\n"
- "ld1w { z0.s }, p3/Z, [x17, #1, MUL VL]\n"
- "ld1w { z1.s }, p3/Z, [x17, #2, MUL VL]\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x17, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ld1w { z14.s }, p3/Z, [x8]\n"
+ "cntw x16\n"
+ "mov x15, #0x0\n"
+ "ld1w { z0.s }, p3/Z, [x8, #1, MUL VL]\n"
+ "ld1w { z1.s }, p3/Z, [x8, #2, MUL VL]\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- "ld1w { z2.s }, p3/Z, [x17, #3, MUL VL]\n"
- "ld1w { z3.s }, p3/Z, [x17, #4, MUL VL]\n"
- "cmp x15, %x[n_channels]\n"
- "ld1w { z4.s }, p3/Z, [x17, #5, MUL VL]\n"
- "ld1w { z5.s }, p3/Z, [x17, #6, MUL VL]\n"
- "sub x13, XZR, x15\n"
- "ld1w { z6.s }, p3/Z, [x17, #7, MUL VL]\n"
- "addvl x17, x17, #16\n"
- "ldp x12, x11, [x16, #0x0]\n"
- "ldp x10, x9, [x16, #0x10]\n"
- "ldr x28, [x16, #0x20]\n"
- "ldr x27, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "ld1w { z7.s }, p3/Z, [x17, #-8, MUL VL]\n"
- "ld1w { z8.s }, p3/Z, [x17, #-7, MUL VL]\n"
- "ld1w { z9.s }, p2/Z, [x12, x14, LSL #2]\n"
- "addvl x17, x17, #-6\n"
- "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x10, x14, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x9, x14, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x28, x14, LSL #2]\n"
+ "ld1w { z2.s }, p3/Z, [x8, #3, MUL VL]\n"
+ "ld1w { z3.s }, p3/Z, [x8, #4, MUL VL]\n"
+ "cmp x16, %x[n_channels]\n"
+ "ld1w { z4.s }, p3/Z, [x8, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x8, #6, MUL VL]\n"
+ "sub x14, XZR, x16\n"
+ "ld1w { z6.s }, p3/Z, [x8, #7, MUL VL]\n"
+ "addvl x8, x8, #16\n"
+ "ldp x24, x23, [x17, #0x0]\n"
+ "ldp x22, x21, [x17, #0x10]\n"
+ "ldr x20, [x17, #0x20]\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ld1rw { z31.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z30.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1w { z7.s }, p3/Z, [x8, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x8, #-7, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "addvl x8, x8, #-6\n"
+ "ld1w { z10.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x20, x15, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z23, z18\n fmla z23.s, p3/M, z8.s, z9.s\n"
- "movprfx z24, z18\n fmla z24.s, p3/M, z7.s, z9.s\n"
- "ldr x26, [x16, #0x30]\n"
- "ldr x25, [x16, #0x38]\n"
- "movprfx z25, z18\n fmla z25.s, p3/M, z6.s, z9.s\n"
- "fmla z23.s, p3/M, z0.s, z10.s\n"
- "ldr x24, [x16, #0x28]\n"
- "ldr x11, [x16, #0x48]\n"
- "fmla z24.s, p3/M, z4.s, z13.s\n"
- "movprfx z26, z18\n fmla z26.s, p3/M, z5.s, z9.s\n"
- "ldr x12, [x16, #0x40]\n"
- "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
- "movprfx z27, z18\n fmla z27.s, p3/M, z4.s, z9.s\n"
- "movprfx z28, z18\n fmla z28.s, p3/M, z3.s, z9.s\n"
- "ldr x10, [x16, #0x50]\n"
- "ldr x9, [x16, #0x58]\n"
- "fmla z25.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x14, LSL #2]\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z2.s, z9.s\n"
- "ldr x28, [x16, #0x60]\n"
- "fmla z23.s, p3/M, z5.s, z13.s\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "ldr x26, [x16, #0x70]\n"
- "ldr x11, [x16, #0x88]\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "fmla z25.s, p3/M, z3.s, z13.s\n"
- "incw x13\n"
+ "movprfx z29, z14\n fmla z29.s, p3/M, z8.s, z9.s\n"
+ "movprfx z28, z14\n fmla z28.s, p3/M, z7.s, z9.s\n"
+ "ldr x23, [x17, #0x30]\n"
+ "ldr x26, [x17, #0x38]\n"
+ "movprfx z27, z14\n fmla z27.s, p3/M, z6.s, z9.s\n"
+ "fmla z29.s, p3/M, z0.s, z10.s\n"
+ "ldr x22, [x17, #0x28]\n"
+ "ldr x21, [x17, #0x48]\n"
+ "fmla z28.s, p3/M, z4.s, z13.s\n"
+ "movprfx z26, z14\n fmla z26.s, p3/M, z5.s, z9.s\n"
+ "ldr x20, [x17, #0x40]\n"
+ "ld1w { z19.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
+ "movprfx z24, z14\n fmla z24.s, p3/M, z3.s, z9.s\n"
+ "ldr x25, [x17, #0x50]\n"
+ "ldr x24, [x17, #0x58]\n"
+ "fmla z27.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "movprfx z23, z14\n fmla z23.s, p3/M, z2.s, z9.s\n"
+ "ldr x23, [x17, #0x60]\n"
+ "fmla z29.s, p3/M, z5.s, z13.s\n"
+ "fmla z28.s, p3/M, z6.s, z18.s\n"
+ "ldr x12, [x17, #0x70]\n"
+ "ldr x11, [x17, #0x88]\n"
+ "movprfx z22, z14\n fmla z22.s, p3/M, z0.s, z9.s\n"
+ "fmla z27.s, p3/M, z3.s, z13.s\n"
+ "incw x14\n"
"mov p1.b, p2.b\n"
"fmla z26.s, p3/M, z2.s, z13.s\n"
- "fmla z27.s, p3/M, z1.s, z13.s\n"
- "ldr x23, [x27, #0x0]\n"
- "whilelt p0.s, x15, %x[n_channels]\n"
- "fmla z28.s, p3/M, z0.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x25, x14, LSL #2]\n"
- "fmla z29.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x14, LSL #2]\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
- "ldr x24, [x16, #0x68]\n"
- "ldr x25, [x16, #0x78]\n"
+ "fmla z25.s, p3/M, z1.s, z13.s\n"
+ "ldr x10, [x13, #0x0]\n"
+ "whilelt p0.s, x16, %x[n_channels]\n"
"fmla z24.s, p3/M, z0.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x14, LSL #2]\n"
- "ldr x12, [x16, #0x80]\n"
- "fmla z26.s, p3/M, z4.s, z11.s\n"
- "fmla z27.s, p3/M, z3.s, z11.s\n"
- "ldr x22, [x27, #0x8]\n"
- "ldr x21, [x27, #0x10]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z28.s, p3/M, z4.s, z10.s\n"
- "ldr x20, [x27, #0x18]\n"
- "ld1w { z18.s }, p3/Z, [x17]\n"
- "fmla z29.s, p3/M, z1.s, z11.s\n"
- "fmla z23.s, p3/M, z1.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x10, x14, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x9, x14, LSL #2]\n"
- "fmla z24.s, p3/M, z2.s, z12.s\n"
- "fmla z25.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x28, x14, LSL #2]\n"
- "ldr x10, [x16, #0x90]\n"
- "fmla z27.s, p3/M, z5.s, z10.s\n"
- "fmla z30.s, p3/M, z2.s, z10.s\n"
- "ldr x28, [x16, #0xa0]\n"
- "ldr x9, [x16, #0x98]\n"
- "fmla z26.s, p3/M, z0.s, z11.s\n"
- "fmla z28.s, p3/M, z2.s, z13.s\n"
- "fmla z24.s, p3/M, z8.s, z10.s\n"
- "fmla z25.s, p3/M, z7.s, z10.s\n"
- "fmla z31.s, p3/M, z1.s, z10.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x24, x14, LSL #2]\n"
- "ldr x24, [x16, #0xa8]\n"
- "fmla z26.s, p3/M, z6.s, z12.s\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x14, LSL #2]\n"
- "ldr x12, [x16, #0xc0]\n"
- "fmla z28.s, p3/M, z6.s, z10.s\n"
- "fmla z30.s, p3/M, z4.s, z10.s\n"
- "fmla z23.s, p3/M, z3.s, z11.s\n"
- "fmla z25.s, p3/M, z5.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x14, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x25, x14, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z3.s, z10.s\n"
- "ldr x26, [x16, #0xb0]\n"
- "ldr x25, [x16, #0xb8]\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "fmla z28.s, p3/M, z8.s, z11.s\n"
- "fmla z30.s, p3/M, z6.s, z13.s\n"
- "fmla z24.s, p3/M, z3.s, z12.s\n"
- "fmla z27.s, p3/M, z0.s, z12.s\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x11, x14, LSL #2]\n"
- "fmla z29.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x10, x14, LSL #2]\n"
- "fmla z23.s, p3/M, z4.s, z12.s\n"
- "fmla z26.s, p3/M, z1.s, z12.s\n"
- "fmla z24.s, p3/M, z5.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x9, x14, LSL #2]\n"
- "fmla z25.s, p3/M, z4.s, z11.s\n"
- "fmla z27.s, p3/M, z2.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z30.s, p3/M, z8.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x14, LSL #2]\n"
- "ldr x28, [x16, #0x20]\n"
- "fmla z31.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x24, x14, LSL #2]\n"
- "fmla z23.s, p3/M, z2.s, z11.s\n"
- "fmla z26.s, p3/M, z7.s, z12.s\n"
- "fmla z27.s, p3/M, z6.s, z12.s\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "fmla z30.s, p3/M, z3.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x14, LSL #2]\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "fmla z24.s, p3/M, z1.s, z11.s\n"
- "fmax z24.s, p3/M, z24.s, z17.s\n"
- "fmin z24.s, p3/M, z24.s, z16.s\n"
- "fmla z25.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x14, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x26, x15, LSL #2]\n"
"fmla z23.s, p3/M, z6.s, z12.s\n"
- "fmax z23.s, p3/M, z23.s, z17.s\n"
- "fmla z28.s, p3/M, z7.s, z13.s\n"
- "fmla z30.s, p3/M, z5.s, z13.s\n"
- "fmin z23.s, p3/M, z23.s, z16.s\n"
- "st1w { z23.s }, p1, [x23, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "ldr x23, [x27, #0x20]\n"
- "st1w { z24.s }, p1, [x22, x13, LSL #2]\n"
- "fmla z27.s, p3/M, z8.s, z13.s\n"
- "fmla z26.s, p3/M, z3.s, z12.s\n"
- "ld1w { z13.s }, p2/Z, [x12, x14, LSL #2]\n"
- "ldp x12, x11, [x16, #0x0]\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "fmla z28.s, p3/M, z5.s, z11.s\n"
- "ldp x10, x9, [x16, #0x10]\n"
- "fmax z25.s, p3/M, z25.s, z17.s\n"
- "fmla z29.s, p3/M, z8.s, z13.s\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
- "fmax z26.s, p3/M, z26.s, z17.s\n"
- "fmax z27.s, p3/M, z27.s, z17.s\n"
- "fmla z31.s, p3/M, z6.s, z13.s\n"
- "incw x14\n"
- "ld1w { z9.s }, p0/Z, [x12, x15, LSL #2]\n"
- "ld1w { z10.s }, p0/Z, [x11, x15, LSL #2]\n"
- "ld1w { z11.s }, p0/Z, [x10, x15, LSL #2]\n"
- "ld1w { z12.s }, p0/Z, [x9, x15, LSL #2]\n"
- "fmin z25.s, p3/M, z25.s, z16.s\n"
- "fmin z26.s, p3/M, z26.s, z16.s\n"
- "ld1w { z13.s }, p0/Z, [x28, x15, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "movprfx z21, z14\n fmla z21.s, p3/M, z1.s, z9.s\n"
+ "fmla z29.s, p3/M, z7.s, z18.s\n"
+ "ldr x22, [x17, #0x68]\n"
+ "ldr x21, [x17, #0x78]\n"
+ "fmla z28.s, p3/M, z0.s, z17.s\n"
+ "fmla z22.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x20, [x17, #0x80]\n"
+ "fmla z26.s, p3/M, z4.s, z18.s\n"
+ "fmla z25.s, p3/M, z3.s, z18.s\n"
+ "ldr x9, [x13, #0x8]\n"
+ "ldr x28, [x13, #0x10]\n"
+ "fmla z21.s, p3/M, z0.s, z18.s\n"
+ "fmla z24.s, p3/M, z4.s, z19.s\n"
+ "ldr x27, [x13, #0x18]\n"
+ "ld1w { z14.s }, p3/Z, [x8]\n"
+ "fmla z23.s, p3/M, z1.s, z18.s\n"
+ "fmla z29.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z20.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z16.s\n"
+ "fmla z27.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ldr x26, [x17, #0x90]\n"
+ "fmla z25.s, p3/M, z5.s, z19.s\n"
+ "fmla z21.s, p3/M, z2.s, z19.s\n"
+ "ldr x25, [x17, #0xa0]\n"
+ "ldr x24, [x17, #0x98]\n"
+ "fmla z26.s, p3/M, z0.s, z20.s\n"
+ "fmla z24.s, p3/M, z2.s, z17.s\n"
+ "fmla z28.s, p3/M, z8.s, z19.s\n"
+ "fmla z27.s, p3/M, z7.s, z19.s\n"
+ "fmla z22.s, p3/M, z1.s, z19.s\n"
+ "fmla z23.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldr x23, [x17, #0xa8]\n"
+ "fmla z26.s, p3/M, z6.s, z16.s\n"
+ "fmla z25.s, p3/M, z7.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x22, [x17, #0xc0]\n"
+ "fmla z24.s, p3/M, z6.s, z18.s\n"
+ "fmla z21.s, p3/M, z4.s, z18.s\n"
+ "fmla z29.s, p3/M, z3.s, z20.s\n"
+ "fmla z27.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x12, x15, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z18.s\n"
+ "fmla z22.s, p3/M, z3.s, z18.s\n"
+ "ldr x21, [x17, #0xb0]\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "fmla z26.s, p3/M, z8.s, z18.s\n"
+ "fmla z24.s, p3/M, z8.s, z17.s\n"
+ "fmla z21.s, p3/M, z6.s, z16.s\n"
+ "fmla z28.s, p3/M, z3.s, z19.s\n"
+ "fmla z25.s, p3/M, z0.s, z19.s\n"
+ "fmla z22.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x11, x15, LSL #2]\n"
+ "fmla z23.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z19.s\n"
+ "fmla z26.s, p3/M, z1.s, z19.s\n"
+ "fmla z28.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z4.s, z17.s\n"
+ "fmla z25.s, p3/M, z2.s, z17.s\n"
+ "fmla z24.s, p3/M, z1.s, z17.s\n"
+ "fmla z21.s, p3/M, z8.s, z18.s\n"
+ "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "ldr x25, [x17, #0x20]\n"
+ "fmla z22.s, p3/M, z7.s, z18.s\n"
+ "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z2.s, z17.s\n"
+ "fmla z26.s, p3/M, z7.s, z16.s\n"
+ "fmla z25.s, p3/M, z6.s, z16.s\n"
+ "fmla z23.s, p3/M, z4.s, z16.s\n"
+ "fmla z21.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "fmla z22.s, p3/M, z4.s, z18.s\n"
+ "fmla z28.s, p3/M, z1.s, z17.s\n"
+ "fmax z28.s, p3/M, z28.s, z31.s\n"
+ "fmin z28.s, p3/M, z28.s, z30.s\n"
+ "fmla z27.s, p3/M, z0.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z6.s, z16.s\n"
+ "fmax z29.s, p3/M, z29.s, z31.s\n"
+ "fmla z24.s, p3/M, z7.s, z18.s\n"
+ "fmla z21.s, p3/M, z5.s, z18.s\n"
+ "fmin z29.s, p3/M, z29.s, z30.s\n"
+ "st1w { z29.s }, p1, [x10, x14, LSL #2]\n"
+ "fmla z23.s, p3/M, z0.s, z16.s\n"
+ "fmla z22.s, p3/M, z2.s, z17.s\n"
+ "ldr x24, [x13, #0x20]\n"
+ "st1w { z28.s }, p1, [x9, x14, LSL #2]\n"
+ "fmla z25.s, p3/M, z8.s, z18.s\n"
+ "fmla z26.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldp x23, x22, [x17, #0x0]\n"
+ "fmla z27.s, p3/M, z8.s, z17.s\n"
+ "fmla z24.s, p3/M, z5.s, z17.s\n"
+ "ldp x21, x20, [x17, #0x10]\n"
+ "fmax z27.s, p3/M, z27.s, z31.s\n"
+ "fmla z23.s, p3/M, z8.s, z16.s\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
+ "fmax z26.s, p3/M, z26.s, z31.s\n"
+ "fmax z25.s, p3/M, z25.s, z31.s\n"
+ "fmla z22.s, p3/M, z6.s, z16.s\n"
"incw x15\n"
- "fmin z27.s, p3/M, z27.s, z16.s\n"
- "st1w { z25.s }, p1, [x21, x13, LSL #2]\n"
- "fmax z28.s, p3/M, z28.s, z17.s\n"
- "fmax z29.s, p3/M, z29.s, z17.s\n"
- "st1w { z26.s }, p1, [x20, x13, LSL #2]\n"
- "ldr x22, [x27, #0x28]\n"
- "fmax z30.s, p3/M, z30.s, z17.s\n"
- "fmax z31.s, p3/M, z31.s, z17.s\n"
- "st1w { z27.s }, p1, [x23, x13, LSL #2]\n"
- "ldr x21, [x27, #0x30]\n"
- "ldr x20, [x27, #0x38]\n"
- "ldr x23, [x27, #0x40]\n"
- "whilelt p2.s, x14, %x[n_channels]\n"
- "cmp x15, %x[n_channels]\n"
- "ld1w { z0.s }, p3/Z, [x17, #1, MUL VL]\n"
- "ld1w { z1.s }, p3/Z, [x17, #2, MUL VL]\n"
- "fmin z28.s, p3/M, z28.s, z16.s\n"
- "fmin z29.s, p3/M, z29.s, z16.s\n"
- "ld1w { z2.s }, p3/Z, [x17, #3, MUL VL]\n"
- "ld1w { z3.s }, p3/Z, [x17, #4, MUL VL]\n"
- "fmin z30.s, p3/M, z30.s, z16.s\n"
- "fmin z31.s, p3/M, z31.s, z16.s\n"
- "ld1w { z4.s }, p3/Z, [x17, #5, MUL VL]\n"
- "ld1w { z5.s }, p3/Z, [x17, #6, MUL VL]\n"
- "st1w { z28.s }, p1, [x22, x13, LSL #2]\n"
- "ld1w { z6.s }, p3/Z, [x17, #7, MUL VL]\n"
- "addvl x17, x17, #16\n"
- "st1w { z29.s }, p1, [x21, x13, LSL #2]\n"
- "ld1w { z7.s }, p3/Z, [x17, #-8, MUL VL]\n"
- "st1w { z30.s }, p1, [x20, x13, LSL #2]\n"
- "ld1w { z8.s }, p3/Z, [x17, #-7, MUL VL]\n"
- "addvl x17, x17, #-6\n"
- "st1w { z31.s }, p1, [x23, x13, LSL #2]\n"
+ "ld1w { z9.s }, p0/Z, [x23, x16, LSL #2]\n"
+ "ld1w { z10.s }, p0/Z, [x22, x16, LSL #2]\n"
+ "ld1w { z11.s }, p0/Z, [x21, x16, LSL #2]\n"
+ "ld1w { z12.s }, p0/Z, [x20, x16, LSL #2]\n"
+ "fmin z27.s, p3/M, z27.s, z30.s\n"
+ "fmin z26.s, p3/M, z26.s, z30.s\n"
+ "ld1w { z13.s }, p0/Z, [x25, x16, LSL #2]\n"
+ "incw x16\n"
+ "fmin z25.s, p3/M, z25.s, z30.s\n"
+ "st1w { z27.s }, p1, [x28, x14, LSL #2]\n"
+ "fmax z24.s, p3/M, z24.s, z31.s\n"
+ "fmax z23.s, p3/M, z23.s, z31.s\n"
+ "st1w { z26.s }, p1, [x27, x14, LSL #2]\n"
+ "ldr x23, [x13, #0x28]\n"
+ "fmax z21.s, p3/M, z21.s, z31.s\n"
+ "fmax z22.s, p3/M, z22.s, z31.s\n"
+ "st1w { z25.s }, p1, [x24, x14, LSL #2]\n"
+ "ldr x22, [x13, #0x30]\n"
+ "ldr x21, [x13, #0x38]\n"
+ "ldr x20, [x13, #0x40]\n"
+ "whilelt p2.s, x15, %x[n_channels]\n"
+ "cmp x16, %x[n_channels]\n"
+ "ld1w { z0.s }, p3/Z, [x8, #1, MUL VL]\n"
+ "ld1w { z1.s }, p3/Z, [x8, #2, MUL VL]\n"
+ "fmin z24.s, p3/M, z24.s, z30.s\n"
+ "fmin z23.s, p3/M, z23.s, z30.s\n"
+ "ld1w { z2.s }, p3/Z, [x8, #3, MUL VL]\n"
+ "ld1w { z3.s }, p3/Z, [x8, #4, MUL VL]\n"
+ "fmin z21.s, p3/M, z21.s, z30.s\n"
+ "fmin z22.s, p3/M, z22.s, z30.s\n"
+ "ld1w { z4.s }, p3/Z, [x8, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x8, #6, MUL VL]\n"
+ "st1w { z24.s }, p1, [x23, x14, LSL #2]\n"
+ "ld1w { z6.s }, p3/Z, [x8, #7, MUL VL]\n"
+ "addvl x8, x8, #16\n"
+ "st1w { z23.s }, p1, [x22, x14, LSL #2]\n"
+ "ld1w { z7.s }, p3/Z, [x8, #-8, MUL VL]\n"
+ "st1w { z21.s }, p1, [x21, x14, LSL #2]\n"
+ "ld1w { z8.s }, p3/Z, [x8, #-7, MUL VL]\n"
+ "addvl x8, x8, #-6\n"
+ "st1w { z22.s }, p1, [x20, x14, LSL #2]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z23, z18\n fmla z23.s, p3/M, z8.s, z9.s\n"
- "movprfx z24, z18\n fmla z24.s, p3/M, z7.s, z9.s\n"
- "ldr x26, [x16, #0x30]\n"
- "ldr x25, [x16, #0x38]\n"
- "movprfx z25, z18\n fmla z25.s, p3/M, z6.s, z9.s\n"
- "fmla z23.s, p3/M, z0.s, z10.s\n"
- "ldr x24, [x16, #0x28]\n"
- "ldr x11, [x16, #0x48]\n"
- "fmla z24.s, p3/M, z4.s, z13.s\n"
- "movprfx z26, z18\n fmla z26.s, p3/M, z5.s, z9.s\n"
- "ldr x12, [x16, #0x40]\n"
- "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
- "movprfx z27, z18\n fmla z27.s, p3/M, z4.s, z9.s\n"
- "movprfx z28, z18\n fmla z28.s, p3/M, z3.s, z9.s\n"
- "ldr x10, [x16, #0x50]\n"
- "ldr x9, [x16, #0x58]\n"
- "fmla z25.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x14, LSL #2]\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z2.s, z9.s\n"
- "ldr x28, [x16, #0x60]\n"
- "fmla z23.s, p3/M, z5.s, z13.s\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "ldr x26, [x16, #0x70]\n"
- "ldr x11, [x16, #0x88]\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "fmla z25.s, p3/M, z3.s, z13.s\n"
- "incw x13\n"
- "mov p1.b, p2.b\n"
+ "movprfx z29, z14\n fmla z29.s, p3/M, z8.s, z9.s\n"
+ "movprfx z28, z14\n fmla z28.s, p3/M, z7.s, z9.s\n"
+ "ldr x23, [x17, #0x30]\n"
+ "ldr x26, [x17, #0x38]\n"
+ "movprfx z27, z14\n fmla z27.s, p3/M, z6.s, z9.s\n"
+ "fmla z29.s, p3/M, z0.s, z10.s\n"
+ "ldr x22, [x17, #0x28]\n"
+ "ldr x21, [x17, #0x48]\n"
+ "fmla z28.s, p3/M, z4.s, z13.s\n"
+ "movprfx z26, z14\n fmla z26.s, p3/M, z5.s, z9.s\n"
+ "ldr x20, [x17, #0x40]\n"
+ "ld1w { z19.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
+ "movprfx z24, z14\n fmla z24.s, p3/M, z3.s, z9.s\n"
+ "ldr x25, [x17, #0x50]\n"
+ "ldr x24, [x17, #0x58]\n"
+ "fmla z27.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "movprfx z23, z14\n fmla z23.s, p3/M, z2.s, z9.s\n"
+ "ldr x23, [x17, #0x60]\n"
+ "fmla z29.s, p3/M, z5.s, z13.s\n"
+ "fmla z28.s, p3/M, z6.s, z18.s\n"
+ "ldr x12, [x17, #0x70]\n"
+ "ldr x11, [x17, #0x88]\n"
+ "movprfx z22, z14\n fmla z22.s, p3/M, z0.s, z9.s\n"
+ "fmla z27.s, p3/M, z3.s, z13.s\n"
+ "incw x14\n"
+ "mov p0.b, p2.b\n"
"fmla z26.s, p3/M, z2.s, z13.s\n"
- "fmla z27.s, p3/M, z1.s, z13.s\n"
- "ldr x23, [x27, #0x0]\n"
- "ldr x22, [x27, #0x8]\n"
- "fmla z28.s, p3/M, z0.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x25, x14, LSL #2]\n"
- "fmla z29.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x14, LSL #2]\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
- "ldr x24, [x16, #0x68]\n"
- "ldr x25, [x16, #0x78]\n"
+ "fmla z25.s, p3/M, z1.s, z13.s\n"
+ "ldr x10, [x13, #0x0]\n"
+ "ldr x9, [x13, #0x8]\n"
"fmla z24.s, p3/M, z0.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x14, LSL #2]\n"
- "ldr x12, [x16, #0x80]\n"
- "fmla z26.s, p3/M, z4.s, z11.s\n"
- "fmla z27.s, p3/M, z3.s, z11.s\n"
- "ldr x21, [x27, #0x10]\n"
- "ldr x20, [x27, #0x18]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z28.s, p3/M, z4.s, z10.s\n"
- "fmla z29.s, p3/M, z1.s, z11.s\n"
- "fmla z23.s, p3/M, z1.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x10, x14, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x9, x14, LSL #2]\n"
- "fmla z24.s, p3/M, z2.s, z12.s\n"
- "fmla z25.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x28, x14, LSL #2]\n"
- "ldr x10, [x16, #0x90]\n"
- "fmla z27.s, p3/M, z5.s, z10.s\n"
- "fmla z30.s, p3/M, z2.s, z10.s\n"
- "ldr x28, [x16, #0xa0]\n"
- "ldr x9, [x16, #0x98]\n"
- "fmla z26.s, p3/M, z0.s, z11.s\n"
- "fmla z28.s, p3/M, z2.s, z13.s\n"
- "fmla z24.s, p3/M, z8.s, z10.s\n"
- "fmla z25.s, p3/M, z7.s, z10.s\n"
- "fmla z31.s, p3/M, z1.s, z10.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x24, x14, LSL #2]\n"
- "ldr x24, [x16, #0xa8]\n"
- "fmla z26.s, p3/M, z6.s, z12.s\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x14, LSL #2]\n"
- "ldr x12, [x16, #0xc0]\n"
- "fmla z28.s, p3/M, z6.s, z10.s\n"
- "fmla z30.s, p3/M, z4.s, z10.s\n"
- "fmla z23.s, p3/M, z3.s, z11.s\n"
- "fmla z25.s, p3/M, z5.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x14, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x25, x14, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z3.s, z10.s\n"
- "ldr x26, [x16, #0xb0]\n"
- "ldr x25, [x16, #0xb8]\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "fmla z28.s, p3/M, z8.s, z11.s\n"
- "fmla z30.s, p3/M, z6.s, z13.s\n"
- "fmla z24.s, p3/M, z3.s, z12.s\n"
- "fmla z27.s, p3/M, z0.s, z12.s\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x11, x14, LSL #2]\n"
- "fmla z29.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x10, x14, LSL #2]\n"
- "fmla z23.s, p3/M, z4.s, z12.s\n"
- "fmla z26.s, p3/M, z1.s, z12.s\n"
- "fmla z24.s, p3/M, z5.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x9, x14, LSL #2]\n"
- "fmla z25.s, p3/M, z4.s, z11.s\n"
- "fmla z27.s, p3/M, z2.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z30.s, p3/M, z8.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x14, LSL #2]\n"
- "fmla z31.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x24, x14, LSL #2]\n"
- "fmla z23.s, p3/M, z2.s, z11.s\n"
- "fmla z26.s, p3/M, z7.s, z12.s\n"
- "fmla z27.s, p3/M, z6.s, z12.s\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "fmla z30.s, p3/M, z3.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x14, LSL #2]\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "fmla z24.s, p3/M, z1.s, z11.s\n"
- "fmax z24.s, p3/M, z24.s, z17.s\n"
- "fmin z24.s, p3/M, z24.s, z16.s\n"
- "fmla z25.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x14, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x26, x15, LSL #2]\n"
"fmla z23.s, p3/M, z6.s, z12.s\n"
- "fmax z23.s, p3/M, z23.s, z17.s\n"
- "fmla z28.s, p3/M, z7.s, z13.s\n"
- "fmla z30.s, p3/M, z5.s, z13.s\n"
- "fmin z23.s, p3/M, z23.s, z16.s\n"
- "st1w { z23.s }, p1, [x23, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "ldr x23, [x27, #0x20]\n"
- "st1w { z24.s }, p1, [x22, x13, LSL #2]\n"
- "fmla z27.s, p3/M, z8.s, z13.s\n"
- "fmla z26.s, p3/M, z3.s, z12.s\n"
- "ld1w { z13.s }, p2/Z, [x12, x14, LSL #2]\n"
- "fmax z26.s, p3/M, z26.s, z17.s\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "fmla z28.s, p3/M, z5.s, z11.s\n"
- "fmax z25.s, p3/M, z25.s, z17.s\n"
- "fmax z27.s, p3/M, z27.s, z17.s\n"
- "fmla z29.s, p3/M, z8.s, z13.s\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
- "fmin z25.s, p3/M, z25.s, z16.s\n"
- "fmin z26.s, p3/M, z26.s, z16.s\n"
- "fmla z31.s, p3/M, z6.s, z13.s\n"
- "fmin z27.s, p3/M, z27.s, z16.s\n"
- "fmax z28.s, p3/M, z28.s, z17.s\n"
- "st1w { z25.s }, p1, [x21, x13, LSL #2]\n"
- "fmax z29.s, p3/M, z29.s, z17.s\n"
- "fmax z30.s, p3/M, z30.s, z17.s\n"
- "st1w { z26.s }, p1, [x20, x13, LSL #2]\n"
- "ldr x22, [x27, #0x28]\n"
- "fmax z31.s, p3/M, z31.s, z17.s\n"
- "st1w { z27.s }, p1, [x23, x13, LSL #2]\n"
- "ldr x21, [x27, #0x30]\n"
- "ldr x20, [x27, #0x38]\n"
- "ldr x23, [x27, #0x40]\n"
- "fmin z28.s, p3/M, z28.s, z16.s\n"
- "fmin z29.s, p3/M, z29.s, z16.s\n"
- "st1w { z28.s }, p1, [x22, x13, LSL #2]\n"
- "fmin z30.s, p3/M, z30.s, z16.s\n"
- "fmin z31.s, p3/M, z31.s, z16.s\n"
- "st1w { z29.s }, p1, [x21, x13, LSL #2]\n"
- "st1w { z30.s }, p1, [x20, x13, LSL #2]\n"
- "st1w { z31.s }, p1, [x23, x13, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "movprfx z21, z14\n fmla z21.s, p3/M, z1.s, z9.s\n"
+ "fmla z29.s, p3/M, z7.s, z18.s\n"
+ "ldr x22, [x17, #0x68]\n"
+ "ldr x21, [x17, #0x78]\n"
+ "fmla z28.s, p3/M, z0.s, z17.s\n"
+ "fmla z22.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x20, [x17, #0x80]\n"
+ "fmla z26.s, p3/M, z4.s, z18.s\n"
+ "fmla z25.s, p3/M, z3.s, z18.s\n"
+ "ldr x28, [x13, #0x10]\n"
+ "ldr x27, [x13, #0x18]\n"
+ "fmla z21.s, p3/M, z0.s, z18.s\n"
+ "fmla z24.s, p3/M, z4.s, z19.s\n"
+ "fmla z23.s, p3/M, z1.s, z18.s\n"
+ "fmla z29.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z20.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z16.s\n"
+ "fmla z27.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ldr x26, [x17, #0x90]\n"
+ "fmla z25.s, p3/M, z5.s, z19.s\n"
+ "fmla z21.s, p3/M, z2.s, z19.s\n"
+ "ldr x25, [x17, #0xa0]\n"
+ "ldr x24, [x17, #0x98]\n"
+ "fmla z26.s, p3/M, z0.s, z20.s\n"
+ "fmla z24.s, p3/M, z2.s, z17.s\n"
+ "fmla z28.s, p3/M, z8.s, z19.s\n"
+ "fmla z27.s, p3/M, z7.s, z19.s\n"
+ "fmla z22.s, p3/M, z1.s, z19.s\n"
+ "fmla z23.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldr x23, [x17, #0xa8]\n"
+ "fmla z26.s, p3/M, z6.s, z16.s\n"
+ "fmla z25.s, p3/M, z7.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x22, [x17, #0xc0]\n"
+ "fmla z24.s, p3/M, z6.s, z18.s\n"
+ "fmla z21.s, p3/M, z4.s, z18.s\n"
+ "fmla z29.s, p3/M, z3.s, z20.s\n"
+ "fmla z27.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x12, x15, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z18.s\n"
+ "fmla z22.s, p3/M, z3.s, z18.s\n"
+ "ldr x21, [x17, #0xb0]\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "fmla z26.s, p3/M, z8.s, z18.s\n"
+ "fmla z24.s, p3/M, z8.s, z17.s\n"
+ "fmla z21.s, p3/M, z6.s, z16.s\n"
+ "fmla z28.s, p3/M, z3.s, z19.s\n"
+ "fmla z25.s, p3/M, z0.s, z19.s\n"
+ "fmla z22.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x11, x15, LSL #2]\n"
+ "fmla z23.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z19.s\n"
+ "fmla z26.s, p3/M, z1.s, z19.s\n"
+ "fmla z28.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z4.s, z17.s\n"
+ "fmla z25.s, p3/M, z2.s, z17.s\n"
+ "fmla z24.s, p3/M, z1.s, z17.s\n"
+ "fmla z21.s, p3/M, z8.s, z18.s\n"
+ "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z22.s, p3/M, z7.s, z18.s\n"
+ "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z2.s, z17.s\n"
+ "fmla z26.s, p3/M, z7.s, z16.s\n"
+ "fmla z25.s, p3/M, z6.s, z16.s\n"
+ "fmla z23.s, p3/M, z4.s, z16.s\n"
+ "fmla z21.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "fmla z22.s, p3/M, z4.s, z18.s\n"
+ "fmla z28.s, p3/M, z1.s, z17.s\n"
+ "fmax z28.s, p3/M, z28.s, z31.s\n"
+ "fmin z28.s, p3/M, z28.s, z30.s\n"
+ "fmla z27.s, p3/M, z0.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z6.s, z16.s\n"
+ "fmax z29.s, p3/M, z29.s, z31.s\n"
+ "fmla z24.s, p3/M, z7.s, z18.s\n"
+ "fmla z21.s, p3/M, z5.s, z18.s\n"
+ "fmin z29.s, p3/M, z29.s, z30.s\n"
+ "st1w { z29.s }, p0, [x10, x14, LSL #2]\n"
+ "fmla z23.s, p3/M, z0.s, z16.s\n"
+ "fmla z22.s, p3/M, z2.s, z17.s\n"
+ "ldr x20, [x13, #0x20]\n"
+ "st1w { z28.s }, p0, [x9, x14, LSL #2]\n"
+ "fmla z25.s, p3/M, z8.s, z18.s\n"
+ "fmla z26.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmax z26.s, p3/M, z26.s, z31.s\n"
+ "fmla z27.s, p3/M, z8.s, z17.s\n"
+ "fmla z24.s, p3/M, z5.s, z17.s\n"
+ "fmax z27.s, p3/M, z27.s, z31.s\n"
+ "fmax z25.s, p3/M, z25.s, z31.s\n"
+ "fmla z23.s, p3/M, z8.s, z16.s\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
+ "fmin z27.s, p3/M, z27.s, z30.s\n"
+ "fmin z26.s, p3/M, z26.s, z30.s\n"
+ "fmla z22.s, p3/M, z6.s, z16.s\n"
+ "fmin z25.s, p3/M, z25.s, z30.s\n"
+ "fmax z24.s, p3/M, z24.s, z31.s\n"
+ "st1w { z27.s }, p0, [x28, x14, LSL #2]\n"
+ "fmax z23.s, p3/M, z23.s, z31.s\n"
+ "fmax z21.s, p3/M, z21.s, z31.s\n"
+ "st1w { z26.s }, p0, [x27, x14, LSL #2]\n"
+ "ldr x23, [x13, #0x28]\n"
+ "fmax z22.s, p3/M, z22.s, z31.s\n"
+ "st1w { z25.s }, p0, [x20, x14, LSL #2]\n"
+ "ldr x22, [x13, #0x30]\n"
+ "ldr x21, [x13, #0x38]\n"
+ "ldr x20, [x13, #0x40]\n"
+ "fmin z24.s, p3/M, z24.s, z30.s\n"
+ "fmin z23.s, p3/M, z23.s, z30.s\n"
+ "st1w { z24.s }, p0, [x23, x14, LSL #2]\n"
+ "fmin z21.s, p3/M, z21.s, z30.s\n"
+ "fmin z22.s, p3/M, z22.s, z30.s\n"
+ "st1w { z23.s }, p0, [x22, x14, LSL #2]\n"
+ "st1w { z21.s }, p0, [x21, x14, LSL #2]\n"
+ "st1w { z22.s }, p0, [x20, x14, LSL #2]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
index 6073b2ba7d..38b377509e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,19 +22,19 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
-void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
-void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
class sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
{
@@ -57,7 +57,7 @@ class sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 4;
sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<float, float, float, float>(4, 3, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
@@ -68,4 +68,4 @@ class sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirst
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
index cdf77a1cf0..35445595f8 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -113,7 +113,7 @@ void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"mul x21, x21, x25\n" // offset *= kernel_stride * output_size
"add x8, x8, x21, LSL #2\n" // inptr[0] += offset * sizeof(float)
"add x13, x8, x23, LSL #2\n"
- "ld1w { z15.s }, p3/Z, [x17]\n"
+ "ld1w { z19.s }, p3/Z, [x17]\n"
"mul x20, x20, x24\n" // offset *= output_tile_size
"add x12, x13, x23, LSL #2\n"
"add x15, x15, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
@@ -132,8 +132,8 @@ void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"add x27, x10, x5\n"
"add x26, x9, x22, LSL #2\n"
"add x25, x6, x6\n"
- "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rw { z13.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"cmp x16, %x[n_channels]\n"
"add x24, x28, x23, LSL #2\n"
"ld1w { z7.s }, p3/Z, [x17, #-8, MUL VL]\n"
@@ -149,500 +149,500 @@ void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"addvl x17, x17, #-6\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z21, z15\n fmla z21.s, p3/M, z4.s, z9.s\n"
- "movprfx z16, z15\n fmla z16.s, p3/M, z8.s, z9.s\n"
+ "movprfx z14, z19\n fmla z14.s, p3/M, z4.s, z9.s\n"
+ "movprfx z31, z19\n fmla z31.s, p3/M, z8.s, z9.s\n"
"whilelt p1.s, x16, %x[n_channels]\n"
"incw x21\n"
- "movprfx z22, z15\n fmla z22.s, p3/M, z3.s, z9.s\n"
- "movprfx z25, z15\n fmla z25.s, p3/M, z1.s, z9.s\n"
+ "movprfx z21, z19\n fmla z21.s, p3/M, z3.s, z9.s\n"
+ "movprfx z22, z19\n fmla z22.s, p3/M, z1.s, z9.s\n"
"incw x16\n"
"mov p0.b, p2.b\n"
- "movprfx z26, z15\n fmla z26.s, p3/M, z0.s, z9.s\n"
- "fmla z21.s, p3/M, z5.s, z12.s\n"
+ "movprfx z20, z19\n fmla z20.s, p3/M, z0.s, z9.s\n"
+ "fmla z14.s, p3/M, z5.s, z12.s\n"
"incw x20\n"
- "movprfx z17, z15\n fmla z17.s, p3/M, z7.s, z9.s\n"
- "movprfx z18, z15\n fmla z18.s, p3/M, z6.s, z9.s\n"
- "movprfx z20, z15\n fmla z20.s, p3/M, z5.s, z9.s\n"
- "movprfx z24, z15\n fmla z24.s, p3/M, z2.s, z9.s\n"
+ "movprfx z13, z19\n fmla z13.s, p3/M, z7.s, z9.s\n"
+ "movprfx z17, z19\n fmla z17.s, p3/M, z6.s, z9.s\n"
+ "movprfx z27, z19\n fmla z27.s, p3/M, z5.s, z9.s\n"
+ "movprfx z18, z19\n fmla z18.s, p3/M, z2.s, z9.s\n"
"ld1w { z9.s }, p2/Z, [x11, x7, LSL #2]\n"
- "fmla z16.s, p3/M, z0.s, z10.s\n"
- "movprfx z19, z15\n fmla z19.s, p3/M, z2.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x24]\n"
+ "fmla z31.s, p3/M, z0.s, z10.s\n"
+ "movprfx z30, z19\n fmla z30.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z29.s }, p2/Z, [x24]\n"
"ld1w { z11.s }, p2/Z, [x24, x27, LSL #2]\n"
- "fmla z22.s, p3/M, z4.s, z12.s\n"
- "fmla z25.s, p3/M, z2.s, z12.s\n"
- "fmla z26.s, p3/M, z1.s, z12.s\n"
- "movprfx z28, z15\n fmla z28.s, p3/M, z6.s, z10.s\n"
+ "fmla z21.s, p3/M, z4.s, z12.s\n"
+ "fmla z22.s, p3/M, z2.s, z12.s\n"
+ "fmla z20.s, p3/M, z1.s, z12.s\n"
+ "movprfx z23, z19\n fmla z23.s, p3/M, z6.s, z29.s\n"
"ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
- "fmla z21.s, p3/M, z7.s, z9.s\n"
- "fmla z17.s, p3/M, z8.s, z12.s\n"
- "fmla z18.s, p3/M, z7.s, z12.s\n"
- "fmla z19.s, p3/M, z6.s, z12.s\n"
- "movprfx z23, z15\n fmla z23.s, p3/M, z3.s, z12.s\n"
- "movprfx z27, z15\n fmla z27.s, p3/M, z0.s, z12.s\n"
+ "fmla z14.s, p3/M, z7.s, z9.s\n"
+ "fmla z13.s, p3/M, z8.s, z12.s\n"
+ "fmla z17.s, p3/M, z7.s, z12.s\n"
+ "fmla z30.s, p3/M, z6.s, z12.s\n"
+ "movprfx z26, z19\n fmla z26.s, p3/M, z3.s, z12.s\n"
+ "movprfx z28, z19\n fmla z28.s, p3/M, z0.s, z12.s\n"
"ld1w { z12.s }, p2/Z, [x8, x5, LSL #2]\n"
- "movprfx z31, z15\n fmla z31.s, p3/M, z8.s, z11.s\n"
- "fmla z22.s, p3/M, z6.s, z9.s\n"
+ "movprfx z24, z19\n fmla z24.s, p3/M, z8.s, z11.s\n"
+ "fmla z21.s, p3/M, z6.s, z9.s\n"
"ld1w { z11.s }, p2/Z, [x8, x10, LSL #2]\n"
- "fmla z25.s, p3/M, z4.s, z9.s\n"
- "fmla z26.s, p3/M, z3.s, z9.s\n"
- "fmla z20.s, p3/M, z8.s, z9.s\n"
- "fmla z24.s, p3/M, z5.s, z9.s\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z21.s, p3/M, z8.s, z10.s\n"
- "fmla z16.s, p3/M, z1.s, z12.s\n"
- "fmla z17.s, p3/M, z0.s, z12.s\n"
+ "fmla z22.s, p3/M, z4.s, z9.s\n"
+ "fmla z20.s, p3/M, z3.s, z9.s\n"
+ "movprfx z25, z19\n fmla z25.s, p3/M, z1.s, z9.s\n"
+ "movprfx z29, z19\n fmla z29.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z19.s }, p3/Z, [x17]\n"
+ "fmla z27.s, p3/M, z8.s, z9.s\n"
+ "fmla z18.s, p3/M, z5.s, z9.s\n"
+ "fmla z23.s, p3/M, z2.s, z9.s\n"
+ "fmla z14.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z9.s }, p2/Z, [x13]\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "fmla z13.s, p3/M, z0.s, z12.s\n"
"ld1w { z12.s }, p2/Z, [x13, x27, LSL #2]\n"
- "fmla z18.s, p3/M, z2.s, z11.s\n"
- "fmla z19.s, p3/M, z1.s, z11.s\n"
+ "fmla z17.s, p3/M, z2.s, z11.s\n"
+ "fmla z30.s, p3/M, z1.s, z11.s\n"
"ld1w { z11.s }, p2/Z, [x28]\n"
- "fmla z22.s, p3/M, z7.s, z10.s\n"
- "fmla z23.s, p3/M, z6.s, z10.s\n"
- "fmla z25.s, p3/M, z5.s, z10.s\n"
- "fmla z26.s, p3/M, z4.s, z10.s\n"
- "fmla z27.s, p3/M, z3.s, z10.s\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x27, LSL #2]\n"
- "fmla z19.s, p3/M, z5.s, z12.s\n"
- "fmla z23.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x13, x14, LSL #2]\n"
- "fmla z27.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "movprfx z29, z15\n fmla z29.s, p3/M, z1.s, z9.s\n"
- "movprfx z30, z15\n fmla z30.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x13]\n"
- "fmla z29.s, p3/M, z2.s, z10.s\n"
- "fmla z30.s, p3/M, z1.s, z10.s\n"
+ "fmla z21.s, p3/M, z7.s, z10.s\n"
+ "fmla z26.s, p3/M, z6.s, z10.s\n"
+ "fmla z22.s, p3/M, z5.s, z10.s\n"
+ "fmla z20.s, p3/M, z4.s, z10.s\n"
+ "fmla z28.s, p3/M, z3.s, z10.s\n"
+ "fmla z25.s, p3/M, z2.s, z10.s\n"
+ "fmla z29.s, p3/M, z1.s, z10.s\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
"ld1w { z10.s }, p2/Z, [x13, x7, LSL #2]\n"
- "fmla z20.s, p3/M, z0.s, z9.s\n"
- "fmla z21.s, p3/M, z1.s, z10.s\n"
- "fmla z16.s, p3/M, z3.s, z9.s\n"
- "fmla z17.s, p3/M, z4.s, z10.s\n"
- "ld1w { z11.s }, p2/Z, [x24, x5, LSL #2]\n"
- "fmla z18.s, p3/M, z3.s, z10.s\n"
- "fmla z22.s, p3/M, z0.s, z10.s\n"
+ "fmla z27.s, p3/M, z0.s, z9.s\n"
+ "fmla z18.s, p3/M, z6.s, z11.s\n"
+ "fmla z23.s, p3/M, z3.s, z11.s\n"
+ "fmla z14.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z11.s }, p2/Z, [x28, x27, LSL #2]\n"
+ "fmla z31.s, p3/M, z3.s, z9.s\n"
+ "fmla z30.s, p3/M, z5.s, z12.s\n"
+ "fmla z26.s, p3/M, z2.s, z12.s\n"
+ "fmla z13.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z9.s }, p2/Z, [x13, x14, LSL #2]\n"
+ "fmla z17.s, p3/M, z3.s, z10.s\n"
+ "fmla z21.s, p3/M, z0.s, z10.s\n"
+ "fmla z28.s, p3/M, z8.s, z11.s\n"
+ "fmla z24.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x5, LSL #2]\n"
+ "fmla z27.s, p3/M, z2.s, z10.s\n"
+ "fmla z14.s, p3/M, z2.s, z9.s\n"
+ "fmla z31.s, p3/M, z5.s, z10.s\n"
+ "fmla z13.s, p3/M, z5.s, z9.s\n"
+ "ld1w { z11.s }, p2/Z, [x12, x5, LSL #2]\n"
+ "fmla z17.s, p3/M, z4.s, z9.s\n"
+ "fmla z30.s, p3/M, z3.s, z9.s\n"
+ "fmla z21.s, p3/M, z1.s, z9.s\n"
+ "fmla z26.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z10.s }, p2/Z, [x12, x10, LSL #2]\n"
+ "fmla z23.s, p3/M, z7.s, z12.s\n"
+ "fmla z25.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x10, LSL #2]\n"
+ "fmla z27.s, p3/M, z4.s, z11.s\n"
+ "fmla z14.s, p3/M, z3.s, z11.s\n"
+ "fmla z18.s, p3/M, z1.s, z11.s\n"
+ "fmla z22.s, p3/M, z0.s, z11.s\n"
+ "fmla z31.s, p3/M, z7.s, z11.s\n"
+ "fmla z13.s, p3/M, z6.s, z11.s\n"
+ "ld1w { z9.s }, p2/Z, [x8, x7, LSL #2]\n"
+ "fmla z29.s, p3/M, z8.s, z12.s\n"
+ "fmla z24.s, p3/M, z7.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x11, x5, LSL #2]\n"
+ "fmla z17.s, p3/M, z8.s, z10.s\n"
+ "fmla z30.s, p3/M, z7.s, z10.s\n"
+ "fmla z21.s, p3/M, z5.s, z10.s\n"
+ "fmla z26.s, p3/M, z4.s, z10.s\n"
"fmla z20.s, p3/M, z2.s, z10.s\n"
- "fmla z21.s, p3/M, z2.s, z12.s\n"
- "fmla z16.s, p3/M, z5.s, z10.s\n"
- "fmla z17.s, p3/M, z5.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x12, x5, LSL #2]\n"
+ "fmla z28.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z11.s }, p2/Z, [x8, x14, LSL #2]\n"
+ "addvl x8, x8, #1\n"
+ "fmla z27.s, p3/M, z7.s, z12.s\n"
+ "fmla z14.s, p3/M, z6.s, z12.s\n"
"fmla z18.s, p3/M, z4.s, z12.s\n"
- "fmla z19.s, p3/M, z3.s, z12.s\n"
- "fmla z22.s, p3/M, z1.s, z12.s\n"
- "fmla z23.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x10, LSL #2]\n"
- "fmla z28.s, p3/M, z7.s, z11.s\n"
- "fmla z29.s, p3/M, z6.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x24, x10, LSL #2]\n"
- "fmla z20.s, p3/M, z4.s, z10.s\n"
- "fmla z21.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z1.s, z10.s\n"
- "fmla z25.s, p3/M, z0.s, z10.s\n"
- "fmla z16.s, p3/M, z7.s, z10.s\n"
- "fmla z17.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x8, x7, LSL #2]\n"
+ "fmla z22.s, p3/M, z3.s, z12.s\n"
+ "fmla z23.s, p3/M, z1.s, z12.s\n"
+ "fmla z25.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x11, x10, LSL #2]\n"
+ "fmla z31.s, p3/M, z2.s, z9.s\n"
+ "fmla z13.s, p3/M, z1.s, z9.s\n"
+ "fmla z17.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x12]\n"
+ "fmla z29.s, p3/M, z2.s, z12.s\n"
+ "fmla z30.s, p3/M, z0.s, z11.s\n"
+ "fmla z27.s, p3/M, z3.s, z9.s\n"
+ "fmla z18.s, p3/M, z0.s, z9.s\n"
+ "fmla z21.s, p3/M, z8.s, z12.s\n"
+ "fmla z26.s, p3/M, z7.s, z12.s\n"
+ "fmla z20.s, p3/M, z5.s, z12.s\n"
+ "fmla z28.s, p3/M, z4.s, z12.s\n"
+ "fmla z24.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x28, x7, LSL #2]\n"
+ "fmla z13.s, p3/M, z2.s, z11.s\n"
+ "fmla z17.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x12, x27, LSL #2]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z31.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x11]\n"
+ "fmla z25.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z9.s }, p1/Z, [x12, x7, LSL #2]\n"
+ "fmla z29.s, p3/M, z3.s, z10.s\n"
"fmla z30.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z7.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x11, x5, LSL #2]\n"
- "fmla z18.s, p3/M, z8.s, z12.s\n"
- "fmla z19.s, p3/M, z7.s, z12.s\n"
- "fmla z22.s, p3/M, z5.s, z12.s\n"
- "fmla z23.s, p3/M, z4.s, z12.s\n"
- "fmla z26.s, p3/M, z2.s, z12.s\n"
- "fmla z27.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x8, x14, LSL #2]\n"
- "addvl x8, x8, #1\n"
- "fmla z20.s, p3/M, z7.s, z11.s\n"
- "fmla z21.s, p3/M, z6.s, z11.s\n"
- "fmla z24.s, p3/M, z4.s, z11.s\n"
- "fmla z25.s, p3/M, z3.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x11, x10, LSL #2]\n"
- "fmla z16.s, p3/M, z2.s, z10.s\n"
- "fmla z17.s, p3/M, z1.s, z10.s\n"
- "fmla z18.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x12]\n"
- "fmla z30.s, p3/M, z2.s, z11.s\n"
- "fmla z19.s, p3/M, z0.s, z12.s\n"
- "fmla z20.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z0.s, z10.s\n"
- "fmla z22.s, p3/M, z8.s, z11.s\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
"fmla z26.s, p3/M, z5.s, z11.s\n"
- "fmla z27.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x7, LSL #2]\n"
- "fmla z17.s, p3/M, z2.s, z12.s\n"
- "fmla z18.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x27, LSL #2]\n"
- "addvl x12, x12, #1\n"
- "fmla z16.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x11]\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "fmla z30.s, p3/M, z3.s, z11.s\n"
- "fmla z19.s, p3/M, z8.s, z12.s\n"
- "fmla z23.s, p3/M, z5.s, z12.s\n"
- "fmla z27.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x11, x27, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x11, x27, LSL #2]\n"
"addvl x11, x11, #1\n"
+ "fmla z27.s, p3/M, z6.s, z12.s\n"
+ "fmla z18.s, p3/M, z3.s, z12.s\n"
+ "fmla z23.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x7, LSL #2]\n"
+ "fmla z24.s, p3/M, z2.s, z11.s\n"
+ "fmla z25.s, p3/M, z7.s, z12.s\n"
+ "fmla z29.s, p3/M, z6.s, z12.s\n"
+ "fmla z18.s, p3/M, z8.s, z10.s\n"
+ "fmla z22.s, p3/M, z7.s, z10.s\n"
"fmla z20.s, p3/M, z6.s, z10.s\n"
- "fmla z24.s, p3/M, z3.s, z10.s\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x24, x7, LSL #2]\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "fmla z30.s, p3/M, z6.s, z10.s\n"
- "fmla z24.s, p3/M, z8.s, z11.s\n"
- "fmla z25.s, p3/M, z7.s, z11.s\n"
- "fmla z26.s, p3/M, z6.s, z11.s\n"
+ "fmla z23.s, p3/M, z5.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x28, x14, LSL #2]\n"
"fmla z28.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x14, LSL #2]\n"
- "fmla z27.s, p3/M, z5.s, z12.s\n"
- "fmla z29.s, p3/M, z5.s, z11.s\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
+ "fmla z25.s, p3/M, z5.s, z10.s\n"
+ "fmla z29.s, p3/M, z4.s, z10.s\n"
+ "fmla z24.s, p3/M, z3.s, z10.s\n"
+ "fmla z26.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x24, x14, LSL #2]\n"
"fmla z23.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x14, LSL #2]\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x13, x5, LSL #2]\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "fmla z26.s, p3/M, z7.s, z11.s\n"
+ "ld1w { z12.s }, p2/Z, [x13, x5, LSL #2]\n"
+ "fmla z22.s, p3/M, z8.s, z10.s\n"
+ "fmla z20.s, p3/M, z7.s, z10.s\n"
"addvl x24, x24, #1\n"
- "fmla z27.s, p3/M, z6.s, z11.s\n"
- "fmla z29.s, p3/M, z8.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x13, x10, LSL #2]\n"
+ "fmla z28.s, p3/M, z6.s, z10.s\n"
+ "fmla z25.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x13, x10, LSL #2]\n"
"addvl x13, x13, #1\n"
- "fmla z30.s, p3/M, z7.s, z12.s\n"
- "fmla z31.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x28, x5, LSL #2]\n"
- "fmla z16.s, p3/M, z4.s, z10.s\n"
- "fmla z17.s, p3/M, z3.s, z10.s\n"
- "fmax z16.s, p3/M, z16.s, z14.s\n"
- "fmax z17.s, p3/M, z17.s, z14.s\n"
- "fmla z20.s, p3/M, z1.s, z10.s\n"
- "fmla z21.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x28, x10, LSL #2]\n"
- "fmax z20.s, p3/M, z20.s, z14.s\n"
- "fmla z18.s, p3/M, z5.s, z11.s\n"
- "fmla z19.s, p3/M, z4.s, z11.s\n"
- "fmax z18.s, p3/M, z18.s, z14.s\n"
- "fmax z19.s, p3/M, z19.s, z14.s\n"
- "fmla z22.s, p3/M, z2.s, z11.s\n"
- "fmla z23.s, p3/M, z1.s, z11.s\n"
- "fmax z21.s, p3/M, z21.s, z14.s\n"
- "fmax z22.s, p3/M, z22.s, z14.s\n"
- "fmla z24.s, p3/M, z7.s, z12.s\n"
- "fmla z25.s, p3/M, z6.s, z12.s\n"
- "fmax z23.s, p3/M, z23.s, z14.s\n"
- "fmax z24.s, p3/M, z24.s, z14.s\n"
- "fmla z28.s, p3/M, z4.s, z12.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "fmax z25.s, p3/M, z25.s, z14.s\n"
- "fmax z28.s, p3/M, z28.s, z14.s\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- "fmax z26.s, p3/M, z26.s, z14.s\n"
- "fmax z27.s, p3/M, z27.s, z14.s\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- "fmax z29.s, p3/M, z29.s, z14.s\n"
- "fmax z30.s, p3/M, z30.s, z14.s\n"
- "fmax z31.s, p3/M, z31.s, z14.s\n"
- "ld1w { z15.s }, p3/Z, [x17]\n"
+ "fmla z29.s, p3/M, z7.s, z11.s\n"
+ "fmla z24.s, p3/M, z6.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x28, x5, LSL #2]\n"
+ "fmla z31.s, p3/M, z4.s, z12.s\n"
+ "fmla z13.s, p3/M, z3.s, z12.s\n"
+ "fmax z31.s, p3/M, z31.s, z15.s\n"
+ "fmax z13.s, p3/M, z13.s, z15.s\n"
+ "fmla z27.s, p3/M, z1.s, z12.s\n"
+ "fmla z14.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z0.s }, p2/Z, [x28, x10, LSL #2]\n"
+ "fmax z27.s, p3/M, z27.s, z15.s\n"
+ "fmla z17.s, p3/M, z5.s, z10.s\n"
+ "fmla z30.s, p3/M, z4.s, z10.s\n"
+ "fmax z17.s, p3/M, z17.s, z15.s\n"
+ "fmax z30.s, p3/M, z30.s, z15.s\n"
+ "fmla z21.s, p3/M, z2.s, z10.s\n"
+ "fmla z26.s, p3/M, z1.s, z10.s\n"
+ "fmax z14.s, p3/M, z14.s, z15.s\n"
+ "fmax z21.s, p3/M, z21.s, z15.s\n"
+ "fmla z18.s, p3/M, z7.s, z11.s\n"
+ "fmla z22.s, p3/M, z6.s, z11.s\n"
+ "fmax z26.s, p3/M, z26.s, z15.s\n"
+ "fmax z18.s, p3/M, z18.s, z15.s\n"
+ "fmla z23.s, p3/M, z4.s, z11.s\n"
+ "fmla z25.s, p3/M, z3.s, z11.s\n"
+ "fmax z22.s, p3/M, z22.s, z15.s\n"
+ "fmax z23.s, p3/M, z23.s, z15.s\n"
+ "fmla z20.s, p3/M, z8.s, z0.s\n"
+ "fmla z28.s, p3/M, z7.s, z0.s\n"
+ "fmax z20.s, p3/M, z20.s, z15.s\n"
+ "fmax z28.s, p3/M, z28.s, z15.s\n"
+ "fmla z29.s, p3/M, z5.s, z0.s\n"
+ "fmla z24.s, p3/M, z4.s, z0.s\n"
+ "fmax z25.s, p3/M, z25.s, z15.s\n"
+ "fmax z29.s, p3/M, z29.s, z15.s\n"
+ "fmax z24.s, p3/M, z24.s, z15.s\n"
"ld1w { z0.s }, p3/Z, [x17, #1, MUL VL]\n"
- "whilelt p2.s, x21, %x[n_channels]\n"
"ld1w { z1.s }, p3/Z, [x17, #2, MUL VL]\n"
+ "whilelt p2.s, x21, %x[n_channels]\n"
"ld1w { z2.s }, p3/Z, [x17, #3, MUL VL]\n"
- "cmp x16, %x[n_channels]\n"
- "fmin z16.s, p3/M, z16.s, z13.s\n"
"ld1w { z3.s }, p3/Z, [x17, #4, MUL VL]\n"
+ "cmp x16, %x[n_channels]\n"
+ "fmin z31.s, p3/M, z31.s, z16.s\n"
"ld1w { z4.s }, p3/Z, [x17, #5, MUL VL]\n"
- "fmin z17.s, p3/M, z17.s, z13.s\n"
- "fmin z18.s, p3/M, z18.s, z13.s\n"
"ld1w { z5.s }, p3/Z, [x17, #6, MUL VL]\n"
+ "fmin z13.s, p3/M, z13.s, z16.s\n"
+ "fmin z17.s, p3/M, z17.s, z16.s\n"
"ld1w { z6.s }, p3/Z, [x17, #7, MUL VL]\n"
"addvl x17, x17, #16\n"
- "fmin z19.s, p3/M, z19.s, z13.s\n"
- "fmin z20.s, p3/M, z20.s, z13.s\n"
- "fmin z21.s, p3/M, z21.s, z13.s\n"
- "ld1w { z9.s }, p1/Z, [x12, x7, LSL #2]\n"
+ "fmin z30.s, p3/M, z30.s, z16.s\n"
"ld1w { z10.s }, p1/Z, [x8]\n"
- "fmin z22.s, p3/M, z22.s, z13.s\n"
- "fmin z23.s, p3/M, z23.s, z13.s\n"
+ "fmin z27.s, p3/M, z27.s, z16.s\n"
+ "fmin z14.s, p3/M, z14.s, z16.s\n"
"ld1w { z11.s }, p1/Z, [x8, x27, LSL #2]\n"
"ld1w { z12.s }, p1/Z, [x12, x14, LSL #2]\n"
- "fmin z24.s, p3/M, z24.s, z13.s\n"
- "fmin z25.s, p3/M, z25.s, z13.s\n"
- "st1w { z16.s }, p0, [x15]\n"
+ "fmin z21.s, p3/M, z21.s, z16.s\n"
+ "fmin z26.s, p3/M, z26.s, z16.s\n"
+ "st1w { z31.s }, p0, [x15]\n"
"ld1w { z7.s }, p3/Z, [x17, #-8, MUL VL]\n"
- "fmin z26.s, p3/M, z26.s, z13.s\n"
- "fmin z27.s, p3/M, z27.s, z13.s\n"
- "st1w { z17.s }, p0, [x15, x6, LSL #2]\n"
+ "fmin z18.s, p3/M, z18.s, z16.s\n"
+ "fmin z22.s, p3/M, z22.s, z16.s\n"
+ "st1w { z13.s }, p0, [x15, x6, LSL #2]\n"
"ld1w { z8.s }, p3/Z, [x17, #-7, MUL VL]\n"
- "fmin z28.s, p3/M, z28.s, z13.s\n"
- "fmin z29.s, p3/M, z29.s, z13.s\n"
- "st1w { z18.s }, p0, [x15, x25, LSL #2]\n"
- "fmin z30.s, p3/M, z30.s, z13.s\n"
- "fmin z31.s, p3/M, z31.s, z13.s\n"
- "st1w { z19.s }, p0, [x15, x22, LSL #2]\n"
+ "fmin z20.s, p3/M, z20.s, z16.s\n"
+ "fmin z28.s, p3/M, z28.s, z16.s\n"
+ "st1w { z17.s }, p0, [x15, x25, LSL #2]\n"
+ "fmin z23.s, p3/M, z23.s, z16.s\n"
+ "fmin z25.s, p3/M, z25.s, z16.s\n"
+ "st1w { z30.s }, p0, [x15, x22, LSL #2]\n"
+ "fmin z29.s, p3/M, z29.s, z16.s\n"
+ "fmin z24.s, p3/M, z24.s, z16.s\n"
+ "st1w { z27.s }, p0, [x9]\n"
"addvl x28, x28, #1\n"
- "st1w { z20.s }, p0, [x9]\n"
+ "st1w { z14.s }, p0, [x9, x6, LSL #2]\n"
"addvl x15, x15, #1\n"
- "st1w { z21.s }, p0, [x9, x6, LSL #2]\n"
+ "st1w { z21.s }, p0, [x9, x25, LSL #2]\n"
"addvl x17, x17, #-6\n"
- "st1w { z22.s }, p0, [x9, x25, LSL #2]\n"
- "st1w { z23.s }, p0, [x9, x22, LSL #2]\n"
+ "st1w { z26.s }, p0, [x9, x22, LSL #2]\n"
"addvl x9, x9, #1\n"
- "st1w { z24.s }, p0, [x26]\n"
- "st1w { z25.s }, p0, [x26, x6, LSL #2]\n"
- "st1w { z26.s }, p0, [x26, x25, LSL #2]\n"
- "st1w { z27.s }, p0, [x26, x22, LSL #2]\n"
+ "st1w { z18.s }, p0, [x26]\n"
+ "st1w { z22.s }, p0, [x26, x6, LSL #2]\n"
+ "st1w { z20.s }, p0, [x26, x25, LSL #2]\n"
+ "st1w { z28.s }, p0, [x26, x22, LSL #2]\n"
"addvl x26, x26, #1\n"
- "st1w { z28.s }, p0, [x23]\n"
- "st1w { z29.s }, p0, [x23, x6, LSL #2]\n"
- "st1w { z30.s }, p0, [x23, x25, LSL #2]\n"
- "st1w { z31.s }, p0, [x23, x22, LSL #2]\n"
+ "st1w { z23.s }, p0, [x23]\n"
+ "st1w { z25.s }, p0, [x23, x6, LSL #2]\n"
+ "st1w { z29.s }, p0, [x23, x25, LSL #2]\n"
+ "st1w { z24.s }, p0, [x23, x22, LSL #2]\n"
"addvl x23, x23, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z21, z15\n fmla z21.s, p3/M, z4.s, z9.s\n"
- "movprfx z16, z15\n fmla z16.s, p3/M, z8.s, z9.s\n"
+ "movprfx z14, z19\n fmla z14.s, p3/M, z4.s, z9.s\n"
+ "movprfx z31, z19\n fmla z31.s, p3/M, z8.s, z9.s\n"
"ldr x4, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x16, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "movprfx z22, z15\n fmla z22.s, p3/M, z3.s, z9.s\n"
- "movprfx z25, z15\n fmla z25.s, p3/M, z1.s, z9.s\n"
+ "movprfx z30, z19\n fmla z30.s, p3/M, z3.s, z9.s\n"
+ "movprfx z13, z19\n fmla z13.s, p3/M, z1.s, z9.s\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
"add x4, x4, #0x1\n"
- "movprfx z26, z15\n fmla z26.s, p3/M, z0.s, z9.s\n"
- "fmla z21.s, p3/M, z5.s, z12.s\n"
+ "movprfx z20, z19\n fmla z20.s, p3/M, z0.s, z9.s\n"
+ "fmla z14.s, p3/M, z5.s, z12.s\n"
"cmp x4, x20\n"
"add x21, x16, #0x1\n"
- "movprfx z17, z15\n fmla z17.s, p3/M, z7.s, z9.s\n"
- "movprfx z18, z15\n fmla z18.s, p3/M, z6.s, z9.s\n"
+ "movprfx z18, z19\n fmla z18.s, p3/M, z7.s, z9.s\n"
+ "movprfx z28, z19\n fmla z28.s, p3/M, z6.s, z9.s\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
"csel x16, x16, x21, LT\n"
- "movprfx z20, z15\n fmla z20.s, p3/M, z5.s, z9.s\n"
- "movprfx z24, z15\n fmla z24.s, p3/M, z2.s, z9.s\n"
+ "movprfx z17, z19\n fmla z17.s, p3/M, z5.s, z9.s\n"
+ "movprfx z26, z19\n fmla z26.s, p3/M, z2.s, z9.s\n"
"ld1w { z9.s }, p2/Z, [x11, x7, LSL #2]\n"
"mov p0.b, p2.b\n"
- "fmla z16.s, p3/M, z0.s, z10.s\n"
- "movprfx z19, z15\n fmla z19.s, p3/M, z2.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x24]\n"
- "ld1w { z11.s }, p2/Z, [x24, x27, LSL #2]\n"
- "fmla z22.s, p3/M, z4.s, z12.s\n"
- "fmla z25.s, p3/M, z2.s, z12.s\n"
+ "fmla z31.s, p3/M, z0.s, z10.s\n"
+ "movprfx z27, z19\n fmla z27.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z29.s }, p2/Z, [x24]\n"
+ "ld1w { z21.s }, p2/Z, [x24, x27, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z12.s\n"
+ "fmla z13.s, p3/M, z2.s, z12.s\n"
"csel x4, x4, XZR, LT\n"
"cmp x16, x20\n"
- "fmla z26.s, p3/M, z1.s, z12.s\n"
- "movprfx z28, z15\n fmla z28.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
- "fmla z21.s, p3/M, z7.s, z9.s\n"
- "fmla z17.s, p3/M, z8.s, z12.s\n"
- "fmla z18.s, p3/M, z7.s, z12.s\n"
- "fmla z19.s, p3/M, z6.s, z12.s\n"
- "movprfx z23, z15\n fmla z23.s, p3/M, z3.s, z12.s\n"
- "movprfx z27, z15\n fmla z27.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x8, x5, LSL #2]\n"
- "movprfx z31, z15\n fmla z31.s, p3/M, z8.s, z11.s\n"
- "fmla z22.s, p3/M, z6.s, z9.s\n"
- "ld1w { z11.s }, p2/Z, [x8, x10, LSL #2]\n"
- "fmla z25.s, p3/M, z4.s, z9.s\n"
- "fmla z26.s, p3/M, z3.s, z9.s\n"
- "fmla z20.s, p3/M, z8.s, z9.s\n"
- "fmla z24.s, p3/M, z5.s, z9.s\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z21.s, p3/M, z8.s, z10.s\n"
- "fmla z16.s, p3/M, z1.s, z12.s\n"
- "fmla z17.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x13, x27, LSL #2]\n"
- "fmla z18.s, p3/M, z2.s, z11.s\n"
- "fmla z19.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x28]\n"
- "fmla z22.s, p3/M, z7.s, z10.s\n"
- "fmla z23.s, p3/M, z6.s, z10.s\n"
- "fmla z25.s, p3/M, z5.s, z10.s\n"
- "fmla z26.s, p3/M, z4.s, z10.s\n"
- "fmla z27.s, p3/M, z3.s, z10.s\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x27, LSL #2]\n"
- "fmla z19.s, p3/M, z5.s, z12.s\n"
- "fmla z23.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x13, x14, LSL #2]\n"
- "fmla z27.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "movprfx z29, z15\n fmla z29.s, p3/M, z1.s, z9.s\n"
- "movprfx z30, z15\n fmla z30.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x13]\n"
- "fmla z29.s, p3/M, z2.s, z10.s\n"
- "fmla z30.s, p3/M, z1.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x13, x7, LSL #2]\n"
- "fmla z20.s, p3/M, z0.s, z9.s\n"
- "fmla z21.s, p3/M, z1.s, z10.s\n"
- "fmla z16.s, p3/M, z3.s, z9.s\n"
- "fmla z17.s, p3/M, z4.s, z10.s\n"
- "ld1w { z11.s }, p2/Z, [x24, x5, LSL #2]\n"
- "fmla z18.s, p3/M, z3.s, z10.s\n"
- "fmla z22.s, p3/M, z0.s, z10.s\n"
- "fmla z20.s, p3/M, z2.s, z10.s\n"
- "fmla z21.s, p3/M, z2.s, z12.s\n"
- "fmla z16.s, p3/M, z5.s, z10.s\n"
- "fmla z17.s, p3/M, z5.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x12, x5, LSL #2]\n"
- "fmla z18.s, p3/M, z4.s, z12.s\n"
- "fmla z19.s, p3/M, z3.s, z12.s\n"
- "fmla z22.s, p3/M, z1.s, z12.s\n"
- "fmla z23.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x10, LSL #2]\n"
- "fmla z28.s, p3/M, z7.s, z11.s\n"
- "fmla z29.s, p3/M, z6.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x24, x10, LSL #2]\n"
- "fmla z20.s, p3/M, z4.s, z10.s\n"
- "fmla z21.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z1.s, z10.s\n"
- "fmla z25.s, p3/M, z0.s, z10.s\n"
- "fmla z16.s, p3/M, z7.s, z10.s\n"
- "fmla z17.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x8, x7, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z7.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x11, x5, LSL #2]\n"
+ "fmla z20.s, p3/M, z1.s, z12.s\n"
+ "movprfx z10, z19\n fmla z10.s, p3/M, z6.s, z29.s\n"
+ "ld1w { z29.s }, p2/Z, [x11, x14, LSL #2]\n"
+ "fmla z14.s, p3/M, z7.s, z9.s\n"
"fmla z18.s, p3/M, z8.s, z12.s\n"
- "fmla z19.s, p3/M, z7.s, z12.s\n"
- "fmla z22.s, p3/M, z5.s, z12.s\n"
- "fmla z23.s, p3/M, z4.s, z12.s\n"
- "fmla z26.s, p3/M, z2.s, z12.s\n"
- "fmla z27.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x8, x14, LSL #2]\n"
- "fmla z20.s, p3/M, z7.s, z11.s\n"
- "fmla z21.s, p3/M, z6.s, z11.s\n"
- "fmla z24.s, p3/M, z4.s, z11.s\n"
- "fmla z25.s, p3/M, z3.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x11, x10, LSL #2]\n"
- "fmla z16.s, p3/M, z2.s, z10.s\n"
- "fmla z17.s, p3/M, z1.s, z10.s\n"
- "fmla z18.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x12]\n"
- "fmla z30.s, p3/M, z2.s, z11.s\n"
- "fmla z19.s, p3/M, z0.s, z12.s\n"
- "fmla z20.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z0.s, z10.s\n"
- "fmla z22.s, p3/M, z8.s, z11.s\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
- "fmla z26.s, p3/M, z5.s, z11.s\n"
- "fmla z27.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x7, LSL #2]\n"
- "fmla z17.s, p3/M, z2.s, z12.s\n"
- "fmla z18.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x27, LSL #2]\n"
- "fmla z16.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x11]\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "fmla z30.s, p3/M, z3.s, z11.s\n"
- "fmla z19.s, p3/M, z8.s, z12.s\n"
- "fmla z23.s, p3/M, z5.s, z12.s\n"
- "fmla z27.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x11, x27, LSL #2]\n"
- "fmla z20.s, p3/M, z6.s, z10.s\n"
- "fmla z24.s, p3/M, z3.s, z10.s\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x24, x7, LSL #2]\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "fmla z30.s, p3/M, z6.s, z10.s\n"
- "fmla z24.s, p3/M, z8.s, z11.s\n"
- "fmla z25.s, p3/M, z7.s, z11.s\n"
- "fmla z26.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x14, LSL #2]\n"
- "fmla z27.s, p3/M, z5.s, z12.s\n"
- "fmla z29.s, p3/M, z5.s, z11.s\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
- "fmla z23.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x14, LSL #2]\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x13, x5, LSL #2]\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "fmla z26.s, p3/M, z7.s, z11.s\n"
- "fmla z27.s, p3/M, z6.s, z11.s\n"
- "fmla z29.s, p3/M, z8.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x13, x10, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z12.s\n"
- "fmla z31.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x28, x5, LSL #2]\n"
- "fmla z16.s, p3/M, z4.s, z10.s\n"
- "fmla z17.s, p3/M, z3.s, z10.s\n"
- "fmax z16.s, p3/M, z16.s, z14.s\n"
- "fmax z17.s, p3/M, z17.s, z14.s\n"
- "fmla z20.s, p3/M, z1.s, z10.s\n"
- "fmla z21.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x28, x10, LSL #2]\n"
- "fmax z20.s, p3/M, z20.s, z14.s\n"
- "fmla z18.s, p3/M, z5.s, z11.s\n"
- "fmla z19.s, p3/M, z4.s, z11.s\n"
- "fmax z18.s, p3/M, z18.s, z14.s\n"
- "fmax z19.s, p3/M, z19.s, z14.s\n"
- "fmla z22.s, p3/M, z2.s, z11.s\n"
- "fmla z23.s, p3/M, z1.s, z11.s\n"
- "fmax z21.s, p3/M, z21.s, z14.s\n"
- "fmax z22.s, p3/M, z22.s, z14.s\n"
- "fmla z24.s, p3/M, z7.s, z12.s\n"
- "fmla z25.s, p3/M, z6.s, z12.s\n"
- "fmax z23.s, p3/M, z23.s, z14.s\n"
- "fmax z24.s, p3/M, z24.s, z14.s\n"
- "fmla z28.s, p3/M, z4.s, z12.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "fmax z25.s, p3/M, z25.s, z14.s\n"
- "fmax z28.s, p3/M, z28.s, z14.s\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- "fmax z26.s, p3/M, z26.s, z14.s\n"
- "fmax z27.s, p3/M, z27.s, z14.s\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- "fmax z29.s, p3/M, z29.s, z14.s\n"
- "fmax z30.s, p3/M, z30.s, z14.s\n"
- "fmax z31.s, p3/M, z31.s, z14.s\n"
- "fmin z16.s, p3/M, z16.s, z13.s\n"
- "st1w { z16.s }, p0, [x15]\n"
- "fmin z17.s, p3/M, z17.s, z13.s\n"
- "fmin z18.s, p3/M, z18.s, z13.s\n"
- "st1w { z17.s }, p0, [x15, x6, LSL #2]\n"
- "fmin z19.s, p3/M, z19.s, z13.s\n"
- "fmin z20.s, p3/M, z20.s, z13.s\n"
- "st1w { z18.s }, p0, [x15, x25, LSL #2]\n"
- "fmin z21.s, p3/M, z21.s, z13.s\n"
- "fmin z22.s, p3/M, z22.s, z13.s\n"
- "st1w { z19.s }, p0, [x15, x22, LSL #2]\n"
- "fmin z23.s, p3/M, z23.s, z13.s\n"
- "fmin z24.s, p3/M, z24.s, z13.s\n"
- "st1w { z20.s }, p0, [x9]\n"
- "fmin z25.s, p3/M, z25.s, z13.s\n"
- "fmin z26.s, p3/M, z26.s, z13.s\n"
- "st1w { z21.s }, p0, [x9, x6, LSL #2]\n"
- "fmin z27.s, p3/M, z27.s, z13.s\n"
- "fmin z28.s, p3/M, z28.s, z13.s\n"
- "st1w { z22.s }, p0, [x9, x25, LSL #2]\n"
- "fmin z29.s, p3/M, z29.s, z13.s\n"
- "fmin z30.s, p3/M, z30.s, z13.s\n"
- "st1w { z23.s }, p0, [x9, x22, LSL #2]\n"
- "fmin z31.s, p3/M, z31.s, z13.s\n"
- "st1w { z24.s }, p0, [x26]\n"
- "st1w { z25.s }, p0, [x26, x6, LSL #2]\n"
- "st1w { z26.s }, p0, [x26, x25, LSL #2]\n"
- "st1w { z27.s }, p0, [x26, x22, LSL #2]\n"
- "st1w { z28.s }, p0, [x23]\n"
- "st1w { z29.s }, p0, [x23, x6, LSL #2]\n"
- "st1w { z30.s }, p0, [x23, x25, LSL #2]\n"
- "st1w { z31.s }, p0, [x23, x22, LSL #2]\n"
+ "fmla z28.s, p3/M, z7.s, z12.s\n"
+ "fmla z27.s, p3/M, z6.s, z12.s\n"
+ "movprfx z11, z19\n fmla z11.s, p3/M, z3.s, z12.s\n"
+ "movprfx z25, z19\n fmla z25.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z22.s }, p2/Z, [x8, x5, LSL #2]\n"
+ "movprfx z24, z19\n fmla z24.s, p3/M, z8.s, z21.s\n"
+ "fmla z30.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z21.s }, p2/Z, [x8, x10, LSL #2]\n"
+ "fmla z13.s, p3/M, z4.s, z9.s\n"
+ "fmla z20.s, p3/M, z3.s, z9.s\n"
+ "movprfx z12, z19\n fmla z12.s, p3/M, z1.s, z9.s\n"
+ "movprfx z23, z19\n fmla z23.s, p3/M, z0.s, z9.s\n"
+ "fmla z17.s, p3/M, z8.s, z9.s\n"
+ "fmla z26.s, p3/M, z5.s, z9.s\n"
+ "fmla z10.s, p3/M, z2.s, z9.s\n"
+ "fmla z14.s, p3/M, z8.s, z29.s\n"
+ "ld1w { z9.s }, p2/Z, [x13]\n"
+ "fmla z31.s, p3/M, z1.s, z22.s\n"
+ "fmla z18.s, p3/M, z0.s, z22.s\n"
+ "ld1w { z22.s }, p2/Z, [x13, x27, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z21.s\n"
+ "fmla z27.s, p3/M, z1.s, z21.s\n"
+ "ld1w { z19.s }, p2/Z, [x28]\n"
+ "fmla z30.s, p3/M, z7.s, z29.s\n"
+ "fmla z11.s, p3/M, z6.s, z29.s\n"
+ "fmla z13.s, p3/M, z5.s, z29.s\n"
+ "fmla z20.s, p3/M, z4.s, z29.s\n"
+ "fmla z25.s, p3/M, z3.s, z29.s\n"
+ "fmla z12.s, p3/M, z2.s, z29.s\n"
+ "fmla z23.s, p3/M, z1.s, z29.s\n"
+ "fmla z24.s, p3/M, z0.s, z29.s\n"
+ "ld1w { z21.s }, p2/Z, [x13, x7, LSL #2]\n"
+ "fmla z17.s, p3/M, z0.s, z9.s\n"
+ "fmla z26.s, p3/M, z6.s, z19.s\n"
+ "fmla z10.s, p3/M, z3.s, z19.s\n"
+ "fmla z14.s, p3/M, z1.s, z21.s\n"
+ "ld1w { z19.s }, p2/Z, [x28, x27, LSL #2]\n"
+ "fmla z31.s, p3/M, z3.s, z9.s\n"
+ "fmla z27.s, p3/M, z5.s, z22.s\n"
+ "fmla z11.s, p3/M, z2.s, z22.s\n"
+ "fmla z18.s, p3/M, z4.s, z21.s\n"
+ "ld1w { z29.s }, p2/Z, [x13, x14, LSL #2]\n"
+ "fmla z28.s, p3/M, z3.s, z21.s\n"
+ "fmla z30.s, p3/M, z0.s, z21.s\n"
+ "fmla z25.s, p3/M, z8.s, z19.s\n"
+ "fmla z24.s, p3/M, z5.s, z19.s\n"
+ "ld1w { z19.s }, p2/Z, [x24, x5, LSL #2]\n"
+ "fmla z17.s, p3/M, z2.s, z21.s\n"
+ "fmla z14.s, p3/M, z2.s, z29.s\n"
+ "fmla z31.s, p3/M, z5.s, z21.s\n"
+ "fmla z18.s, p3/M, z5.s, z29.s\n"
+ "ld1w { z22.s }, p2/Z, [x12, x5, LSL #2]\n"
+ "fmla z28.s, p3/M, z4.s, z29.s\n"
+ "fmla z27.s, p3/M, z3.s, z29.s\n"
+ "fmla z30.s, p3/M, z1.s, z29.s\n"
+ "fmla z11.s, p3/M, z0.s, z29.s\n"
+ "ld1w { z21.s }, p2/Z, [x12, x10, LSL #2]\n"
+ "fmla z10.s, p3/M, z7.s, z19.s\n"
+ "fmla z12.s, p3/M, z6.s, z19.s\n"
+ "ld1w { z19.s }, p2/Z, [x24, x10, LSL #2]\n"
+ "fmla z17.s, p3/M, z4.s, z22.s\n"
+ "fmla z14.s, p3/M, z3.s, z22.s\n"
+ "fmla z26.s, p3/M, z1.s, z22.s\n"
+ "fmla z13.s, p3/M, z0.s, z22.s\n"
+ "fmla z31.s, p3/M, z7.s, z22.s\n"
+ "fmla z18.s, p3/M, z6.s, z22.s\n"
+ "ld1w { z29.s }, p2/Z, [x8, x7, LSL #2]\n"
+ "fmla z23.s, p3/M, z8.s, z19.s\n"
+ "fmla z24.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z19.s }, p2/Z, [x11, x5, LSL #2]\n"
+ "fmla z28.s, p3/M, z8.s, z21.s\n"
+ "fmla z27.s, p3/M, z7.s, z21.s\n"
+ "fmla z30.s, p3/M, z5.s, z21.s\n"
+ "fmla z11.s, p3/M, z4.s, z21.s\n"
+ "fmla z20.s, p3/M, z2.s, z21.s\n"
+ "fmla z25.s, p3/M, z1.s, z21.s\n"
+ "ld1w { z22.s }, p2/Z, [x8, x14, LSL #2]\n"
+ "fmla z17.s, p3/M, z7.s, z19.s\n"
+ "fmla z14.s, p3/M, z6.s, z19.s\n"
+ "fmla z26.s, p3/M, z4.s, z19.s\n"
+ "fmla z13.s, p3/M, z3.s, z19.s\n"
+ "fmla z10.s, p3/M, z1.s, z19.s\n"
+ "fmla z12.s, p3/M, z0.s, z19.s\n"
+ "ld1w { z21.s }, p2/Z, [x11, x10, LSL #2]\n"
+ "fmla z31.s, p3/M, z2.s, z29.s\n"
+ "fmla z18.s, p3/M, z1.s, z29.s\n"
+ "fmla z28.s, p3/M, z0.s, z29.s\n"
+ "ld1w { z29.s }, p2/Z, [x12]\n"
+ "fmla z23.s, p3/M, z2.s, z21.s\n"
+ "fmla z27.s, p3/M, z0.s, z22.s\n"
+ "fmla z17.s, p3/M, z3.s, z29.s\n"
+ "fmla z26.s, p3/M, z0.s, z29.s\n"
+ "fmla z30.s, p3/M, z8.s, z21.s\n"
+ "fmla z11.s, p3/M, z7.s, z21.s\n"
+ "fmla z20.s, p3/M, z5.s, z21.s\n"
+ "fmla z25.s, p3/M, z4.s, z21.s\n"
+ "fmla z24.s, p3/M, z1.s, z21.s\n"
+ "ld1w { z19.s }, p2/Z, [x28, x7, LSL #2]\n"
+ "fmla z18.s, p3/M, z2.s, z22.s\n"
+ "fmla z28.s, p3/M, z1.s, z22.s\n"
+ "ld1w { z21.s }, p2/Z, [x12, x27, LSL #2]\n"
+ "fmla z31.s, p3/M, z6.s, z29.s\n"
+ "ld1w { z29.s }, p2/Z, [x11]\n"
+ "fmla z12.s, p3/M, z4.s, z19.s\n"
+ "fmla z23.s, p3/M, z3.s, z19.s\n"
+ "fmla z27.s, p3/M, z8.s, z21.s\n"
+ "fmla z11.s, p3/M, z5.s, z21.s\n"
+ "fmla z25.s, p3/M, z2.s, z21.s\n"
+ "ld1w { z9.s }, p2/Z, [x11, x27, LSL #2]\n"
+ "fmla z17.s, p3/M, z6.s, z29.s\n"
+ "fmla z26.s, p3/M, z3.s, z29.s\n"
+ "fmla z10.s, p3/M, z0.s, z29.s\n"
+ "ld1w { z22.s }, p2/Z, [x24, x7, LSL #2]\n"
+ "fmla z24.s, p3/M, z2.s, z9.s\n"
+ "fmla z12.s, p3/M, z7.s, z22.s\n"
+ "fmla z23.s, p3/M, z6.s, z22.s\n"
+ "fmla z26.s, p3/M, z8.s, z19.s\n"
+ "fmla z13.s, p3/M, z7.s, z19.s\n"
+ "fmla z20.s, p3/M, z6.s, z19.s\n"
+ "fmla z10.s, p3/M, z5.s, z19.s\n"
+ "ld1w { z21.s }, p2/Z, [x28, x14, LSL #2]\n"
+ "fmla z25.s, p3/M, z5.s, z9.s\n"
+ "fmla z12.s, p3/M, z5.s, z21.s\n"
+ "fmla z23.s, p3/M, z4.s, z21.s\n"
+ "fmla z24.s, p3/M, z3.s, z21.s\n"
+ "fmla z11.s, p3/M, z8.s, z9.s\n"
+ "ld1w { z19.s }, p2/Z, [x24, x14, LSL #2]\n"
+ "fmla z10.s, p3/M, z8.s, z22.s\n"
+ "ld1w { z22.s }, p2/Z, [x13, x5, LSL #2]\n"
+ "fmla z13.s, p3/M, z8.s, z21.s\n"
+ "fmla z20.s, p3/M, z7.s, z21.s\n"
+ "fmla z25.s, p3/M, z6.s, z21.s\n"
+ "fmla z12.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z29.s }, p2/Z, [x13, x10, LSL #2]\n"
+ "fmla z23.s, p3/M, z7.s, z19.s\n"
+ "fmla z24.s, p3/M, z6.s, z19.s\n"
+ "ld1w { z21.s }, p2/Z, [x28, x5, LSL #2]\n"
+ "fmla z31.s, p3/M, z4.s, z22.s\n"
+ "fmla z18.s, p3/M, z3.s, z22.s\n"
+ "fmax z31.s, p3/M, z31.s, z15.s\n"
+ "fmax z18.s, p3/M, z18.s, z15.s\n"
+ "fmla z17.s, p3/M, z1.s, z22.s\n"
+ "fmla z14.s, p3/M, z0.s, z22.s\n"
+ "ld1w { z9.s }, p2/Z, [x28, x10, LSL #2]\n"
+ "fmax z17.s, p3/M, z17.s, z15.s\n"
+ "fmla z28.s, p3/M, z5.s, z29.s\n"
+ "fmla z27.s, p3/M, z4.s, z29.s\n"
+ "fmax z28.s, p3/M, z28.s, z15.s\n"
+ "fmax z27.s, p3/M, z27.s, z15.s\n"
+ "fmla z30.s, p3/M, z2.s, z29.s\n"
+ "fmla z11.s, p3/M, z1.s, z29.s\n"
+ "fmax z14.s, p3/M, z14.s, z15.s\n"
+ "fmax z30.s, p3/M, z30.s, z15.s\n"
+ "fmla z26.s, p3/M, z7.s, z21.s\n"
+ "fmla z13.s, p3/M, z6.s, z21.s\n"
+ "fmax z11.s, p3/M, z11.s, z15.s\n"
+ "fmax z26.s, p3/M, z26.s, z15.s\n"
+ "fmla z10.s, p3/M, z4.s, z21.s\n"
+ "fmla z12.s, p3/M, z3.s, z21.s\n"
+ "fmax z13.s, p3/M, z13.s, z15.s\n"
+ "fmax z10.s, p3/M, z10.s, z15.s\n"
+ "fmla z20.s, p3/M, z8.s, z9.s\n"
+ "fmla z25.s, p3/M, z7.s, z9.s\n"
+ "fmax z20.s, p3/M, z20.s, z15.s\n"
+ "fmax z25.s, p3/M, z25.s, z15.s\n"
+ "fmla z23.s, p3/M, z5.s, z9.s\n"
+ "fmla z24.s, p3/M, z4.s, z9.s\n"
+ "fmax z12.s, p3/M, z12.s, z15.s\n"
+ "fmax z23.s, p3/M, z23.s, z15.s\n"
+ "fmax z24.s, p3/M, z24.s, z15.s\n"
+ "fmin z31.s, p3/M, z31.s, z16.s\n"
+ "st1w { z31.s }, p0, [x15]\n"
+ "fmin z18.s, p3/M, z18.s, z16.s\n"
+ "fmin z28.s, p3/M, z28.s, z16.s\n"
+ "st1w { z18.s }, p0, [x15, x6, LSL #2]\n"
+ "fmin z27.s, p3/M, z27.s, z16.s\n"
+ "fmin z17.s, p3/M, z17.s, z16.s\n"
+ "st1w { z28.s }, p0, [x15, x25, LSL #2]\n"
+ "fmin z14.s, p3/M, z14.s, z16.s\n"
+ "fmin z30.s, p3/M, z30.s, z16.s\n"
+ "st1w { z27.s }, p0, [x15, x22, LSL #2]\n"
+ "fmin z11.s, p3/M, z11.s, z16.s\n"
+ "fmin z26.s, p3/M, z26.s, z16.s\n"
+ "st1w { z17.s }, p0, [x9]\n"
+ "fmin z13.s, p3/M, z13.s, z16.s\n"
+ "fmin z20.s, p3/M, z20.s, z16.s\n"
+ "st1w { z14.s }, p0, [x9, x6, LSL #2]\n"
+ "fmin z25.s, p3/M, z25.s, z16.s\n"
+ "fmin z10.s, p3/M, z10.s, z16.s\n"
+ "st1w { z30.s }, p0, [x9, x25, LSL #2]\n"
+ "fmin z12.s, p3/M, z12.s, z16.s\n"
+ "fmin z23.s, p3/M, z23.s, z16.s\n"
+ "st1w { z11.s }, p0, [x9, x22, LSL #2]\n"
+ "fmin z24.s, p3/M, z24.s, z16.s\n"
+ "st1w { z26.s }, p0, [x26]\n"
+ "st1w { z13.s }, p0, [x26, x6, LSL #2]\n"
+ "st1w { z20.s }, p0, [x26, x25, LSL #2]\n"
+ "st1w { z25.s }, p0, [x26, x22, LSL #2]\n"
+ "st1w { z10.s }, p0, [x23]\n"
+ "st1w { z12.s }, p0, [x23, x6, LSL #2]\n"
+ "st1w { z23.s }, p0, [x23, x25, LSL #2]\n"
+ "st1w { z24.s }, p0, [x23, x22, LSL #2]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
@@ -653,4 +653,4 @@ void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
index 0b04ae064d..3db248924f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -99,616 +99,616 @@ void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
__asm__ __volatile__(
"ptrue p3.b\n"
- "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ld1w { z15.s }, p3/Z, [x17]\n"
- "cntw x15\n"
- "mov x14, #0x0\n"
- "ld1w { z0.s }, p3/Z, [x17, #1, MUL VL]\n"
- "ld1w { z1.s }, p3/Z, [x17, #2, MUL VL]\n"
+ "ldr x7, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x8, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ld1w { z17.s }, p3/Z, [x7]\n"
+ "cntw x17\n"
+ "mov x16, #0x0\n"
+ "ld1w { z0.s }, p3/Z, [x7, #1, MUL VL]\n"
+ "ld1w { z1.s }, p3/Z, [x7, #2, MUL VL]\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- "ld1w { z2.s }, p3/Z, [x17, #3, MUL VL]\n"
- "ld1w { z3.s }, p3/Z, [x17, #4, MUL VL]\n"
- "cmp x15, %x[n_channels]\n"
- "ld1w { z4.s }, p3/Z, [x17, #5, MUL VL]\n"
- "ld1w { z5.s }, p3/Z, [x17, #6, MUL VL]\n"
- "sub x13, XZR, x15\n"
- "ld1w { z6.s }, p3/Z, [x17, #7, MUL VL]\n"
- "addvl x17, x17, #16\n"
- "ldp x12, x11, [x16, #0x0]\n"
- "ldp x10, x9, [x16, #0x10]\n"
- "ldr x28, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rw { z13.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "ld1w { z7.s }, p3/Z, [x17, #-8, MUL VL]\n"
- "ld1w { z8.s }, p3/Z, [x17, #-7, MUL VL]\n"
- "addvl x17, x17, #-6\n"
- "ld1w { z9.s }, p2/Z, [x12, x14, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x10, x14, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x9, x14, LSL #2]\n"
+ "ld1w { z2.s }, p3/Z, [x7, #3, MUL VL]\n"
+ "ld1w { z3.s }, p3/Z, [x7, #4, MUL VL]\n"
+ "cmp x17, %x[n_channels]\n"
+ "ld1w { z4.s }, p3/Z, [x7, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x7, #6, MUL VL]\n"
+ "sub x15, XZR, x17\n"
+ "ld1w { z6.s }, p3/Z, [x7, #7, MUL VL]\n"
+ "addvl x7, x7, #16\n"
+ "ldp x23, x22, [x8, #0x0]\n"
+ "ldp x21, x20, [x8, #0x10]\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z19.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1w { z7.s }, p3/Z, [x7, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x7, #-7, MUL VL]\n"
+ "addvl x7, x7, #-6\n"
+ "ld1w { z9.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x20, x16, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z21, z15\n fmla z21.s, p3/M, z4.s, z9.s\n"
- "movprfx z16, z15\n fmla z16.s, p3/M, z8.s, z9.s\n"
- "ldr x27, [x16, #0x20]\n"
- "ldr x26, [x16, #0x30]\n"
- "movprfx z22, z15\n fmla z22.s, p3/M, z3.s, z9.s\n"
- "movprfx z25, z15\n fmla z25.s, p3/M, z1.s, z9.s\n"
- "ldr x25, [x16, #0x28]\n"
- "ldr x24, [x16, #0x38]\n"
- "movprfx z26, z15\n fmla z26.s, p3/M, z0.s, z9.s\n"
- "movprfx z17, z15\n fmla z17.s, p3/M, z7.s, z9.s\n"
- "ldr x12, [x16, #0x40]\n"
- "ldr x11, [x16, #0x48]\n"
- "movprfx z18, z15\n fmla z18.s, p3/M, z6.s, z9.s\n"
- "fmla z21.s, p3/M, z5.s, z12.s\n"
- "ldr x10, [x16, #0x50]\n"
- "ldr x9, [x16, #0x58]\n"
- "movprfx z20, z15\n fmla z20.s, p3/M, z5.s, z9.s\n"
- "movprfx z24, z15\n fmla z24.s, p3/M, z2.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x26, x14, LSL #2]\n"
- "ldr x26, [x16, #0x70]\n"
- "fmla z16.s, p3/M, z0.s, z10.s\n"
- "movprfx z19, z15\n fmla z19.s, p3/M, z2.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x27, x14, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x25, x14, LSL #2]\n"
- "fmla z22.s, p3/M, z4.s, z12.s\n"
- "fmla z25.s, p3/M, z2.s, z12.s\n"
- "ldr x27, [x16, #0x60]\n"
- "ldr x25, [x16, #0x68]\n"
- "fmla z26.s, p3/M, z1.s, z12.s\n"
- "fmla z17.s, p3/M, z8.s, z12.s\n"
- "incw x13\n"
+ "movprfx z20, z17\n fmla z20.s, p3/M, z4.s, z9.s\n"
+ "movprfx z26, z17\n fmla z26.s, p3/M, z8.s, z9.s\n"
+ "ldr x27, [x8, #0x20]\n"
+ "ldr x24, [x8, #0x30]\n"
+ "movprfx z24, z17\n fmla z24.s, p3/M, z3.s, z9.s\n"
+ "movprfx z30, z17\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "ldr x23, [x8, #0x28]\n"
+ "ldr x22, [x8, #0x38]\n"
+ "movprfx z31, z17\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "movprfx z22, z17\n fmla z22.s, p3/M, z7.s, z9.s\n"
+ "ldr x26, [x8, #0x40]\n"
+ "ldr x21, [x8, #0x48]\n"
+ "movprfx z27, z17\n fmla z27.s, p3/M, z6.s, z9.s\n"
+ "fmla z20.s, p3/M, z5.s, z12.s\n"
+ "ldr x25, [x8, #0x50]\n"
+ "ldr x20, [x8, #0x58]\n"
+ "movprfx z14, z17\n fmla z14.s, p3/M, z5.s, z9.s\n"
+ "movprfx z23, z17\n fmla z23.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z25.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "ldr x13, [x8, #0x70]\n"
+ "fmla z26.s, p3/M, z0.s, z10.s\n"
+ "movprfx z9, z17\n fmla z9.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z28.s }, p2/Z, [x27, x16, LSL #2]\n"
+ "ld1w { z21.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "fmla z24.s, p3/M, z4.s, z12.s\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
+ "ldr x24, [x8, #0x60]\n"
+ "ldr x23, [x8, #0x68]\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "fmla z22.s, p3/M, z8.s, z12.s\n"
+ "incw x15\n"
"mov p1.b, p2.b\n"
- "fmla z18.s, p3/M, z7.s, z12.s\n"
- "movprfx z28, z15\n fmla z28.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
- "ldr x11, [x16, #0x88]\n"
- "fmla z21.s, p3/M, z7.s, z9.s\n"
- "fmla z19.s, p3/M, z6.s, z12.s\n"
- "ldr x23, [x28, #0x0]\n"
- "ldr x22, [x28, #0x8]\n"
- "movprfx z23, z15\n fmla z23.s, p3/M, z3.s, z12.s\n"
- "movprfx z27, z15\n fmla z27.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x14, LSL #2]\n"
- "ldr x24, [x16, #0x78]\n"
- "movprfx z31, z15\n fmla z31.s, p3/M, z8.s, z11.s\n"
- "fmla z22.s, p3/M, z6.s, z9.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x14, LSL #2]\n"
- "ldr x12, [x16, #0x80]\n"
- "fmla z25.s, p3/M, z4.s, z9.s\n"
- "fmla z26.s, p3/M, z3.s, z9.s\n"
- "ldr x21, [x28, #0x10]\n"
- "ldr x20, [x28, #0x18]\n"
- "fmla z20.s, p3/M, z8.s, z9.s\n"
- "fmla z24.s, p3/M, z5.s, z9.s\n"
- "whilelt p0.s, x15, %x[n_channels]\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z16.s, p3/M, z1.s, z12.s\n"
- "fmla z17.s, p3/M, z0.s, z12.s\n"
- "movprfx z29, z15\n fmla z29.s, p3/M, z1.s, z9.s\n"
- "movprfx z30, z15\n fmla z30.s, p3/M, z0.s, z9.s\n"
- "fmla z18.s, p3/M, z2.s, z11.s\n"
- "ld1w { z9.s }, p2/Z, [x10, x14, LSL #2]\n"
- "ldr x10, [x16, #0x90]\n"
- "fmla z21.s, p3/M, z8.s, z10.s\n"
- "fmla z19.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x14, LSL #2]\n"
- "ldr x27, [x16, #0xa0]\n"
- "fmla z22.s, p3/M, z7.s, z10.s\n"
- "fmla z23.s, p3/M, z6.s, z10.s\n"
- "fmla z25.s, p3/M, z5.s, z10.s\n"
- "fmla z26.s, p3/M, z4.s, z10.s\n"
- "fmla z27.s, p3/M, z3.s, z10.s\n"
- "fmla z29.s, p3/M, z2.s, z10.s\n"
- "fmla z30.s, p3/M, z1.s, z10.s\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x14, LSL #2]\n"
- "ldr x25, [x16, #0xa8]\n"
- "fmla z16.s, p3/M, z3.s, z9.s\n"
- "fmla z20.s, p3/M, z0.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x9, x14, LSL #2]\n"
- "ldr x9, [x16, #0x98]\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x14, LSL #2]\n"
- "ldr x26, [x16, #0xb0]\n"
- "fmla z17.s, p3/M, z4.s, z10.s\n"
- "fmla z18.s, p3/M, z3.s, z10.s\n"
+ "fmla z27.s, p3/M, z7.s, z12.s\n"
+ "movprfx z15, z17\n fmla z15.s, p3/M, z6.s, z28.s\n"
+ "ld1w { z10.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "ldr x28, [x8, #0x88]\n"
+ "fmla z20.s, p3/M, z7.s, z25.s\n"
+ "fmla z9.s, p3/M, z6.s, z12.s\n"
+ "ldr x12, [x14, #0x0]\n"
+ "ldr x11, [x14, #0x8]\n"
+ "movprfx z11, z17\n fmla z11.s, p3/M, z3.s, z12.s\n"
+ "movprfx z13, z17\n fmla z13.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "ldr x22, [x8, #0x78]\n"
+ "movprfx z28, z17\n fmla z28.s, p3/M, z8.s, z21.s\n"
+ "fmla z24.s, p3/M, z6.s, z25.s\n"
+ "ld1w { z29.s }, p2/Z, [x26, x16, LSL #2]\n"
+ "ldr x21, [x8, #0x80]\n"
+ "fmla z30.s, p3/M, z4.s, z25.s\n"
+ "fmla z31.s, p3/M, z3.s, z25.s\n"
+ "ldr x10, [x14, #0x10]\n"
+ "ldr x9, [x14, #0x18]\n"
+ "movprfx z18, z17\n fmla z18.s, p3/M, z1.s, z25.s\n"
+ "movprfx z21, z17\n fmla z21.s, p3/M, z0.s, z25.s\n"
+ "whilelt p0.s, x17, %x[n_channels]\n"
+ "ld1w { z17.s }, p3/Z, [x7]\n"
+ "fmla z14.s, p3/M, z8.s, z25.s\n"
+ "fmla z23.s, p3/M, z5.s, z25.s\n"
+ "fmla z15.s, p3/M, z2.s, z25.s\n"
+ "fmla z26.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z25.s }, p2/Z, [x25, x16, LSL #2]\n"
+ "ldr x27, [x8, #0x90]\n"
+ "fmla z22.s, p3/M, z0.s, z12.s\n"
+ "fmla z27.s, p3/M, z2.s, z29.s\n"
+ "ld1w { z12.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "ldr x20, [x8, #0x98]\n"
+ "fmla z20.s, p3/M, z8.s, z10.s\n"
+ "fmla z9.s, p3/M, z1.s, z29.s\n"
+ "ld1w { z29.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "ldr x26, [x8, #0xa0]\n"
+ "fmla z24.s, p3/M, z7.s, z10.s\n"
+ "fmla z11.s, p3/M, z6.s, z10.s\n"
+ "fmla z30.s, p3/M, z5.s, z10.s\n"
+ "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "fmla z13.s, p3/M, z3.s, z10.s\n"
+ "fmla z18.s, p3/M, z2.s, z10.s\n"
"fmla z21.s, p3/M, z1.s, z10.s\n"
- "fmla z19.s, p3/M, z5.s, z12.s\n"
- "fmla z23.s, p3/M, z2.s, z12.s\n"
- "fmla z22.s, p3/M, z0.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x14, LSL #2]\n"
- "ldr x24, [x16, #0xb8]\n"
- "fmla z27.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x14, LSL #2]\n"
- "ldr x12, [x16, #0xc0]\n"
- "fmla z16.s, p3/M, z5.s, z10.s\n"
- "fmla z20.s, p3/M, z2.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
- "ldr x11, [x16, #0xc8]\n"
- "fmla z17.s, p3/M, z5.s, z12.s\n"
- "fmla z18.s, p3/M, z4.s, z12.s\n"
- "fmla z21.s, p3/M, z2.s, z12.s\n"
- "fmla z19.s, p3/M, z3.s, z12.s\n"
- "fmla z22.s, p3/M, z1.s, z12.s\n"
- "fmla z23.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x9, x14, LSL #2]\n"
- "ldr x9, [x16, #0xd8]\n"
- "fmla z28.s, p3/M, z7.s, z11.s\n"
- "fmla z29.s, p3/M, z6.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x10, x14, LSL #2]\n"
- "ldr x10, [x16, #0xd0]\n"
- "fmla z16.s, p3/M, z7.s, z10.s\n"
- "fmla z17.s, p3/M, z6.s, z10.s\n"
- "fmla z20.s, p3/M, z4.s, z10.s\n"
- "fmla z21.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z1.s, z10.s\n"
- "fmla z25.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x27, x14, LSL #2]\n"
- "ldr x27, [x16, #0xe0]\n"
- "fmla z18.s, p3/M, z8.s, z12.s\n"
- "fmla z30.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z7.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x14, LSL #2]\n"
- "fmla z27.s, p3/M, z1.s, z12.s\n"
- "ldr x25, [x16, #0xe8]\n"
- "fmla z19.s, p3/M, z7.s, z12.s\n"
- "fmla z22.s, p3/M, z5.s, z12.s\n"
- "fmla z23.s, p3/M, z4.s, z12.s\n"
- "fmla z26.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x14, LSL #2]\n"
- "ldr x26, [x16, #0xf0]\n"
- "fmla z16.s, p3/M, z2.s, z10.s\n"
- "fmla z17.s, p3/M, z1.s, z10.s\n"
- "fmla z18.s, p3/M, z0.s, z10.s\n"
- "fmla z20.s, p3/M, z7.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x24, x14, LSL #2]\n"
- "ldr x24, [x16, #0xf8]\n"
- "fmla z21.s, p3/M, z6.s, z11.s\n"
- "fmla z24.s, p3/M, z4.s, z11.s\n"
- "fmla z25.s, p3/M, z3.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x14, LSL #2]\n"
- "fmla z27.s, p3/M, z4.s, z11.s\n"
- "ldr x12, [x16, #0x100]\n"
- "fmla z30.s, p3/M, z2.s, z11.s\n"
- "fmla z17.s, p3/M, z2.s, z12.s\n"
- "fmla z18.s, p3/M, z1.s, z12.s\n"
- "fmla z19.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x11, x14, LSL #2]\n"
- "ldr x11, [x16, #0x108]\n"
- "fmla z16.s, p3/M, z6.s, z10.s\n"
- "fmla z20.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z0.s, z10.s\n"
- "fmla z22.s, p3/M, z8.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x10, x14, LSL #2]\n"
- "ldr x10, [x16, #0x110]\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
- "fmla z26.s, p3/M, z5.s, z11.s\n"
- "fmla z31.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x9, x14, LSL #2]\n"
- "fmla z27.s, p3/M, z2.s, z12.s\n"
- "ldr x9, [x16, #0x118]\n"
"fmla z28.s, p3/M, z0.s, z10.s\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "fmla z30.s, p3/M, z3.s, z11.s\n"
- "fmla z19.s, p3/M, z8.s, z12.s\n"
- "fmla z23.s, p3/M, z5.s, z12.s\n"
- "fmla z20.s, p3/M, z6.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x14, LSL #2]\n"
- "fmla z24.s, p3/M, z3.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x14, LSL #2]\n"
- "fmla z25.s, p3/M, z7.s, z11.s\n"
- "fmla z26.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z5.s, z11.s\n"
- "fmla z27.s, p3/M, z5.s, z12.s\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "fmla z30.s, p3/M, z6.s, z10.s\n"
- "fmla z24.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x14, LSL #2]\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x12, x14, LSL #2]\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "fmla z26.s, p3/M, z7.s, z11.s\n"
- "fmla z27.s, p3/M, z6.s, z11.s\n"
- "fmla z29.s, p3/M, z5.s, z11.s\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x11, x14, LSL #2]\n"
- "ldp x12, x11, [x16, #0x0]\n"
- "fmla z23.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x14, LSL #2]\n"
- "fmla z16.s, p3/M, z4.s, z10.s\n"
- "fmax z16.s, p3/M, z16.s, z14.s\n"
- "fmla z17.s, p3/M, z3.s, z10.s\n"
- "fmla z18.s, p3/M, z5.s, z11.s\n"
- "fmax z17.s, p3/M, z17.s, z14.s\n"
- "fmax z18.s, p3/M, z18.s, z14.s\n"
- "fmla z19.s, p3/M, z4.s, z11.s\n"
- "fmla z29.s, p3/M, z8.s, z12.s\n"
- "fmax z19.s, p3/M, z19.s, z14.s\n"
- "fmin z16.s, p3/M, z16.s, z13.s\n"
- "fmla z30.s, p3/M, z7.s, z12.s\n"
- "fmla z31.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x10, x14, LSL #2]\n"
- "fmin z17.s, p3/M, z17.s, z13.s\n"
+ "ld1w { z10.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "ldr x25, [x8, #0xa8]\n"
+ "fmla z26.s, p3/M, z3.s, z25.s\n"
+ "fmla z14.s, p3/M, z0.s, z25.s\n"
+ "fmla z23.s, p3/M, z6.s, z29.s\n"
+ "fmla z15.s, p3/M, z3.s, z29.s\n"
+ "ld1w { z25.s }, p2/Z, [x13, x16, LSL #2]\n"
+ "ldr x24, [x8, #0xb0]\n"
+ "fmla z22.s, p3/M, z4.s, z10.s\n"
+ "fmla z27.s, p3/M, z3.s, z10.s\n"
"fmla z20.s, p3/M, z1.s, z10.s\n"
- "fmla z21.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x9, x14, LSL #2]\n"
- "fmin z18.s, p3/M, z18.s, z13.s\n"
- "fmla z22.s, p3/M, z2.s, z11.s\n"
- "fmla z23.s, p3/M, z1.s, z11.s\n"
- "fmin z19.s, p3/M, z19.s, z13.s\n"
- "fmax z20.s, p3/M, z20.s, z14.s\n"
- "fmla z24.s, p3/M, z7.s, z12.s\n"
- "fmla z25.s, p3/M, z6.s, z12.s\n"
- "fmax z21.s, p3/M, z21.s, z14.s\n"
- "fmax z22.s, p3/M, z22.s, z14.s\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- "fmax z23.s, p3/M, z23.s, z14.s\n"
- "st1w { z16.s }, p1, [x23, x13, LSL #2]\n"
- "st1w { z17.s }, p1, [x22, x13, LSL #2]\n"
- "ldr x23, [x28, #0x20]\n"
- "ldr x22, [x28, #0x28]\n"
+ "fmla z9.s, p3/M, z5.s, z12.s\n"
+ "fmla z11.s, p3/M, z2.s, z12.s\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "ldr x23, [x8, #0xb8]\n"
+ "fmla z13.s, p3/M, z8.s, z25.s\n"
+ "fmla z28.s, p3/M, z5.s, z25.s\n"
+ "ld1w { z25.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "ldr x22, [x8, #0xc0]\n"
+ "fmla z26.s, p3/M, z5.s, z10.s\n"
+ "fmla z14.s, p3/M, z2.s, z10.s\n"
+ "ld1w { z29.s }, p2/Z, [x28, x16, LSL #2]\n"
+ "ldr x21, [x8, #0xc8]\n"
+ "fmla z22.s, p3/M, z5.s, z12.s\n"
+ "fmla z27.s, p3/M, z4.s, z12.s\n"
+ "fmla z20.s, p3/M, z2.s, z12.s\n"
+ "fmla z9.s, p3/M, z3.s, z12.s\n"
+ "fmla z24.s, p3/M, z1.s, z12.s\n"
+ "fmla z11.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "ldr x28, [x8, #0xd8]\n"
+ "fmla z15.s, p3/M, z7.s, z25.s\n"
+ "fmla z18.s, p3/M, z6.s, z25.s\n"
+ "ld1w { z25.s }, p2/Z, [x27, x16, LSL #2]\n"
+ "ldr x20, [x8, #0xd0]\n"
+ "fmla z26.s, p3/M, z7.s, z29.s\n"
+ "fmla z22.s, p3/M, z6.s, z29.s\n"
+ "fmla z14.s, p3/M, z4.s, z29.s\n"
+ "fmla z20.s, p3/M, z3.s, z29.s\n"
+ "fmla z23.s, p3/M, z1.s, z29.s\n"
+ "fmla z30.s, p3/M, z0.s, z29.s\n"
+ "ld1w { z29.s }, p2/Z, [x26, x16, LSL #2]\n"
+ "ldr x27, [x8, #0xe0]\n"
+ "fmla z27.s, p3/M, z8.s, z10.s\n"
+ "fmla z21.s, p3/M, z8.s, z25.s\n"
+ "fmla z28.s, p3/M, z7.s, z25.s\n"
+ "ld1w { z25.s }, p2/Z, [x25, x16, LSL #2]\n"
+ "fmla z13.s, p3/M, z1.s, z10.s\n"
+ "ldr x26, [x8, #0xe8]\n"
+ "fmla z9.s, p3/M, z7.s, z10.s\n"
+ "fmla z24.s, p3/M, z5.s, z10.s\n"
+ "fmla z11.s, p3/M, z4.s, z10.s\n"
+ "fmla z31.s, p3/M, z2.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "ldr x25, [x8, #0xf0]\n"
+ "fmla z26.s, p3/M, z2.s, z29.s\n"
+ "fmla z22.s, p3/M, z1.s, z29.s\n"
+ "fmla z27.s, p3/M, z0.s, z29.s\n"
+ "fmla z14.s, p3/M, z7.s, z25.s\n"
+ "ld1w { z29.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "ldr x24, [x8, #0xf8]\n"
+ "fmla z20.s, p3/M, z6.s, z25.s\n"
+ "fmla z23.s, p3/M, z4.s, z25.s\n"
+ "fmla z30.s, p3/M, z3.s, z25.s\n"
+ "fmla z15.s, p3/M, z1.s, z25.s\n"
+ "fmla z18.s, p3/M, z0.s, z25.s\n"
+ "ld1w { z25.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "fmla z13.s, p3/M, z4.s, z25.s\n"
+ "ldr x23, [x8, #0x100]\n"
+ "fmla z21.s, p3/M, z2.s, z25.s\n"
+ "fmla z22.s, p3/M, z2.s, z10.s\n"
+ "fmla z27.s, p3/M, z1.s, z10.s\n"
+ "fmla z9.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "ldr x22, [x8, #0x108]\n"
+ "fmla z26.s, p3/M, z6.s, z29.s\n"
+ "fmla z14.s, p3/M, z3.s, z29.s\n"
+ "fmla z23.s, p3/M, z0.s, z29.s\n"
+ "fmla z24.s, p3/M, z8.s, z25.s\n"
+ "ld1w { z10.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "ldr x21, [x8, #0x110]\n"
+ "fmla z11.s, p3/M, z7.s, z25.s\n"
+ "fmla z31.s, p3/M, z5.s, z25.s\n"
+ "fmla z28.s, p3/M, z1.s, z25.s\n"
+ "ld1w { z25.s }, p2/Z, [x28, x16, LSL #2]\n"
+ "fmla z13.s, p3/M, z2.s, z12.s\n"
+ "ldr x20, [x8, #0x118]\n"
+ "fmla z15.s, p3/M, z0.s, z10.s\n"
+ "fmla z18.s, p3/M, z4.s, z25.s\n"
+ "fmla z21.s, p3/M, z3.s, z25.s\n"
+ "fmla z9.s, p3/M, z8.s, z12.s\n"
+ "fmla z11.s, p3/M, z5.s, z12.s\n"
+ "fmla z14.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x27, x16, LSL #2]\n"
+ "fmla z23.s, p3/M, z3.s, z10.s\n"
+ "ld1w { z29.s }, p2/Z, [x26, x16, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z25.s\n"
+ "fmla z31.s, p3/M, z6.s, z25.s\n"
+ "fmla z15.s, p3/M, z5.s, z25.s\n"
+ "fmla z13.s, p3/M, z5.s, z12.s\n"
+ "fmla z28.s, p3/M, z2.s, z12.s\n"
+ "fmla z18.s, p3/M, z7.s, z29.s\n"
+ "fmla z21.s, p3/M, z6.s, z29.s\n"
+ "fmla z23.s, p3/M, z8.s, z25.s\n"
+ "ld1w { z25.s }, p2/Z, [x25, x16, LSL #2]\n"
+ "fmla z15.s, p3/M, z8.s, z29.s\n"
+ "ld1w { z29.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z25.s\n"
+ "fmla z31.s, p3/M, z7.s, z25.s\n"
+ "fmla z13.s, p3/M, z6.s, z25.s\n"
+ "fmla z18.s, p3/M, z5.s, z25.s\n"
+ "fmla z21.s, p3/M, z4.s, z25.s\n"
+ "fmla z28.s, p3/M, z3.s, z25.s\n"
+ "ld1w { z25.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "ldp x27, x26, [x8, #0x0]\n"
+ "fmla z11.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "fmla z26.s, p3/M, z4.s, z29.s\n"
+ "fmax z26.s, p3/M, z26.s, z16.s\n"
+ "fmla z22.s, p3/M, z3.s, z29.s\n"
+ "fmla z27.s, p3/M, z5.s, z25.s\n"
+ "fmax z22.s, p3/M, z22.s, z16.s\n"
+ "fmax z27.s, p3/M, z27.s, z16.s\n"
+ "fmla z9.s, p3/M, z4.s, z25.s\n"
+ "fmla z18.s, p3/M, z8.s, z12.s\n"
+ "fmax z9.s, p3/M, z9.s, z16.s\n"
+ "fmin z26.s, p3/M, z26.s, z19.s\n"
+ "fmla z21.s, p3/M, z7.s, z12.s\n"
+ "fmla z28.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "fmin z22.s, p3/M, z22.s, z19.s\n"
+ "fmla z14.s, p3/M, z1.s, z29.s\n"
+ "fmla z20.s, p3/M, z0.s, z29.s\n"
+ "ld1w { z12.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "fmin z27.s, p3/M, z27.s, z19.s\n"
+ "fmla z24.s, p3/M, z2.s, z25.s\n"
+ "fmla z11.s, p3/M, z1.s, z25.s\n"
+ "fmin z9.s, p3/M, z9.s, z19.s\n"
+ "fmax z14.s, p3/M, z14.s, z16.s\n"
+ "fmla z23.s, p3/M, z7.s, z10.s\n"
+ "fmla z30.s, p3/M, z6.s, z10.s\n"
+ "fmax z20.s, p3/M, z20.s, z16.s\n"
+ "fmax z24.s, p3/M, z24.s, z16.s\n"
+ "fmla z31.s, p3/M, z8.s, z12.s\n"
+ "fmla z13.s, p3/M, z7.s, z12.s\n"
+ "fmax z11.s, p3/M, z11.s, z16.s\n"
+ "st1w { z26.s }, p1, [x12, x15, LSL #2]\n"
+ "st1w { z22.s }, p1, [x11, x15, LSL #2]\n"
+ "ldr x23, [x14, #0x20]\n"
+ "ldr x22, [x14, #0x28]\n"
+ "fmla z15.s, p3/M, z4.s, z10.s\n"
+ "st1w { z27.s }, p1, [x10, x15, LSL #2]\n"
+ "ldr x21, [x14, #0x30]\n"
+ "fmla z18.s, p3/M, z3.s, z10.s\n"
+ "fmla z21.s, p3/M, z5.s, z12.s\n"
+ "st1w { z9.s }, p1, [x9, x15, LSL #2]\n"
+ "ldr x20, [x14, #0x38]\n"
"fmla z28.s, p3/M, z4.s, z12.s\n"
- "st1w { z18.s }, p1, [x21, x13, LSL #2]\n"
- "ldr x21, [x28, #0x30]\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "st1w { z19.s }, p1, [x20, x13, LSL #2]\n"
- "ldr x20, [x28, #0x38]\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- "ldp x10, x9, [x16, #0x10]\n"
- "fmin z20.s, p3/M, z20.s, z13.s\n"
- "fmin z21.s, p3/M, z21.s, z13.s\n"
- "st1w { z20.s }, p1, [x23, x13, LSL #2]\n"
- "ldr x23, [x28, #0x40]\n"
- "fmin z22.s, p3/M, z22.s, z13.s\n"
- "fmin z23.s, p3/M, z23.s, z13.s\n"
- "st1w { z21.s }, p1, [x22, x13, LSL #2]\n"
- "ldr x22, [x28, #0x48]\n"
- "fmax z24.s, p3/M, z24.s, z14.s\n"
- "fmax z25.s, p3/M, z25.s, z14.s\n"
- "st1w { z22.s }, p1, [x21, x13, LSL #2]\n"
- "ldr x21, [x28, #0x50]\n"
- "fmax z26.s, p3/M, z26.s, z14.s\n"
- "fmax z27.s, p3/M, z27.s, z14.s\n"
- "st1w { z23.s }, p1, [x20, x13, LSL #2]\n"
- "ldr x20, [x28, #0x58]\n"
- "incw x14\n"
- "ld1w { z9.s }, p0/Z, [x12, x15, LSL #2]\n"
- "ld1w { z10.s }, p0/Z, [x11, x15, LSL #2]\n"
- "fmin z24.s, p3/M, z24.s, z13.s\n"
- "ld1w { z11.s }, p0/Z, [x10, x15, LSL #2]\n"
- "ld1w { z12.s }, p0/Z, [x9, x15, LSL #2]\n"
- "incw x15\n"
- "fmin z25.s, p3/M, z25.s, z13.s\n"
- "fmin z26.s, p3/M, z26.s, z13.s\n"
- "fmin z27.s, p3/M, z27.s, z13.s\n"
- "st1w { z24.s }, p1, [x23, x13, LSL #2]\n"
- "ldr x23, [x28, #0x60]\n"
- "fmax z28.s, p3/M, z28.s, z14.s\n"
- "fmax z29.s, p3/M, z29.s, z14.s\n"
- "st1w { z25.s }, p1, [x22, x13, LSL #2]\n"
- "ldr x22, [x28, #0x68]\n"
- "fmax z30.s, p3/M, z30.s, z14.s\n"
- "fmax z31.s, p3/M, z31.s, z14.s\n"
- "st1w { z26.s }, p1, [x21, x13, LSL #2]\n"
- "ldr x21, [x28, #0x70]\n"
- "st1w { z27.s }, p1, [x20, x13, LSL #2]\n"
- "ldr x20, [x28, #0x78]\n"
- "ld1w { z15.s }, p3/Z, [x17]\n"
- "whilelt p2.s, x14, %x[n_channels]\n"
- "ld1w { z0.s }, p3/Z, [x17, #1, MUL VL]\n"
- "ld1w { z1.s }, p3/Z, [x17, #2, MUL VL]\n"
- "cmp x15, %x[n_channels]\n"
- "fmin z28.s, p3/M, z28.s, z13.s\n"
- "ld1w { z2.s }, p3/Z, [x17, #3, MUL VL]\n"
- "ld1w { z3.s }, p3/Z, [x17, #4, MUL VL]\n"
- "fmin z29.s, p3/M, z29.s, z13.s\n"
- "fmin z30.s, p3/M, z30.s, z13.s\n"
- "ld1w { z4.s }, p3/Z, [x17, #5, MUL VL]\n"
- "ld1w { z5.s }, p3/Z, [x17, #6, MUL VL]\n"
- "fmin z31.s, p3/M, z31.s, z13.s\n"
- "st1w { z28.s }, p1, [x23, x13, LSL #2]\n"
- "ld1w { z6.s }, p3/Z, [x17, #7, MUL VL]\n"
- "addvl x17, x17, #16\n"
- "st1w { z29.s }, p1, [x22, x13, LSL #2]\n"
- "ld1w { z7.s }, p3/Z, [x17, #-8, MUL VL]\n"
- "st1w { z30.s }, p1, [x21, x13, LSL #2]\n"
- "ld1w { z8.s }, p3/Z, [x17, #-7, MUL VL]\n"
- "addvl x17, x17, #-6\n"
- "st1w { z31.s }, p1, [x20, x13, LSL #2]\n"
+ "ldp x25, x24, [x8, #0x10]\n"
+ "fmin z14.s, p3/M, z14.s, z19.s\n"
+ "fmin z20.s, p3/M, z20.s, z19.s\n"
+ "st1w { z14.s }, p1, [x23, x15, LSL #2]\n"
+ "ldr x23, [x14, #0x40]\n"
+ "fmin z24.s, p3/M, z24.s, z19.s\n"
+ "fmin z11.s, p3/M, z11.s, z19.s\n"
+ "st1w { z20.s }, p1, [x22, x15, LSL #2]\n"
+ "ldr x22, [x14, #0x48]\n"
+ "fmax z23.s, p3/M, z23.s, z16.s\n"
+ "fmax z30.s, p3/M, z30.s, z16.s\n"
+ "st1w { z24.s }, p1, [x21, x15, LSL #2]\n"
+ "ldr x21, [x14, #0x50]\n"
+ "fmax z31.s, p3/M, z31.s, z16.s\n"
+ "fmax z13.s, p3/M, z13.s, z16.s\n"
+ "st1w { z11.s }, p1, [x20, x15, LSL #2]\n"
+ "ldr x20, [x14, #0x58]\n"
+ "incw x16\n"
+ "ld1w { z9.s }, p0/Z, [x27, x17, LSL #2]\n"
+ "ld1w { z10.s }, p0/Z, [x26, x17, LSL #2]\n"
+ "fmin z23.s, p3/M, z23.s, z19.s\n"
+ "ld1w { z11.s }, p0/Z, [x25, x17, LSL #2]\n"
+ "ld1w { z12.s }, p0/Z, [x24, x17, LSL #2]\n"
+ "incw x17\n"
+ "fmin z30.s, p3/M, z30.s, z19.s\n"
+ "fmin z31.s, p3/M, z31.s, z19.s\n"
+ "fmin z13.s, p3/M, z13.s, z19.s\n"
+ "st1w { z23.s }, p1, [x23, x15, LSL #2]\n"
+ "ldr x23, [x14, #0x60]\n"
+ "fmax z15.s, p3/M, z15.s, z16.s\n"
+ "fmax z18.s, p3/M, z18.s, z16.s\n"
+ "st1w { z30.s }, p1, [x22, x15, LSL #2]\n"
+ "ldr x22, [x14, #0x68]\n"
+ "fmax z21.s, p3/M, z21.s, z16.s\n"
+ "fmax z28.s, p3/M, z28.s, z16.s\n"
+ "st1w { z31.s }, p1, [x21, x15, LSL #2]\n"
+ "ldr x21, [x14, #0x70]\n"
+ "st1w { z13.s }, p1, [x20, x15, LSL #2]\n"
+ "ldr x20, [x14, #0x78]\n"
+ "ld1w { z0.s }, p3/Z, [x7, #1, MUL VL]\n"
+ "whilelt p2.s, x16, %x[n_channels]\n"
+ "ld1w { z1.s }, p3/Z, [x7, #2, MUL VL]\n"
+ "ld1w { z2.s }, p3/Z, [x7, #3, MUL VL]\n"
+ "cmp x17, %x[n_channels]\n"
+ "fmin z15.s, p3/M, z15.s, z19.s\n"
+ "ld1w { z3.s }, p3/Z, [x7, #4, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x7, #5, MUL VL]\n"
+ "fmin z18.s, p3/M, z18.s, z19.s\n"
+ "fmin z21.s, p3/M, z21.s, z19.s\n"
+ "ld1w { z5.s }, p3/Z, [x7, #6, MUL VL]\n"
+ "ld1w { z6.s }, p3/Z, [x7, #7, MUL VL]\n"
+ "addvl x7, x7, #16\n"
+ "fmin z28.s, p3/M, z28.s, z19.s\n"
+ "st1w { z15.s }, p1, [x23, x15, LSL #2]\n"
+ "ld1w { z7.s }, p3/Z, [x7, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x7, #-7, MUL VL]\n"
+ "addvl x7, x7, #-6\n"
+ "st1w { z18.s }, p1, [x22, x15, LSL #2]\n"
+ "st1w { z21.s }, p1, [x21, x15, LSL #2]\n"
+ "st1w { z28.s }, p1, [x20, x15, LSL #2]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z21, z15\n fmla z21.s, p3/M, z4.s, z9.s\n"
- "movprfx z16, z15\n fmla z16.s, p3/M, z8.s, z9.s\n"
- "ldr x27, [x16, #0x20]\n"
- "ldr x26, [x16, #0x30]\n"
- "movprfx z22, z15\n fmla z22.s, p3/M, z3.s, z9.s\n"
- "movprfx z25, z15\n fmla z25.s, p3/M, z1.s, z9.s\n"
- "ldr x25, [x16, #0x28]\n"
- "ldr x24, [x16, #0x38]\n"
- "movprfx z26, z15\n fmla z26.s, p3/M, z0.s, z9.s\n"
- "movprfx z17, z15\n fmla z17.s, p3/M, z7.s, z9.s\n"
- "ldr x12, [x16, #0x40]\n"
- "ldr x11, [x16, #0x48]\n"
- "movprfx z18, z15\n fmla z18.s, p3/M, z6.s, z9.s\n"
- "fmla z21.s, p3/M, z5.s, z12.s\n"
- "ldr x10, [x16, #0x50]\n"
- "ldr x9, [x16, #0x58]\n"
- "movprfx z20, z15\n fmla z20.s, p3/M, z5.s, z9.s\n"
- "movprfx z24, z15\n fmla z24.s, p3/M, z2.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x26, x14, LSL #2]\n"
- "ldr x26, [x16, #0x70]\n"
- "fmla z16.s, p3/M, z0.s, z10.s\n"
- "movprfx z19, z15\n fmla z19.s, p3/M, z2.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x27, x14, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x25, x14, LSL #2]\n"
- "fmla z22.s, p3/M, z4.s, z12.s\n"
- "fmla z25.s, p3/M, z2.s, z12.s\n"
- "ldr x27, [x16, #0x60]\n"
- "ldr x25, [x16, #0x68]\n"
- "fmla z26.s, p3/M, z1.s, z12.s\n"
- "fmla z17.s, p3/M, z8.s, z12.s\n"
- "incw x13\n"
- "mov p1.b, p2.b\n"
- "fmla z18.s, p3/M, z7.s, z12.s\n"
- "movprfx z28, z15\n fmla z28.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
- "ldr x11, [x16, #0x88]\n"
- "fmla z21.s, p3/M, z7.s, z9.s\n"
- "fmla z19.s, p3/M, z6.s, z12.s\n"
- "ldr x23, [x28, #0x0]\n"
- "ldr x22, [x28, #0x8]\n"
- "movprfx z23, z15\n fmla z23.s, p3/M, z3.s, z12.s\n"
- "movprfx z27, z15\n fmla z27.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x14, LSL #2]\n"
- "ldr x24, [x16, #0x78]\n"
- "movprfx z31, z15\n fmla z31.s, p3/M, z8.s, z11.s\n"
- "fmla z22.s, p3/M, z6.s, z9.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x14, LSL #2]\n"
- "ldr x12, [x16, #0x80]\n"
- "fmla z25.s, p3/M, z4.s, z9.s\n"
- "fmla z26.s, p3/M, z3.s, z9.s\n"
- "ldr x21, [x28, #0x10]\n"
- "ldr x20, [x28, #0x18]\n"
- "fmla z20.s, p3/M, z8.s, z9.s\n"
- "fmla z24.s, p3/M, z5.s, z9.s\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z16.s, p3/M, z1.s, z12.s\n"
- "fmla z17.s, p3/M, z0.s, z12.s\n"
- "movprfx z29, z15\n fmla z29.s, p3/M, z1.s, z9.s\n"
- "movprfx z30, z15\n fmla z30.s, p3/M, z0.s, z9.s\n"
- "fmla z18.s, p3/M, z2.s, z11.s\n"
- "ld1w { z9.s }, p2/Z, [x10, x14, LSL #2]\n"
- "ldr x10, [x16, #0x90]\n"
- "fmla z21.s, p3/M, z8.s, z10.s\n"
- "fmla z19.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x14, LSL #2]\n"
- "ldr x27, [x16, #0xa0]\n"
- "fmla z22.s, p3/M, z7.s, z10.s\n"
- "fmla z23.s, p3/M, z6.s, z10.s\n"
- "fmla z25.s, p3/M, z5.s, z10.s\n"
- "fmla z26.s, p3/M, z4.s, z10.s\n"
- "fmla z27.s, p3/M, z3.s, z10.s\n"
- "fmla z29.s, p3/M, z2.s, z10.s\n"
- "fmla z30.s, p3/M, z1.s, z10.s\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x14, LSL #2]\n"
- "ldr x25, [x16, #0xa8]\n"
- "fmla z16.s, p3/M, z3.s, z9.s\n"
- "fmla z20.s, p3/M, z0.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x9, x14, LSL #2]\n"
- "ldr x9, [x16, #0x98]\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x14, LSL #2]\n"
- "ldr x26, [x16, #0xb0]\n"
- "fmla z17.s, p3/M, z4.s, z10.s\n"
- "fmla z18.s, p3/M, z3.s, z10.s\n"
- "fmla z21.s, p3/M, z1.s, z10.s\n"
- "fmla z19.s, p3/M, z5.s, z12.s\n"
- "fmla z23.s, p3/M, z2.s, z12.s\n"
- "fmla z22.s, p3/M, z0.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x14, LSL #2]\n"
- "ldr x24, [x16, #0xb8]\n"
- "fmla z27.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x14, LSL #2]\n"
- "ldr x12, [x16, #0xc0]\n"
- "fmla z16.s, p3/M, z5.s, z10.s\n"
- "fmla z20.s, p3/M, z2.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
- "ldr x11, [x16, #0xc8]\n"
- "fmla z17.s, p3/M, z5.s, z12.s\n"
- "fmla z18.s, p3/M, z4.s, z12.s\n"
- "fmla z21.s, p3/M, z2.s, z12.s\n"
- "fmla z19.s, p3/M, z3.s, z12.s\n"
- "fmla z22.s, p3/M, z1.s, z12.s\n"
- "fmla z23.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x9, x14, LSL #2]\n"
- "ldr x9, [x16, #0xd8]\n"
- "fmla z28.s, p3/M, z7.s, z11.s\n"
- "fmla z29.s, p3/M, z6.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x10, x14, LSL #2]\n"
- "ldr x10, [x16, #0xd0]\n"
- "fmla z16.s, p3/M, z7.s, z10.s\n"
- "fmla z17.s, p3/M, z6.s, z10.s\n"
- "fmla z20.s, p3/M, z4.s, z10.s\n"
- "fmla z21.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z1.s, z10.s\n"
- "fmla z25.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x27, x14, LSL #2]\n"
- "ldr x27, [x16, #0xe0]\n"
- "fmla z18.s, p3/M, z8.s, z12.s\n"
- "fmla z30.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z7.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x14, LSL #2]\n"
- "fmla z27.s, p3/M, z1.s, z12.s\n"
- "ldr x25, [x16, #0xe8]\n"
- "fmla z19.s, p3/M, z7.s, z12.s\n"
- "fmla z22.s, p3/M, z5.s, z12.s\n"
- "fmla z23.s, p3/M, z4.s, z12.s\n"
- "fmla z26.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x14, LSL #2]\n"
- "ldr x26, [x16, #0xf0]\n"
- "fmla z16.s, p3/M, z2.s, z10.s\n"
- "fmla z17.s, p3/M, z1.s, z10.s\n"
+ "movprfx z14, z17\n fmla z14.s, p3/M, z4.s, z9.s\n"
+ "movprfx z18, z17\n fmla z18.s, p3/M, z8.s, z9.s\n"
+ "ldr x27, [x8, #0x20]\n"
+ "ldr x24, [x8, #0x30]\n"
+ "movprfx z15, z17\n fmla z15.s, p3/M, z3.s, z9.s\n"
+ "movprfx z30, z17\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "ldr x23, [x8, #0x28]\n"
+ "ldr x22, [x8, #0x38]\n"
+ "movprfx z20, z17\n fmla z20.s, p3/M, z0.s, z9.s\n"
+ "movprfx z13, z17\n fmla z13.s, p3/M, z7.s, z9.s\n"
+ "ldr x26, [x8, #0x40]\n"
+ "ldr x21, [x8, #0x48]\n"
+ "movprfx z22, z17\n fmla z22.s, p3/M, z6.s, z9.s\n"
+ "fmla z14.s, p3/M, z5.s, z12.s\n"
+ "ldr x25, [x8, #0x50]\n"
+ "ldr x20, [x8, #0x58]\n"
+ "movprfx z27, z17\n fmla z27.s, p3/M, z5.s, z9.s\n"
+ "movprfx z31, z17\n fmla z31.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z23.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "ldr x13, [x8, #0x70]\n"
"fmla z18.s, p3/M, z0.s, z10.s\n"
- "fmla z20.s, p3/M, z7.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x24, x14, LSL #2]\n"
- "ldr x24, [x16, #0xf8]\n"
- "fmla z21.s, p3/M, z6.s, z11.s\n"
- "fmla z24.s, p3/M, z4.s, z11.s\n"
- "fmla z25.s, p3/M, z3.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x14, LSL #2]\n"
- "fmla z27.s, p3/M, z4.s, z11.s\n"
- "ldr x12, [x16, #0x100]\n"
- "fmla z30.s, p3/M, z2.s, z11.s\n"
- "fmla z17.s, p3/M, z2.s, z12.s\n"
+ "movprfx z9, z17\n fmla z9.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z21.s }, p2/Z, [x27, x16, LSL #2]\n"
+ "ld1w { z25.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "fmla z15.s, p3/M, z4.s, z12.s\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
+ "ldr x24, [x8, #0x60]\n"
+ "ldr x23, [x8, #0x68]\n"
+ "fmla z20.s, p3/M, z1.s, z12.s\n"
+ "fmla z13.s, p3/M, z8.s, z12.s\n"
+ "incw x15\n"
+ "mov p0.b, p2.b\n"
+ "fmla z22.s, p3/M, z7.s, z12.s\n"
+ "movprfx z28, z17\n fmla z28.s, p3/M, z6.s, z21.s\n"
+ "ld1w { z29.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "ldr x28, [x8, #0x88]\n"
+ "fmla z14.s, p3/M, z7.s, z23.s\n"
+ "fmla z9.s, p3/M, z6.s, z12.s\n"
+ "ldr x12, [x14, #0x0]\n"
+ "ldr x11, [x14, #0x8]\n"
+ "movprfx z11, z17\n fmla z11.s, p3/M, z3.s, z12.s\n"
+ "movprfx z10, z17\n fmla z10.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "ldr x22, [x8, #0x78]\n"
+ "movprfx z26, z17\n fmla z26.s, p3/M, z8.s, z25.s\n"
+ "fmla z15.s, p3/M, z6.s, z23.s\n"
+ "ld1w { z21.s }, p2/Z, [x26, x16, LSL #2]\n"
+ "ldr x21, [x8, #0x80]\n"
+ "fmla z30.s, p3/M, z4.s, z23.s\n"
+ "fmla z20.s, p3/M, z3.s, z23.s\n"
+ "ldr x10, [x14, #0x10]\n"
+ "ldr x9, [x14, #0x18]\n"
+ "movprfx z25, z17\n fmla z25.s, p3/M, z1.s, z23.s\n"
+ "movprfx z24, z17\n fmla z24.s, p3/M, z0.s, z23.s\n"
+ "fmla z27.s, p3/M, z8.s, z23.s\n"
+ "fmla z31.s, p3/M, z5.s, z23.s\n"
+ "fmla z28.s, p3/M, z2.s, z23.s\n"
"fmla z18.s, p3/M, z1.s, z12.s\n"
- "fmla z19.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x11, x14, LSL #2]\n"
- "ldr x11, [x16, #0x108]\n"
- "fmla z16.s, p3/M, z6.s, z10.s\n"
- "fmla z20.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z0.s, z10.s\n"
- "fmla z22.s, p3/M, z8.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x10, x14, LSL #2]\n"
- "ldr x10, [x16, #0x110]\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
- "fmla z26.s, p3/M, z5.s, z11.s\n"
- "fmla z31.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x9, x14, LSL #2]\n"
- "fmla z27.s, p3/M, z2.s, z12.s\n"
- "ldr x9, [x16, #0x118]\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "fmla z30.s, p3/M, z3.s, z11.s\n"
- "fmla z19.s, p3/M, z8.s, z12.s\n"
- "fmla z23.s, p3/M, z5.s, z12.s\n"
- "fmla z20.s, p3/M, z6.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x14, LSL #2]\n"
- "fmla z24.s, p3/M, z3.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x14, LSL #2]\n"
- "fmla z25.s, p3/M, z7.s, z11.s\n"
- "fmla z26.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z5.s, z11.s\n"
- "fmla z27.s, p3/M, z5.s, z12.s\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "fmla z30.s, p3/M, z6.s, z10.s\n"
- "fmla z24.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x14, LSL #2]\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x12, x14, LSL #2]\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "fmla z26.s, p3/M, z7.s, z11.s\n"
- "fmla z27.s, p3/M, z6.s, z11.s\n"
- "fmla z29.s, p3/M, z5.s, z11.s\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x11, x14, LSL #2]\n"
- "fmla z23.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x14, LSL #2]\n"
- "fmla z16.s, p3/M, z4.s, z10.s\n"
- "fmax z16.s, p3/M, z16.s, z14.s\n"
- "fmla z17.s, p3/M, z3.s, z10.s\n"
- "fmla z18.s, p3/M, z5.s, z11.s\n"
- "fmax z17.s, p3/M, z17.s, z14.s\n"
- "fmax z18.s, p3/M, z18.s, z14.s\n"
- "fmla z19.s, p3/M, z4.s, z11.s\n"
- "fmla z29.s, p3/M, z8.s, z12.s\n"
- "fmax z19.s, p3/M, z19.s, z14.s\n"
- "fmin z16.s, p3/M, z16.s, z13.s\n"
- "fmla z30.s, p3/M, z7.s, z12.s\n"
- "fmla z31.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x10, x14, LSL #2]\n"
- "fmin z17.s, p3/M, z17.s, z13.s\n"
- "fmla z20.s, p3/M, z1.s, z10.s\n"
- "fmla z21.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x9, x14, LSL #2]\n"
- "fmin z18.s, p3/M, z18.s, z13.s\n"
- "fmla z22.s, p3/M, z2.s, z11.s\n"
- "fmla z23.s, p3/M, z1.s, z11.s\n"
- "fmin z19.s, p3/M, z19.s, z13.s\n"
- "fmax z20.s, p3/M, z20.s, z14.s\n"
- "fmla z24.s, p3/M, z7.s, z12.s\n"
- "fmla z25.s, p3/M, z6.s, z12.s\n"
- "fmax z21.s, p3/M, z21.s, z14.s\n"
- "fmax z22.s, p3/M, z22.s, z14.s\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- "fmax z23.s, p3/M, z23.s, z14.s\n"
- "st1w { z16.s }, p1, [x23, x13, LSL #2]\n"
- "st1w { z17.s }, p1, [x22, x13, LSL #2]\n"
- "ldr x23, [x28, #0x20]\n"
- "ldr x22, [x28, #0x28]\n"
- "fmla z28.s, p3/M, z4.s, z12.s\n"
- "st1w { z18.s }, p1, [x21, x13, LSL #2]\n"
- "ldr x21, [x28, #0x30]\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "st1w { z19.s }, p1, [x20, x13, LSL #2]\n"
- "ldr x20, [x28, #0x38]\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- "fmin z20.s, p3/M, z20.s, z13.s\n"
- "fmin z21.s, p3/M, z21.s, z13.s\n"
- "fmin z22.s, p3/M, z22.s, z13.s\n"
- "st1w { z20.s }, p1, [x23, x13, LSL #2]\n"
- "ldr x23, [x28, #0x40]\n"
- "fmin z23.s, p3/M, z23.s, z13.s\n"
- "fmax z24.s, p3/M, z24.s, z14.s\n"
- "st1w { z21.s }, p1, [x22, x13, LSL #2]\n"
- "ldr x22, [x28, #0x48]\n"
- "fmax z25.s, p3/M, z25.s, z14.s\n"
- "fmax z26.s, p3/M, z26.s, z14.s\n"
- "st1w { z22.s }, p1, [x21, x13, LSL #2]\n"
- "ldr x21, [x28, #0x50]\n"
- "fmax z27.s, p3/M, z27.s, z14.s\n"
- "st1w { z23.s }, p1, [x20, x13, LSL #2]\n"
- "ldr x20, [x28, #0x58]\n"
- "fmin z24.s, p3/M, z24.s, z13.s\n"
- "fmin z25.s, p3/M, z25.s, z13.s\n"
- "fmin z26.s, p3/M, z26.s, z13.s\n"
- "st1w { z24.s }, p1, [x23, x13, LSL #2]\n"
- "ldr x23, [x28, #0x60]\n"
- "fmin z27.s, p3/M, z27.s, z13.s\n"
- "fmax z28.s, p3/M, z28.s, z14.s\n"
- "st1w { z25.s }, p1, [x22, x13, LSL #2]\n"
- "ldr x22, [x28, #0x68]\n"
- "fmax z29.s, p3/M, z29.s, z14.s\n"
- "fmax z30.s, p3/M, z30.s, z14.s\n"
- "st1w { z26.s }, p1, [x21, x13, LSL #2]\n"
- "ldr x21, [x28, #0x70]\n"
- "fmax z31.s, p3/M, z31.s, z14.s\n"
- "st1w { z27.s }, p1, [x20, x13, LSL #2]\n"
- "ldr x20, [x28, #0x78]\n"
- "fmin z28.s, p3/M, z28.s, z13.s\n"
- "fmin z29.s, p3/M, z29.s, z13.s\n"
- "fmin z30.s, p3/M, z30.s, z13.s\n"
- "st1w { z28.s }, p1, [x23, x13, LSL #2]\n"
- "fmin z31.s, p3/M, z31.s, z13.s\n"
- "st1w { z29.s }, p1, [x22, x13, LSL #2]\n"
- "st1w { z30.s }, p1, [x21, x13, LSL #2]\n"
- "st1w { z31.s }, p1, [x20, x13, LSL #2]\n"
+ "ld1w { z23.s }, p2/Z, [x25, x16, LSL #2]\n"
+ "ldr x27, [x8, #0x90]\n"
+ "fmla z13.s, p3/M, z0.s, z12.s\n"
+ "fmla z22.s, p3/M, z2.s, z21.s\n"
+ "ld1w { z12.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "ldr x20, [x8, #0x98]\n"
+ "fmla z14.s, p3/M, z8.s, z29.s\n"
+ "fmla z9.s, p3/M, z1.s, z21.s\n"
+ "ld1w { z21.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "ldr x26, [x8, #0xa0]\n"
+ "fmla z15.s, p3/M, z7.s, z29.s\n"
+ "fmla z11.s, p3/M, z6.s, z29.s\n"
+ "fmla z30.s, p3/M, z5.s, z29.s\n"
+ "fmla z20.s, p3/M, z4.s, z29.s\n"
+ "fmla z10.s, p3/M, z3.s, z29.s\n"
+ "fmla z25.s, p3/M, z2.s, z29.s\n"
+ "fmla z24.s, p3/M, z1.s, z29.s\n"
+ "fmla z26.s, p3/M, z0.s, z29.s\n"
+ "ld1w { z29.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "ldr x25, [x8, #0xa8]\n"
+ "fmla z18.s, p3/M, z3.s, z23.s\n"
+ "fmla z27.s, p3/M, z0.s, z23.s\n"
+ "fmla z31.s, p3/M, z6.s, z21.s\n"
+ "fmla z28.s, p3/M, z3.s, z21.s\n"
+ "ld1w { z21.s }, p2/Z, [x13, x16, LSL #2]\n"
+ "ldr x24, [x8, #0xb0]\n"
+ "fmla z13.s, p3/M, z4.s, z29.s\n"
+ "fmla z22.s, p3/M, z3.s, z29.s\n"
+ "fmla z14.s, p3/M, z1.s, z29.s\n"
+ "fmla z9.s, p3/M, z5.s, z12.s\n"
+ "fmla z11.s, p3/M, z2.s, z12.s\n"
+ "fmla z15.s, p3/M, z0.s, z29.s\n"
+ "ld1w { z17.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "ldr x23, [x8, #0xb8]\n"
+ "fmla z10.s, p3/M, z8.s, z21.s\n"
+ "fmla z26.s, p3/M, z5.s, z21.s\n"
+ "ld1w { z23.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "ldr x22, [x8, #0xc0]\n"
+ "fmla z18.s, p3/M, z5.s, z29.s\n"
+ "fmla z27.s, p3/M, z2.s, z29.s\n"
+ "ld1w { z21.s }, p2/Z, [x28, x16, LSL #2]\n"
+ "ldr x21, [x8, #0xc8]\n"
+ "fmla z13.s, p3/M, z5.s, z17.s\n"
+ "fmla z22.s, p3/M, z4.s, z17.s\n"
+ "fmla z14.s, p3/M, z2.s, z17.s\n"
+ "fmla z9.s, p3/M, z3.s, z17.s\n"
+ "fmla z15.s, p3/M, z1.s, z17.s\n"
+ "fmla z11.s, p3/M, z0.s, z17.s\n"
+ "ld1w { z29.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "ldr x28, [x8, #0xd8]\n"
+ "fmla z28.s, p3/M, z7.s, z23.s\n"
+ "fmla z25.s, p3/M, z6.s, z23.s\n"
+ "ld1w { z23.s }, p2/Z, [x27, x16, LSL #2]\n"
+ "ldr x20, [x8, #0xd0]\n"
+ "fmla z18.s, p3/M, z7.s, z21.s\n"
+ "fmla z13.s, p3/M, z6.s, z21.s\n"
+ "fmla z27.s, p3/M, z4.s, z21.s\n"
+ "fmla z14.s, p3/M, z3.s, z21.s\n"
+ "fmla z31.s, p3/M, z1.s, z21.s\n"
+ "fmla z30.s, p3/M, z0.s, z21.s\n"
+ "ld1w { z21.s }, p2/Z, [x26, x16, LSL #2]\n"
+ "ldr x27, [x8, #0xe0]\n"
+ "fmla z22.s, p3/M, z8.s, z29.s\n"
+ "fmla z24.s, p3/M, z8.s, z23.s\n"
+ "fmla z26.s, p3/M, z7.s, z23.s\n"
+ "ld1w { z23.s }, p2/Z, [x25, x16, LSL #2]\n"
+ "fmla z10.s, p3/M, z1.s, z29.s\n"
+ "ldr x26, [x8, #0xe8]\n"
+ "fmla z9.s, p3/M, z7.s, z29.s\n"
+ "fmla z15.s, p3/M, z5.s, z29.s\n"
+ "fmla z11.s, p3/M, z4.s, z29.s\n"
+ "fmla z20.s, p3/M, z2.s, z29.s\n"
+ "ld1w { z29.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "ldr x25, [x8, #0xf0]\n"
+ "fmla z18.s, p3/M, z2.s, z21.s\n"
+ "fmla z13.s, p3/M, z1.s, z21.s\n"
+ "fmla z22.s, p3/M, z0.s, z21.s\n"
+ "fmla z27.s, p3/M, z7.s, z23.s\n"
+ "ld1w { z21.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "ldr x24, [x8, #0xf8]\n"
+ "fmla z14.s, p3/M, z6.s, z23.s\n"
+ "fmla z31.s, p3/M, z4.s, z23.s\n"
+ "fmla z30.s, p3/M, z3.s, z23.s\n"
+ "fmla z28.s, p3/M, z1.s, z23.s\n"
+ "fmla z25.s, p3/M, z0.s, z23.s\n"
+ "ld1w { z17.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "fmla z10.s, p3/M, z4.s, z17.s\n"
+ "ldr x23, [x8, #0x100]\n"
+ "fmla z24.s, p3/M, z2.s, z17.s\n"
+ "fmla z13.s, p3/M, z2.s, z29.s\n"
+ "fmla z22.s, p3/M, z1.s, z29.s\n"
+ "fmla z9.s, p3/M, z0.s, z29.s\n"
+ "ld1w { z23.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "ldr x22, [x8, #0x108]\n"
+ "fmla z18.s, p3/M, z6.s, z21.s\n"
+ "fmla z27.s, p3/M, z3.s, z21.s\n"
+ "fmla z31.s, p3/M, z0.s, z21.s\n"
+ "fmla z15.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z29.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "ldr x21, [x8, #0x110]\n"
+ "fmla z11.s, p3/M, z7.s, z17.s\n"
+ "fmla z20.s, p3/M, z5.s, z17.s\n"
+ "fmla z26.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z21.s }, p2/Z, [x28, x16, LSL #2]\n"
+ "fmla z10.s, p3/M, z2.s, z23.s\n"
+ "ldr x20, [x8, #0x118]\n"
+ "fmla z28.s, p3/M, z0.s, z29.s\n"
+ "fmla z25.s, p3/M, z4.s, z21.s\n"
+ "fmla z24.s, p3/M, z3.s, z21.s\n"
+ "fmla z9.s, p3/M, z8.s, z23.s\n"
+ "fmla z11.s, p3/M, z5.s, z23.s\n"
+ "fmla z27.s, p3/M, z6.s, z29.s\n"
+ "ld1w { z23.s }, p2/Z, [x27, x16, LSL #2]\n"
+ "fmla z31.s, p3/M, z3.s, z29.s\n"
+ "ld1w { z17.s }, p2/Z, [x26, x16, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z21.s\n"
+ "fmla z20.s, p3/M, z6.s, z21.s\n"
+ "fmla z28.s, p3/M, z5.s, z21.s\n"
+ "fmla z10.s, p3/M, z5.s, z23.s\n"
+ "fmla z26.s, p3/M, z2.s, z23.s\n"
+ "fmla z25.s, p3/M, z7.s, z17.s\n"
+ "fmla z24.s, p3/M, z6.s, z17.s\n"
+ "fmla z31.s, p3/M, z8.s, z21.s\n"
+ "ld1w { z21.s }, p2/Z, [x25, x16, LSL #2]\n"
+ "fmla z28.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z12.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z21.s\n"
+ "fmla z20.s, p3/M, z7.s, z21.s\n"
+ "fmla z10.s, p3/M, z6.s, z21.s\n"
+ "fmla z25.s, p3/M, z5.s, z21.s\n"
+ "fmla z24.s, p3/M, z4.s, z21.s\n"
+ "fmla z26.s, p3/M, z3.s, z21.s\n"
+ "ld1w { z21.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "fmla z11.s, p3/M, z8.s, z23.s\n"
+ "ld1w { z29.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "fmla z18.s, p3/M, z4.s, z12.s\n"
+ "fmax z18.s, p3/M, z18.s, z16.s\n"
+ "fmla z13.s, p3/M, z3.s, z12.s\n"
+ "fmla z22.s, p3/M, z5.s, z21.s\n"
+ "fmax z13.s, p3/M, z13.s, z16.s\n"
+ "fmax z22.s, p3/M, z22.s, z16.s\n"
+ "fmla z9.s, p3/M, z4.s, z21.s\n"
+ "fmla z25.s, p3/M, z8.s, z29.s\n"
+ "fmax z9.s, p3/M, z9.s, z16.s\n"
+ "fmin z18.s, p3/M, z18.s, z19.s\n"
+ "fmla z24.s, p3/M, z7.s, z29.s\n"
+ "fmla z26.s, p3/M, z6.s, z29.s\n"
+ "ld1w { z23.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "fmin z13.s, p3/M, z13.s, z19.s\n"
+ "fmla z27.s, p3/M, z1.s, z12.s\n"
+ "fmla z14.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z29.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "fmin z22.s, p3/M, z22.s, z19.s\n"
+ "fmla z15.s, p3/M, z2.s, z21.s\n"
+ "fmla z11.s, p3/M, z1.s, z21.s\n"
+ "fmin z9.s, p3/M, z9.s, z19.s\n"
+ "fmax z27.s, p3/M, z27.s, z16.s\n"
+ "fmla z31.s, p3/M, z7.s, z23.s\n"
+ "fmla z30.s, p3/M, z6.s, z23.s\n"
+ "fmax z14.s, p3/M, z14.s, z16.s\n"
+ "fmax z15.s, p3/M, z15.s, z16.s\n"
+ "fmla z20.s, p3/M, z8.s, z29.s\n"
+ "fmla z10.s, p3/M, z7.s, z29.s\n"
+ "fmax z11.s, p3/M, z11.s, z16.s\n"
+ "st1w { z18.s }, p0, [x12, x15, LSL #2]\n"
+ "st1w { z13.s }, p0, [x11, x15, LSL #2]\n"
+ "ldr x23, [x14, #0x20]\n"
+ "ldr x22, [x14, #0x28]\n"
+ "fmla z28.s, p3/M, z4.s, z23.s\n"
+ "st1w { z22.s }, p0, [x10, x15, LSL #2]\n"
+ "ldr x21, [x14, #0x30]\n"
+ "fmla z25.s, p3/M, z3.s, z23.s\n"
+ "fmla z24.s, p3/M, z5.s, z29.s\n"
+ "st1w { z9.s }, p0, [x9, x15, LSL #2]\n"
+ "ldr x20, [x14, #0x38]\n"
+ "fmla z26.s, p3/M, z4.s, z29.s\n"
+ "fmin z27.s, p3/M, z27.s, z19.s\n"
+ "fmin z14.s, p3/M, z14.s, z19.s\n"
+ "fmin z15.s, p3/M, z15.s, z19.s\n"
+ "st1w { z27.s }, p0, [x23, x15, LSL #2]\n"
+ "ldr x23, [x14, #0x40]\n"
+ "fmin z11.s, p3/M, z11.s, z19.s\n"
+ "fmax z31.s, p3/M, z31.s, z16.s\n"
+ "st1w { z14.s }, p0, [x22, x15, LSL #2]\n"
+ "ldr x22, [x14, #0x48]\n"
+ "fmax z30.s, p3/M, z30.s, z16.s\n"
+ "fmax z20.s, p3/M, z20.s, z16.s\n"
+ "st1w { z15.s }, p0, [x21, x15, LSL #2]\n"
+ "ldr x21, [x14, #0x50]\n"
+ "fmax z10.s, p3/M, z10.s, z16.s\n"
+ "st1w { z11.s }, p0, [x20, x15, LSL #2]\n"
+ "ldr x20, [x14, #0x58]\n"
+ "fmin z31.s, p3/M, z31.s, z19.s\n"
+ "fmin z30.s, p3/M, z30.s, z19.s\n"
+ "fmin z20.s, p3/M, z20.s, z19.s\n"
+ "st1w { z31.s }, p0, [x23, x15, LSL #2]\n"
+ "ldr x23, [x14, #0x60]\n"
+ "fmin z10.s, p3/M, z10.s, z19.s\n"
+ "fmax z28.s, p3/M, z28.s, z16.s\n"
+ "st1w { z30.s }, p0, [x22, x15, LSL #2]\n"
+ "ldr x22, [x14, #0x68]\n"
+ "fmax z25.s, p3/M, z25.s, z16.s\n"
+ "fmax z24.s, p3/M, z24.s, z16.s\n"
+ "st1w { z20.s }, p0, [x21, x15, LSL #2]\n"
+ "ldr x21, [x14, #0x70]\n"
+ "fmax z26.s, p3/M, z26.s, z16.s\n"
+ "st1w { z10.s }, p0, [x20, x15, LSL #2]\n"
+ "ldr x20, [x14, #0x78]\n"
+ "fmin z28.s, p3/M, z28.s, z19.s\n"
+ "fmin z25.s, p3/M, z25.s, z19.s\n"
+ "fmin z24.s, p3/M, z24.s, z19.s\n"
+ "st1w { z28.s }, p0, [x23, x15, LSL #2]\n"
+ "fmin z26.s, p3/M, z26.s, z19.s\n"
+ "st1w { z25.s }, p0, [x22, x15, LSL #2]\n"
+ "st1w { z24.s }, p0, [x21, x15, LSL #2]\n"
+ "st1w { z26.s }, p0, [x20, x15, LSL #2]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index 17ac74e223..75d62007ab 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,19 +22,19 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
-void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
-void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
class sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
{
@@ -57,7 +57,7 @@ class sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 2;
sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<float, float, float, float>(2, 3, 2) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
@@ -68,4 +68,4 @@ class sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirst
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
index 5a1f309b88..e6090fda94 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -112,7 +112,7 @@ void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"add x28, x12, x23, LSL #2\n"
"madd x20, x16, x14, x20\n" // offset += tile_j * ld_output_col
"whilelt p2.s, XZR, %x[n_channels]\n"
- "ld1w { z19.s }, p3/Z, [x11]\n"
+ "ld1w { z30.s }, p3/Z, [x11]\n"
"ld1w { z0.s }, p3/Z, [x11, #1, MUL VL]\n"
"mul x20, x20, x24\n" // offset *= output_tile_size
"ld1w { z1.s }, p3/Z, [x11, #2, MUL VL]\n"
@@ -128,8 +128,8 @@ void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"add x24, x26, x15\n"
"add x9, x9, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
"cmp x13, %x[n_channels]\n"
- "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rw { z29.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z28.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"add x23, x25, x23, LSL #2\n"
"add x22, x9, x21, LSL #2\n"
"ld1w { z7.s }, p3/Z, [x11, #-8, MUL VL]\n"
@@ -147,191 +147,191 @@ void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ld1w { z16.s }, p2/Z, [x12, x10, LSL #2]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z28, z19\n fmla z28.s, p3/M, z8.s, z9.s\n"
- "movprfx z29, z19\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "movprfx z27, z30\n fmla z27.s, p3/M, z8.s, z9.s\n"
+ "movprfx z26, z30\n fmla z26.s, p3/M, z6.s, z9.s\n"
"whilelt p1.s, x13, %x[n_channels]\n"
"incw x21\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "fmla z29.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x28, x24, LSL #2]\n"
+ "fmla z27.s, p3/M, z0.s, z10.s\n"
+ "fmla z26.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z20.s }, p2/Z, [x28, x24, LSL #2]\n"
"incw x13\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z2.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x26, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x28, x10, LSL #2]\n"
- "fmla z28.s, p3/M, z3.s, z14.s\n"
- "fmla z29.s, p3/M, z0.s, z16.s\n"
- "ld1w { z14.s }, p2/Z, [x25]\n"
+ "fmla z27.s, p3/M, z1.s, z11.s\n"
+ "fmla z26.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z17.s }, p2/Z, [x28, x26, LSL #2]\n"
+ "ld1w { z19.s }, p2/Z, [x28, x10, LSL #2]\n"
+ "fmla z27.s, p3/M, z3.s, z14.s\n"
+ "fmla z26.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x25]\n"
"mov p0.b, p2.b\n"
- "fmla z28.s, p3/M, z4.s, z15.s\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "ld1w { z15.s }, p2/Z, [x27]\n"
- "ld1w { z11.s }, p2/Z, [x25, x15, LSL #2]\n"
- "fmla z28.s, p3/M, z2.s, z16.s\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x26, LSL #2]\n"
- "ld1w { z16.s }, p2/Z, [x27, x15, LSL #2]\n"
- "movprfx z30, z19\n fmla z30.s, p3/M, z2.s, z9.s\n"
- "movprfx z31, z19\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "fmla z27.s, p3/M, z4.s, z15.s\n"
+ "fmla z26.s, p3/M, z4.s, z17.s\n"
+ "ld1w { z25.s }, p2/Z, [x27]\n"
+ "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z2.s, z16.s\n"
+ "fmla z26.s, p3/M, z5.s, z20.s\n"
+ "ld1w { z24.s }, p2/Z, [x27, x26, LSL #2]\n"
+ "ld1w { z23.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "movprfx z22, z30\n fmla z22.s, p3/M, z2.s, z9.s\n"
+ "movprfx z21, z30\n fmla z21.s, p3/M, z0.s, z9.s\n"
"addvl x12, x12, #1\n"
"addvl x28, x28, #1\n"
- "fmla z28.s, p3/M, z5.s, z13.s\n"
- "fmla z29.s, p3/M, z3.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x25, x26, LSL #2]\n"
- "ld1w { z19.s }, p3/Z, [x11]\n"
- "fmla z30.s, p3/M, z3.s, z14.s\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "ld1w { z14.s }, p2/Z, [x25, x24, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x23, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z15.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "fmla z27.s, p3/M, z5.s, z19.s\n"
+ "fmla z26.s, p3/M, z3.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x26, LSL #2]\n"
+ "ld1w { z30.s }, p3/Z, [x11]\n"
+ "fmla z22.s, p3/M, z3.s, z18.s\n"
+ "fmla z21.s, p3/M, z4.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x24, LSL #2]\n"
+ "ld1w { z20.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z22.s, p3/M, z0.s, z25.s\n"
+ "fmla z21.s, p3/M, z1.s, z24.s\n"
"ld1w { z0.s }, p3/Z, [x11, #1, MUL VL]\n"
"incw x20\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z14.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x24, LSL #2]\n"
- "ld1w { z14.s }, p2/Z, [x23, x26, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z15.s\n"
- "fmla z30.s, p3/M, z1.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x23]\n"
+ "fmla z22.s, p3/M, z4.s, z17.s\n"
+ "fmla z21.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x27, x24, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x23, x26, LSL #2]\n"
+ "fmla z27.s, p3/M, z6.s, z25.s\n"
+ "fmla z22.s, p3/M, z1.s, z23.s\n"
+ "ld1w { z17.s }, p2/Z, [x23]\n"
"addvl x27, x27, #1\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "fmla z28.s, p3/M, z7.s, z16.s\n"
+ "fmla z21.s, p3/M, z2.s, z19.s\n"
+ "fmla z27.s, p3/M, z7.s, z23.s\n"
"ld1w { z16.s }, p2/Z, [x25, x10, LSL #2]\n"
- "fmax z28.s, p3/M, z28.s, z18.s\n"
- "fmla z30.s, p3/M, z6.s, z15.s\n"
- "fmla z31.s, p3/M, z3.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x23, x10, LSL #2]\n"
+ "fmax z27.s, p3/M, z27.s, z29.s\n"
+ "fmla z22.s, p3/M, z6.s, z17.s\n"
+ "fmla z21.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x10, LSL #2]\n"
"ld1w { z1.s }, p3/Z, [x11, #2, MUL VL]\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
- "fmla z31.s, p3/M, z7.s, z14.s\n"
+ "fmla z22.s, p3/M, z7.s, z20.s\n"
+ "fmla z21.s, p3/M, z7.s, z18.s\n"
"ld1w { z2.s }, p3/Z, [x11, #3, MUL VL]\n"
"ld1w { z3.s }, p3/Z, [x11, #4, MUL VL]\n"
- "fmla z29.s, p3/M, z7.s, z12.s\n"
- "fmla z30.s, p3/M, z5.s, z16.s\n"
+ "fmla z26.s, p3/M, z7.s, z24.s\n"
+ "fmla z22.s, p3/M, z5.s, z16.s\n"
"ld1w { z4.s }, p3/Z, [x11, #5, MUL VL]\n"
"ld1w { z5.s }, p3/Z, [x11, #6, MUL VL]\n"
- "fmla z31.s, p3/M, z6.s, z15.s\n"
- "fmla z29.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x23, x24, LSL #2]\n"
- "fmax z29.s, p3/M, z29.s, z18.s\n"
- "fmla z30.s, p3/M, z8.s, z15.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
- "fmax z30.s, p3/M, z30.s, z18.s\n"
- "fmax z31.s, p3/M, z31.s, z18.s\n"
+ "fmla z21.s, p3/M, z6.s, z17.s\n"
+ "fmla z26.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x24, LSL #2]\n"
+ "fmax z26.s, p3/M, z26.s, z29.s\n"
+ "fmla z22.s, p3/M, z8.s, z17.s\n"
+ "fmla z21.s, p3/M, z8.s, z16.s\n"
+ "fmax z22.s, p3/M, z22.s, z29.s\n"
+ "fmax z21.s, p3/M, z21.s, z29.s\n"
"ld1w { z6.s }, p3/Z, [x11, #7, MUL VL]\n"
"addvl x11, x11, #16\n"
"whilelt p2.s, x21, %x[n_channels]\n"
"ld1w { z9.s }, p1/Z, [x27, x10, LSL #2]\n"
"cmp x13, %x[n_channels]\n"
- "fmin z28.s, p3/M, z28.s, z17.s\n"
+ "fmin z27.s, p3/M, z27.s, z28.s\n"
"ld1w { z10.s }, p1/Z, [x12]\n"
"ld1w { z11.s }, p1/Z, [x12, x15, LSL #2]\n"
- "fmin z29.s, p3/M, z29.s, z17.s\n"
- "fmin z30.s, p3/M, z30.s, z17.s\n"
+ "fmin z26.s, p3/M, z26.s, z28.s\n"
+ "fmin z22.s, p3/M, z22.s, z28.s\n"
"ld1w { z12.s }, p1/Z, [x12, x26, LSL #2]\n"
"ld1w { z13.s }, p1/Z, [x12, x24, LSL #2]\n"
- "fmin z31.s, p3/M, z31.s, z17.s\n"
+ "fmin z21.s, p3/M, z21.s, z28.s\n"
"addvl x25, x25, #1\n"
"ld1w { z14.s }, p1/Z, [x28]\n"
"ld1w { z15.s }, p1/Z, [x28, x15, LSL #2]\n"
"addvl x23, x23, #1\n"
"ld1w { z16.s }, p1/Z, [x12, x10, LSL #2]\n"
- "st1w { z28.s }, p0, [x9]\n"
+ "st1w { z27.s }, p0, [x9]\n"
"ld1w { z7.s }, p3/Z, [x11, #-8, MUL VL]\n"
- "st1w { z29.s }, p0, [x9, x14, LSL #2]\n"
+ "st1w { z26.s }, p0, [x9, x14, LSL #2]\n"
"addvl x9, x9, #1\n"
"ld1w { z8.s }, p3/Z, [x11, #-7, MUL VL]\n"
"addvl x11, x11, #-6\n"
- "st1w { z30.s }, p0, [x22]\n"
- "st1w { z31.s }, p0, [x22, x14, LSL #2]\n"
+ "st1w { z22.s }, p0, [x22]\n"
+ "st1w { z21.s }, p0, [x22, x14, LSL #2]\n"
"addvl x22, x22, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z28, z19\n fmla z28.s, p3/M, z8.s, z9.s\n"
- "movprfx z29, z19\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "movprfx z27, z30\n fmla z27.s, p3/M, z8.s, z9.s\n"
+ "movprfx z26, z30\n fmla z26.s, p3/M, z6.s, z9.s\n"
"ldr x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "fmla z29.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x28, x24, LSL #2]\n"
+ "fmla z27.s, p3/M, z0.s, z10.s\n"
+ "fmla z26.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z20.s }, p2/Z, [x28, x24, LSL #2]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z2.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x26, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x28, x10, LSL #2]\n"
- "fmla z28.s, p3/M, z3.s, z14.s\n"
- "fmla z29.s, p3/M, z0.s, z16.s\n"
- "ld1w { z14.s }, p2/Z, [x25]\n"
+ "fmla z27.s, p3/M, z1.s, z11.s\n"
+ "fmla z26.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z17.s }, p2/Z, [x28, x26, LSL #2]\n"
+ "ld1w { z19.s }, p2/Z, [x28, x10, LSL #2]\n"
+ "fmla z27.s, p3/M, z3.s, z14.s\n"
+ "fmla z26.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x25]\n"
"add x16, x16, #0x1\n"
- "fmla z28.s, p3/M, z4.s, z15.s\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "ld1w { z15.s }, p2/Z, [x27]\n"
- "ld1w { z11.s }, p2/Z, [x25, x15, LSL #2]\n"
- "fmla z28.s, p3/M, z2.s, z16.s\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x26, LSL #2]\n"
- "ld1w { z16.s }, p2/Z, [x27, x15, LSL #2]\n"
- "movprfx z30, z19\n fmla z30.s, p3/M, z2.s, z9.s\n"
- "movprfx z31, z19\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "fmla z27.s, p3/M, z4.s, z15.s\n"
+ "fmla z26.s, p3/M, z4.s, z17.s\n"
+ "ld1w { z25.s }, p2/Z, [x27]\n"
+ "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z2.s, z16.s\n"
+ "fmla z26.s, p3/M, z5.s, z20.s\n"
+ "ld1w { z24.s }, p2/Z, [x27, x26, LSL #2]\n"
+ "ld1w { z23.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "movprfx z22, z30\n fmla z22.s, p3/M, z2.s, z9.s\n"
+ "movprfx z21, z30\n fmla z21.s, p3/M, z0.s, z9.s\n"
"cmp x16, x20\n"
"add x21, x11, #0x1\n"
- "fmla z28.s, p3/M, z5.s, z13.s\n"
- "fmla z29.s, p3/M, z3.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x25, x26, LSL #2]\n"
+ "fmla z27.s, p3/M, z5.s, z19.s\n"
+ "fmla z26.s, p3/M, z3.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x26, LSL #2]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "fmla z30.s, p3/M, z3.s, z14.s\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "ld1w { z14.s }, p2/Z, [x25, x24, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x23, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z15.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "fmla z22.s, p3/M, z3.s, z18.s\n"
+ "fmla z21.s, p3/M, z4.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x24, LSL #2]\n"
+ "ld1w { z20.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z22.s, p3/M, z0.s, z25.s\n"
+ "fmla z21.s, p3/M, z1.s, z24.s\n"
"csel x11, x11, x21, LT\n"
"mov p0.b, p2.b\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z14.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x24, LSL #2]\n"
- "ld1w { z14.s }, p2/Z, [x23, x26, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z15.s\n"
- "fmla z30.s, p3/M, z1.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x23]\n"
+ "fmla z22.s, p3/M, z4.s, z17.s\n"
+ "fmla z21.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x27, x24, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x23, x26, LSL #2]\n"
+ "fmla z27.s, p3/M, z6.s, z25.s\n"
+ "fmla z22.s, p3/M, z1.s, z23.s\n"
+ "ld1w { z17.s }, p2/Z, [x23]\n"
"csel x16, x16, XZR, LT\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "fmla z28.s, p3/M, z7.s, z16.s\n"
+ "fmla z21.s, p3/M, z2.s, z19.s\n"
+ "fmla z27.s, p3/M, z7.s, z23.s\n"
"ld1w { z16.s }, p2/Z, [x25, x10, LSL #2]\n"
- "fmax z28.s, p3/M, z28.s, z18.s\n"
- "fmla z30.s, p3/M, z6.s, z15.s\n"
- "fmla z31.s, p3/M, z3.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x23, x10, LSL #2]\n"
+ "fmax z27.s, p3/M, z27.s, z29.s\n"
+ "fmla z22.s, p3/M, z6.s, z17.s\n"
+ "fmla z21.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x10, LSL #2]\n"
"cmp x11, x20\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
- "fmla z31.s, p3/M, z7.s, z14.s\n"
- "fmin z28.s, p3/M, z28.s, z17.s\n"
- "st1w { z28.s }, p0, [x9]\n"
- "fmla z29.s, p3/M, z7.s, z12.s\n"
- "fmla z30.s, p3/M, z5.s, z16.s\n"
- "fmla z31.s, p3/M, z6.s, z15.s\n"
- "fmla z29.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x23, x24, LSL #2]\n"
- "fmax z29.s, p3/M, z29.s, z18.s\n"
- "fmla z30.s, p3/M, z8.s, z15.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
- "fmax z30.s, p3/M, z30.s, z18.s\n"
- "fmax z31.s, p3/M, z31.s, z18.s\n"
- "fmin z29.s, p3/M, z29.s, z17.s\n"
- "fmin z30.s, p3/M, z30.s, z17.s\n"
- "st1w { z29.s }, p0, [x9, x14, LSL #2]\n"
- "fmin z31.s, p3/M, z31.s, z17.s\n"
- "st1w { z30.s }, p0, [x22]\n"
- "st1w { z31.s }, p0, [x22, x14, LSL #2]\n"
+ "fmla z22.s, p3/M, z7.s, z20.s\n"
+ "fmla z21.s, p3/M, z7.s, z18.s\n"
+ "fmin z27.s, p3/M, z27.s, z28.s\n"
+ "st1w { z27.s }, p0, [x9]\n"
+ "fmla z26.s, p3/M, z7.s, z24.s\n"
+ "fmla z22.s, p3/M, z5.s, z16.s\n"
+ "fmla z21.s, p3/M, z6.s, z17.s\n"
+ "fmla z26.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x24, LSL #2]\n"
+ "fmax z26.s, p3/M, z26.s, z29.s\n"
+ "fmla z22.s, p3/M, z8.s, z17.s\n"
+ "fmla z21.s, p3/M, z8.s, z16.s\n"
+ "fmax z22.s, p3/M, z22.s, z29.s\n"
+ "fmax z21.s, p3/M, z21.s, z29.s\n"
+ "fmin z26.s, p3/M, z26.s, z28.s\n"
+ "fmin z22.s, p3/M, z22.s, z28.s\n"
+ "st1w { z26.s }, p0, [x9, x14, LSL #2]\n"
+ "fmin z21.s, p3/M, z21.s, z28.s\n"
+ "st1w { z22.s }, p0, [x22]\n"
+ "st1w { z21.s }, p0, [x22, x14, LSL #2]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
index eb6c2daa97..98427701fa 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -96,7 +96,7 @@ void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"ldp x11, x10, [x20, #0x10]\n"
"mov x9, #0x0\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- "ld1w { z19.s }, p3/Z, [x16]\n"
+ "ld1w { z20.s }, p3/Z, [x16]\n"
"ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
"cmp x14, %x[n_channels]\n"
"ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
@@ -111,8 +111,8 @@ void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"ldp x25, x24, [x15, #0x10]\n"
"ldp x23, x22, [x15, #0x20]\n"
"ldp x21, x20, [x15, #0x30]\n"
- "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rw { z26.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z25.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
"ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
"addvl x16, x16, #-6\n"
@@ -126,89 +126,89 @@ void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z28, z19\n fmla z28.s, p3/M, z8.s, z9.s\n"
- "movprfx z29, z19\n fmla z29.s, p3/M, z6.s, z9.s\n"
- "ldr x27, [x15, #0x40]\n"
- "ldr x26, [x15, #0x48]\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "fmla z29.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x9, LSL #2]\n"
- "ldr x25, [x15, #0x50]\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z2.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x9, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x25, x9, LSL #2]\n"
- "fmla z28.s, p3/M, z3.s, z14.s\n"
- "fmla z29.s, p3/M, z0.s, z16.s\n"
- "ldr x24, [x15, #0x58]\n"
- "ldr x20, [x15, #0x78]\n"
- "fmla z28.s, p3/M, z4.s, z15.s\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "ld1w { z14.s }, p2/Z, [x24, x9, LSL #2]\n"
- "ldr x23, [x15, #0x60]\n"
- "fmla z28.s, p3/M, z2.s, z16.s\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
- "ldr x27, [x15, #0x80]\n"
- "ld1w { z15.s }, p2/Z, [x23, x9, LSL #2]\n"
- "movprfx z30, z19\n fmla z30.s, p3/M, z2.s, z9.s\n"
- "movprfx z31, z19\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x9, LSL #2]\n"
- "ldr x22, [x15, #0x68]\n"
- "fmla z28.s, p3/M, z5.s, z13.s\n"
- "fmla z29.s, p3/M, z3.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x26, [x15, #0x88]\n"
- "fmla z30.s, p3/M, z3.s, z14.s\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x22, x9, LSL #2]\n"
- "ld1w { z14.s }, p2/Z, [x26, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z15.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "movprfx z24, z20\n fmla z24.s, p3/M, z8.s, z9.s\n"
+ "movprfx z23, z20\n fmla z23.s, p3/M, z6.s, z9.s\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "fmla z23.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x20, [x15, #0x50]\n"
+ "fmla z24.s, p3/M, z1.s, z11.s\n"
+ "fmla z23.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z24.s, p3/M, z3.s, z14.s\n"
+ "fmla z23.s, p3/M, z0.s, z16.s\n"
+ "ldr x20, [x15, #0x58]\n"
+ "ldr x22, [x15, #0x78]\n"
+ "fmla z24.s, p3/M, z4.s, z15.s\n"
+ "fmla z23.s, p3/M, z4.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x15, #0x60]\n"
+ "fmla z24.s, p3/M, z2.s, z16.s\n"
+ "fmla z23.s, p3/M, z5.s, z18.s\n"
+ "ldr x20, [x15, #0x80]\n"
+ "ld1w { z18.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "movprfx z22, z20\n fmla z22.s, p3/M, z2.s, z9.s\n"
+ "movprfx z21, z20\n fmla z21.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z20.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x15, #0x68]\n"
+ "fmla z24.s, p3/M, z5.s, z19.s\n"
+ "fmla z23.s, p3/M, z3.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "fmla z22.s, p3/M, z3.s, z17.s\n"
+ "fmla z21.s, p3/M, z4.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z0.s, z18.s\n"
+ "fmla z21.s, p3/M, z1.s, z20.s\n"
"ldr x21, [x15, #0x70]\n"
- "ldr x24, [x15, #0x98]\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z14.s\n"
+ "ldr x20, [x15, #0x98]\n"
+ "fmla z22.s, p3/M, z4.s, z17.s\n"
+ "fmla z21.s, p3/M, z5.s, z16.s\n"
"ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x24, x9, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z15.s\n"
- "ldr x25, [x15, #0x90]\n"
- "ldr x22, [x15, #0xa8]\n"
- "fmla z30.s, p3/M, z1.s, z16.s\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "fmla z28.s, p3/M, z7.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x25, x9, LSL #2]\n"
- "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
- "ldr x23, [x15, #0xa0]\n"
- "ldr x21, [x15, #0xb0]\n"
- "fmla z30.s, p3/M, z6.s, z15.s\n"
- "fmla z31.s, p3/M, z3.s, z16.s\n"
- "ld1w { z13.s }, p2/Z, [x23, x9, LSL #2]\n"
- "ld1w { z14.s }, p2/Z, [x21, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
- "fmla z31.s, p3/M, z7.s, z14.s\n"
+ "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z24.s, p3/M, z6.s, z18.s\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "fmla z22.s, p3/M, z1.s, z16.s\n"
+ "fmla z21.s, p3/M, z2.s, z19.s\n"
+ "fmla z24.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x15, #0xa0]\n"
+ "ldr x20, [x15, #0xb0]\n"
+ "fmla z22.s, p3/M, z6.s, z16.s\n"
+ "fmla z21.s, p3/M, z3.s, z18.s\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z7.s, z17.s\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
"ldr x20, [x15, #0xb8]\n"
- "fmla z29.s, p3/M, z7.s, z12.s\n"
- "ld1w { z15.s }, p2/Z, [x20, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z5.s, z16.s\n"
- "ldr x27, [x15, #0xc0]\n"
- "fmla z31.s, p3/M, z6.s, z15.s\n"
- "fmla z29.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z15.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
+ "fmla z23.s, p3/M, z7.s, z20.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z5.s, z18.s\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "fmla z21.s, p3/M, z6.s, z17.s\n"
+ "fmla z23.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z8.s, z17.s\n"
+ "fmla z21.s, p3/M, z8.s, z16.s\n"
"whilelt p1.s, x14, %x[n_channels]\n"
"ldp x27, x26, [x15, #0x0]\n"
"ldp x25, x24, [x15, #0x10]\n"
"ldp x23, x22, [x15, #0x20]\n"
"incw x9\n"
- "fmax z28.s, p3/M, z28.s, z18.s\n"
+ "fmax z24.s, p3/M, z24.s, z26.s\n"
"ldp x21, x20, [x15, #0x30]\n"
"ld1w { z9.s }, p1/Z, [x27, x14, LSL #2]\n"
- "fmax z29.s, p3/M, z29.s, z18.s\n"
- "fmax z30.s, p3/M, z30.s, z18.s\n"
+ "fmax z23.s, p3/M, z23.s, z26.s\n"
+ "fmax z22.s, p3/M, z22.s, z26.s\n"
"ld1w { z10.s }, p1/Z, [x26, x14, LSL #2]\n"
"ld1w { z11.s }, p1/Z, [x25, x14, LSL #2]\n"
- "fmax z31.s, p3/M, z31.s, z18.s\n"
+ "fmax z21.s, p3/M, z21.s, z26.s\n"
"incw x28\n"
"ld1w { z12.s }, p1/Z, [x24, x14, LSL #2]\n"
"ld1w { z13.s }, p1/Z, [x23, x14, LSL #2]\n"
@@ -216,122 +216,122 @@ void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"whilelt p2.s, x9, %x[n_channels]\n"
"ld1w { z14.s }, p1/Z, [x22, x14, LSL #2]\n"
"ld1w { z15.s }, p1/Z, [x21, x14, LSL #2]\n"
- "fmin z28.s, p3/M, z28.s, z17.s\n"
- "fmin z29.s, p3/M, z29.s, z17.s\n"
+ "fmin z24.s, p3/M, z24.s, z25.s\n"
+ "fmin z23.s, p3/M, z23.s, z25.s\n"
"ld1w { z16.s }, p1/Z, [x20, x14, LSL #2]\n"
"incw x14\n"
- "ld1w { z19.s }, p3/Z, [x16]\n"
+ "ld1w { z20.s }, p3/Z, [x16]\n"
"cmp x14, %x[n_channels]\n"
"ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
"ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
- "fmin z30.s, p3/M, z30.s, z17.s\n"
- "fmin z31.s, p3/M, z31.s, z17.s\n"
+ "fmin z22.s, p3/M, z22.s, z25.s\n"
+ "fmin z21.s, p3/M, z21.s, z25.s\n"
"ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
"ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
- "st1w { z28.s }, p0, [x13, x28, LSL #2]\n"
+ "st1w { z24.s }, p0, [x13, x28, LSL #2]\n"
"ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
"ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n"
- "st1w { z29.s }, p0, [x12, x28, LSL #2]\n"
+ "st1w { z23.s }, p0, [x12, x28, LSL #2]\n"
"ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
"addvl x16, x16, #16\n"
- "st1w { z30.s }, p0, [x11, x28, LSL #2]\n"
+ "st1w { z22.s }, p0, [x11, x28, LSL #2]\n"
"ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
- "st1w { z31.s }, p0, [x10, x28, LSL #2]\n"
+ "st1w { z21.s }, p0, [x10, x28, LSL #2]\n"
"ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
"addvl x16, x16, #-6\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z28, z19\n fmla z28.s, p3/M, z8.s, z9.s\n"
- "movprfx z29, z19\n fmla z29.s, p3/M, z6.s, z9.s\n"
- "ldr x27, [x15, #0x40]\n"
- "ldr x26, [x15, #0x48]\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "fmla z29.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x9, LSL #2]\n"
- "ldr x25, [x15, #0x50]\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z2.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x9, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x25, x9, LSL #2]\n"
- "fmla z28.s, p3/M, z3.s, z14.s\n"
- "fmla z29.s, p3/M, z0.s, z16.s\n"
- "ldr x24, [x15, #0x58]\n"
- "ldr x20, [x15, #0x78]\n"
- "fmla z28.s, p3/M, z4.s, z15.s\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "ld1w { z14.s }, p2/Z, [x24, x9, LSL #2]\n"
- "ldr x23, [x15, #0x60]\n"
- "fmla z28.s, p3/M, z2.s, z16.s\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
- "ldr x27, [x15, #0x80]\n"
- "ld1w { z15.s }, p2/Z, [x23, x9, LSL #2]\n"
- "movprfx z30, z19\n fmla z30.s, p3/M, z2.s, z9.s\n"
- "movprfx z31, z19\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x9, LSL #2]\n"
- "ldr x22, [x15, #0x68]\n"
- "fmla z28.s, p3/M, z5.s, z13.s\n"
- "fmla z29.s, p3/M, z3.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x26, [x15, #0x88]\n"
- "fmla z30.s, p3/M, z3.s, z14.s\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x22, x9, LSL #2]\n"
- "ld1w { z14.s }, p2/Z, [x26, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z15.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "movprfx z24, z20\n fmla z24.s, p3/M, z8.s, z9.s\n"
+ "movprfx z23, z20\n fmla z23.s, p3/M, z6.s, z9.s\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "fmla z23.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x20, [x15, #0x50]\n"
+ "fmla z24.s, p3/M, z1.s, z11.s\n"
+ "fmla z23.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z24.s, p3/M, z3.s, z14.s\n"
+ "fmla z23.s, p3/M, z0.s, z16.s\n"
+ "ldr x20, [x15, #0x58]\n"
+ "ldr x22, [x15, #0x78]\n"
+ "fmla z24.s, p3/M, z4.s, z15.s\n"
+ "fmla z23.s, p3/M, z4.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x15, #0x60]\n"
+ "fmla z24.s, p3/M, z2.s, z16.s\n"
+ "fmla z23.s, p3/M, z5.s, z18.s\n"
+ "ldr x20, [x15, #0x80]\n"
+ "ld1w { z18.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "movprfx z22, z20\n fmla z22.s, p3/M, z2.s, z9.s\n"
+ "movprfx z21, z20\n fmla z21.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z20.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x15, #0x68]\n"
+ "fmla z24.s, p3/M, z5.s, z19.s\n"
+ "fmla z23.s, p3/M, z3.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "fmla z22.s, p3/M, z3.s, z17.s\n"
+ "fmla z21.s, p3/M, z4.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z0.s, z18.s\n"
+ "fmla z21.s, p3/M, z1.s, z20.s\n"
"ldr x21, [x15, #0x70]\n"
- "ldr x24, [x15, #0x98]\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z14.s\n"
+ "ldr x20, [x15, #0x98]\n"
+ "fmla z22.s, p3/M, z4.s, z17.s\n"
+ "fmla z21.s, p3/M, z5.s, z16.s\n"
"ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x24, x9, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z15.s\n"
- "ldr x25, [x15, #0x90]\n"
- "ldr x22, [x15, #0xa8]\n"
- "fmla z30.s, p3/M, z1.s, z16.s\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "fmla z28.s, p3/M, z7.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x25, x9, LSL #2]\n"
- "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
- "ldr x23, [x15, #0xa0]\n"
- "ldr x21, [x15, #0xb0]\n"
- "fmla z30.s, p3/M, z6.s, z15.s\n"
- "fmla z31.s, p3/M, z3.s, z16.s\n"
- "ld1w { z13.s }, p2/Z, [x23, x9, LSL #2]\n"
- "ld1w { z14.s }, p2/Z, [x21, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
- "fmla z31.s, p3/M, z7.s, z14.s\n"
+ "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z24.s, p3/M, z6.s, z18.s\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "fmla z22.s, p3/M, z1.s, z16.s\n"
+ "fmla z21.s, p3/M, z2.s, z19.s\n"
+ "fmla z24.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x15, #0xa0]\n"
+ "ldr x20, [x15, #0xb0]\n"
+ "fmla z22.s, p3/M, z6.s, z16.s\n"
+ "fmla z21.s, p3/M, z3.s, z18.s\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z7.s, z17.s\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
"ldr x20, [x15, #0xb8]\n"
- "fmla z29.s, p3/M, z7.s, z12.s\n"
- "ld1w { z15.s }, p2/Z, [x20, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z5.s, z16.s\n"
- "ldr x27, [x15, #0xc0]\n"
- "fmla z31.s, p3/M, z6.s, z15.s\n"
- "fmla z29.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z15.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
+ "fmla z23.s, p3/M, z7.s, z20.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z5.s, z18.s\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "fmla z21.s, p3/M, z6.s, z17.s\n"
+ "fmla z23.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z8.s, z17.s\n"
+ "fmla z21.s, p3/M, z8.s, z16.s\n"
"incw x28\n"
"mov p0.b, p2.b\n"
- "fmax z28.s, p3/M, z28.s, z18.s\n"
- "fmax z29.s, p3/M, z29.s, z18.s\n"
- "fmax z30.s, p3/M, z30.s, z18.s\n"
- "fmax z31.s, p3/M, z31.s, z18.s\n"
- "fmin z28.s, p3/M, z28.s, z17.s\n"
- "fmin z29.s, p3/M, z29.s, z17.s\n"
- "st1w { z28.s }, p0, [x13, x28, LSL #2]\n"
- "fmin z30.s, p3/M, z30.s, z17.s\n"
- "fmin z31.s, p3/M, z31.s, z17.s\n"
- "st1w { z29.s }, p0, [x12, x28, LSL #2]\n"
- "st1w { z30.s }, p0, [x11, x28, LSL #2]\n"
- "st1w { z31.s }, p0, [x10, x28, LSL #2]\n"
+ "fmax z24.s, p3/M, z24.s, z26.s\n"
+ "fmax z23.s, p3/M, z23.s, z26.s\n"
+ "fmax z22.s, p3/M, z22.s, z26.s\n"
+ "fmax z21.s, p3/M, z21.s, z26.s\n"
+ "fmin z24.s, p3/M, z24.s, z25.s\n"
+ "fmin z23.s, p3/M, z23.s, z25.s\n"
+ "st1w { z24.s }, p0, [x13, x28, LSL #2]\n"
+ "fmin z22.s, p3/M, z22.s, z25.s\n"
+ "fmin z21.s, p3/M, z21.s, z25.s\n"
+ "st1w { z23.s }, p0, [x12, x28, LSL #2]\n"
+ "st1w { z22.s }, p0, [x11, x28, LSL #2]\n"
+ "st1w { z21.s }, p0, [x10, x28, LSL #2]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index 2449c96637..ae89a64c6b 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,19 +22,19 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
-void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
-void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
class sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
{
@@ -57,7 +57,7 @@ class sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 2;
sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<float, float, float, float>(2, 5, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
@@ -68,4 +68,4 @@ class sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirst
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
index b4cf6c8582..075181a488 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -113,14 +113,14 @@ void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"madd x20, x8, x16, x20\n" // offset += tile_j * ld_output_col
"add x9, x11, x23, LSL #2\n"
"add x28, x15, x17\n"
- "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
"mul x20, x20, x24\n" // offset *= output_tile_size
"whilelt p2.s, XZR, %x[n_channels]\n"
"add x27, x9, x23, LSL #2\n"
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rw { z28.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"add x26, x28, x17\n"
"add x25, x27, x23, LSL #2\n"
- "ld1w { z16.s }, p3/Z, [x10]\n"
+ "ld1w { z29.s }, p3/Z, [x10]\n"
"ld1w { z0.s }, p3/Z, [x10, #1, MUL VL]\n"
"add x24, x26, x17\n"
"add x13, x13, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
@@ -146,378 +146,378 @@ void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"ld1w { z14.s }, p2/Z, [x9]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z28, z16\n fmla z28.s, p3/M, z0.s, z5.s\n"
- "movprfx z29, z16\n fmla z29.s, p3/M, z0.s, z6.s\n"
- "ld1w { z5.s }, p2/Z, [x11, x28, LSL #2]\n"
+ "movprfx z27, z29\n fmla z27.s, p3/M, z0.s, z5.s\n"
+ "movprfx z31, z29\n fmla z31.s, p3/M, z0.s, z6.s\n"
+ "ld1w { z24.s }, p2/Z, [x11, x28, LSL #2]\n"
"whilelt p1.s, x12, %x[n_channels]\n"
- "movprfx z30, z16\n fmla z30.s, p3/M, z0.s, z7.s\n"
- "movprfx z31, z16\n fmla z31.s, p3/M, z0.s, z8.s\n"
- "ld1w { z0.s }, p3/Z, [x10]\n"
+ "movprfx z26, z29\n fmla z26.s, p3/M, z0.s, z7.s\n"
+ "movprfx z30, z29\n fmla z30.s, p3/M, z0.s, z8.s\n"
+ "ld1w { z18.s }, p3/Z, [x10]\n"
"incw x21\n"
- "fmla z28.s, p3/M, z1.s, z6.s\n"
- "fmla z29.s, p3/M, z1.s, z9.s\n"
- "ld1w { z6.s }, p2/Z, [x11, x26, LSL #2]\n"
+ "fmla z27.s, p3/M, z1.s, z6.s\n"
+ "fmla z31.s, p3/M, z1.s, z9.s\n"
+ "ld1w { z23.s }, p2/Z, [x11, x26, LSL #2]\n"
"incw x12\n"
- "fmla z30.s, p3/M, z1.s, z8.s\n"
- "fmla z31.s, p3/M, z1.s, z13.s\n"
- "ld1w { z1.s }, p3/Z, [x10, #1, MUL VL]\n"
+ "fmla z26.s, p3/M, z1.s, z8.s\n"
+ "fmla z30.s, p3/M, z1.s, z13.s\n"
+ "ld1w { z22.s }, p3/Z, [x10, #1, MUL VL]\n"
"mov p0.b, p2.b\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z29.s, p3/M, z2.s, z11.s\n"
- "ld1w { z9.s }, p2/Z, [x14, x24, LSL #2]\n"
+ "fmla z27.s, p3/M, z2.s, z9.s\n"
+ "fmla z31.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x24, LSL #2]\n"
"addvl x14, x14, #1\n"
- "fmla z30.s, p3/M, z2.s, z13.s\n"
- "fmla z31.s, p3/M, z2.s, z5.s\n"
- "ld1w { z2.s }, p3/Z, [x10, #2, MUL VL]\n"
+ "fmla z26.s, p3/M, z2.s, z13.s\n"
+ "fmla z30.s, p3/M, z2.s, z24.s\n"
+ "ld1w { z20.s }, p3/Z, [x10, #2, MUL VL]\n"
"addvl x11, x11, #1\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x9, x17, LSL #2]\n"
+ "fmla z27.s, p3/M, z3.s, z11.s\n"
+ "fmla z31.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z0.s }, p2/Z, [x9, x17, LSL #2]\n"
"incw x20\n"
- "fmla z30.s, p3/M, z3.s, z5.s\n"
- "fmla z31.s, p3/M, z3.s, z6.s\n"
- "ld1w { z3.s }, p3/Z, [x10, #3, MUL VL]\n"
- "fmla z28.s, p3/M, z4.s, z12.s\n"
- "fmla z29.s, p3/M, z4.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x9, x15, LSL #2]\n"
- "ld1w { z9.s }, p2/Z, [x9, x28, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z6.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- "ld1w { z4.s }, p3/Z, [x10, #4, MUL VL]\n"
- "fmla z28.s, p3/M, z0.s, z7.s\n"
- "fmla z29.s, p3/M, z0.s, z8.s\n"
+ "fmla z26.s, p3/M, z3.s, z24.s\n"
+ "fmla z30.s, p3/M, z3.s, z23.s\n"
+ "ld1w { z17.s }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z27.s, p3/M, z4.s, z12.s\n"
+ "fmla z31.s, p3/M, z4.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x9, x15, LSL #2]\n"
+ "ld1w { z5.s }, p2/Z, [x9, x28, LSL #2]\n"
+ "fmla z26.s, p3/M, z4.s, z23.s\n"
+ "fmla z30.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z21.s }, p3/Z, [x10, #4, MUL VL]\n"
+ "fmla z27.s, p3/M, z18.s, z7.s\n"
+ "fmla z31.s, p3/M, z18.s, z8.s\n"
"ld1w { z7.s }, p1/Z, [x11]\n"
- "fmla z30.s, p3/M, z0.s, z14.s\n"
- "fmla z31.s, p3/M, z0.s, z11.s\n"
- "ld1w { z0.s }, p3/Z, [x10, #5, MUL VL]\n"
- "fmla z28.s, p3/M, z1.s, z8.s\n"
- "fmla z29.s, p3/M, z1.s, z13.s\n"
- "ld1w { z8.s }, p2/Z, [x9, x24, LSL #2]\n"
- "fmla z30.s, p3/M, z1.s, z11.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
- "ld1w { z1.s }, p3/Z, [x10, #6, MUL VL]\n"
- "fmla z28.s, p3/M, z2.s, z13.s\n"
- "fmla z29.s, p3/M, z2.s, z5.s\n"
- "ld1w { z13.s }, p2/Z, [x9, x26, LSL #2]\n"
+ "fmla z26.s, p3/M, z18.s, z14.s\n"
+ "fmla z30.s, p3/M, z18.s, z0.s\n"
+ "ld1w { z18.s }, p3/Z, [x10, #5, MUL VL]\n"
+ "fmla z27.s, p3/M, z22.s, z8.s\n"
+ "fmla z31.s, p3/M, z22.s, z13.s\n"
+ "ld1w { z3.s }, p2/Z, [x9, x24, LSL #2]\n"
+ "fmla z26.s, p3/M, z22.s, z0.s\n"
+ "fmla z30.s, p3/M, z22.s, z19.s\n"
+ "ld1w { z8.s }, p3/Z, [x10, #6, MUL VL]\n"
+ "fmla z27.s, p3/M, z20.s, z13.s\n"
+ "fmla z31.s, p3/M, z20.s, z24.s\n"
+ "ld1w { z2.s }, p2/Z, [x9, x26, LSL #2]\n"
"addvl x9, x9, #1\n"
- "fmla z30.s, p3/M, z2.s, z12.s\n"
- "fmla z31.s, p3/M, z2.s, z9.s\n"
- "ld1w { z2.s }, p3/Z, [x10, #7, MUL VL]\n"
+ "fmla z26.s, p3/M, z20.s, z19.s\n"
+ "fmla z30.s, p3/M, z20.s, z5.s\n"
+ "ld1w { z16.s }, p3/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "fmla z28.s, p3/M, z3.s, z5.s\n"
- "fmla z29.s, p3/M, z3.s, z6.s\n"
- "ld1w { z5.s }, p2/Z, [x27]\n"
- "ld1w { z16.s }, p3/Z, [x10, #4, MUL VL]\n"
- "fmla z30.s, p3/M, z3.s, z9.s\n"
- "fmla z31.s, p3/M, z3.s, z13.s\n"
- "ld1w { z3.s }, p3/Z, [x10, #-8, MUL VL]\n"
- "fmla z28.s, p3/M, z4.s, z6.s\n"
- "fmla z29.s, p3/M, z4.s, z10.s\n"
- "ld1w { z6.s }, p2/Z, [x27, x17, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x27, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z13.s\n"
- "fmla z31.s, p3/M, z4.s, z8.s\n"
- "ld1w { z4.s }, p3/Z, [x10, #-7, MUL VL]\n"
- "fmla z28.s, p3/M, z0.s, z14.s\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
- "ld1w { z14.s }, p2/Z, [x27, x24, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z5.s\n"
- "fmla z31.s, p3/M, z0.s, z6.s\n"
- "ld1w { z0.s }, p3/Z, [x10, #-6, MUL VL]\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z1.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x28, LSL #2]\n"
- "fmla z30.s, p3/M, z1.s, z6.s\n"
- "fmla z31.s, p3/M, z1.s, z10.s\n"
- "ld1w { z1.s }, p3/Z, [x10, #-5, MUL VL]\n"
- "fmla z28.s, p3/M, z2.s, z12.s\n"
- "fmla z29.s, p3/M, z2.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x26, LSL #2]\n"
+ "fmla z27.s, p3/M, z17.s, z24.s\n"
+ "fmla z31.s, p3/M, z17.s, z23.s\n"
+ "ld1w { z25.s }, p2/Z, [x27]\n"
+ "ld1w { z29.s }, p3/Z, [x10, #4, MUL VL]\n"
+ "fmla z26.s, p3/M, z17.s, z5.s\n"
+ "fmla z30.s, p3/M, z17.s, z2.s\n"
+ "ld1w { z17.s }, p3/Z, [x10, #-8, MUL VL]\n"
+ "fmla z27.s, p3/M, z21.s, z23.s\n"
+ "fmla z31.s, p3/M, z21.s, z10.s\n"
+ "ld1w { z24.s }, p2/Z, [x27, x17, LSL #2]\n"
+ "ld1w { z22.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "fmla z26.s, p3/M, z21.s, z2.s\n"
+ "fmla z30.s, p3/M, z21.s, z3.s\n"
+ "ld1w { z21.s }, p3/Z, [x10, #-7, MUL VL]\n"
+ "fmla z27.s, p3/M, z18.s, z14.s\n"
+ "fmla z31.s, p3/M, z18.s, z0.s\n"
+ "ld1w { z1.s }, p2/Z, [x27, x24, LSL #2]\n"
+ "fmla z26.s, p3/M, z18.s, z25.s\n"
+ "fmla z30.s, p3/M, z18.s, z24.s\n"
+ "ld1w { z23.s }, p3/Z, [x10, #-6, MUL VL]\n"
+ "fmla z27.s, p3/M, z8.s, z0.s\n"
+ "fmla z31.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z0.s }, p2/Z, [x27, x28, LSL #2]\n"
+ "fmla z26.s, p3/M, z8.s, z24.s\n"
+ "fmla z30.s, p3/M, z8.s, z22.s\n"
+ "ld1w { z20.s }, p3/Z, [x10, #-5, MUL VL]\n"
+ "fmla z27.s, p3/M, z16.s, z19.s\n"
+ "fmla z31.s, p3/M, z16.s, z5.s\n"
+ "ld1w { z19.s }, p2/Z, [x27, x26, LSL #2]\n"
"addvl x27, x27, #1\n"
- "fmla z30.s, p3/M, z2.s, z10.s\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "ld1w { z2.s }, p3/Z, [x10, #-4, MUL VL]\n"
- "fmla z28.s, p3/M, z3.s, z9.s\n"
- "fmla z29.s, p3/M, z3.s, z13.s\n"
- "ld1w { z9.s }, p2/Z, [x25]\n"
- "fmla z30.s, p3/M, z3.s, z11.s\n"
- "fmla z31.s, p3/M, z3.s, z12.s\n"
- "ld1w { z3.s }, p3/Z, [x10, #-3, MUL VL]\n"
- "fmla z28.s, p3/M, z4.s, z13.s\n"
- "fmla z29.s, p3/M, z4.s, z8.s\n"
- "ld1w { z13.s }, p2/Z, [x25, x17, LSL #2]\n"
+ "fmla z26.s, p3/M, z16.s, z22.s\n"
+ "fmla z30.s, p3/M, z16.s, z0.s\n"
+ "ld1w { z18.s }, p3/Z, [x10, #-4, MUL VL]\n"
+ "fmla z27.s, p3/M, z17.s, z5.s\n"
+ "fmla z31.s, p3/M, z17.s, z2.s\n"
+ "ld1w { z16.s }, p2/Z, [x25]\n"
+ "fmla z26.s, p3/M, z17.s, z0.s\n"
+ "fmla z30.s, p3/M, z17.s, z19.s\n"
+ "ld1w { z17.s }, p3/Z, [x10, #-3, MUL VL]\n"
+ "fmla z27.s, p3/M, z21.s, z2.s\n"
+ "fmla z31.s, p3/M, z21.s, z3.s\n"
+ "ld1w { z4.s }, p2/Z, [x25, x17, LSL #2]\n"
"ld1w { z8.s }, p2/Z, [x25, x26, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z12.s\n"
- "fmla z31.s, p3/M, z4.s, z14.s\n"
- "ld1w { z4.s }, p3/Z, [x10, #-2, MUL VL]\n"
- "fmla z28.s, p3/M, z0.s, z5.s\n"
- "fmla z29.s, p3/M, z0.s, z6.s\n"
- "ld1w { z5.s }, p2/Z, [x25, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z9.s\n"
- "fmla z31.s, p3/M, z0.s, z13.s\n"
- "ld1w { z0.s }, p3/Z, [x10, #-1, MUL VL]\n"
- "fmla z28.s, p3/M, z1.s, z6.s\n"
- "fmla z29.s, p3/M, z1.s, z10.s\n"
- "ld1w { z6.s }, p2/Z, [x25, x28, LSL #2]\n"
- "fmla z30.s, p3/M, z1.s, z13.s\n"
- "fmla z31.s, p3/M, z1.s, z5.s\n"
- "ld1w { z1.s }, p3/Z, [x10]\n"
- "fmla z28.s, p3/M, z2.s, z10.s\n"
- "fmla z29.s, p3/M, z2.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x24, LSL #2]\n"
+ "fmla z26.s, p3/M, z21.s, z19.s\n"
+ "fmla z30.s, p3/M, z21.s, z1.s\n"
+ "ld1w { z13.s }, p3/Z, [x10, #-2, MUL VL]\n"
+ "fmla z27.s, p3/M, z23.s, z25.s\n"
+ "fmla z31.s, p3/M, z23.s, z24.s\n"
+ "ld1w { z25.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z26.s, p3/M, z23.s, z16.s\n"
+ "fmla z30.s, p3/M, z23.s, z4.s\n"
+ "ld1w { z5.s }, p3/Z, [x10, #-1, MUL VL]\n"
+ "fmla z27.s, p3/M, z20.s, z24.s\n"
+ "fmla z31.s, p3/M, z20.s, z22.s\n"
+ "ld1w { z24.s }, p2/Z, [x25, x28, LSL #2]\n"
+ "fmla z26.s, p3/M, z20.s, z4.s\n"
+ "fmla z30.s, p3/M, z20.s, z25.s\n"
+ "ld1w { z23.s }, p3/Z, [x10]\n"
+ "fmla z27.s, p3/M, z18.s, z22.s\n"
+ "fmla z31.s, p3/M, z18.s, z0.s\n"
+ "ld1w { z22.s }, p2/Z, [x25, x24, LSL #2]\n"
"addvl x25, x25, #1\n"
- "fmla z30.s, p3/M, z2.s, z5.s\n"
- "fmla z31.s, p3/M, z2.s, z6.s\n"
- "ld1w { z2.s }, p3/Z, [x10, #1, MUL VL]\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x23]\n"
- "fmla z30.s, p3/M, z3.s, z6.s\n"
- "fmla z31.s, p3/M, z3.s, z8.s\n"
- "ld1w { z3.s }, p3/Z, [x10, #2, MUL VL]\n"
- "fmla z28.s, p3/M, z4.s, z12.s\n"
- "fmla z29.s, p3/M, z4.s, z14.s\n"
- "ld1w { z12.s }, p2/Z, [x23, x17, LSL #2]\n"
+ "fmla z26.s, p3/M, z18.s, z25.s\n"
+ "fmla z30.s, p3/M, z18.s, z24.s\n"
+ "ld1w { z21.s }, p3/Z, [x10, #1, MUL VL]\n"
+ "fmla z27.s, p3/M, z17.s, z0.s\n"
+ "fmla z31.s, p3/M, z17.s, z19.s\n"
+ "ld1w { z18.s }, p2/Z, [x23]\n"
+ "fmla z26.s, p3/M, z17.s, z24.s\n"
+ "fmla z30.s, p3/M, z17.s, z8.s\n"
+ "ld1w { z20.s }, p3/Z, [x10, #2, MUL VL]\n"
+ "fmla z27.s, p3/M, z13.s, z19.s\n"
+ "fmla z31.s, p3/M, z13.s, z1.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x17, LSL #2]\n"
"ld1w { z14.s }, p1/Z, [x9]\n"
- "fmla z30.s, p3/M, z4.s, z8.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- "ld1w { z4.s }, p3/Z, [x10, #3, MUL VL]\n"
- "fmla z28.s, p3/M, z0.s, z9.s\n"
- "fmla z29.s, p3/M, z0.s, z13.s\n"
- "ld1w { z9.s }, p2/Z, [x23, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z31.s, p3/M, z0.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "fmla z26.s, p3/M, z13.s, z8.s\n"
+ "fmla z30.s, p3/M, z13.s, z22.s\n"
+ "ld1w { z19.s }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z27.s, p3/M, z5.s, z16.s\n"
+ "fmla z31.s, p3/M, z5.s, z4.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z26.s, p3/M, z5.s, z18.s\n"
+ "fmla z30.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x23, x28, LSL #2]\n"
"ld1w { z0.s }, p3/Z, [x10, #5, MUL VL]\n"
- "fmla z28.s, p3/M, z1.s, z13.s\n"
- "fmla z29.s, p3/M, z1.s, z5.s\n"
+ "fmla z27.s, p3/M, z23.s, z4.s\n"
+ "fmla z31.s, p3/M, z23.s, z25.s\n"
"ld1w { z13.s }, p1/Z, [x11, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z1.s, z12.s\n"
- "fmla z31.s, p3/M, z1.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x23, x26, LSL #2]\n"
+ "fmla z26.s, p3/M, z23.s, z17.s\n"
+ "fmla z30.s, p3/M, z23.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x26, LSL #2]\n"
"ld1w { z1.s }, p3/Z, [x10, #6, MUL VL]\n"
- "fmla z28.s, p3/M, z2.s, z5.s\n"
- "fmla z29.s, p3/M, z2.s, z6.s\n"
+ "fmla z27.s, p3/M, z21.s, z25.s\n"
+ "fmla z31.s, p3/M, z21.s, z24.s\n"
"ld1w { z5.s }, p1/Z, [x14]\n"
- "fmla z30.s, p3/M, z2.s, z9.s\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "ld1w { z9.s }, p2/Z, [x23, x24, LSL #2]\n"
+ "fmla z26.s, p3/M, z21.s, z16.s\n"
+ "fmla z30.s, p3/M, z21.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x24, LSL #2]\n"
"ld1w { z2.s }, p3/Z, [x10, #7, MUL VL]\n"
- "fmla z28.s, p3/M, z3.s, z6.s\n"
- "fmla z29.s, p3/M, z3.s, z8.s\n"
+ "fmla z27.s, p3/M, z20.s, z24.s\n"
+ "fmla z31.s, p3/M, z20.s, z8.s\n"
"addvl x10, x10, #16\n"
"whilelt p2.s, x21, %x[n_channels]\n"
- "fmla z30.s, p3/M, z3.s, z11.s\n"
- "fmla z31.s, p3/M, z3.s, z12.s\n"
+ "fmla z26.s, p3/M, z20.s, z18.s\n"
+ "fmla z30.s, p3/M, z20.s, z17.s\n"
"cmp x12, %x[n_channels]\n"
"addvl x23, x23, #1\n"
- "fmla z28.s, p3/M, z4.s, z8.s\n"
- "fmla z29.s, p3/M, z4.s, z10.s\n"
- "fmax z28.s, p3/M, z28.s, z18.s\n"
- "fmax z29.s, p3/M, z29.s, z18.s\n"
- "fmla z30.s, p3/M, z4.s, z12.s\n"
- "fmla z31.s, p3/M, z4.s, z9.s\n"
- "fmax z30.s, p3/M, z30.s, z18.s\n"
- "fmax z31.s, p3/M, z31.s, z18.s\n"
- "fmin z28.s, p3/M, z28.s, z17.s\n"
- "fmin z29.s, p3/M, z29.s, z17.s\n"
+ "fmla z27.s, p3/M, z19.s, z8.s\n"
+ "fmla z31.s, p3/M, z19.s, z22.s\n"
+ "fmax z27.s, p3/M, z27.s, z15.s\n"
+ "fmax z31.s, p3/M, z31.s, z15.s\n"
+ "fmla z26.s, p3/M, z19.s, z17.s\n"
+ "fmla z30.s, p3/M, z19.s, z16.s\n"
+ "fmax z26.s, p3/M, z26.s, z15.s\n"
+ "fmax z30.s, p3/M, z30.s, z15.s\n"
+ "fmin z27.s, p3/M, z27.s, z28.s\n"
+ "fmin z31.s, p3/M, z31.s, z28.s\n"
"ld1w { z6.s }, p1/Z, [x14, x17, LSL #2]\n"
"ld1w { z8.s }, p1/Z, [x11, x17, LSL #2]\n"
- "fmin z30.s, p3/M, z30.s, z17.s\n"
- "fmin z31.s, p3/M, z31.s, z17.s\n"
+ "fmin z26.s, p3/M, z26.s, z28.s\n"
+ "fmin z30.s, p3/M, z30.s, z28.s\n"
"ld1w { z9.s }, p1/Z, [x14, x15, LSL #2]\n"
"ld1w { z11.s }, p1/Z, [x14, x28, LSL #2]\n"
"ld1w { z12.s }, p1/Z, [x14, x26, LSL #2]\n"
"ld1w { z10.s }, p1/Z, [x11, x24, LSL #2]\n"
- "st1w { z28.s }, p0, [x13]\n"
- "st1w { z29.s }, p0, [x13, x16, LSL #2]\n"
+ "st1w { z27.s }, p0, [x13]\n"
+ "st1w { z31.s }, p0, [x13, x16, LSL #2]\n"
"addvl x13, x13, #1\n"
"ld1w { z3.s }, p3/Z, [x10, #-8, MUL VL]\n"
"ld1w { z4.s }, p3/Z, [x10, #-7, MUL VL]\n"
- "st1w { z30.s }, p0, [x22]\n"
+ "st1w { z26.s }, p0, [x22]\n"
"addvl x10, x10, #-6\n"
- "st1w { z31.s }, p0, [x22, x16, LSL #2]\n"
+ "st1w { z30.s }, p0, [x22, x16, LSL #2]\n"
"addvl x22, x22, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z28, z16\n fmla z28.s, p3/M, z0.s, z5.s\n"
- "movprfx z29, z16\n fmla z29.s, p3/M, z0.s, z6.s\n"
- "ld1w { z5.s }, p2/Z, [x11, x28, LSL #2]\n"
+ "movprfx z30, z29\n fmla z30.s, p3/M, z0.s, z5.s\n"
+ "movprfx z31, z29\n fmla z31.s, p3/M, z0.s, z6.s\n"
+ "ld1w { z22.s }, p2/Z, [x11, x28, LSL #2]\n"
"ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "movprfx z30, z16\n fmla z30.s, p3/M, z0.s, z7.s\n"
- "movprfx z31, z16\n fmla z31.s, p3/M, z0.s, z8.s\n"
- "ld1w { z0.s }, p3/Z, [x10]\n"
+ "movprfx z5, z29\n fmla z5.s, p3/M, z0.s, z7.s\n"
+ "fmla z29.s, p3/M, z0.s, z8.s\n"
+ "ld1w { z20.s }, p3/Z, [x10]\n"
"ldr x12, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "fmla z28.s, p3/M, z1.s, z6.s\n"
- "fmla z29.s, p3/M, z1.s, z9.s\n"
+ "fmla z30.s, p3/M, z1.s, z6.s\n"
+ "fmla z31.s, p3/M, z1.s, z9.s\n"
"ld1w { z6.s }, p2/Z, [x11, x26, LSL #2]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "fmla z30.s, p3/M, z1.s, z8.s\n"
- "fmla z31.s, p3/M, z1.s, z13.s\n"
- "ld1w { z1.s }, p3/Z, [x10, #1, MUL VL]\n"
+ "fmla z5.s, p3/M, z1.s, z8.s\n"
+ "fmla z29.s, p3/M, z1.s, z13.s\n"
+ "ld1w { z19.s }, p3/Z, [x10, #1, MUL VL]\n"
"add x8, x8, #0x1\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z29.s, p3/M, z2.s, z11.s\n"
- "ld1w { z9.s }, p2/Z, [x14, x24, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z9.s\n"
+ "fmla z31.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x24, LSL #2]\n"
"cmp x8, x20\n"
- "fmla z30.s, p3/M, z2.s, z13.s\n"
- "fmla z31.s, p3/M, z2.s, z5.s\n"
- "ld1w { z2.s }, p3/Z, [x10, #2, MUL VL]\n"
+ "fmla z5.s, p3/M, z2.s, z13.s\n"
+ "fmla z29.s, p3/M, z2.s, z22.s\n"
+ "ld1w { z18.s }, p3/Z, [x10, #2, MUL VL]\n"
"add x21, x12, #0x1\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x9, x17, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z11.s\n"
+ "fmla z31.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z1.s }, p2/Z, [x9, x17, LSL #2]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "fmla z30.s, p3/M, z3.s, z5.s\n"
- "fmla z31.s, p3/M, z3.s, z6.s\n"
- "ld1w { z3.s }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z5.s, p3/M, z3.s, z22.s\n"
+ "fmla z29.s, p3/M, z3.s, z6.s\n"
+ "ld1w { z17.s }, p3/Z, [x10, #3, MUL VL]\n"
"csel x12, x12, x21, LT\n"
- "fmla z28.s, p3/M, z4.s, z12.s\n"
- "fmla z29.s, p3/M, z4.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x9, x15, LSL #2]\n"
- "ld1w { z9.s }, p2/Z, [x9, x28, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z6.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- "ld1w { z4.s }, p3/Z, [x10, #4, MUL VL]\n"
+ "fmla z30.s, p3/M, z4.s, z12.s\n"
+ "fmla z31.s, p3/M, z4.s, z16.s\n"
+ "ld1w { z0.s }, p2/Z, [x9, x15, LSL #2]\n"
+ "ld1w { z27.s }, p2/Z, [x9, x28, LSL #2]\n"
+ "fmla z5.s, p3/M, z4.s, z6.s\n"
+ "fmla z29.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z16.s }, p3/Z, [x10, #4, MUL VL]\n"
"mov p0.b, p2.b\n"
- "fmla z28.s, p3/M, z0.s, z7.s\n"
- "fmla z29.s, p3/M, z0.s, z8.s\n"
+ "fmla z30.s, p3/M, z20.s, z7.s\n"
+ "fmla z31.s, p3/M, z20.s, z8.s\n"
"csel x8, x8, XZR, LT\n"
"cmp x12, x20\n"
- "fmla z30.s, p3/M, z0.s, z14.s\n"
- "fmla z31.s, p3/M, z0.s, z11.s\n"
- "ld1w { z0.s }, p3/Z, [x10, #5, MUL VL]\n"
- "fmla z28.s, p3/M, z1.s, z8.s\n"
- "fmla z29.s, p3/M, z1.s, z13.s\n"
- "ld1w { z8.s }, p2/Z, [x9, x24, LSL #2]\n"
- "fmla z30.s, p3/M, z1.s, z11.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
- "ld1w { z1.s }, p3/Z, [x10, #6, MUL VL]\n"
- "fmla z28.s, p3/M, z2.s, z13.s\n"
- "fmla z29.s, p3/M, z2.s, z5.s\n"
- "ld1w { z13.s }, p2/Z, [x9, x26, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z12.s\n"
- "fmla z31.s, p3/M, z2.s, z9.s\n"
- "ld1w { z2.s }, p3/Z, [x10, #7, MUL VL]\n"
+ "fmla z5.s, p3/M, z20.s, z14.s\n"
+ "fmla z29.s, p3/M, z20.s, z1.s\n"
+ "ld1w { z21.s }, p3/Z, [x10, #5, MUL VL]\n"
+ "fmla z30.s, p3/M, z19.s, z8.s\n"
+ "fmla z31.s, p3/M, z19.s, z13.s\n"
+ "ld1w { z26.s }, p2/Z, [x9, x24, LSL #2]\n"
+ "fmla z5.s, p3/M, z19.s, z1.s\n"
+ "fmla z29.s, p3/M, z19.s, z0.s\n"
+ "ld1w { z25.s }, p3/Z, [x10, #6, MUL VL]\n"
+ "fmla z30.s, p3/M, z18.s, z13.s\n"
+ "fmla z31.s, p3/M, z18.s, z22.s\n"
+ "ld1w { z24.s }, p2/Z, [x9, x26, LSL #2]\n"
+ "fmla z5.s, p3/M, z18.s, z0.s\n"
+ "fmla z29.s, p3/M, z18.s, z27.s\n"
+ "ld1w { z23.s }, p3/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "fmla z28.s, p3/M, z3.s, z5.s\n"
- "fmla z29.s, p3/M, z3.s, z6.s\n"
- "ld1w { z5.s }, p2/Z, [x27]\n"
- "fmla z30.s, p3/M, z3.s, z9.s\n"
- "fmla z31.s, p3/M, z3.s, z13.s\n"
- "ld1w { z3.s }, p3/Z, [x10, #-8, MUL VL]\n"
- "fmla z28.s, p3/M, z4.s, z6.s\n"
- "fmla z29.s, p3/M, z4.s, z10.s\n"
- "ld1w { z6.s }, p2/Z, [x27, x17, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x27, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z13.s\n"
- "fmla z31.s, p3/M, z4.s, z8.s\n"
- "ld1w { z4.s }, p3/Z, [x10, #-7, MUL VL]\n"
- "fmla z28.s, p3/M, z0.s, z14.s\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
- "ld1w { z14.s }, p2/Z, [x27, x24, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z5.s\n"
- "fmla z31.s, p3/M, z0.s, z6.s\n"
- "ld1w { z0.s }, p3/Z, [x10, #-6, MUL VL]\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z1.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x28, LSL #2]\n"
- "fmla z30.s, p3/M, z1.s, z6.s\n"
- "fmla z31.s, p3/M, z1.s, z10.s\n"
- "ld1w { z1.s }, p3/Z, [x10, #-5, MUL VL]\n"
- "fmla z28.s, p3/M, z2.s, z12.s\n"
- "fmla z29.s, p3/M, z2.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x26, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z10.s\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "ld1w { z2.s }, p3/Z, [x10, #-4, MUL VL]\n"
- "fmla z28.s, p3/M, z3.s, z9.s\n"
- "fmla z29.s, p3/M, z3.s, z13.s\n"
- "ld1w { z9.s }, p2/Z, [x25]\n"
- "fmla z30.s, p3/M, z3.s, z11.s\n"
- "fmla z31.s, p3/M, z3.s, z12.s\n"
- "ld1w { z3.s }, p3/Z, [x10, #-3, MUL VL]\n"
- "fmla z28.s, p3/M, z4.s, z13.s\n"
- "fmla z29.s, p3/M, z4.s, z8.s\n"
- "ld1w { z13.s }, p2/Z, [x25, x17, LSL #2]\n"
- "ld1w { z8.s }, p2/Z, [x25, x26, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z12.s\n"
- "fmla z31.s, p3/M, z4.s, z14.s\n"
- "ld1w { z4.s }, p3/Z, [x10, #-2, MUL VL]\n"
- "fmla z28.s, p3/M, z0.s, z5.s\n"
- "fmla z29.s, p3/M, z0.s, z6.s\n"
- "ld1w { z5.s }, p2/Z, [x25, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z9.s\n"
- "fmla z31.s, p3/M, z0.s, z13.s\n"
- "ld1w { z0.s }, p3/Z, [x10, #-1, MUL VL]\n"
- "fmla z28.s, p3/M, z1.s, z6.s\n"
- "fmla z29.s, p3/M, z1.s, z10.s\n"
- "ld1w { z6.s }, p2/Z, [x25, x28, LSL #2]\n"
- "fmla z30.s, p3/M, z1.s, z13.s\n"
- "fmla z31.s, p3/M, z1.s, z5.s\n"
- "ld1w { z1.s }, p3/Z, [x10]\n"
- "fmla z28.s, p3/M, z2.s, z10.s\n"
- "fmla z29.s, p3/M, z2.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x24, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z5.s\n"
- "fmla z31.s, p3/M, z2.s, z6.s\n"
- "ld1w { z2.s }, p3/Z, [x10, #1, MUL VL]\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x23]\n"
- "fmla z30.s, p3/M, z3.s, z6.s\n"
- "fmla z31.s, p3/M, z3.s, z8.s\n"
- "ld1w { z3.s }, p3/Z, [x10, #2, MUL VL]\n"
- "fmla z28.s, p3/M, z4.s, z12.s\n"
- "fmla z29.s, p3/M, z4.s, z14.s\n"
- "ld1w { z12.s }, p2/Z, [x23, x17, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z8.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- "ld1w { z4.s }, p3/Z, [x10, #3, MUL VL]\n"
- "fmla z28.s, p3/M, z0.s, z9.s\n"
- "fmla z29.s, p3/M, z0.s, z13.s\n"
- "ld1w { z9.s }, p2/Z, [x23, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z31.s, p3/M, z0.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x23, x28, LSL #2]\n"
- "fmla z28.s, p3/M, z1.s, z13.s\n"
- "fmla z29.s, p3/M, z1.s, z5.s\n"
- "fmla z30.s, p3/M, z1.s, z12.s\n"
- "fmla z31.s, p3/M, z1.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x23, x26, LSL #2]\n"
- "fmla z28.s, p3/M, z2.s, z5.s\n"
- "fmla z29.s, p3/M, z2.s, z6.s\n"
- "fmla z30.s, p3/M, z2.s, z9.s\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "ld1w { z9.s }, p2/Z, [x23, x24, LSL #2]\n"
- "fmla z28.s, p3/M, z3.s, z6.s\n"
- "fmla z29.s, p3/M, z3.s, z8.s\n"
- "fmla z30.s, p3/M, z3.s, z11.s\n"
- "fmla z31.s, p3/M, z3.s, z12.s\n"
- "fmla z28.s, p3/M, z4.s, z8.s\n"
- "fmla z29.s, p3/M, z4.s, z10.s\n"
- "fmax z28.s, p3/M, z28.s, z18.s\n"
- "fmax z29.s, p3/M, z29.s, z18.s\n"
- "fmla z30.s, p3/M, z4.s, z12.s\n"
- "fmla z31.s, p3/M, z4.s, z9.s\n"
- "fmax z30.s, p3/M, z30.s, z18.s\n"
- "fmax z31.s, p3/M, z31.s, z18.s\n"
- "fmin z28.s, p3/M, z28.s, z17.s\n"
- "fmin z29.s, p3/M, z29.s, z17.s\n"
- "st1w { z28.s }, p0, [x13]\n"
- "fmin z30.s, p3/M, z30.s, z17.s\n"
- "fmin z31.s, p3/M, z31.s, z17.s\n"
- "st1w { z29.s }, p0, [x13, x16, LSL #2]\n"
- "st1w { z30.s }, p0, [x22]\n"
- "st1w { z31.s }, p0, [x22, x16, LSL #2]\n"
+ "fmla z30.s, p3/M, z17.s, z22.s\n"
+ "fmla z31.s, p3/M, z17.s, z6.s\n"
+ "ld1w { z22.s }, p2/Z, [x27]\n"
+ "fmla z5.s, p3/M, z17.s, z27.s\n"
+ "fmla z29.s, p3/M, z17.s, z24.s\n"
+ "ld1w { z20.s }, p3/Z, [x10, #-8, MUL VL]\n"
+ "fmla z30.s, p3/M, z16.s, z6.s\n"
+ "fmla z31.s, p3/M, z16.s, z10.s\n"
+ "ld1w { z19.s }, p2/Z, [x27, x17, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "fmla z5.s, p3/M, z16.s, z24.s\n"
+ "fmla z29.s, p3/M, z16.s, z26.s\n"
+ "ld1w { z16.s }, p3/Z, [x10, #-7, MUL VL]\n"
+ "fmla z30.s, p3/M, z21.s, z14.s\n"
+ "fmla z31.s, p3/M, z21.s, z1.s\n"
+ "ld1w { z17.s }, p2/Z, [x27, x24, LSL #2]\n"
+ "fmla z5.s, p3/M, z21.s, z22.s\n"
+ "fmla z29.s, p3/M, z21.s, z19.s\n"
+ "ld1w { z21.s }, p3/Z, [x10, #-6, MUL VL]\n"
+ "fmla z30.s, p3/M, z25.s, z1.s\n"
+ "fmla z31.s, p3/M, z25.s, z0.s\n"
+ "ld1w { z7.s }, p2/Z, [x27, x28, LSL #2]\n"
+ "fmla z5.s, p3/M, z25.s, z19.s\n"
+ "fmla z29.s, p3/M, z25.s, z18.s\n"
+ "ld1w { z10.s }, p3/Z, [x10, #-5, MUL VL]\n"
+ "fmla z30.s, p3/M, z23.s, z0.s\n"
+ "fmla z31.s, p3/M, z23.s, z27.s\n"
+ "ld1w { z11.s }, p2/Z, [x27, x26, LSL #2]\n"
+ "fmla z5.s, p3/M, z23.s, z18.s\n"
+ "fmla z29.s, p3/M, z23.s, z7.s\n"
+ "ld1w { z6.s }, p3/Z, [x10, #-4, MUL VL]\n"
+ "fmla z30.s, p3/M, z20.s, z27.s\n"
+ "fmla z31.s, p3/M, z20.s, z24.s\n"
+ "ld1w { z0.s }, p2/Z, [x25]\n"
+ "fmla z5.s, p3/M, z20.s, z7.s\n"
+ "fmla z29.s, p3/M, z20.s, z11.s\n"
+ "ld1w { z9.s }, p3/Z, [x10, #-3, MUL VL]\n"
+ "fmla z30.s, p3/M, z16.s, z24.s\n"
+ "fmla z31.s, p3/M, z16.s, z26.s\n"
+ "ld1w { z3.s }, p2/Z, [x25, x17, LSL #2]\n"
+ "ld1w { z27.s }, p2/Z, [x25, x26, LSL #2]\n"
+ "fmla z5.s, p3/M, z16.s, z11.s\n"
+ "fmla z29.s, p3/M, z16.s, z17.s\n"
+ "ld1w { z16.s }, p3/Z, [x10, #-2, MUL VL]\n"
+ "fmla z30.s, p3/M, z21.s, z22.s\n"
+ "fmla z31.s, p3/M, z21.s, z19.s\n"
+ "ld1w { z26.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z5.s, p3/M, z21.s, z0.s\n"
+ "fmla z29.s, p3/M, z21.s, z3.s\n"
+ "ld1w { z25.s }, p3/Z, [x10, #-1, MUL VL]\n"
+ "fmla z30.s, p3/M, z10.s, z19.s\n"
+ "fmla z31.s, p3/M, z10.s, z18.s\n"
+ "ld1w { z24.s }, p2/Z, [x25, x28, LSL #2]\n"
+ "fmla z5.s, p3/M, z10.s, z3.s\n"
+ "fmla z29.s, p3/M, z10.s, z26.s\n"
+ "ld1w { z23.s }, p3/Z, [x10]\n"
+ "fmla z30.s, p3/M, z6.s, z18.s\n"
+ "fmla z31.s, p3/M, z6.s, z7.s\n"
+ "ld1w { z22.s }, p2/Z, [x25, x24, LSL #2]\n"
+ "fmla z5.s, p3/M, z6.s, z26.s\n"
+ "fmla z29.s, p3/M, z6.s, z24.s\n"
+ "ld1w { z21.s }, p3/Z, [x10, #1, MUL VL]\n"
+ "fmla z30.s, p3/M, z9.s, z7.s\n"
+ "fmla z31.s, p3/M, z9.s, z11.s\n"
+ "ld1w { z18.s }, p2/Z, [x23]\n"
+ "fmla z5.s, p3/M, z9.s, z24.s\n"
+ "fmla z29.s, p3/M, z9.s, z27.s\n"
+ "ld1w { z20.s }, p3/Z, [x10, #2, MUL VL]\n"
+ "fmla z30.s, p3/M, z16.s, z11.s\n"
+ "fmla z31.s, p3/M, z16.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x17, LSL #2]\n"
+ "fmla z5.s, p3/M, z16.s, z27.s\n"
+ "fmla z29.s, p3/M, z16.s, z22.s\n"
+ "ld1w { z19.s }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z30.s, p3/M, z25.s, z0.s\n"
+ "fmla z31.s, p3/M, z25.s, z3.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z5.s, p3/M, z25.s, z18.s\n"
+ "fmla z29.s, p3/M, z25.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "fmla z30.s, p3/M, z23.s, z3.s\n"
+ "fmla z31.s, p3/M, z23.s, z26.s\n"
+ "fmla z5.s, p3/M, z23.s, z17.s\n"
+ "fmla z29.s, p3/M, z23.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x26, LSL #2]\n"
+ "fmla z30.s, p3/M, z21.s, z26.s\n"
+ "fmla z31.s, p3/M, z21.s, z24.s\n"
+ "fmla z5.s, p3/M, z21.s, z16.s\n"
+ "fmla z29.s, p3/M, z21.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x24, LSL #2]\n"
+ "fmla z30.s, p3/M, z20.s, z24.s\n"
+ "fmla z31.s, p3/M, z20.s, z27.s\n"
+ "fmla z5.s, p3/M, z20.s, z18.s\n"
+ "fmla z29.s, p3/M, z20.s, z17.s\n"
+ "fmla z30.s, p3/M, z19.s, z27.s\n"
+ "fmla z31.s, p3/M, z19.s, z22.s\n"
+ "fmax z30.s, p3/M, z30.s, z15.s\n"
+ "fmax z31.s, p3/M, z31.s, z15.s\n"
+ "fmla z5.s, p3/M, z19.s, z17.s\n"
+ "fmla z29.s, p3/M, z19.s, z16.s\n"
+ "fmax z5.s, p3/M, z5.s, z15.s\n"
+ "fmax z29.s, p3/M, z29.s, z15.s\n"
+ "fmin z30.s, p3/M, z30.s, z28.s\n"
+ "fmin z31.s, p3/M, z31.s, z28.s\n"
+ "st1w { z30.s }, p0, [x13]\n"
+ "fmin z5.s, p3/M, z5.s, z28.s\n"
+ "fmin z29.s, p3/M, z29.s, z28.s\n"
+ "st1w { z31.s }, p0, [x13, x16, LSL #2]\n"
+ "st1w { z5.s }, p0, [x22]\n"
+ "st1w { z29.s }, p0, [x22, x16, LSL #2]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index cb70bd2b6f..bf65e04d32 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -104,448 +104,448 @@ void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"mov x13, #0x0\n"
"ldp x12, x11, [x20, #0x10]\n"
"whilelt p3.s, XZR, %x[n_channels]\n"
- "ldp x10, x9, [x16, #0x0]\n"
- "cntw x28\n"
+ "ldp x21, x20, [x16, #0x0]\n"
+ "cntw x10\n"
"ptrue p2.b\n"
- "ldr x27, [%x[params_struct], %[offsetof_args_params]]\n"
- "ld1w { z5.s }, p3/Z, [x10, x13, LSL #2]\n"
- "cmp x28, %x[n_channels]\n"
- "ld1w { z6.s }, p3/Z, [x9, x13, LSL #2]\n"
- "ldp x26, x25, [x16, #0x10]\n"
- "sub x24, XZR, x28\n"
- "ldp x23, x22, [x16, #0x20]\n"
- "ldp x21, x20, [x16, #0x30]\n"
- "ldp x10, x9, [x16, #0x40]\n"
- "ld1rw { z18.s }, p2/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rw { z17.s }, p2/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "ld1w { z16.s }, p2/Z, [x27]\n"
- "ld1w { z0.s }, p2/Z, [x27, #1, MUL VL]\n"
- "ld1w { z1.s }, p2/Z, [x27, #2, MUL VL]\n"
- "ld1w { z2.s }, p2/Z, [x27, #3, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x27, #4, MUL VL]\n"
- "ld1w { z4.s }, p2/Z, [x27, #5, MUL VL]\n"
- "ld1w { z7.s }, p3/Z, [x26, x13, LSL #2]\n"
- "addvl x27, x27, #6\n"
- "ld1w { z8.s }, p3/Z, [x25, x13, LSL #2]\n"
- "ld1w { z9.s }, p3/Z, [x23, x13, LSL #2]\n"
- "ld1w { z13.s }, p3/Z, [x22, x13, LSL #2]\n"
- "ld1w { z11.s }, p3/Z, [x21, x13, LSL #2]\n"
- "ld1w { z12.s }, p3/Z, [x20, x13, LSL #2]\n"
- "ld1w { z10.s }, p3/Z, [x10, x13, LSL #2]\n"
- "ld1w { z14.s }, p3/Z, [x9, x13, LSL #2]\n"
+ "ldr x9, [%x[params_struct], %[offsetof_args_params]]\n"
+ "ld1w { z5.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "cmp x10, %x[n_channels]\n"
+ "ld1w { z6.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldp x27, x26, [x16, #0x10]\n"
+ "sub x28, XZR, x10\n"
+ "ldp x25, x24, [x16, #0x20]\n"
+ "ldp x23, x22, [x16, #0x30]\n"
+ "ldp x21, x20, [x16, #0x40]\n"
+ "ld1rw { z15.s }, p2/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z28.s }, p2/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1w { z29.s }, p2/Z, [x9]\n"
+ "ld1w { z0.s }, p2/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z1.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x9, #4, MUL VL]\n"
+ "ld1w { z4.s }, p2/Z, [x9, #5, MUL VL]\n"
+ "ld1w { z7.s }, p3/Z, [x27, x13, LSL #2]\n"
+ "addvl x9, x9, #6\n"
+ "ld1w { z8.s }, p3/Z, [x26, x13, LSL #2]\n"
+ "ld1w { z9.s }, p3/Z, [x25, x13, LSL #2]\n"
+ "ld1w { z13.s }, p3/Z, [x24, x13, LSL #2]\n"
+ "ld1w { z11.s }, p3/Z, [x23, x13, LSL #2]\n"
+ "ld1w { z12.s }, p3/Z, [x22, x13, LSL #2]\n"
+ "ld1w { z10.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "ld1w { z14.s }, p3/Z, [x20, x13, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z28, z16\n fmla z28.s, p2/M, z0.s, z5.s\n"
- "movprfx z29, z16\n fmla z29.s, p2/M, z0.s, z6.s\n"
- "ldr x26, [x16, #0x50]\n"
- "ld1w { z5.s }, p3/Z, [x26, x13, LSL #2]\n"
- "movprfx z30, z16\n fmla z30.s, p2/M, z0.s, z7.s\n"
- "movprfx z31, z16\n fmla z31.s, p2/M, z0.s, z8.s\n"
- "ldr x25, [x16, #0x58]\n"
- "ldr x23, [x16, #0x60]\n"
- "fmla z28.s, p2/M, z1.s, z6.s\n"
- "fmla z29.s, p2/M, z1.s, z9.s\n"
- "ld1w { z6.s }, p3/Z, [x25, x13, LSL #2]\n"
- "ldr x22, [x16, #0x68]\n"
- "fmla z30.s, p2/M, z1.s, z8.s\n"
- "fmla z31.s, p2/M, z1.s, z13.s\n"
- "ld1w { z0.s }, p2/Z, [x27]\n"
- "ldr x21, [x16, #0x70]\n"
- "fmla z28.s, p2/M, z2.s, z9.s\n"
- "fmla z29.s, p2/M, z2.s, z11.s\n"
- "ld1w { z9.s }, p3/Z, [x23, x13, LSL #2]\n"
- "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n"
- "fmla z30.s, p2/M, z2.s, z13.s\n"
- "fmla z31.s, p2/M, z2.s, z5.s\n"
- "ldr x20, [x16, #0x78]\n"
- "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n"
- "fmla z28.s, p2/M, z3.s, z11.s\n"
- "fmla z29.s, p2/M, z3.s, z12.s\n"
- "ld1w { z11.s }, p3/Z, [x22, x13, LSL #2]\n"
- "ldr x10, [x16, #0x80]\n"
- "fmla z30.s, p2/M, z3.s, z5.s\n"
- "fmla z31.s, p2/M, z3.s, z6.s\n"
- "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n"
- "ldr x9, [x16, #0x88]\n"
- "fmla z28.s, p2/M, z4.s, z12.s\n"
- "fmla z29.s, p2/M, z4.s, z9.s\n"
- "ld1w { z12.s }, p3/Z, [x21, x13, LSL #2]\n"
- "ld1w { z9.s }, p3/Z, [x20, x13, LSL #2]\n"
- "fmla z30.s, p2/M, z4.s, z6.s\n"
- "fmla z31.s, p2/M, z4.s, z10.s\n"
- "ld1w { z4.s }, p2/Z, [x27, #4, MUL VL]\n"
- "ldr x26, [x16, #0x90]\n"
- "fmla z28.s, p2/M, z0.s, z7.s\n"
- "fmla z29.s, p2/M, z0.s, z8.s\n"
- "ldr x25, [x16, #0x98]\n"
- "ldr x23, [x16, #0xa0]\n"
- "fmla z30.s, p2/M, z0.s, z14.s\n"
- "fmla z31.s, p2/M, z0.s, z11.s\n"
- "ld1w { z0.s }, p2/Z, [x27, #5, MUL VL]\n"
- "ldr x22, [x16, #0xa8]\n"
- "fmla z28.s, p2/M, z1.s, z8.s\n"
- "fmla z29.s, p2/M, z1.s, z13.s\n"
- "ld1w { z8.s }, p3/Z, [x9, x13, LSL #2]\n"
- "ldr x21, [x16, #0xb0]\n"
- "fmla z30.s, p2/M, z1.s, z11.s\n"
- "fmla z31.s, p2/M, z1.s, z12.s\n"
- "ld1w { z1.s }, p2/Z, [x27, #6, MUL VL]\n"
- "ldr x20, [x16, #0xb8]\n"
- "fmla z28.s, p2/M, z2.s, z13.s\n"
- "fmla z29.s, p2/M, z2.s, z5.s\n"
- "ld1w { z13.s }, p3/Z, [x10, x13, LSL #2]\n"
- "ldr x10, [x16, #0xc0]\n"
- "fmla z30.s, p2/M, z2.s, z12.s\n"
- "fmla z31.s, p2/M, z2.s, z9.s\n"
- "ld1w { z2.s }, p2/Z, [x27, #7, MUL VL]\n"
- "addvl x27, x27, #16\n"
- "fmla z28.s, p2/M, z3.s, z5.s\n"
- "fmla z29.s, p2/M, z3.s, z6.s\n"
- "ld1w { z5.s }, p3/Z, [x26, x13, LSL #2]\n"
- "ldr x9, [x16, #0xc8]\n"
- "fmla z30.s, p2/M, z3.s, z9.s\n"
- "fmla z31.s, p2/M, z3.s, z13.s\n"
- "ld1w { z3.s }, p2/Z, [x27, #-8, MUL VL]\n"
- "ldr x26, [x16, #0xd0]\n"
- "fmla z28.s, p2/M, z4.s, z6.s\n"
- "fmla z29.s, p2/M, z4.s, z10.s\n"
- "ld1w { z6.s }, p3/Z, [x25, x13, LSL #2]\n"
- "ld1w { z10.s }, p3/Z, [x23, x13, LSL #2]\n"
- "fmla z30.s, p2/M, z4.s, z13.s\n"
- "fmla z31.s, p2/M, z4.s, z8.s\n"
- "ld1w { z4.s }, p2/Z, [x27, #-7, MUL VL]\n"
- "ldr x25, [x16, #0xd8]\n"
- "fmla z28.s, p2/M, z0.s, z14.s\n"
- "fmla z29.s, p2/M, z0.s, z11.s\n"
- "ld1w { z14.s }, p3/Z, [x20, x13, LSL #2]\n"
- "ldr x23, [x16, #0xe0]\n"
- "fmla z30.s, p2/M, z0.s, z5.s\n"
- "fmla z31.s, p2/M, z0.s, z6.s\n"
- "ld1w { z0.s }, p2/Z, [x27, #-6, MUL VL]\n"
- "ldr x20, [x16, #0xf8]\n"
- "fmla z28.s, p2/M, z1.s, z11.s\n"
- "fmla z29.s, p2/M, z1.s, z12.s\n"
- "ld1w { z11.s }, p3/Z, [x22, x13, LSL #2]\n"
- "ldr x22, [x16, #0xe8]\n"
+ "movprfx z30, z29\n fmla z30.s, p2/M, z0.s, z5.s\n"
+ "movprfx z27, z29\n fmla z27.s, p2/M, z0.s, z6.s\n"
+ "ldr x20, [x16, #0x50]\n"
+ "ld1w { z5.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "movprfx z31, z29\n fmla z31.s, p2/M, z0.s, z7.s\n"
+ "movprfx z26, z29\n fmla z26.s, p2/M, z0.s, z8.s\n"
+ "ldr x20, [x16, #0x58]\n"
+ "ldr x21, [x16, #0x60]\n"
"fmla z30.s, p2/M, z1.s, z6.s\n"
- "fmla z31.s, p2/M, z1.s, z10.s\n"
- "ld1w { z1.s }, p2/Z, [x27, #-5, MUL VL]\n"
- "whilelt p1.s, x28, %x[n_channels]\n"
- "fmla z28.s, p2/M, z2.s, z12.s\n"
- "fmla z29.s, p2/M, z2.s, z9.s\n"
- "ld1w { z12.s }, p3/Z, [x21, x13, LSL #2]\n"
- "ldr x21, [x16, #0xf0]\n"
- "fmla z30.s, p2/M, z2.s, z10.s\n"
- "fmla z31.s, p2/M, z2.s, z11.s\n"
- "ld1w { z2.s }, p2/Z, [x27, #-4, MUL VL]\n"
- "incw x24\n"
- "fmla z28.s, p2/M, z3.s, z9.s\n"
- "fmla z29.s, p2/M, z3.s, z13.s\n"
- "ld1w { z9.s }, p3/Z, [x10, x13, LSL #2]\n"
- "ldr x10, [x16, #0x100]\n"
- "fmla z30.s, p2/M, z3.s, z11.s\n"
- "fmla z31.s, p2/M, z3.s, z12.s\n"
- "ld1w { z3.s }, p2/Z, [x27, #-3, MUL VL]\n"
- "mov p0.b, p3.b\n"
- "fmla z28.s, p2/M, z4.s, z13.s\n"
- "fmla z29.s, p2/M, z4.s, z8.s\n"
- "ld1w { z13.s }, p3/Z, [x9, x13, LSL #2]\n"
- "ld1w { z8.s }, p3/Z, [x23, x13, LSL #2]\n"
- "fmla z30.s, p2/M, z4.s, z12.s\n"
- "fmla z31.s, p2/M, z4.s, z14.s\n"
- "ld1w { z4.s }, p2/Z, [x27, #-2, MUL VL]\n"
- "ldr x9, [x16, #0x108]\n"
- "fmla z28.s, p2/M, z0.s, z5.s\n"
- "fmla z29.s, p2/M, z0.s, z6.s\n"
- "ld1w { z5.s }, p3/Z, [x26, x13, LSL #2]\n"
- "ldr x26, [x16, #0x110]\n"
- "fmla z30.s, p2/M, z0.s, z9.s\n"
- "fmla z31.s, p2/M, z0.s, z13.s\n"
- "ld1w { z0.s }, p2/Z, [x27, #-1, MUL VL]\n"
- "ld1w { z16.s }, p2/Z, [x27, #4, MUL VL]\n"
- "fmla z28.s, p2/M, z1.s, z6.s\n"
- "fmla z29.s, p2/M, z1.s, z10.s\n"
- "ld1w { z6.s }, p3/Z, [x25, x13, LSL #2]\n"
- "ldr x25, [x16, #0x118]\n"
- "fmla z30.s, p2/M, z1.s, z13.s\n"
- "fmla z31.s, p2/M, z1.s, z5.s\n"
- "ld1w { z1.s }, p2/Z, [x27]\n"
- "fmla z28.s, p2/M, z2.s, z10.s\n"
- "fmla z29.s, p2/M, z2.s, z11.s\n"
- "ld1w { z10.s }, p3/Z, [x22, x13, LSL #2]\n"
- "fmla z30.s, p2/M, z2.s, z5.s\n"
- "fmla z31.s, p2/M, z2.s, z6.s\n"
- "ld1w { z2.s }, p2/Z, [x27, #1, MUL VL]\n"
- "fmla z28.s, p2/M, z3.s, z11.s\n"
- "fmla z29.s, p2/M, z3.s, z12.s\n"
- "ld1w { z11.s }, p3/Z, [x21, x13, LSL #2]\n"
- "fmla z30.s, p2/M, z3.s, z6.s\n"
- "fmla z31.s, p2/M, z3.s, z8.s\n"
- "ld1w { z3.s }, p2/Z, [x27, #2, MUL VL]\n"
- "fmla z28.s, p2/M, z4.s, z12.s\n"
- "fmla z29.s, p2/M, z4.s, z14.s\n"
- "ld1w { z12.s }, p3/Z, [x20, x13, LSL #2]\n"
- "fmla z30.s, p2/M, z4.s, z8.s\n"
- "fmla z31.s, p2/M, z4.s, z10.s\n"
- "ld1w { z4.s }, p2/Z, [x27, #3, MUL VL]\n"
- "fmla z28.s, p2/M, z0.s, z9.s\n"
- "fmla z29.s, p2/M, z0.s, z13.s\n"
- "ld1w { z9.s }, p3/Z, [x10, x13, LSL #2]\n"
- "fmla z30.s, p2/M, z0.s, z11.s\n"
- "fmla z31.s, p2/M, z0.s, z12.s\n"
- "ld1w { z11.s }, p3/Z, [x9, x13, LSL #2]\n"
- "ldp x10, x9, [x16, #0x0]\n"
- "fmla z28.s, p2/M, z1.s, z13.s\n"
- "fmla z29.s, p2/M, z1.s, z5.s\n"
- "ld1w { z0.s }, p2/Z, [x27, #5, MUL VL]\n"
- "fmla z30.s, p2/M, z1.s, z12.s\n"
- "fmla z31.s, p2/M, z1.s, z9.s\n"
- "ld1w { z12.s }, p3/Z, [x26, x13, LSL #2]\n"
- "ld1w { z1.s }, p2/Z, [x27, #6, MUL VL]\n"
- "fmla z28.s, p2/M, z2.s, z5.s\n"
- "fmla z29.s, p2/M, z2.s, z6.s\n"
- "ld1w { z5.s }, p1/Z, [x10, x28, LSL #2]\n"
+ "fmla z27.s, p2/M, z1.s, z9.s\n"
+ "ld1w { z22.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldr x20, [x16, #0x68]\n"
+ "fmla z31.s, p2/M, z1.s, z8.s\n"
+ "fmla z26.s, p2/M, z1.s, z13.s\n"
+ "ld1w { z21.s }, p2/Z, [x9]\n"
+ "ldr x23, [x16, #0x70]\n"
"fmla z30.s, p2/M, z2.s, z9.s\n"
- "fmla z31.s, p2/M, z2.s, z11.s\n"
- "ld1w { z9.s }, p3/Z, [x25, x13, LSL #2]\n"
- "ldp x26, x25, [x16, #0x10]\n"
- "fmla z28.s, p2/M, z3.s, z6.s\n"
- "fmla z29.s, p2/M, z3.s, z8.s\n"
- "ld1w { z6.s }, p1/Z, [x9, x28, LSL #2]\n"
- "ldp x23, x22, [x16, #0x20]\n"
+ "fmla z27.s, p2/M, z2.s, z11.s\n"
+ "ld1w { z20.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x9, #1, MUL VL]\n"
+ "fmla z31.s, p2/M, z2.s, z13.s\n"
+ "fmla z26.s, p2/M, z2.s, z5.s\n"
+ "ldr x22, [x16, #0x78]\n"
+ "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
"fmla z30.s, p2/M, z3.s, z11.s\n"
- "fmla z31.s, p2/M, z3.s, z12.s\n"
- "ldp x21, x20, [x16, #0x30]\n"
- "ldp x10, x9, [x16, #0x40]\n"
- "fmla z28.s, p2/M, z4.s, z8.s\n"
- "fmla z29.s, p2/M, z4.s, z10.s\n"
- "incw x13\n"
- "ld1w { z7.s }, p1/Z, [x26, x28, LSL #2]\n"
+ "fmla z27.s, p2/M, z3.s, z12.s\n"
+ "ld1w { z11.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldr x21, [x16, #0x80]\n"
+ "fmla z31.s, p2/M, z3.s, z5.s\n"
+ "fmla z26.s, p2/M, z3.s, z22.s\n"
+ "ld1w { z16.s }, p2/Z, [x9, #3, MUL VL]\n"
+ "ldr x20, [x16, #0x88]\n"
"fmla z30.s, p2/M, z4.s, z12.s\n"
- "fmla z31.s, p2/M, z4.s, z9.s\n"
- "ld1w { z8.s }, p1/Z, [x25, x28, LSL #2]\n"
- "ld1w { z9.s }, p1/Z, [x23, x28, LSL #2]\n"
- "ld1w { z13.s }, p1/Z, [x22, x28, LSL #2]\n"
- "ld1w { z11.s }, p1/Z, [x21, x28, LSL #2]\n"
- "fmax z28.s, p2/M, z28.s, z18.s\n"
- "fmax z29.s, p2/M, z29.s, z18.s\n"
- "ld1w { z12.s }, p1/Z, [x20, x28, LSL #2]\n"
- "ld1w { z10.s }, p1/Z, [x10, x28, LSL #2]\n"
- "fmax z30.s, p2/M, z30.s, z18.s\n"
- "fmax z31.s, p2/M, z31.s, z18.s\n"
- "ld1w { z14.s }, p1/Z, [x9, x28, LSL #2]\n"
+ "fmla z27.s, p2/M, z4.s, z20.s\n"
+ "ld1w { z0.s }, p3/Z, [x23, x13, LSL #2]\n"
+ "ld1w { z29.s }, p3/Z, [x22, x13, LSL #2]\n"
+ "fmla z31.s, p2/M, z4.s, z22.s\n"
+ "fmla z26.s, p2/M, z4.s, z10.s\n"
+ "ld1w { z19.s }, p2/Z, [x9, #4, MUL VL]\n"
+ "ldr x23, [x16, #0x90]\n"
+ "fmla z30.s, p2/M, z21.s, z7.s\n"
+ "fmla z27.s, p2/M, z21.s, z8.s\n"
+ "ldr x26, [x16, #0x98]\n"
+ "ldr x22, [x16, #0xa0]\n"
+ "fmla z31.s, p2/M, z21.s, z14.s\n"
+ "fmla z26.s, p2/M, z21.s, z11.s\n"
+ "ld1w { z25.s }, p2/Z, [x9, #5, MUL VL]\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla z30.s, p2/M, z18.s, z8.s\n"
+ "fmla z27.s, p2/M, z18.s, z13.s\n"
+ "ld1w { z24.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla z31.s, p2/M, z18.s, z11.s\n"
+ "fmla z26.s, p2/M, z18.s, z0.s\n"
+ "ld1w { z18.s }, p2/Z, [x9, #6, MUL VL]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ "fmla z30.s, p2/M, z17.s, z13.s\n"
+ "fmla z27.s, p2/M, z17.s, z5.s\n"
+ "ld1w { z3.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "ldr x21, [x16, #0xc0]\n"
+ "fmla z31.s, p2/M, z17.s, z0.s\n"
+ "fmla z26.s, p2/M, z17.s, z29.s\n"
+ "ld1w { z17.s }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
+ "fmla z30.s, p2/M, z16.s, z5.s\n"
+ "fmla z27.s, p2/M, z16.s, z22.s\n"
+ "ld1w { z6.s }, p3/Z, [x23, x13, LSL #2]\n"
+ "ldr x27, [x16, #0xc8]\n"
+ "fmla z31.s, p2/M, z16.s, z29.s\n"
+ "fmla z26.s, p2/M, z16.s, z3.s\n"
+ "ld1w { z16.s }, p2/Z, [x9, #-8, MUL VL]\n"
+ "ldr x23, [x16, #0xd0]\n"
+ "fmla z30.s, p2/M, z19.s, z22.s\n"
+ "fmla z27.s, p2/M, z19.s, z10.s\n"
+ "ld1w { z23.s }, p3/Z, [x26, x13, LSL #2]\n"
+ "ld1w { z22.s }, p3/Z, [x22, x13, LSL #2]\n"
+ "fmla z31.s, p2/M, z19.s, z3.s\n"
+ "fmla z26.s, p2/M, z19.s, z24.s\n"
+ "ld1w { z21.s }, p2/Z, [x9, #-7, MUL VL]\n"
+ "ldr x22, [x16, #0xd8]\n"
+ "fmla z30.s, p2/M, z25.s, z14.s\n"
+ "fmla z27.s, p2/M, z25.s, z11.s\n"
+ "ld1w { z1.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldr x20, [x16, #0xe0]\n"
+ "fmla z31.s, p2/M, z25.s, z6.s\n"
+ "fmla z26.s, p2/M, z25.s, z23.s\n"
+ "ld1w { z20.s }, p2/Z, [x9, #-6, MUL VL]\n"
+ "ldr x26, [x16, #0xf8]\n"
+ "fmla z30.s, p2/M, z18.s, z11.s\n"
+ "fmla z27.s, p2/M, z18.s, z0.s\n"
+ "ld1w { z7.s }, p3/Z, [x25, x13, LSL #2]\n"
+ "ldr x25, [x16, #0xe8]\n"
+ "fmla z31.s, p2/M, z18.s, z23.s\n"
+ "fmla z26.s, p2/M, z18.s, z22.s\n"
+ "ld1w { z18.s }, p2/Z, [x9, #-5, MUL VL]\n"
+ "whilelt p1.s, x10, %x[n_channels]\n"
+ "fmla z30.s, p2/M, z17.s, z0.s\n"
+ "fmla z27.s, p2/M, z17.s, z29.s\n"
+ "ld1w { z19.s }, p3/Z, [x24, x13, LSL #2]\n"
+ "ldr x24, [x16, #0xf0]\n"
+ "fmla z31.s, p2/M, z17.s, z22.s\n"
+ "fmla z26.s, p2/M, z17.s, z7.s\n"
+ "ld1w { z17.s }, p2/Z, [x9, #-4, MUL VL]\n"
"incw x28\n"
- "ld1w { z2.s }, p2/Z, [x27, #7, MUL VL]\n"
- "addvl x27, x27, #16\n"
+ "fmla z30.s, p2/M, z16.s, z29.s\n"
+ "fmla z27.s, p2/M, z16.s, z3.s\n"
+ "ld1w { z0.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "ldr x21, [x16, #0x100]\n"
+ "fmla z31.s, p2/M, z16.s, z7.s\n"
+ "fmla z26.s, p2/M, z16.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x9, #-3, MUL VL]\n"
+ "mov p0.b, p3.b\n"
+ "fmla z30.s, p2/M, z21.s, z3.s\n"
+ "fmla z27.s, p2/M, z21.s, z24.s\n"
+ "ld1w { z11.s }, p3/Z, [x27, x13, LSL #2]\n"
+ "ld1w { z13.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "fmla z31.s, p2/M, z21.s, z19.s\n"
+ "fmla z26.s, p2/M, z21.s, z1.s\n"
+ "ld1w { z10.s }, p2/Z, [x9, #-2, MUL VL]\n"
+ "ldr x20, [x16, #0x108]\n"
+ "fmla z30.s, p2/M, z20.s, z6.s\n"
+ "fmla z27.s, p2/M, z20.s, z23.s\n"
+ "ld1w { z25.s }, p3/Z, [x23, x13, LSL #2]\n"
+ "ldr x23, [x16, #0x110]\n"
+ "fmla z31.s, p2/M, z20.s, z0.s\n"
+ "fmla z26.s, p2/M, z20.s, z11.s\n"
+ "ld1w { z8.s }, p2/Z, [x9, #-1, MUL VL]\n"
+ "ld1w { z29.s }, p2/Z, [x9, #4, MUL VL]\n"
+ "fmla z30.s, p2/M, z18.s, z23.s\n"
+ "fmla z27.s, p2/M, z18.s, z22.s\n"
+ "ld1w { z24.s }, p3/Z, [x22, x13, LSL #2]\n"
+ "ldr x22, [x16, #0x118]\n"
+ "fmla z31.s, p2/M, z18.s, z11.s\n"
+ "fmla z26.s, p2/M, z18.s, z25.s\n"
+ "ld1w { z23.s }, p2/Z, [x9]\n"
+ "fmla z30.s, p2/M, z17.s, z22.s\n"
+ "fmla z27.s, p2/M, z17.s, z7.s\n"
+ "ld1w { z22.s }, p3/Z, [x25, x13, LSL #2]\n"
+ "fmla z31.s, p2/M, z17.s, z25.s\n"
+ "fmla z26.s, p2/M, z17.s, z24.s\n"
+ "ld1w { z21.s }, p2/Z, [x9, #1, MUL VL]\n"
+ "fmla z30.s, p2/M, z16.s, z7.s\n"
+ "fmla z27.s, p2/M, z16.s, z19.s\n"
+ "ld1w { z18.s }, p3/Z, [x24, x13, LSL #2]\n"
+ "fmla z31.s, p2/M, z16.s, z24.s\n"
+ "fmla z26.s, p2/M, z16.s, z13.s\n"
+ "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "fmla z30.s, p2/M, z10.s, z19.s\n"
+ "fmla z27.s, p2/M, z10.s, z1.s\n"
+ "ld1w { z17.s }, p3/Z, [x26, x13, LSL #2]\n"
+ "fmla z31.s, p2/M, z10.s, z13.s\n"
+ "fmla z26.s, p2/M, z10.s, z22.s\n"
+ "ld1w { z19.s }, p2/Z, [x9, #3, MUL VL]\n"
+ "fmla z30.s, p2/M, z8.s, z0.s\n"
+ "fmla z27.s, p2/M, z8.s, z11.s\n"
+ "ld1w { z16.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "fmla z31.s, p2/M, z8.s, z18.s\n"
+ "fmla z26.s, p2/M, z8.s, z17.s\n"
+ "ld1w { z18.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldp x21, x20, [x16, #0x0]\n"
+ "fmla z30.s, p2/M, z23.s, z11.s\n"
+ "fmla z27.s, p2/M, z23.s, z25.s\n"
+ "ld1w { z0.s }, p2/Z, [x9, #5, MUL VL]\n"
+ "fmla z31.s, p2/M, z23.s, z17.s\n"
+ "fmla z26.s, p2/M, z23.s, z16.s\n"
+ "ld1w { z17.s }, p3/Z, [x23, x13, LSL #2]\n"
+ "ld1w { z1.s }, p2/Z, [x9, #6, MUL VL]\n"
+ "fmla z30.s, p2/M, z21.s, z25.s\n"
+ "fmla z27.s, p2/M, z21.s, z24.s\n"
+ "ld1w { z5.s }, p1/Z, [x21, x10, LSL #2]\n"
+ "fmla z31.s, p2/M, z21.s, z16.s\n"
+ "fmla z26.s, p2/M, z21.s, z18.s\n"
+ "ld1w { z16.s }, p3/Z, [x22, x13, LSL #2]\n"
+ "ldp x27, x26, [x16, #0x10]\n"
+ "fmla z30.s, p2/M, z20.s, z24.s\n"
+ "fmla z27.s, p2/M, z20.s, z13.s\n"
+ "ld1w { z6.s }, p1/Z, [x20, x10, LSL #2]\n"
+ "ldp x25, x24, [x16, #0x20]\n"
+ "fmla z31.s, p2/M, z20.s, z18.s\n"
+ "fmla z26.s, p2/M, z20.s, z17.s\n"
+ "ldp x23, x22, [x16, #0x30]\n"
+ "ldp x21, x20, [x16, #0x40]\n"
+ "fmla z30.s, p2/M, z19.s, z13.s\n"
+ "fmla z27.s, p2/M, z19.s, z22.s\n"
+ "incw x13\n"
+ "ld1w { z7.s }, p1/Z, [x27, x10, LSL #2]\n"
+ "fmla z31.s, p2/M, z19.s, z17.s\n"
+ "fmla z26.s, p2/M, z19.s, z16.s\n"
+ "ld1w { z8.s }, p1/Z, [x26, x10, LSL #2]\n"
+ "ld1w { z9.s }, p1/Z, [x25, x10, LSL #2]\n"
+ "ld1w { z13.s }, p1/Z, [x24, x10, LSL #2]\n"
+ "ld1w { z11.s }, p1/Z, [x23, x10, LSL #2]\n"
+ "fmax z30.s, p2/M, z30.s, z15.s\n"
+ "fmax z27.s, p2/M, z27.s, z15.s\n"
+ "ld1w { z12.s }, p1/Z, [x22, x10, LSL #2]\n"
+ "ld1w { z10.s }, p1/Z, [x21, x10, LSL #2]\n"
+ "fmax z31.s, p2/M, z31.s, z15.s\n"
+ "fmax z26.s, p2/M, z26.s, z15.s\n"
+ "ld1w { z14.s }, p1/Z, [x20, x10, LSL #2]\n"
+ "incw x10\n"
+ "ld1w { z2.s }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
"whilelt p3.s, x13, %x[n_channels]\n"
- "cmp x28, %x[n_channels]\n"
- "ld1w { z3.s }, p2/Z, [x27, #-8, MUL VL]\n"
- "ld1w { z4.s }, p2/Z, [x27, #-7, MUL VL]\n"
- "fmin z28.s, p2/M, z28.s, z17.s\n"
- "fmin z29.s, p2/M, z29.s, z17.s\n"
- "st1w { z28.s }, p0, [x15, x24, LSL #2]\n"
- "fmin z30.s, p2/M, z30.s, z17.s\n"
- "fmin z31.s, p2/M, z31.s, z17.s\n"
- "st1w { z29.s }, p0, [x14, x24, LSL #2]\n"
- "st1w { z30.s }, p0, [x12, x24, LSL #2]\n"
- "addvl x27, x27, #-6\n"
- "st1w { z31.s }, p0, [x11, x24, LSL #2]\n"
+ "cmp x10, %x[n_channels]\n"
+ "ld1w { z3.s }, p2/Z, [x9, #-8, MUL VL]\n"
+ "ld1w { z4.s }, p2/Z, [x9, #-7, MUL VL]\n"
+ "fmin z30.s, p2/M, z30.s, z28.s\n"
+ "fmin z27.s, p2/M, z27.s, z28.s\n"
+ "st1w { z30.s }, p0, [x15, x28, LSL #2]\n"
+ "fmin z31.s, p2/M, z31.s, z28.s\n"
+ "fmin z26.s, p2/M, z26.s, z28.s\n"
+ "st1w { z27.s }, p0, [x14, x28, LSL #2]\n"
+ "st1w { z31.s }, p0, [x12, x28, LSL #2]\n"
+ "addvl x9, x9, #-6\n"
+ "st1w { z26.s }, p0, [x11, x28, LSL #2]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z28, z16\n fmla z28.s, p2/M, z0.s, z5.s\n"
- "movprfx z29, z16\n fmla z29.s, p2/M, z0.s, z6.s\n"
- "ldr x26, [x16, #0x50]\n"
- "ld1w { z5.s }, p3/Z, [x26, x13, LSL #2]\n"
- "movprfx z30, z16\n fmla z30.s, p2/M, z0.s, z7.s\n"
- "movprfx z31, z16\n fmla z31.s, p2/M, z0.s, z8.s\n"
- "ldr x25, [x16, #0x58]\n"
- "ldr x23, [x16, #0x60]\n"
- "fmla z28.s, p2/M, z1.s, z6.s\n"
- "fmla z29.s, p2/M, z1.s, z9.s\n"
- "ld1w { z6.s }, p3/Z, [x25, x13, LSL #2]\n"
- "ldr x22, [x16, #0x68]\n"
- "fmla z30.s, p2/M, z1.s, z8.s\n"
- "fmla z31.s, p2/M, z1.s, z13.s\n"
- "ld1w { z0.s }, p2/Z, [x27]\n"
- "ldr x21, [x16, #0x70]\n"
- "fmla z28.s, p2/M, z2.s, z9.s\n"
- "fmla z29.s, p2/M, z2.s, z11.s\n"
- "ld1w { z9.s }, p3/Z, [x23, x13, LSL #2]\n"
- "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n"
- "fmla z30.s, p2/M, z2.s, z13.s\n"
- "fmla z31.s, p2/M, z2.s, z5.s\n"
- "ldr x20, [x16, #0x78]\n"
- "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n"
- "fmla z28.s, p2/M, z3.s, z11.s\n"
- "fmla z29.s, p2/M, z3.s, z12.s\n"
- "ld1w { z11.s }, p3/Z, [x22, x13, LSL #2]\n"
- "ldr x10, [x16, #0x80]\n"
- "fmla z30.s, p2/M, z3.s, z5.s\n"
- "fmla z31.s, p2/M, z3.s, z6.s\n"
- "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n"
- "ldr x9, [x16, #0x88]\n"
- "fmla z28.s, p2/M, z4.s, z12.s\n"
- "fmla z29.s, p2/M, z4.s, z9.s\n"
- "ld1w { z12.s }, p3/Z, [x21, x13, LSL #2]\n"
- "ld1w { z9.s }, p3/Z, [x20, x13, LSL #2]\n"
- "fmla z30.s, p2/M, z4.s, z6.s\n"
- "fmla z31.s, p2/M, z4.s, z10.s\n"
- "ld1w { z4.s }, p2/Z, [x27, #4, MUL VL]\n"
- "ldr x26, [x16, #0x90]\n"
- "fmla z28.s, p2/M, z0.s, z7.s\n"
+ "movprfx z30, z29\n fmla z30.s, p2/M, z0.s, z5.s\n"
+ "movprfx z31, z29\n fmla z31.s, p2/M, z0.s, z6.s\n"
+ "ldr x20, [x16, #0x50]\n"
+ "ld1w { z22.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "movprfx z5, z29\n fmla z5.s, p2/M, z0.s, z7.s\n"
"fmla z29.s, p2/M, z0.s, z8.s\n"
- "ldr x25, [x16, #0x98]\n"
- "ldr x23, [x16, #0xa0]\n"
- "fmla z30.s, p2/M, z0.s, z14.s\n"
- "fmla z31.s, p2/M, z0.s, z11.s\n"
- "ld1w { z0.s }, p2/Z, [x27, #5, MUL VL]\n"
- "ldr x22, [x16, #0xa8]\n"
- "fmla z28.s, p2/M, z1.s, z8.s\n"
- "fmla z29.s, p2/M, z1.s, z13.s\n"
- "ld1w { z8.s }, p3/Z, [x9, x13, LSL #2]\n"
- "ldr x21, [x16, #0xb0]\n"
- "fmla z30.s, p2/M, z1.s, z11.s\n"
- "fmla z31.s, p2/M, z1.s, z12.s\n"
- "ld1w { z1.s }, p2/Z, [x27, #6, MUL VL]\n"
- "ldr x20, [x16, #0xb8]\n"
- "fmla z28.s, p2/M, z2.s, z13.s\n"
- "fmla z29.s, p2/M, z2.s, z5.s\n"
- "ld1w { z13.s }, p3/Z, [x10, x13, LSL #2]\n"
- "ldr x10, [x16, #0xc0]\n"
- "fmla z30.s, p2/M, z2.s, z12.s\n"
- "fmla z31.s, p2/M, z2.s, z9.s\n"
- "ld1w { z2.s }, p2/Z, [x27, #7, MUL VL]\n"
- "addvl x27, x27, #16\n"
- "fmla z28.s, p2/M, z3.s, z5.s\n"
- "fmla z29.s, p2/M, z3.s, z6.s\n"
- "ld1w { z5.s }, p3/Z, [x26, x13, LSL #2]\n"
- "ldr x9, [x16, #0xc8]\n"
- "fmla z30.s, p2/M, z3.s, z9.s\n"
- "fmla z31.s, p2/M, z3.s, z13.s\n"
- "ld1w { z3.s }, p2/Z, [x27, #-8, MUL VL]\n"
- "ldr x26, [x16, #0xd0]\n"
- "fmla z28.s, p2/M, z4.s, z6.s\n"
- "fmla z29.s, p2/M, z4.s, z10.s\n"
- "ld1w { z6.s }, p3/Z, [x25, x13, LSL #2]\n"
- "ld1w { z10.s }, p3/Z, [x23, x13, LSL #2]\n"
- "fmla z30.s, p2/M, z4.s, z13.s\n"
- "fmla z31.s, p2/M, z4.s, z8.s\n"
- "ld1w { z4.s }, p2/Z, [x27, #-7, MUL VL]\n"
- "ldr x25, [x16, #0xd8]\n"
- "fmla z28.s, p2/M, z0.s, z14.s\n"
- "fmla z29.s, p2/M, z0.s, z11.s\n"
- "ld1w { z14.s }, p3/Z, [x20, x13, LSL #2]\n"
- "ldr x23, [x16, #0xe0]\n"
- "fmla z30.s, p2/M, z0.s, z5.s\n"
- "fmla z31.s, p2/M, z0.s, z6.s\n"
- "ld1w { z0.s }, p2/Z, [x27, #-6, MUL VL]\n"
- "ldr x20, [x16, #0xf8]\n"
- "fmla z28.s, p2/M, z1.s, z11.s\n"
- "fmla z29.s, p2/M, z1.s, z12.s\n"
- "ld1w { z11.s }, p3/Z, [x22, x13, LSL #2]\n"
- "ldr x22, [x16, #0xe8]\n"
+ "ldr x20, [x16, #0x58]\n"
+ "ldr x21, [x16, #0x60]\n"
"fmla z30.s, p2/M, z1.s, z6.s\n"
- "fmla z31.s, p2/M, z1.s, z10.s\n"
- "ld1w { z1.s }, p2/Z, [x27, #-5, MUL VL]\n"
- "incw x24\n"
- "fmla z28.s, p2/M, z2.s, z12.s\n"
- "fmla z29.s, p2/M, z2.s, z9.s\n"
- "ld1w { z12.s }, p3/Z, [x21, x13, LSL #2]\n"
- "ldr x21, [x16, #0xf0]\n"
- "fmla z30.s, p2/M, z2.s, z10.s\n"
- "fmla z31.s, p2/M, z2.s, z11.s\n"
- "ld1w { z2.s }, p2/Z, [x27, #-4, MUL VL]\n"
- "mov p0.b, p3.b\n"
- "fmla z28.s, p2/M, z3.s, z9.s\n"
- "fmla z29.s, p2/M, z3.s, z13.s\n"
- "ld1w { z9.s }, p3/Z, [x10, x13, LSL #2]\n"
- "ldr x10, [x16, #0x100]\n"
- "fmla z30.s, p2/M, z3.s, z11.s\n"
- "fmla z31.s, p2/M, z3.s, z12.s\n"
- "ld1w { z3.s }, p2/Z, [x27, #-3, MUL VL]\n"
- "fmla z28.s, p2/M, z4.s, z13.s\n"
- "fmla z29.s, p2/M, z4.s, z8.s\n"
- "ld1w { z13.s }, p3/Z, [x9, x13, LSL #2]\n"
- "ld1w { z8.s }, p3/Z, [x23, x13, LSL #2]\n"
- "fmla z30.s, p2/M, z4.s, z12.s\n"
- "fmla z31.s, p2/M, z4.s, z14.s\n"
- "ld1w { z4.s }, p2/Z, [x27, #-2, MUL VL]\n"
- "ldr x9, [x16, #0x108]\n"
- "fmla z28.s, p2/M, z0.s, z5.s\n"
- "fmla z29.s, p2/M, z0.s, z6.s\n"
- "ld1w { z5.s }, p3/Z, [x26, x13, LSL #2]\n"
- "ldr x26, [x16, #0x110]\n"
- "fmla z30.s, p2/M, z0.s, z9.s\n"
- "fmla z31.s, p2/M, z0.s, z13.s\n"
- "ld1w { z0.s }, p2/Z, [x27, #-1, MUL VL]\n"
- "fmla z28.s, p2/M, z1.s, z6.s\n"
- "fmla z29.s, p2/M, z1.s, z10.s\n"
- "ld1w { z6.s }, p3/Z, [x25, x13, LSL #2]\n"
- "ldr x25, [x16, #0x118]\n"
- "fmla z30.s, p2/M, z1.s, z13.s\n"
- "fmla z31.s, p2/M, z1.s, z5.s\n"
- "ld1w { z1.s }, p2/Z, [x27]\n"
- "fmla z28.s, p2/M, z2.s, z10.s\n"
- "fmla z29.s, p2/M, z2.s, z11.s\n"
- "ld1w { z10.s }, p3/Z, [x22, x13, LSL #2]\n"
- "fmla z30.s, p2/M, z2.s, z5.s\n"
- "fmla z31.s, p2/M, z2.s, z6.s\n"
- "ld1w { z2.s }, p2/Z, [x27, #1, MUL VL]\n"
- "fmla z28.s, p2/M, z3.s, z11.s\n"
- "fmla z29.s, p2/M, z3.s, z12.s\n"
- "ld1w { z11.s }, p3/Z, [x21, x13, LSL #2]\n"
- "fmla z30.s, p2/M, z3.s, z6.s\n"
- "fmla z31.s, p2/M, z3.s, z8.s\n"
- "ld1w { z3.s }, p2/Z, [x27, #2, MUL VL]\n"
- "fmla z28.s, p2/M, z4.s, z12.s\n"
- "fmla z29.s, p2/M, z4.s, z14.s\n"
- "ld1w { z12.s }, p3/Z, [x20, x13, LSL #2]\n"
- "fmla z30.s, p2/M, z4.s, z8.s\n"
- "fmla z31.s, p2/M, z4.s, z10.s\n"
- "ld1w { z4.s }, p2/Z, [x27, #3, MUL VL]\n"
- "fmla z28.s, p2/M, z0.s, z9.s\n"
- "fmla z29.s, p2/M, z0.s, z13.s\n"
- "ld1w { z9.s }, p3/Z, [x10, x13, LSL #2]\n"
- "fmla z30.s, p2/M, z0.s, z11.s\n"
- "fmla z31.s, p2/M, z0.s, z12.s\n"
- "ld1w { z11.s }, p3/Z, [x9, x13, LSL #2]\n"
- "fmla z28.s, p2/M, z1.s, z13.s\n"
- "fmla z29.s, p2/M, z1.s, z5.s\n"
- "fmla z30.s, p2/M, z1.s, z12.s\n"
"fmla z31.s, p2/M, z1.s, z9.s\n"
- "ld1w { z12.s }, p3/Z, [x26, x13, LSL #2]\n"
- "fmla z28.s, p2/M, z2.s, z5.s\n"
- "fmla z29.s, p2/M, z2.s, z6.s\n"
+ "ld1w { z6.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldr x20, [x16, #0x68]\n"
+ "fmla z5.s, p2/M, z1.s, z8.s\n"
+ "fmla z29.s, p2/M, z1.s, z13.s\n"
+ "ld1w { z20.s }, p2/Z, [x9]\n"
+ "ldr x23, [x16, #0x70]\n"
"fmla z30.s, p2/M, z2.s, z9.s\n"
"fmla z31.s, p2/M, z2.s, z11.s\n"
- "ld1w { z9.s }, p3/Z, [x25, x13, LSL #2]\n"
- "fmla z28.s, p2/M, z3.s, z6.s\n"
- "fmla z29.s, p2/M, z3.s, z8.s\n"
+ "ld1w { z16.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "ld1w { z19.s }, p2/Z, [x9, #1, MUL VL]\n"
+ "fmla z5.s, p2/M, z2.s, z13.s\n"
+ "fmla z29.s, p2/M, z2.s, z22.s\n"
+ "ldr x21, [x16, #0x78]\n"
+ "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
"fmla z30.s, p2/M, z3.s, z11.s\n"
"fmla z31.s, p2/M, z3.s, z12.s\n"
- "fmla z28.s, p2/M, z4.s, z8.s\n"
- "fmla z29.s, p2/M, z4.s, z10.s\n"
- "fmax z28.s, p2/M, z28.s, z18.s\n"
- "fmax z29.s, p2/M, z29.s, z18.s\n"
+ "ld1w { z1.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldr x22, [x16, #0x80]\n"
+ "fmla z5.s, p2/M, z3.s, z22.s\n"
+ "fmla z29.s, p2/M, z3.s, z6.s\n"
+ "ld1w { z17.s }, p2/Z, [x9, #3, MUL VL]\n"
+ "ldr x20, [x16, #0x88]\n"
"fmla z30.s, p2/M, z4.s, z12.s\n"
- "fmla z31.s, p2/M, z4.s, z9.s\n"
- "fmax z30.s, p2/M, z30.s, z18.s\n"
- "fmax z31.s, p2/M, z31.s, z18.s\n"
- "fmin z28.s, p2/M, z28.s, z17.s\n"
- "fmin z29.s, p2/M, z29.s, z17.s\n"
- "st1w { z28.s }, p0, [x15, x24, LSL #2]\n"
- "fmin z30.s, p2/M, z30.s, z17.s\n"
- "fmin z31.s, p2/M, z31.s, z17.s\n"
- "st1w { z29.s }, p0, [x14, x24, LSL #2]\n"
- "st1w { z30.s }, p0, [x12, x24, LSL #2]\n"
- "st1w { z31.s }, p0, [x11, x24, LSL #2]\n"
+ "fmla z31.s, p2/M, z4.s, z16.s\n"
+ "ld1w { z0.s }, p3/Z, [x23, x13, LSL #2]\n"
+ "ld1w { z27.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "fmla z5.s, p2/M, z4.s, z6.s\n"
+ "fmla z29.s, p2/M, z4.s, z10.s\n"
+ "ld1w { z16.s }, p2/Z, [x9, #4, MUL VL]\n"
+ "ldr x21, [x16, #0x90]\n"
+ "fmla z30.s, p2/M, z20.s, z7.s\n"
+ "fmla z31.s, p2/M, z20.s, z8.s\n"
+ "ldr x27, [x16, #0x98]\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla z5.s, p2/M, z20.s, z14.s\n"
+ "fmla z29.s, p2/M, z20.s, z1.s\n"
+ "ld1w { z21.s }, p2/Z, [x9, #5, MUL VL]\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla z30.s, p2/M, z19.s, z8.s\n"
+ "fmla z31.s, p2/M, z19.s, z13.s\n"
+ "ld1w { z26.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla z5.s, p2/M, z19.s, z1.s\n"
+ "fmla z29.s, p2/M, z19.s, z0.s\n"
+ "ld1w { z25.s }, p2/Z, [x9, #6, MUL VL]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ "fmla z30.s, p2/M, z18.s, z13.s\n"
+ "fmla z31.s, p2/M, z18.s, z22.s\n"
+ "ld1w { z24.s }, p3/Z, [x22, x13, LSL #2]\n"
+ "ldr x23, [x16, #0xc0]\n"
+ "fmla z5.s, p2/M, z18.s, z0.s\n"
+ "fmla z29.s, p2/M, z18.s, z27.s\n"
+ "ld1w { z23.s }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
+ "fmla z30.s, p2/M, z17.s, z22.s\n"
+ "fmla z31.s, p2/M, z17.s, z6.s\n"
+ "ld1w { z22.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "ldr x22, [x16, #0xc8]\n"
+ "fmla z5.s, p2/M, z17.s, z27.s\n"
+ "fmla z29.s, p2/M, z17.s, z24.s\n"
+ "ld1w { z20.s }, p2/Z, [x9, #-8, MUL VL]\n"
+ "ldr x21, [x16, #0xd0]\n"
+ "fmla z30.s, p2/M, z16.s, z6.s\n"
+ "fmla z31.s, p2/M, z16.s, z10.s\n"
+ "ld1w { z19.s }, p3/Z, [x27, x13, LSL #2]\n"
+ "ld1w { z18.s }, p3/Z, [x26, x13, LSL #2]\n"
+ "fmla z5.s, p2/M, z16.s, z24.s\n"
+ "fmla z29.s, p2/M, z16.s, z26.s\n"
+ "ld1w { z16.s }, p2/Z, [x9, #-7, MUL VL]\n"
+ "ldr x27, [x16, #0xd8]\n"
+ "fmla z30.s, p2/M, z21.s, z14.s\n"
+ "fmla z31.s, p2/M, z21.s, z1.s\n"
+ "ld1w { z17.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldr x20, [x16, #0xe0]\n"
+ "fmla z5.s, p2/M, z21.s, z22.s\n"
+ "fmla z29.s, p2/M, z21.s, z19.s\n"
+ "ld1w { z21.s }, p2/Z, [x9, #-6, MUL VL]\n"
+ "ldr x26, [x16, #0xf8]\n"
+ "fmla z30.s, p2/M, z25.s, z1.s\n"
+ "fmla z31.s, p2/M, z25.s, z0.s\n"
+ "ld1w { z9.s }, p3/Z, [x25, x13, LSL #2]\n"
+ "ldr x25, [x16, #0xe8]\n"
+ "fmla z5.s, p2/M, z25.s, z19.s\n"
+ "fmla z29.s, p2/M, z25.s, z18.s\n"
+ "ld1w { z4.s }, p2/Z, [x9, #-5, MUL VL]\n"
+ "incw x28\n"
+ "fmla z30.s, p2/M, z23.s, z0.s\n"
+ "fmla z31.s, p2/M, z23.s, z27.s\n"
+ "ld1w { z8.s }, p3/Z, [x24, x13, LSL #2]\n"
+ "ldr x24, [x16, #0xf0]\n"
+ "fmla z5.s, p2/M, z23.s, z18.s\n"
+ "fmla z29.s, p2/M, z23.s, z9.s\n"
+ "ld1w { z6.s }, p2/Z, [x9, #-4, MUL VL]\n"
+ "mov p0.b, p3.b\n"
+ "fmla z30.s, p2/M, z20.s, z27.s\n"
+ "fmla z31.s, p2/M, z20.s, z24.s\n"
+ "ld1w { z10.s }, p3/Z, [x23, x13, LSL #2]\n"
+ "ldr x23, [x16, #0x100]\n"
+ "fmla z5.s, p2/M, z20.s, z9.s\n"
+ "fmla z29.s, p2/M, z20.s, z8.s\n"
+ "ld1w { z11.s }, p2/Z, [x9, #-3, MUL VL]\n"
+ "fmla z30.s, p2/M, z16.s, z24.s\n"
+ "fmla z31.s, p2/M, z16.s, z26.s\n"
+ "ld1w { z0.s }, p3/Z, [x22, x13, LSL #2]\n"
+ "ld1w { z27.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "fmla z5.s, p2/M, z16.s, z8.s\n"
+ "fmla z29.s, p2/M, z16.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x9, #-2, MUL VL]\n"
+ "ldr x22, [x16, #0x108]\n"
+ "fmla z30.s, p2/M, z21.s, z22.s\n"
+ "fmla z31.s, p2/M, z21.s, z19.s\n"
+ "ld1w { z26.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "ldr x21, [x16, #0x110]\n"
+ "fmla z5.s, p2/M, z21.s, z10.s\n"
+ "fmla z29.s, p2/M, z21.s, z0.s\n"
+ "ld1w { z25.s }, p2/Z, [x9, #-1, MUL VL]\n"
+ "fmla z30.s, p2/M, z4.s, z19.s\n"
+ "fmla z31.s, p2/M, z4.s, z18.s\n"
+ "ld1w { z24.s }, p3/Z, [x27, x13, LSL #2]\n"
+ "ldr x20, [x16, #0x118]\n"
+ "fmla z5.s, p2/M, z4.s, z0.s\n"
+ "fmla z29.s, p2/M, z4.s, z26.s\n"
+ "ld1w { z23.s }, p2/Z, [x9]\n"
+ "fmla z30.s, p2/M, z6.s, z18.s\n"
+ "fmla z31.s, p2/M, z6.s, z9.s\n"
+ "ld1w { z22.s }, p3/Z, [x25, x13, LSL #2]\n"
+ "fmla z5.s, p2/M, z6.s, z26.s\n"
+ "fmla z29.s, p2/M, z6.s, z24.s\n"
+ "ld1w { z21.s }, p2/Z, [x9, #1, MUL VL]\n"
+ "fmla z30.s, p2/M, z11.s, z9.s\n"
+ "fmla z31.s, p2/M, z11.s, z8.s\n"
+ "ld1w { z18.s }, p3/Z, [x24, x13, LSL #2]\n"
+ "fmla z5.s, p2/M, z11.s, z24.s\n"
+ "fmla z29.s, p2/M, z11.s, z27.s\n"
+ "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "fmla z30.s, p2/M, z16.s, z8.s\n"
+ "fmla z31.s, p2/M, z16.s, z17.s\n"
+ "ld1w { z17.s }, p3/Z, [x26, x13, LSL #2]\n"
+ "fmla z5.s, p2/M, z16.s, z27.s\n"
+ "fmla z29.s, p2/M, z16.s, z22.s\n"
+ "ld1w { z19.s }, p2/Z, [x9, #3, MUL VL]\n"
+ "fmla z30.s, p2/M, z25.s, z10.s\n"
+ "fmla z31.s, p2/M, z25.s, z0.s\n"
+ "ld1w { z16.s }, p3/Z, [x23, x13, LSL #2]\n"
+ "fmla z5.s, p2/M, z25.s, z18.s\n"
+ "fmla z29.s, p2/M, z25.s, z17.s\n"
+ "ld1w { z18.s }, p3/Z, [x22, x13, LSL #2]\n"
+ "fmla z30.s, p2/M, z23.s, z0.s\n"
+ "fmla z31.s, p2/M, z23.s, z26.s\n"
+ "fmla z5.s, p2/M, z23.s, z17.s\n"
+ "fmla z29.s, p2/M, z23.s, z16.s\n"
+ "ld1w { z17.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "fmla z30.s, p2/M, z21.s, z26.s\n"
+ "fmla z31.s, p2/M, z21.s, z24.s\n"
+ "fmla z5.s, p2/M, z21.s, z16.s\n"
+ "fmla z29.s, p2/M, z21.s, z18.s\n"
+ "ld1w { z16.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "fmla z30.s, p2/M, z20.s, z24.s\n"
+ "fmla z31.s, p2/M, z20.s, z27.s\n"
+ "fmla z5.s, p2/M, z20.s, z18.s\n"
+ "fmla z29.s, p2/M, z20.s, z17.s\n"
+ "fmla z30.s, p2/M, z19.s, z27.s\n"
+ "fmla z31.s, p2/M, z19.s, z22.s\n"
+ "fmax z30.s, p2/M, z30.s, z15.s\n"
+ "fmax z31.s, p2/M, z31.s, z15.s\n"
+ "fmla z5.s, p2/M, z19.s, z17.s\n"
+ "fmla z29.s, p2/M, z19.s, z16.s\n"
+ "fmax z5.s, p2/M, z5.s, z15.s\n"
+ "fmax z29.s, p2/M, z29.s, z15.s\n"
+ "fmin z30.s, p2/M, z30.s, z28.s\n"
+ "fmin z31.s, p2/M, z31.s, z28.s\n"
+ "st1w { z30.s }, p0, [x15, x28, LSL #2]\n"
+ "fmin z5.s, p2/M, z5.s, z28.s\n"
+ "fmin z29.s, p2/M, z29.s, z28.s\n"
+ "st1w { z31.s }, p0, [x14, x28, LSL #2]\n"
+ "st1w { z5.s }, p0, [x12, x28, LSL #2]\n"
+ "st1w { z29.s }, p0, [x11, x28, LSL #2]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp
index 62faca97a9..6b155fc855 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
index 204f36edca..d53daaa8a0 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -55,9 +55,9 @@ void sve_fp32_nhwc_generic_output9_mla_depthfirst_impl(
"ld1w { z23.s }, p0/Z, [%x[bias], x11, LSL #2]\n"
"2:" // Channel loop: Load bias: Done
"mov x10, %x[inptrs]\n"
- "ldp x9, x28, [x10], #0x10\n"
- "ldp x27, x26, [x10], #0x10\n"
- "subs x25, %x[n_points], #0x1\n"
+ "ldp x28, x27, [x10], #0x10\n"
+ "ldp x26, x25, [x10], #0x10\n"
+ "subs x9, %x[n_points], #0x1\n"
"ldp x24, x23, [x10], #0x10\n"
"ldp x22, x21, [x10], #0x10\n"
"mov z24.d, z23.d\n"
@@ -68,12 +68,12 @@ void sve_fp32_nhwc_generic_output9_mla_depthfirst_impl(
"ld1w { z0.s }, p1/Z, [%x[params]]\n"
"mov z28.d, z23.d\n"
"mov z29.d, z23.d\n"
- "ld1w { z14.s }, p0/Z, [x9, x11, LSL #2]\n"
- "ld1w { z15.s }, p0/Z, [x28, x11, LSL #2]\n"
+ "ld1w { z14.s }, p0/Z, [x28, x11, LSL #2]\n"
+ "ld1w { z15.s }, p0/Z, [x27, x11, LSL #2]\n"
"mov z30.d, z23.d\n"
"mov z31.d, z23.d\n"
- "ld1w { z16.s }, p0/Z, [x27, x11, LSL #2]\n"
- "ld1w { z17.s }, p0/Z, [x26, x11, LSL #2]\n"
+ "ld1w { z16.s }, p0/Z, [x26, x11, LSL #2]\n"
+ "ld1w { z17.s }, p0/Z, [x25, x11, LSL #2]\n"
"ld1w { z18.s }, p0/Z, [x24, x11, LSL #2]\n"
"ld1w { z19.s }, p0/Z, [x23, x11, LSL #2]\n"
"addvl %x[params], %x[params], #1\n"
@@ -82,9 +82,9 @@ void sve_fp32_nhwc_generic_output9_mla_depthfirst_impl(
"ld1w { z22.s }, p0/Z, [x20, x11, LSL #2]\n"
"ble 4f\n"
"3:" // Channel loop: Planar loop
- "ldp x9, x28, [x10], #0x10\n"
- "ldp x27, x26, [x10], #0x10\n"
- "subs x25, x25, #0x1\n"
+ "ldp x28, x27, [x10], #0x10\n"
+ "ldp x26, x25, [x10], #0x10\n"
+ "subs x9, x9, #0x1\n"
"fmla z23.s, p1/M, z14.s, z0.s\n"
"ldp x24, x23, [x10], #0x10\n"
"ldp x22, x21, [x10], #0x10\n"
@@ -93,15 +93,15 @@ void sve_fp32_nhwc_generic_output9_mla_depthfirst_impl(
"ldr x20, [x10], #0x8\n"
"fmla z26.s, p1/M, z17.s, z0.s\n"
"fmla z27.s, p1/M, z18.s, z0.s\n"
- "ld1w { z14.s }, p0/Z, [x9, x11, LSL #2]\n"
+ "ld1w { z14.s }, p0/Z, [x28, x11, LSL #2]\n"
"fmla z28.s, p1/M, z19.s, z0.s\n"
"fmla z29.s, p1/M, z20.s, z0.s\n"
- "ld1w { z15.s }, p0/Z, [x28, x11, LSL #2]\n"
- "ld1w { z16.s }, p0/Z, [x27, x11, LSL #2]\n"
+ "ld1w { z15.s }, p0/Z, [x27, x11, LSL #2]\n"
+ "ld1w { z16.s }, p0/Z, [x26, x11, LSL #2]\n"
"fmla z30.s, p1/M, z21.s, z0.s\n"
"fmla z31.s, p1/M, z22.s, z0.s\n"
"ld1w { z0.s }, p1/Z, [%x[params]]\n"
- "ld1w { z17.s }, p0/Z, [x26, x11, LSL #2]\n"
+ "ld1w { z17.s }, p0/Z, [x25, x11, LSL #2]\n"
"ld1w { z18.s }, p0/Z, [x24, x11, LSL #2]\n"
"ld1w { z19.s }, p0/Z, [x23, x11, LSL #2]\n"
"addvl %x[params], %x[params], #1\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
index 8640343747..eb1b111c36 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
index 7ba0edd991..3a71baaf61 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
@@ -53,21 +53,21 @@ void sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"ldr x11, [%x[inptrs], #0x20]\n"
"ldr x10, [%x[inptrs], #0x28]\n"
"ldr x9, [%x[inptrs], #0x30]\n"
- "ld1w { z26.s }, p2/Z, [%x[params]]\n"
- "mov z25.d, z26.d\n"
- "mov z24.d, z26.d\n"
+ "ld1w { z24.s }, p2/Z, [%x[params]]\n"
+ "mov z21.d, z24.d\n"
+ "mov z25.d, z24.d\n"
"ldp x28, x27, [%x[outptrs], #0x0]\n"
"ldp x26, x25, [%x[outptrs], #0x10]\n"
- "mov z23.d, z26.d\n"
- "mov z22.d, z26.d\n"
+ "mov z27.d, z24.d\n"
+ "mov z26.d, z24.d\n"
"ldp x24, x23, [%x[outptrs], #0x20]\n"
"ldp x22, x21, [%x[outptrs], #0x30]\n"
- "mov z21.d, z26.d\n"
- "mov z20.d, z26.d\n"
+ "mov z28.d, z24.d\n"
+ "mov z20.d, z24.d\n"
"ldr x20, [%x[outptrs], #0x40]\n"
"ld1rqw { z2.s }, p1/Z, [x16]\n"
- "mov z19.d, z26.d\n"
- "mov z18.d, z26.d\n"
+ "mov z23.d, z24.d\n"
+ "mov z19.d, z24.d\n"
"ld1rqw { z3.s }, p1/Z, [x16, #16]\n"
"ld1rqw { z4.s }, p1/Z, [x15]\n"
"ld1rqw { z5.s }, p1/Z, [x15, #16]\n"
@@ -81,175 +81,175 @@ void sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"ld1rqw { z13.s }, p1/Z, [x10, #16]\n"
"ld1rqw { z14.s }, p1/Z, [x9]\n"
"ld1rqw { z15.s }, p1/Z, [x9, #16]\n"
- "ld1rw { z17.s }, p1/Z, [%x[clamps]]\n"
+ "ld1rw { z22.s }, p1/Z, [%x[clamps]]\n"
"ld1rw { z16.s }, p1/Z, [%x[clamps], #4]\n"
"ld1w { z31.s }, p2/Z, [%x[params], #1, MUL VL]\n"
"ld1w { z30.s }, p2/Z, [%x[params], #2, MUL VL]\n"
"ld1w { z29.s }, p2/Z, [%x[params], #3, MUL VL]\n"
"addvl %x[params], %x[params], #4\n"
"1:" // Output channel complete vector loop
- "fmla z26.s, z31.s, z2.s[0]\n"
- "fmla z23.s, z31.s, z6.s[0]\n"
- "mov z0.d, z10.d\n"
+ "fmla z24.s, z31.s, z2.s[0]\n"
+ "fmla z27.s, z31.s, z6.s[0]\n"
+ "mov z1.d, z10.d\n"
"incw x17\n"
- "fmla z22.s, z31.s, z6.s[2]\n"
- "fmla z21.s, z31.s, z7.s[0]\n"
- "mov z1.d, z11.d\n"
+ "fmla z26.s, z31.s, z6.s[2]\n"
+ "fmla z28.s, z31.s, z7.s[0]\n"
+ "mov z0.d, z11.d\n"
"mov p0.b, p2.b\n"
- "fmla z25.s, z31.s, z2.s[2]\n"
- "fmla z24.s, z31.s, z3.s[0]\n"
+ "fmla z21.s, z31.s, z2.s[2]\n"
+ "fmla z25.s, z31.s, z3.s[0]\n"
"whilelt p2.s, x17, %x[channel_multiplier]\n"
- "fmla z20.s, z31.s, z0.s[0]\n"
- "fmla z19.s, z31.s, z0.s[2]\n"
- "fmla z18.s, z31.s, z1.s[0]\n"
- "fmla z26.s, z30.s, z2.s[1]\n"
- "ld1w { z31.s }, p1/Z, [%x[params]]\n"
- "fmla z23.s, z30.s, z6.s[1]\n"
- "fmla z22.s, z30.s, z6.s[3]\n"
- "fmla z21.s, z30.s, z7.s[1]\n"
- "fmla z25.s, z30.s, z2.s[3]\n"
- "fmla z24.s, z30.s, z3.s[1]\n"
- "fmla z20.s, z30.s, z0.s[1]\n"
- "fmla z19.s, z30.s, z0.s[3]\n"
- "fmla z18.s, z30.s, z1.s[1]\n"
- "ld1w { z30.s }, p1/Z, [%x[params], #1, MUL VL]\n"
- "fmla z26.s, z29.s, z2.s[2]\n"
- "fmla z23.s, z29.s, z6.s[2]\n"
- "fmla z22.s, z29.s, z7.s[0]\n"
- "fmla z21.s, z29.s, z7.s[2]\n"
- "fmla z25.s, z29.s, z3.s[0]\n"
- "fmla z24.s, z29.s, z3.s[2]\n"
- "fmla z20.s, z29.s, z0.s[2]\n"
- "fmla z19.s, z29.s, z1.s[0]\n"
- "mov z0.d, z8.d\n"
- "fmla z18.s, z29.s, z1.s[2]\n"
- "mov z1.d, z9.d\n"
- "fmla z26.s, z31.s, z4.s[0]\n"
- "ld1w { z29.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "fmla z20.s, z31.s, z1.s[0]\n"
+ "fmla z23.s, z31.s, z1.s[2]\n"
+ "fmla z19.s, z31.s, z0.s[0]\n"
+ "fmla z24.s, z30.s, z2.s[1]\n"
+ "ld1w { z18.s }, p1/Z, [%x[params]]\n"
+ "fmla z27.s, z30.s, z6.s[1]\n"
+ "fmla z26.s, z30.s, z6.s[3]\n"
+ "fmla z28.s, z30.s, z7.s[1]\n"
+ "fmla z21.s, z30.s, z2.s[3]\n"
+ "fmla z25.s, z30.s, z3.s[1]\n"
+ "fmla z20.s, z30.s, z1.s[1]\n"
+ "fmla z23.s, z30.s, z1.s[3]\n"
+ "fmla z19.s, z30.s, z0.s[1]\n"
+ "ld1w { z17.s }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "fmla z24.s, z29.s, z2.s[2]\n"
+ "fmla z27.s, z29.s, z6.s[2]\n"
+ "fmla z26.s, z29.s, z7.s[0]\n"
+ "fmla z28.s, z29.s, z7.s[2]\n"
+ "fmla z21.s, z29.s, z3.s[0]\n"
+ "fmla z25.s, z29.s, z3.s[2]\n"
+ "fmla z20.s, z29.s, z1.s[2]\n"
+ "fmla z23.s, z29.s, z0.s[0]\n"
+ "mov z1.d, z8.d\n"
+ "fmla z19.s, z29.s, z0.s[2]\n"
+ "mov z0.d, z9.d\n"
+ "fmla z24.s, z18.s, z4.s[0]\n"
+ "ld1w { z31.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "fmla z27.s, z18.s, z1.s[0]\n"
+ "fmla z26.s, z18.s, z1.s[2]\n"
+ "mov z1.d, z12.d\n"
+ "fmla z28.s, z18.s, z0.s[0]\n"
+ "mov z0.d, z13.d\n"
+ "fmla z21.s, z18.s, z4.s[2]\n"
+ "fmla z25.s, z18.s, z5.s[0]\n"
+ "fmla z20.s, z18.s, z1.s[0]\n"
+ "fmla z23.s, z18.s, z1.s[2]\n"
+ "fmla z19.s, z18.s, z0.s[0]\n"
+ "mov z1.d, z8.d\n"
+ "ld1w { z18.s }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "mov z0.d, z9.d\n"
+ "fmla z24.s, z17.s, z4.s[1]\n"
+ "fmla z27.s, z17.s, z1.s[1]\n"
+ "fmla z26.s, z17.s, z1.s[3]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "mov z1.d, z12.d\n"
+ "mov z0.d, z13.d\n"
+ "fmla z21.s, z17.s, z4.s[3]\n"
+ "fmla z25.s, z17.s, z5.s[1]\n"
+ "fmla z20.s, z17.s, z1.s[1]\n"
+ "fmla z23.s, z17.s, z1.s[3]\n"
+ "mov z1.d, z8.d\n"
+ "fmla z19.s, z17.s, z0.s[1]\n"
+ "mov z0.d, z9.d\n"
+ "fmla z24.s, z31.s, z4.s[2]\n"
+ "ld1w { z17.s }, p1/Z, [%x[params], #4, MUL VL]\n"
+ "fmla z27.s, z31.s, z1.s[2]\n"
+ "fmla z26.s, z31.s, z0.s[0]\n"
+ "mov z1.d, z12.d\n"
+ "fmla z28.s, z31.s, z0.s[2]\n"
+ "mov z0.d, z13.d\n"
+ "fmla z21.s, z31.s, z5.s[0]\n"
+ "fmla z25.s, z31.s, z5.s[2]\n"
+ "fmla z20.s, z31.s, z1.s[2]\n"
+ "mov z1.d, z10.d\n"
"fmla z23.s, z31.s, z0.s[0]\n"
- "fmla z22.s, z31.s, z0.s[2]\n"
- "mov z0.d, z12.d\n"
- "fmla z21.s, z31.s, z1.s[0]\n"
- "mov z1.d, z13.d\n"
- "fmla z25.s, z31.s, z4.s[2]\n"
- "fmla z24.s, z31.s, z5.s[0]\n"
- "fmla z20.s, z31.s, z0.s[0]\n"
"fmla z19.s, z31.s, z0.s[2]\n"
- "fmla z18.s, z31.s, z1.s[0]\n"
- "mov z0.d, z8.d\n"
- "ld1w { z31.s }, p1/Z, [%x[params], #3, MUL VL]\n"
- "mov z1.d, z9.d\n"
- "fmla z26.s, z30.s, z4.s[1]\n"
- "fmla z23.s, z30.s, z0.s[1]\n"
- "fmla z22.s, z30.s, z0.s[3]\n"
- "fmla z21.s, z30.s, z1.s[1]\n"
- "mov z0.d, z12.d\n"
- "mov z1.d, z13.d\n"
- "fmla z25.s, z30.s, z4.s[3]\n"
- "fmla z24.s, z30.s, z5.s[1]\n"
- "fmla z20.s, z30.s, z0.s[1]\n"
- "fmla z19.s, z30.s, z0.s[3]\n"
- "mov z0.d, z8.d\n"
- "fmla z18.s, z30.s, z1.s[1]\n"
- "mov z1.d, z9.d\n"
- "fmla z26.s, z29.s, z4.s[2]\n"
- "ld1w { z30.s }, p1/Z, [%x[params], #4, MUL VL]\n"
- "fmla z23.s, z29.s, z0.s[2]\n"
- "fmla z22.s, z29.s, z1.s[0]\n"
- "mov z0.d, z12.d\n"
- "fmla z21.s, z29.s, z1.s[2]\n"
- "mov z1.d, z13.d\n"
- "fmla z25.s, z29.s, z5.s[0]\n"
- "fmla z24.s, z29.s, z5.s[2]\n"
- "fmla z20.s, z29.s, z0.s[2]\n"
- "mov z0.d, z10.d\n"
- "fmla z19.s, z29.s, z1.s[0]\n"
- "fmla z18.s, z29.s, z1.s[2]\n"
- "mov z1.d, z11.d\n"
+ "mov z0.d, z11.d\n"
"ld1w { z29.s }, p1/Z, [%x[params], #5, MUL VL]\n"
- "fmla z26.s, z31.s, z6.s[0]\n"
- "fmla z23.s, z31.s, z0.s[0]\n"
- "fmla z22.s, z31.s, z0.s[2]\n"
- "fmla z21.s, z31.s, z1.s[0]\n"
- "mov z0.d, z14.d\n"
- "mov z1.d, z15.d\n"
- "fmla z25.s, z31.s, z6.s[2]\n"
- "fmla z24.s, z31.s, z7.s[0]\n"
- "fmla z20.s, z31.s, z0.s[0]\n"
- "fmla z19.s, z31.s, z0.s[2]\n"
- "mov z0.d, z10.d\n"
- "fmla z18.s, z31.s, z1.s[0]\n"
- "mov z1.d, z11.d\n"
- "fmla z26.s, z30.s, z6.s[1]\n"
+ "fmla z24.s, z18.s, z6.s[0]\n"
+ "fmla z27.s, z18.s, z1.s[0]\n"
+ "fmla z26.s, z18.s, z1.s[2]\n"
+ "fmla z28.s, z18.s, z0.s[0]\n"
+ "mov z1.d, z14.d\n"
+ "mov z0.d, z15.d\n"
+ "fmla z21.s, z18.s, z6.s[2]\n"
+ "fmla z25.s, z18.s, z7.s[0]\n"
+ "fmla z20.s, z18.s, z1.s[0]\n"
+ "fmla z23.s, z18.s, z1.s[2]\n"
+ "mov z1.d, z10.d\n"
+ "fmla z19.s, z18.s, z0.s[0]\n"
+ "mov z0.d, z11.d\n"
+ "fmla z24.s, z17.s, z6.s[1]\n"
"ld1w { z31.s }, p2/Z, [%x[params], #7, MUL VL]\n"
- "fmla z23.s, z30.s, z0.s[1]\n"
- "fmla z22.s, z30.s, z0.s[3]\n"
- "mov z0.d, z14.d\n"
- "fmla z21.s, z30.s, z1.s[1]\n"
- "mov z1.d, z15.d\n"
- "fmla z25.s, z30.s, z6.s[3]\n"
- "fmla z24.s, z30.s, z7.s[1]\n"
- "fmla z20.s, z30.s, z0.s[1]\n"
- "fmla z19.s, z30.s, z0.s[3]\n"
- "fmla z18.s, z30.s, z1.s[1]\n"
- "mov z0.d, z10.d\n"
- "mov z1.d, z11.d\n"
- "fmla z26.s, z29.s, z6.s[2]\n"
- "fmla z23.s, z29.s, z0.s[2]\n"
- "fmin z26.s, p1/M, z26.s, z16.s\n"
- "fmla z22.s, z29.s, z1.s[0]\n"
- "fmla z21.s, z29.s, z1.s[2]\n"
- "mov z0.d, z14.d\n"
- "fmax z26.s, p1/M, z26.s, z17.s\n"
- "mov z1.d, z15.d\n"
- "fmla z25.s, z29.s, z7.s[0]\n"
- "fmla z24.s, z29.s, z7.s[2]\n"
- "fmin z25.s, p1/M, z25.s, z16.s\n"
- "fmla z20.s, z29.s, z0.s[2]\n"
- "fmla z19.s, z29.s, z1.s[0]\n"
+ "fmla z27.s, z17.s, z1.s[1]\n"
+ "fmla z26.s, z17.s, z1.s[3]\n"
+ "mov z1.d, z14.d\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "mov z0.d, z15.d\n"
+ "fmla z21.s, z17.s, z6.s[3]\n"
+ "fmla z25.s, z17.s, z7.s[1]\n"
+ "fmla z20.s, z17.s, z1.s[1]\n"
+ "fmla z23.s, z17.s, z1.s[3]\n"
+ "fmla z19.s, z17.s, z0.s[1]\n"
+ "mov z1.d, z10.d\n"
+ "mov z0.d, z11.d\n"
+ "fmla z24.s, z29.s, z6.s[2]\n"
+ "fmla z27.s, z29.s, z1.s[2]\n"
"fmin z24.s, p1/M, z24.s, z16.s\n"
- "fmin z23.s, p1/M, z23.s, z16.s\n"
- "fmla z18.s, z29.s, z1.s[2]\n"
- "fmin z22.s, p1/M, z22.s, z16.s\n"
+ "fmla z26.s, z29.s, z0.s[0]\n"
+ "fmla z28.s, z29.s, z0.s[2]\n"
+ "mov z1.d, z14.d\n"
+ "fmax z24.s, p1/M, z24.s, z22.s\n"
+ "mov z0.d, z15.d\n"
+ "fmla z21.s, z29.s, z7.s[0]\n"
+ "fmla z25.s, z29.s, z7.s[2]\n"
"fmin z21.s, p1/M, z21.s, z16.s\n"
- "st1w { z26.s }, p0, [x28, x12, LSL #2]\n"
+ "fmla z20.s, z29.s, z1.s[2]\n"
+ "fmla z23.s, z29.s, z0.s[0]\n"
+ "fmin z25.s, p1/M, z25.s, z16.s\n"
+ "fmin z27.s, p1/M, z27.s, z16.s\n"
+ "fmla z19.s, z29.s, z0.s[2]\n"
+ "fmin z26.s, p1/M, z26.s, z16.s\n"
+ "fmin z28.s, p1/M, z28.s, z16.s\n"
+ "st1w { z24.s }, p0, [x28, x12, LSL #2]\n"
"fmin z20.s, p1/M, z20.s, z16.s\n"
+ "fmin z23.s, p1/M, z23.s, z16.s\n"
+ "ld1w { z24.s }, p2/Z, [%x[params], #6, MUL VL]\n"
"fmin z19.s, p1/M, z19.s, z16.s\n"
- "ld1w { z26.s }, p2/Z, [%x[params], #6, MUL VL]\n"
- "fmin z18.s, p1/M, z18.s, z16.s\n"
"addvl %x[params], %x[params], #16\n"
"ld1w { z30.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
"ld1w { z29.s }, p2/Z, [%x[params], #-7, MUL VL]\n"
- "fmax z25.s, p1/M, z25.s, z17.s\n"
- "fmax z24.s, p1/M, z24.s, z17.s\n"
- "st1w { z25.s }, p0, [x27, x12, LSL #2]\n"
- "mov z25.d, z26.d\n"
- "fmax z23.s, p1/M, z23.s, z17.s\n"
- "fmax z22.s, p1/M, z22.s, z17.s\n"
- "st1w { z24.s }, p0, [x26, x12, LSL #2]\n"
- "mov z24.d, z26.d\n"
- "fmax z21.s, p1/M, z21.s, z17.s\n"
- "fmax z20.s, p1/M, z20.s, z17.s\n"
- "st1w { z23.s }, p0, [x25, x12, LSL #2]\n"
- "mov z23.d, z26.d\n"
- "fmax z19.s, p1/M, z19.s, z17.s\n"
- "fmax z18.s, p1/M, z18.s, z17.s\n"
- "st1w { z22.s }, p0, [x24, x12, LSL #2]\n"
- "mov z22.d, z26.d\n"
- "st1w { z21.s }, p0, [x23, x12, LSL #2]\n"
- "mov z21.d, z26.d\n"
+ "fmax z21.s, p1/M, z21.s, z22.s\n"
+ "fmax z25.s, p1/M, z25.s, z22.s\n"
+ "st1w { z21.s }, p0, [x27, x12, LSL #2]\n"
+ "mov z21.d, z24.d\n"
+ "fmax z27.s, p1/M, z27.s, z22.s\n"
+ "fmax z26.s, p1/M, z26.s, z22.s\n"
+ "st1w { z25.s }, p0, [x26, x12, LSL #2]\n"
+ "mov z25.d, z24.d\n"
+ "fmax z28.s, p1/M, z28.s, z22.s\n"
+ "fmax z20.s, p1/M, z20.s, z22.s\n"
+ "st1w { z27.s }, p0, [x25, x12, LSL #2]\n"
+ "mov z27.d, z24.d\n"
+ "fmax z23.s, p1/M, z23.s, z22.s\n"
+ "fmax z19.s, p1/M, z19.s, z22.s\n"
+ "st1w { z26.s }, p0, [x24, x12, LSL #2]\n"
+ "mov z26.d, z24.d\n"
+ "st1w { z28.s }, p0, [x23, x12, LSL #2]\n"
+ "mov z28.d, z24.d\n"
"addvl %x[params], %x[params], #-6\n"
"st1w { z20.s }, p0, [x22, x12, LSL #2]\n"
- "mov z20.d, z26.d\n"
- "st1w { z19.s }, p0, [x21, x12, LSL #2]\n"
- "mov z19.d, z26.d\n"
- "st1w { z18.s }, p0, [x20, x12, LSL #2]\n"
+ "mov z20.d, z24.d\n"
+ "st1w { z23.s }, p0, [x21, x12, LSL #2]\n"
+ "mov z23.d, z24.d\n"
+ "st1w { z19.s }, p0, [x20, x12, LSL #2]\n"
"incw x12\n"
- "mov z18.d, z26.d\n"
+ "mov z19.d, z24.d\n"
"b.any 1b\n"
: [params] "+&r" (params)
: [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
- : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
index a4ee87cce2..cc0c4236a8 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
index 2ea116fc9e..84ab4b5035 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
@@ -52,21 +52,21 @@ void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"mov x10, #0x0\n"
"ldr x9, [%x[inptrs], #0x20]\n"
"ldr x28, [%x[inptrs], #0x28]\n"
- "ld1w { z25.s }, p2/Z, [%x[params]]\n"
+ "ld1w { z16.s }, p2/Z, [%x[params]]\n"
"ldp x27, x26, [%x[outptrs], #0x0]\n"
- "mov z24.d, z25.d\n"
- "mov z23.d, z25.d\n"
+ "mov z25.d, z16.d\n"
+ "mov z15.d, z16.d\n"
"ldp x25, x24, [%x[outptrs], #0x10]\n"
"ldp x23, x22, [%x[outptrs], #0x20]\n"
- "mov z22.d, z25.d\n"
- "mov z21.d, z25.d\n"
+ "mov z24.d, z16.d\n"
+ "mov z14.d, z16.d\n"
"ldp x21, x20, [%x[outptrs], #0x30]\n"
"ld1rqw { z2.s }, p1/Z, [x14]\n"
- "mov z20.d, z25.d\n"
- "mov z19.d, z25.d\n"
+ "mov z26.d, z16.d\n"
+ "mov z17.d, z16.d\n"
"ld1rqw { z3.s }, p1/Z, [x14, #16]\n"
"ld1rqw { z4.s }, p1/Z, [x13]\n"
- "mov z18.d, z25.d\n"
+ "mov z23.d, z16.d\n"
"ld1rqw { z5.s }, p1/Z, [x13, #16]\n"
"ld1rqw { z6.s }, p1/Z, [x12]\n"
"ld1rqw { z7.s }, p1/Z, [x12, #16]\n"
@@ -76,8 +76,8 @@ void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"ld1rqw { z11.s }, p1/Z, [x9, #16]\n"
"ld1rqw { z12.s }, p1/Z, [x28]\n"
"ld1rqw { z13.s }, p1/Z, [x28, #16]\n"
- "ld1rw { z17.s }, p1/Z, [%x[clamps]]\n"
- "ld1rw { z16.s }, p1/Z, [%x[clamps], #4]\n"
+ "ld1rw { z21.s }, p1/Z, [%x[clamps]]\n"
+ "ld1rw { z22.s }, p1/Z, [%x[clamps], #4]\n"
"ld1w { z31.s }, p2/Z, [%x[params], #1, MUL VL]\n"
"ld1w { z30.s }, p2/Z, [%x[params], #2, MUL VL]\n"
"ld1w { z29.s }, p2/Z, [%x[params], #3, MUL VL]\n"
@@ -85,304 +85,304 @@ void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"ld1w { z27.s }, p2/Z, [%x[params], #5, MUL VL]\n"
"addvl %x[params], %x[params], #6\n"
"1:" // Output channel complete vector loop
- "fmla z25.s, z31.s, z2.s[0]\n"
- "fmla z24.s, z31.s, z2.s[1]\n"
+ "fmla z16.s, z31.s, z2.s[0]\n"
+ "fmla z25.s, z31.s, z2.s[1]\n"
"mov z0.d, z8.d\n"
"incw x15\n"
- "fmla z23.s, z31.s, z2.s[2]\n"
- "fmla z22.s, z31.s, z2.s[3]\n"
+ "fmla z15.s, z31.s, z2.s[2]\n"
+ "fmla z24.s, z31.s, z2.s[3]\n"
"mov z1.d, z9.d\n"
"mov p0.b, p2.b\n"
- "fmla z21.s, z31.s, z4.s[0]\n"
- "fmla z20.s, z31.s, z4.s[1]\n"
+ "fmla z14.s, z31.s, z4.s[0]\n"
+ "fmla z26.s, z31.s, z4.s[1]\n"
"whilelt p2.s, x15, %x[channel_multiplier]\n"
- "fmla z19.s, z31.s, z4.s[2]\n"
- "fmla z18.s, z31.s, z4.s[3]\n"
- "ld1w { z31.s }, p1/Z, [%x[params]]\n"
- "fmla z25.s, z30.s, z2.s[1]\n"
- "fmla z24.s, z30.s, z2.s[2]\n"
- "fmla z23.s, z30.s, z2.s[3]\n"
- "fmla z22.s, z30.s, z3.s[0]\n"
- "fmla z21.s, z30.s, z4.s[1]\n"
- "fmla z20.s, z30.s, z4.s[2]\n"
- "fmla z19.s, z30.s, z4.s[3]\n"
- "fmla z18.s, z30.s, z5.s[0]\n"
- "ld1w { z30.s }, p1/Z, [%x[params], #1, MUL VL]\n"
- "fmla z25.s, z29.s, z2.s[2]\n"
- "fmla z24.s, z29.s, z2.s[3]\n"
- "fmla z23.s, z29.s, z3.s[0]\n"
- "fmla z22.s, z29.s, z3.s[1]\n"
- "fmla z21.s, z29.s, z4.s[2]\n"
- "fmla z20.s, z29.s, z4.s[3]\n"
- "fmla z19.s, z29.s, z5.s[0]\n"
- "fmla z18.s, z29.s, z5.s[1]\n"
- "ld1w { z29.s }, p1/Z, [%x[params], #2, MUL VL]\n"
- "fmla z25.s, z28.s, z2.s[3]\n"
- "fmla z24.s, z28.s, z3.s[0]\n"
- "fmla z23.s, z28.s, z3.s[1]\n"
- "fmla z22.s, z28.s, z3.s[2]\n"
- "fmla z21.s, z28.s, z4.s[3]\n"
- "fmla z20.s, z28.s, z5.s[0]\n"
- "fmla z19.s, z28.s, z5.s[1]\n"
- "fmla z18.s, z28.s, z5.s[2]\n"
+ "fmla z17.s, z31.s, z4.s[2]\n"
+ "fmla z23.s, z31.s, z4.s[3]\n"
+ "ld1w { z20.s }, p1/Z, [%x[params]]\n"
+ "fmla z16.s, z30.s, z2.s[1]\n"
+ "fmla z25.s, z30.s, z2.s[2]\n"
+ "fmla z15.s, z30.s, z2.s[3]\n"
+ "fmla z24.s, z30.s, z3.s[0]\n"
+ "fmla z14.s, z30.s, z4.s[1]\n"
+ "fmla z26.s, z30.s, z4.s[2]\n"
+ "fmla z17.s, z30.s, z4.s[3]\n"
+ "fmla z23.s, z30.s, z5.s[0]\n"
+ "ld1w { z19.s }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "fmla z16.s, z29.s, z2.s[2]\n"
+ "fmla z25.s, z29.s, z2.s[3]\n"
+ "fmla z15.s, z29.s, z3.s[0]\n"
+ "fmla z24.s, z29.s, z3.s[1]\n"
+ "fmla z14.s, z29.s, z4.s[2]\n"
+ "fmla z26.s, z29.s, z4.s[3]\n"
+ "fmla z17.s, z29.s, z5.s[0]\n"
+ "fmla z23.s, z29.s, z5.s[1]\n"
+ "ld1w { z18.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "fmla z16.s, z28.s, z2.s[3]\n"
+ "fmla z25.s, z28.s, z3.s[0]\n"
+ "fmla z15.s, z28.s, z3.s[1]\n"
+ "fmla z24.s, z28.s, z3.s[2]\n"
+ "fmla z14.s, z28.s, z4.s[3]\n"
+ "fmla z26.s, z28.s, z5.s[0]\n"
+ "fmla z17.s, z28.s, z5.s[1]\n"
+ "fmla z23.s, z28.s, z5.s[2]\n"
"ld1w { z28.s }, p1/Z, [%x[params], #3, MUL VL]\n"
- "fmla z25.s, z27.s, z3.s[0]\n"
- "fmla z24.s, z27.s, z3.s[1]\n"
- "fmla z23.s, z27.s, z3.s[2]\n"
- "fmla z22.s, z27.s, z3.s[3]\n"
- "fmla z21.s, z27.s, z5.s[0]\n"
- "fmla z20.s, z27.s, z5.s[1]\n"
- "fmla z19.s, z27.s, z5.s[2]\n"
- "fmla z18.s, z27.s, z5.s[3]\n"
+ "fmla z16.s, z27.s, z3.s[0]\n"
+ "fmla z25.s, z27.s, z3.s[1]\n"
+ "fmla z15.s, z27.s, z3.s[2]\n"
+ "fmla z24.s, z27.s, z3.s[3]\n"
+ "fmla z14.s, z27.s, z5.s[0]\n"
+ "fmla z26.s, z27.s, z5.s[1]\n"
+ "fmla z17.s, z27.s, z5.s[2]\n"
+ "fmla z23.s, z27.s, z5.s[3]\n"
"ld1w { z27.s }, p1/Z, [%x[params], #4, MUL VL]\n"
- "fmla z25.s, z31.s, z4.s[0]\n"
- "fmla z24.s, z31.s, z4.s[1]\n"
- "fmla z23.s, z31.s, z4.s[2]\n"
- "fmla z22.s, z31.s, z4.s[3]\n"
- "fmla z21.s, z31.s, z6.s[0]\n"
- "fmla z20.s, z31.s, z6.s[1]\n"
- "fmla z19.s, z31.s, z6.s[2]\n"
- "fmla z18.s, z31.s, z6.s[3]\n"
- "ld1w { z31.s }, p1/Z, [%x[params], #5, MUL VL]\n"
- "fmla z25.s, z30.s, z4.s[1]\n"
- "fmla z24.s, z30.s, z4.s[2]\n"
- "fmla z23.s, z30.s, z4.s[3]\n"
- "fmla z22.s, z30.s, z5.s[0]\n"
- "fmla z21.s, z30.s, z6.s[1]\n"
- "fmla z20.s, z30.s, z6.s[2]\n"
- "fmla z19.s, z30.s, z6.s[3]\n"
- "fmla z18.s, z30.s, z7.s[0]\n"
- "ld1w { z30.s }, p1/Z, [%x[params], #6, MUL VL]\n"
- "fmla z25.s, z29.s, z4.s[2]\n"
- "fmla z24.s, z29.s, z4.s[3]\n"
- "fmla z23.s, z29.s, z5.s[0]\n"
- "fmla z22.s, z29.s, z5.s[1]\n"
- "fmla z21.s, z29.s, z6.s[2]\n"
- "fmla z20.s, z29.s, z6.s[3]\n"
- "fmla z19.s, z29.s, z7.s[0]\n"
- "fmla z18.s, z29.s, z7.s[1]\n"
- "ld1w { z29.s }, p1/Z, [%x[params], #7, MUL VL]\n"
+ "fmla z16.s, z20.s, z4.s[0]\n"
+ "fmla z25.s, z20.s, z4.s[1]\n"
+ "fmla z15.s, z20.s, z4.s[2]\n"
+ "fmla z24.s, z20.s, z4.s[3]\n"
+ "fmla z14.s, z20.s, z6.s[0]\n"
+ "fmla z26.s, z20.s, z6.s[1]\n"
+ "fmla z17.s, z20.s, z6.s[2]\n"
+ "fmla z23.s, z20.s, z6.s[3]\n"
+ "ld1w { z20.s }, p1/Z, [%x[params], #5, MUL VL]\n"
+ "fmla z16.s, z19.s, z4.s[1]\n"
+ "fmla z25.s, z19.s, z4.s[2]\n"
+ "fmla z15.s, z19.s, z4.s[3]\n"
+ "fmla z24.s, z19.s, z5.s[0]\n"
+ "fmla z14.s, z19.s, z6.s[1]\n"
+ "fmla z26.s, z19.s, z6.s[2]\n"
+ "fmla z17.s, z19.s, z6.s[3]\n"
+ "fmla z23.s, z19.s, z7.s[0]\n"
+ "ld1w { z19.s }, p1/Z, [%x[params], #6, MUL VL]\n"
+ "fmla z16.s, z18.s, z4.s[2]\n"
+ "fmla z25.s, z18.s, z4.s[3]\n"
+ "fmla z15.s, z18.s, z5.s[0]\n"
+ "fmla z24.s, z18.s, z5.s[1]\n"
+ "fmla z14.s, z18.s, z6.s[2]\n"
+ "fmla z26.s, z18.s, z6.s[3]\n"
+ "fmla z17.s, z18.s, z7.s[0]\n"
+ "fmla z23.s, z18.s, z7.s[1]\n"
+ "ld1w { z18.s }, p1/Z, [%x[params], #7, MUL VL]\n"
"addvl %x[params], %x[params], #16\n"
- "fmla z25.s, z28.s, z4.s[3]\n"
- "fmla z24.s, z28.s, z5.s[0]\n"
- "fmla z23.s, z28.s, z5.s[1]\n"
- "fmla z22.s, z28.s, z5.s[2]\n"
- "fmla z21.s, z28.s, z6.s[3]\n"
- "fmla z20.s, z28.s, z7.s[0]\n"
- "fmla z19.s, z28.s, z7.s[1]\n"
- "fmla z18.s, z28.s, z7.s[2]\n"
- "ld1w { z28.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
- "fmla z25.s, z27.s, z5.s[0]\n"
- "fmla z24.s, z27.s, z5.s[1]\n"
- "fmla z23.s, z27.s, z5.s[2]\n"
- "fmla z22.s, z27.s, z5.s[3]\n"
- "fmla z21.s, z27.s, z7.s[0]\n"
- "fmla z20.s, z27.s, z7.s[1]\n"
- "fmla z19.s, z27.s, z7.s[2]\n"
- "fmla z18.s, z27.s, z7.s[3]\n"
+ "fmla z16.s, z28.s, z4.s[3]\n"
+ "fmla z25.s, z28.s, z5.s[0]\n"
+ "fmla z15.s, z28.s, z5.s[1]\n"
+ "fmla z24.s, z28.s, z5.s[2]\n"
+ "fmla z14.s, z28.s, z6.s[3]\n"
+ "fmla z26.s, z28.s, z7.s[0]\n"
+ "fmla z17.s, z28.s, z7.s[1]\n"
+ "fmla z23.s, z28.s, z7.s[2]\n"
+ "ld1w { z30.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
+ "fmla z16.s, z27.s, z5.s[0]\n"
+ "fmla z25.s, z27.s, z5.s[1]\n"
+ "fmla z15.s, z27.s, z5.s[2]\n"
+ "fmla z24.s, z27.s, z5.s[3]\n"
+ "fmla z14.s, z27.s, z7.s[0]\n"
+ "fmla z26.s, z27.s, z7.s[1]\n"
+ "fmla z17.s, z27.s, z7.s[2]\n"
+ "fmla z23.s, z27.s, z7.s[3]\n"
"ld1w { z27.s }, p1/Z, [%x[params], #-7, MUL VL]\n"
- "fmla z25.s, z31.s, z6.s[0]\n"
- "fmla z24.s, z31.s, z6.s[1]\n"
- "fmla z23.s, z31.s, z6.s[2]\n"
- "fmla z22.s, z31.s, z6.s[3]\n"
- "fmla z21.s, z31.s, z0.s[0]\n"
- "fmla z20.s, z31.s, z0.s[1]\n"
- "fmla z19.s, z31.s, z0.s[2]\n"
- "fmla z18.s, z31.s, z0.s[3]\n"
- "ld1w { z31.s }, p1/Z, [%x[params], #-6, MUL VL]\n"
- "fmla z25.s, z30.s, z6.s[1]\n"
- "fmla z24.s, z30.s, z6.s[2]\n"
- "fmla z23.s, z30.s, z6.s[3]\n"
- "fmla z22.s, z30.s, z7.s[0]\n"
- "fmla z21.s, z30.s, z0.s[1]\n"
- "fmla z20.s, z30.s, z0.s[2]\n"
- "fmla z19.s, z30.s, z0.s[3]\n"
- "fmla z18.s, z30.s, z1.s[0]\n"
- "ld1w { z30.s }, p1/Z, [%x[params], #-5, MUL VL]\n"
- "fmla z25.s, z29.s, z6.s[2]\n"
- "fmla z24.s, z29.s, z6.s[3]\n"
- "fmla z23.s, z29.s, z7.s[0]\n"
- "fmla z22.s, z29.s, z7.s[1]\n"
- "fmla z21.s, z29.s, z0.s[2]\n"
- "fmla z20.s, z29.s, z0.s[3]\n"
- "fmla z19.s, z29.s, z1.s[0]\n"
- "fmla z18.s, z29.s, z1.s[1]\n"
- "ld1w { z29.s }, p1/Z, [%x[params], #-4, MUL VL]\n"
- "fmla z25.s, z28.s, z6.s[3]\n"
- "fmla z24.s, z28.s, z7.s[0]\n"
- "fmla z23.s, z28.s, z7.s[1]\n"
- "fmla z22.s, z28.s, z7.s[2]\n"
- "fmla z21.s, z28.s, z0.s[3]\n"
- "fmla z20.s, z28.s, z1.s[0]\n"
- "fmla z19.s, z28.s, z1.s[1]\n"
- "fmla z18.s, z28.s, z1.s[2]\n"
- "ld1w { z28.s }, p1/Z, [%x[params], #-3, MUL VL]\n"
- "fmla z25.s, z27.s, z7.s[0]\n"
- "fmla z24.s, z27.s, z7.s[1]\n"
- "fmla z23.s, z27.s, z7.s[2]\n"
- "fmla z22.s, z27.s, z7.s[3]\n"
- "fmla z21.s, z27.s, z1.s[0]\n"
- "fmla z20.s, z27.s, z1.s[1]\n"
- "fmla z19.s, z27.s, z1.s[2]\n"
- "fmla z18.s, z27.s, z1.s[3]\n"
+ "fmla z16.s, z20.s, z6.s[0]\n"
+ "fmla z25.s, z20.s, z6.s[1]\n"
+ "fmla z15.s, z20.s, z6.s[2]\n"
+ "fmla z24.s, z20.s, z6.s[3]\n"
+ "fmla z14.s, z20.s, z0.s[0]\n"
+ "fmla z26.s, z20.s, z0.s[1]\n"
+ "fmla z17.s, z20.s, z0.s[2]\n"
+ "fmla z23.s, z20.s, z0.s[3]\n"
+ "ld1w { z20.s }, p1/Z, [%x[params], #-6, MUL VL]\n"
+ "fmla z16.s, z19.s, z6.s[1]\n"
+ "fmla z25.s, z19.s, z6.s[2]\n"
+ "fmla z15.s, z19.s, z6.s[3]\n"
+ "fmla z24.s, z19.s, z7.s[0]\n"
+ "fmla z14.s, z19.s, z0.s[1]\n"
+ "fmla z26.s, z19.s, z0.s[2]\n"
+ "fmla z17.s, z19.s, z0.s[3]\n"
+ "fmla z23.s, z19.s, z1.s[0]\n"
+ "ld1w { z19.s }, p1/Z, [%x[params], #-5, MUL VL]\n"
+ "fmla z16.s, z18.s, z6.s[2]\n"
+ "fmla z25.s, z18.s, z6.s[3]\n"
+ "fmla z15.s, z18.s, z7.s[0]\n"
+ "fmla z24.s, z18.s, z7.s[1]\n"
+ "fmla z14.s, z18.s, z0.s[2]\n"
+ "fmla z26.s, z18.s, z0.s[3]\n"
+ "fmla z17.s, z18.s, z1.s[0]\n"
+ "fmla z23.s, z18.s, z1.s[1]\n"
+ "ld1w { z18.s }, p1/Z, [%x[params], #-4, MUL VL]\n"
+ "fmla z16.s, z30.s, z6.s[3]\n"
+ "fmla z25.s, z30.s, z7.s[0]\n"
+ "fmla z15.s, z30.s, z7.s[1]\n"
+ "fmla z24.s, z30.s, z7.s[2]\n"
+ "fmla z14.s, z30.s, z0.s[3]\n"
+ "fmla z26.s, z30.s, z1.s[0]\n"
+ "fmla z17.s, z30.s, z1.s[1]\n"
+ "fmla z23.s, z30.s, z1.s[2]\n"
+ "ld1w { z31.s }, p1/Z, [%x[params], #-3, MUL VL]\n"
+ "fmla z16.s, z27.s, z7.s[0]\n"
+ "fmla z25.s, z27.s, z7.s[1]\n"
+ "fmla z15.s, z27.s, z7.s[2]\n"
+ "fmla z24.s, z27.s, z7.s[3]\n"
+ "fmla z14.s, z27.s, z1.s[0]\n"
+ "fmla z26.s, z27.s, z1.s[1]\n"
+ "fmla z17.s, z27.s, z1.s[2]\n"
+ "fmla z23.s, z27.s, z1.s[3]\n"
"ld1w { z27.s }, p1/Z, [%x[params], #-2, MUL VL]\n"
- "fmla z25.s, z31.s, z0.s[0]\n"
- "fmla z24.s, z31.s, z0.s[1]\n"
- "fmla z23.s, z31.s, z0.s[2]\n"
- "fmla z22.s, z31.s, z0.s[3]\n"
- "mov z0.d, z10.d\n"
- "fmla z21.s, z31.s, z0.s[0]\n"
- "fmla z20.s, z31.s, z0.s[1]\n"
- "fmla z19.s, z31.s, z0.s[2]\n"
- "fmla z18.s, z31.s, z0.s[3]\n"
- "mov z0.d, z8.d\n"
- "ld1w { z31.s }, p1/Z, [%x[params], #-1, MUL VL]\n"
- "fmla z25.s, z30.s, z0.s[1]\n"
- "fmla z24.s, z30.s, z0.s[2]\n"
- "fmla z23.s, z30.s, z0.s[3]\n"
- "fmla z22.s, z30.s, z1.s[0]\n"
- "mov z0.d, z10.d\n"
- "mov z1.d, z11.d\n"
- "fmla z21.s, z30.s, z0.s[1]\n"
- "fmla z20.s, z30.s, z0.s[2]\n"
- "fmla z19.s, z30.s, z0.s[3]\n"
- "fmla z18.s, z30.s, z1.s[0]\n"
- "mov z0.d, z8.d\n"
- "ld1w { z30.s }, p1/Z, [%x[params]]\n"
- "mov z1.d, z9.d\n"
- "fmla z25.s, z29.s, z0.s[2]\n"
- "fmla z24.s, z29.s, z0.s[3]\n"
- "fmla z23.s, z29.s, z1.s[0]\n"
- "fmla z22.s, z29.s, z1.s[1]\n"
+ "fmla z16.s, z20.s, z0.s[0]\n"
+ "fmla z25.s, z20.s, z0.s[1]\n"
+ "fmla z15.s, z20.s, z0.s[2]\n"
+ "fmla z24.s, z20.s, z0.s[3]\n"
"mov z0.d, z10.d\n"
- "mov z1.d, z11.d\n"
- "fmla z21.s, z29.s, z0.s[2]\n"
- "fmla z20.s, z29.s, z0.s[3]\n"
- "fmla z19.s, z29.s, z1.s[0]\n"
- "fmla z18.s, z29.s, z1.s[1]\n"
+ "fmla z14.s, z20.s, z0.s[0]\n"
+ "fmla z26.s, z20.s, z0.s[1]\n"
+ "fmla z17.s, z20.s, z0.s[2]\n"
+ "fmla z23.s, z20.s, z0.s[3]\n"
"mov z0.d, z8.d\n"
- "ld1w { z29.s }, p1/Z, [%x[params], #1, MUL VL]\n"
- "mov z1.d, z9.d\n"
- "fmla z25.s, z28.s, z0.s[3]\n"
- "fmla z24.s, z28.s, z1.s[0]\n"
- "fmla z23.s, z28.s, z1.s[1]\n"
- "fmla z22.s, z28.s, z1.s[2]\n"
+ "ld1w { z20.s }, p1/Z, [%x[params], #-1, MUL VL]\n"
+ "fmla z16.s, z19.s, z0.s[1]\n"
+ "fmla z25.s, z19.s, z0.s[2]\n"
+ "fmla z15.s, z19.s, z0.s[3]\n"
+ "fmla z24.s, z19.s, z1.s[0]\n"
+ "mov z1.d, z10.d\n"
+ "mov z0.d, z11.d\n"
+ "fmla z14.s, z19.s, z1.s[1]\n"
+ "fmla z26.s, z19.s, z1.s[2]\n"
+ "fmla z17.s, z19.s, z1.s[3]\n"
+ "fmla z23.s, z19.s, z0.s[0]\n"
+ "mov z1.d, z8.d\n"
+ "ld1w { z19.s }, p1/Z, [%x[params]]\n"
+ "mov z0.d, z9.d\n"
+ "fmla z16.s, z18.s, z1.s[2]\n"
+ "fmla z25.s, z18.s, z1.s[3]\n"
+ "fmla z15.s, z18.s, z0.s[0]\n"
+ "fmla z24.s, z18.s, z0.s[1]\n"
+ "mov z1.d, z10.d\n"
+ "mov z0.d, z11.d\n"
+ "fmla z14.s, z18.s, z1.s[2]\n"
+ "fmla z26.s, z18.s, z1.s[3]\n"
+ "fmla z17.s, z18.s, z0.s[0]\n"
+ "fmla z23.s, z18.s, z0.s[1]\n"
+ "mov z1.d, z8.d\n"
+ "ld1w { z18.s }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "mov z0.d, z9.d\n"
+ "fmla z16.s, z31.s, z1.s[3]\n"
+ "fmla z25.s, z31.s, z0.s[0]\n"
+ "fmla z15.s, z31.s, z0.s[1]\n"
+ "fmla z24.s, z31.s, z0.s[2]\n"
"mov z0.d, z10.d\n"
"mov z1.d, z11.d\n"
- "fmla z21.s, z28.s, z0.s[3]\n"
- "fmla z20.s, z28.s, z1.s[0]\n"
- "fmla z19.s, z28.s, z1.s[1]\n"
- "fmla z18.s, z28.s, z1.s[2]\n"
+ "fmla z14.s, z31.s, z0.s[3]\n"
+ "fmla z26.s, z31.s, z1.s[0]\n"
+ "fmla z17.s, z31.s, z1.s[1]\n"
+ "fmla z23.s, z31.s, z1.s[2]\n"
"mov z1.d, z9.d\n"
"ld1w { z28.s }, p1/Z, [%x[params], #2, MUL VL]\n"
- "fmla z25.s, z27.s, z1.s[0]\n"
- "fmla z24.s, z27.s, z1.s[1]\n"
- "fmla z23.s, z27.s, z1.s[2]\n"
- "fmla z22.s, z27.s, z1.s[3]\n"
+ "fmla z16.s, z27.s, z1.s[0]\n"
+ "fmla z25.s, z27.s, z1.s[1]\n"
+ "fmla z15.s, z27.s, z1.s[2]\n"
+ "fmla z24.s, z27.s, z1.s[3]\n"
"mov z1.d, z11.d\n"
- "fmla z21.s, z27.s, z1.s[0]\n"
- "fmla z20.s, z27.s, z1.s[1]\n"
- "fmla z19.s, z27.s, z1.s[2]\n"
- "fmla z18.s, z27.s, z1.s[3]\n"
+ "fmla z14.s, z27.s, z1.s[0]\n"
+ "fmla z26.s, z27.s, z1.s[1]\n"
+ "fmla z17.s, z27.s, z1.s[2]\n"
+ "fmla z23.s, z27.s, z1.s[3]\n"
"ld1w { z27.s }, p1/Z, [%x[params], #3, MUL VL]\n"
- "fmla z25.s, z31.s, z0.s[0]\n"
- "fmla z24.s, z31.s, z0.s[1]\n"
- "fmla z23.s, z31.s, z0.s[2]\n"
- "fmla z22.s, z31.s, z0.s[3]\n"
+ "fmla z16.s, z20.s, z0.s[0]\n"
+ "fmla z25.s, z20.s, z0.s[1]\n"
+ "fmla z15.s, z20.s, z0.s[2]\n"
+ "fmla z24.s, z20.s, z0.s[3]\n"
"mov z0.d, z12.d\n"
- "fmla z21.s, z31.s, z0.s[0]\n"
- "fmla z20.s, z31.s, z0.s[1]\n"
- "fmla z19.s, z31.s, z0.s[2]\n"
- "fmla z18.s, z31.s, z0.s[3]\n"
+ "fmla z14.s, z20.s, z0.s[0]\n"
+ "fmla z26.s, z20.s, z0.s[1]\n"
+ "fmla z17.s, z20.s, z0.s[2]\n"
+ "fmla z23.s, z20.s, z0.s[3]\n"
"mov z0.d, z10.d\n"
"ld1w { z31.s }, p2/Z, [%x[params], #5, MUL VL]\n"
- "fmla z25.s, z30.s, z0.s[1]\n"
- "fmla z24.s, z30.s, z0.s[2]\n"
- "fmla z23.s, z30.s, z0.s[3]\n"
- "fmla z22.s, z30.s, z1.s[0]\n"
- "mov z0.d, z12.d\n"
- "mov z1.d, z13.d\n"
- "fmla z21.s, z30.s, z0.s[1]\n"
- "fmla z20.s, z30.s, z0.s[2]\n"
- "fmla z19.s, z30.s, z0.s[3]\n"
- "fmla z18.s, z30.s, z1.s[0]\n"
- "mov z0.d, z10.d\n"
+ "fmla z16.s, z19.s, z0.s[1]\n"
+ "fmla z25.s, z19.s, z0.s[2]\n"
+ "fmla z15.s, z19.s, z0.s[3]\n"
+ "fmla z24.s, z19.s, z1.s[0]\n"
+ "mov z1.d, z12.d\n"
+ "mov z0.d, z13.d\n"
+ "fmla z14.s, z19.s, z1.s[1]\n"
+ "fmla z26.s, z19.s, z1.s[2]\n"
+ "fmla z17.s, z19.s, z1.s[3]\n"
+ "fmla z23.s, z19.s, z0.s[0]\n"
+ "mov z1.d, z10.d\n"
"ld1w { z30.s }, p2/Z, [%x[params], #6, MUL VL]\n"
- "mov z1.d, z11.d\n"
- "fmla z25.s, z29.s, z0.s[2]\n"
- "fmla z24.s, z29.s, z0.s[3]\n"
- "fmla z23.s, z29.s, z1.s[0]\n"
- "fmla z22.s, z29.s, z1.s[1]\n"
- "mov z0.d, z12.d\n"
- "mov z1.d, z13.d\n"
- "fmla z21.s, z29.s, z0.s[2]\n"
- "fmla z20.s, z29.s, z0.s[3]\n"
- "fmla z19.s, z29.s, z1.s[0]\n"
- "fmla z18.s, z29.s, z1.s[1]\n"
- "mov z0.d, z10.d\n"
+ "mov z0.d, z11.d\n"
+ "fmla z16.s, z18.s, z1.s[2]\n"
+ "fmla z25.s, z18.s, z1.s[3]\n"
+ "fmla z15.s, z18.s, z0.s[0]\n"
+ "fmla z24.s, z18.s, z0.s[1]\n"
+ "mov z1.d, z12.d\n"
+ "mov z0.d, z13.d\n"
+ "fmla z14.s, z18.s, z1.s[2]\n"
+ "fmla z26.s, z18.s, z1.s[3]\n"
+ "fmla z17.s, z18.s, z0.s[0]\n"
+ "fmla z23.s, z18.s, z0.s[1]\n"
+ "mov z1.d, z10.d\n"
"ld1w { z29.s }, p2/Z, [%x[params], #7, MUL VL]\n"
- "mov z1.d, z11.d\n"
- "fmla z25.s, z28.s, z0.s[3]\n"
- "fmla z24.s, z28.s, z1.s[0]\n"
- "fmla z23.s, z28.s, z1.s[1]\n"
- "fmla z22.s, z28.s, z1.s[2]\n"
- "mov z1.d, z13.d\n"
- "mov z0.d, z12.d\n"
- "fmla z20.s, z28.s, z1.s[0]\n"
- "fmla z19.s, z28.s, z1.s[1]\n"
- "fmla z18.s, z28.s, z1.s[2]\n"
- "mov z1.d, z11.d\n"
- "fmla z21.s, z28.s, z0.s[3]\n"
- "fmla z25.s, z27.s, z1.s[0]\n"
- "fmla z24.s, z27.s, z1.s[1]\n"
- "fmin z25.s, p1/M, z25.s, z16.s\n"
- "fmax z25.s, p1/M, z25.s, z17.s\n"
- "fmla z23.s, z27.s, z1.s[2]\n"
- "fmla z22.s, z27.s, z1.s[3]\n"
- "mov z1.d, z13.d\n"
- "fmin z24.s, p1/M, z24.s, z16.s\n"
- "fmla z21.s, z27.s, z1.s[0]\n"
- "fmla z20.s, z27.s, z1.s[1]\n"
- "fmin z23.s, p1/M, z23.s, z16.s\n"
- "fmin z22.s, p1/M, z22.s, z16.s\n"
- "fmla z19.s, z27.s, z1.s[2]\n"
- "fmla z18.s, z27.s, z1.s[3]\n"
- "fmin z21.s, p1/M, z21.s, z16.s\n"
- "fmin z20.s, p1/M, z20.s, z16.s\n"
- "fmin z19.s, p1/M, z19.s, z16.s\n"
- "fmin z18.s, p1/M, z18.s, z16.s\n"
- "st1w { z25.s }, p0, [x27, x10, LSL #2]\n"
- "ld1w { z25.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "mov z0.d, z11.d\n"
+ "fmla z16.s, z28.s, z1.s[3]\n"
+ "fmla z25.s, z28.s, z0.s[0]\n"
+ "fmla z15.s, z28.s, z0.s[1]\n"
+ "fmla z24.s, z28.s, z0.s[2]\n"
+ "mov z0.d, z13.d\n"
+ "mov z1.d, z12.d\n"
+ "fmla z26.s, z28.s, z0.s[0]\n"
+ "fmla z17.s, z28.s, z0.s[1]\n"
+ "fmla z23.s, z28.s, z0.s[2]\n"
+ "mov z0.d, z11.d\n"
+ "fmla z14.s, z28.s, z1.s[3]\n"
+ "fmla z16.s, z27.s, z0.s[0]\n"
+ "fmla z25.s, z27.s, z0.s[1]\n"
+ "fmin z16.s, p1/M, z16.s, z22.s\n"
+ "fmax z16.s, p1/M, z16.s, z21.s\n"
+ "fmla z15.s, z27.s, z0.s[2]\n"
+ "fmla z24.s, z27.s, z0.s[3]\n"
+ "mov z0.d, z13.d\n"
+ "fmin z25.s, p1/M, z25.s, z22.s\n"
+ "fmla z14.s, z27.s, z0.s[0]\n"
+ "fmla z26.s, z27.s, z0.s[1]\n"
+ "fmin z15.s, p1/M, z15.s, z22.s\n"
+ "fmin z24.s, p1/M, z24.s, z22.s\n"
+ "fmla z17.s, z27.s, z0.s[2]\n"
+ "fmla z23.s, z27.s, z0.s[3]\n"
+ "fmin z14.s, p1/M, z14.s, z22.s\n"
+ "fmin z26.s, p1/M, z26.s, z22.s\n"
+ "fmin z17.s, p1/M, z17.s, z22.s\n"
+ "fmin z23.s, p1/M, z23.s, z22.s\n"
+ "st1w { z16.s }, p0, [x27, x10, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [%x[params], #4, MUL VL]\n"
"addvl %x[params], %x[params], #16\n"
- "fmax z24.s, p1/M, z24.s, z17.s\n"
- "st1w { z24.s }, p0, [x26, x10, LSL #2]\n"
- "mov z24.d, z25.d\n"
- "fmax z23.s, p1/M, z23.s, z17.s\n"
- "fmax z22.s, p1/M, z22.s, z17.s\n"
- "st1w { z23.s }, p0, [x25, x10, LSL #2]\n"
- "mov z23.d, z25.d\n"
- "fmax z21.s, p1/M, z21.s, z17.s\n"
- "fmax z20.s, p1/M, z20.s, z17.s\n"
- "st1w { z22.s }, p0, [x24, x10, LSL #2]\n"
- "mov z22.d, z25.d\n"
- "fmax z19.s, p1/M, z19.s, z17.s\n"
- "fmax z18.s, p1/M, z18.s, z17.s\n"
- "st1w { z21.s }, p0, [x23, x10, LSL #2]\n"
- "mov z21.d, z25.d\n"
- "st1w { z20.s }, p0, [x22, x10, LSL #2]\n"
- "mov z20.d, z25.d\n"
+ "fmax z25.s, p1/M, z25.s, z21.s\n"
+ "st1w { z25.s }, p0, [x26, x10, LSL #2]\n"
+ "mov z25.d, z16.d\n"
+ "fmax z15.s, p1/M, z15.s, z21.s\n"
+ "fmax z24.s, p1/M, z24.s, z21.s\n"
+ "st1w { z15.s }, p0, [x25, x10, LSL #2]\n"
+ "mov z15.d, z16.d\n"
+ "fmax z14.s, p1/M, z14.s, z21.s\n"
+ "fmax z26.s, p1/M, z26.s, z21.s\n"
+ "st1w { z24.s }, p0, [x24, x10, LSL #2]\n"
+ "mov z24.d, z16.d\n"
+ "fmax z17.s, p1/M, z17.s, z21.s\n"
+ "fmax z23.s, p1/M, z23.s, z21.s\n"
+ "st1w { z14.s }, p0, [x23, x10, LSL #2]\n"
+ "mov z14.d, z16.d\n"
+ "st1w { z26.s }, p0, [x22, x10, LSL #2]\n"
+ "mov z26.d, z16.d\n"
"ld1w { z28.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
"ld1w { z27.s }, p2/Z, [%x[params], #-7, MUL VL]\n"
- "st1w { z19.s }, p0, [x21, x10, LSL #2]\n"
- "mov z19.d, z25.d\n"
+ "st1w { z17.s }, p0, [x21, x10, LSL #2]\n"
+ "mov z17.d, z16.d\n"
"addvl %x[params], %x[params], #-6\n"
- "st1w { z18.s }, p0, [x20, x10, LSL #2]\n"
+ "st1w { z23.s }, p0, [x20, x10, LSL #2]\n"
"incw x10\n"
- "mov z18.d, z25.d\n"
+ "mov z23.d, z16.d\n"
"b.any 1b\n"
: [params] "+&r" (params)
: [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
- : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
index e1f0b50d89..f83767d8ae 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,13 +22,13 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -49,4 +49,4 @@ struct sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
index a43b81d7e8..1770ec182c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -46,405 +46,405 @@ void sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
__asm__ __volatile__(
"ptrue p1.b\n"
"mov x9, #0x0\n"
- "ld1rw { z10.s }, p1/Z, [%x[minmax_vals]]\n"
- "ld1rw { z13.s }, p1/Z, [%x[minmax_vals], #4]\n"
+ "ld1rw { z15.s }, p1/Z, [%x[minmax_vals]]\n"
+ "ld1rw { z14.s }, p1/Z, [%x[minmax_vals], #4]\n"
"whilelt p0.s, x9, %x[n_output_channels]\n"
"1:" // Output channel loop
- "mov z5.b, #0x0\n"
+ "mov z31.b, #0x0\n"
"cbz %x[bias], 2f\n"
- "ld1w { z5.s }, p0/Z, [%x[bias], x9, LSL #2]\n"
+ "ld1w { z31.s }, p0/Z, [%x[bias], x9, LSL #2]\n"
"2:" // Output channel loop: Load bias: Done
- "mov x21, %x[inptrs]\n"
- "ldp x24, x28, [x21], #0x10\n"
- "lsr x20, %x[kernel_points], #0x1\n"
- "mov z16.d, z5.d\n"
- "mov z17.d, z5.d\n"
- "mov z18.d, z5.d\n"
- "ld1rqw { z1.s }, p1/Z, [x24]\n"
- "ld1rqw { z6.s }, p1/Z, [x24, #16]\n"
- "mov z19.d, z5.d\n"
- "mov z20.d, z5.d\n"
- "ld1rqw { z0.s }, p1/Z, [x28]\n"
- "ld1rqw { z4.s }, p1/Z, [x28, #16]\n"
- "mov z21.d, z5.d\n"
- "mov z22.d, z5.d\n"
+ "mov x23, %x[inptrs]\n"
+ "ldp x21, x20, [x23], #0x10\n"
+ "lsr x22, %x[kernel_points], #0x1\n"
+ "mov z16.d, z31.d\n"
+ "mov z17.d, z31.d\n"
+ "mov z18.d, z31.d\n"
+ "ld1rqw { z6.s }, p1/Z, [x21]\n"
+ "ld1rqw { z5.s }, p1/Z, [x21, #16]\n"
+ "mov z19.d, z31.d\n"
+ "mov z20.d, z31.d\n"
+ "ld1rqw { z1.s }, p1/Z, [x20]\n"
+ "ld1rqw { z2.s }, p1/Z, [x20, #16]\n"
+ "mov z21.d, z31.d\n"
+ "mov z22.d, z31.d\n"
"ld1w { z8.s }, p1/Z, [%x[weights]]\n"
"addvl %x[weights], %x[weights], #1\n"
- "mov z23.d, z5.d\n"
- "mov z24.d, z5.d\n"
- "mov z25.d, z5.d\n"
- "mov z26.d, z5.d\n"
- "mov z27.d, z5.d\n"
- "mov z28.d, z5.d\n"
- "mov z29.d, z5.d\n"
- "mov z30.d, z5.d\n"
- "mov z31.d, z5.d\n"
- "cbz x20, 6f\n"
- "ldp x24, x28, [x21], #0x10\n"
- "subs x20, x20, #0x1\n"
- "ld1rqw { z5.s }, p1/Z, [x24]\n"
- "ld1rqw { z7.s }, p1/Z, [x24, #16]\n"
- "ld1rqw { z3.s }, p1/Z, [x28]\n"
- "ld1rqw { z2.s }, p1/Z, [x28, #16]\n"
+ "mov z23.d, z31.d\n"
+ "mov z24.d, z31.d\n"
+ "mov z25.d, z31.d\n"
+ "mov z26.d, z31.d\n"
+ "mov z27.d, z31.d\n"
+ "mov z28.d, z31.d\n"
+ "mov z29.d, z31.d\n"
+ "mov z30.d, z31.d\n"
+ "mov z31.d, z31.d\n"
+ "cbz x22, 6f\n"
+ "ldp x21, x20, [x23], #0x10\n"
+ "subs x22, x22, #0x1\n"
+ "ld1rqw { z0.s }, p1/Z, [x21]\n"
+ "ld1rqw { z4.s }, p1/Z, [x21, #16]\n"
+ "ld1rqw { z7.s }, p1/Z, [x20]\n"
+ "ld1rqw { z3.s }, p1/Z, [x20, #16]\n"
"ld1w { z11.s }, p1/Z, [%x[weights]]\n"
"addvl %x[weights], %x[weights], #1\n"
"beq 4f\n"
"3:" // Output channel loop: Kernel loop
- "ldp x24, x28, [x21], #0x10\n"
- "fmla z16.s, z8.s, z1.s[0]\n"
- "fmla z17.s, z8.s, z1.s[1]\n"
- "subs x20, x20, #0x1\n"
- "fmla z18.s, z8.s, z1.s[2]\n"
- "fmla z19.s, z8.s, z1.s[3]\n"
- "ld1rqw { z1.s }, p1/Z, [x24]\n"
- "fmla z20.s, z8.s, z6.s[0]\n"
- "fmla z21.s, z8.s, z6.s[1]\n"
- "fmla z22.s, z8.s, z6.s[2]\n"
- "fmla z23.s, z8.s, z6.s[3]\n"
- "ld1rqw { z6.s }, p1/Z, [x24, #16]\n"
- "fmla z24.s, z8.s, z0.s[0]\n"
- "fmla z25.s, z8.s, z0.s[1]\n"
- "fmla z26.s, z8.s, z0.s[2]\n"
- "fmla z27.s, z8.s, z0.s[3]\n"
- "ld1rqw { z0.s }, p1/Z, [x28]\n"
- "fmla z28.s, z8.s, z4.s[0]\n"
- "fmla z29.s, z8.s, z4.s[1]\n"
- "fmla z30.s, z8.s, z4.s[2]\n"
- "fmla z31.s, z8.s, z4.s[3]\n"
- "ld1rqw { z4.s }, p1/Z, [x28, #16]\n"
- "ldp x24, x28, [x21], #0x10\n"
+ "ldp x21, x20, [x23], #0x10\n"
+ "fmla z16.s, z8.s, z6.s[0]\n"
+ "fmla z17.s, z8.s, z6.s[1]\n"
+ "subs x22, x22, #0x1\n"
+ "fmla z18.s, z8.s, z6.s[2]\n"
+ "fmla z19.s, z8.s, z6.s[3]\n"
+ "ld1rqw { z6.s }, p1/Z, [x21]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z21.s, z8.s, z5.s[1]\n"
+ "fmla z22.s, z8.s, z5.s[2]\n"
+ "fmla z23.s, z8.s, z5.s[3]\n"
+ "ld1rqw { z5.s }, p1/Z, [x21, #16]\n"
+ "fmla z24.s, z8.s, z1.s[0]\n"
+ "fmla z25.s, z8.s, z1.s[1]\n"
+ "fmla z26.s, z8.s, z1.s[2]\n"
+ "fmla z27.s, z8.s, z1.s[3]\n"
+ "ld1rqw { z1.s }, p1/Z, [x20]\n"
+ "fmla z28.s, z8.s, z2.s[0]\n"
+ "fmla z29.s, z8.s, z2.s[1]\n"
+ "fmla z30.s, z8.s, z2.s[2]\n"
+ "fmla z31.s, z8.s, z2.s[3]\n"
+ "ld1rqw { z2.s }, p1/Z, [x20, #16]\n"
+ "ldp x21, x20, [x23], #0x10\n"
"ld1w { z8.s }, p1/Z, [%x[weights]]\n"
- "fmla z16.s, z11.s, z5.s[0]\n"
- "fmla z17.s, z11.s, z5.s[1]\n"
- "fmla z18.s, z11.s, z5.s[2]\n"
- "fmla z19.s, z11.s, z5.s[3]\n"
- "ld1rqw { z5.s }, p1/Z, [x24]\n"
- "fmla z20.s, z11.s, z7.s[0]\n"
- "fmla z21.s, z11.s, z7.s[1]\n"
- "fmla z22.s, z11.s, z7.s[2]\n"
- "fmla z23.s, z11.s, z7.s[3]\n"
- "ld1rqw { z7.s }, p1/Z, [x24, #16]\n"
- "fmla z24.s, z11.s, z3.s[0]\n"
- "fmla z25.s, z11.s, z3.s[1]\n"
- "fmla z26.s, z11.s, z3.s[2]\n"
- "fmla z27.s, z11.s, z3.s[3]\n"
- "ld1rqw { z3.s }, p1/Z, [x28]\n"
- "fmla z28.s, z11.s, z2.s[0]\n"
- "fmla z29.s, z11.s, z2.s[1]\n"
- "fmla z30.s, z11.s, z2.s[2]\n"
- "fmla z31.s, z11.s, z2.s[3]\n"
- "ld1rqw { z2.s }, p1/Z, [x28, #16]\n"
+ "fmla z16.s, z11.s, z0.s[0]\n"
+ "fmla z17.s, z11.s, z0.s[1]\n"
+ "fmla z18.s, z11.s, z0.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[3]\n"
+ "ld1rqw { z0.s }, p1/Z, [x21]\n"
+ "fmla z20.s, z11.s, z4.s[0]\n"
+ "fmla z21.s, z11.s, z4.s[1]\n"
+ "fmla z22.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z4.s[3]\n"
+ "ld1rqw { z4.s }, p1/Z, [x21, #16]\n"
+ "fmla z24.s, z11.s, z7.s[0]\n"
+ "fmla z25.s, z11.s, z7.s[1]\n"
+ "fmla z26.s, z11.s, z7.s[2]\n"
+ "fmla z27.s, z11.s, z7.s[3]\n"
+ "ld1rqw { z7.s }, p1/Z, [x20]\n"
+ "fmla z28.s, z11.s, z3.s[0]\n"
+ "fmla z29.s, z11.s, z3.s[1]\n"
+ "fmla z30.s, z11.s, z3.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw { z3.s }, p1/Z, [x20, #16]\n"
"ld1w { z11.s }, p1/Z, [%x[weights], #1, MUL VL]\n"
"addvl %x[weights], %x[weights], #2\n"
"bgt 3b\n"
"4:" // Output channel loop: Kernel loop tail
"tbnz %x[kernel_points], #0, 5f\n"
- "fmla z16.s, z8.s, z1.s[0]\n"
- "fmla z17.s, z8.s, z1.s[1]\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "fmla z18.s, z8.s, z1.s[2]\n"
- "fmla z19.s, z8.s, z1.s[3]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "fmla z20.s, z8.s, z6.s[0]\n"
- "fmla z21.s, z8.s, z6.s[1]\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "fmla z22.s, z8.s, z6.s[2]\n"
- "fmla z23.s, z8.s, z6.s[3]\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "fmla z24.s, z8.s, z0.s[0]\n"
- "fmla z25.s, z8.s, z0.s[1]\n"
- "fmla z26.s, z8.s, z0.s[2]\n"
- "fmla z27.s, z8.s, z0.s[3]\n"
- "fmla z28.s, z8.s, z4.s[0]\n"
- "fmla z29.s, z8.s, z4.s[1]\n"
- "fmla z30.s, z8.s, z4.s[2]\n"
- "fmla z31.s, z8.s, z4.s[3]\n"
- "fmla z16.s, z11.s, z5.s[0]\n"
- "fmla z17.s, z11.s, z5.s[1]\n"
- "fmin z16.s, p1/M, z16.s, z13.s\n"
- "fmin z17.s, p1/M, z17.s, z13.s\n"
- "fmla z18.s, z11.s, z5.s[2]\n"
- "fmla z19.s, z11.s, z5.s[3]\n"
- "fmin z18.s, p1/M, z18.s, z13.s\n"
- "fmin z19.s, p1/M, z19.s, z13.s\n"
- "fmla z20.s, z11.s, z7.s[0]\n"
- "fmla z21.s, z11.s, z7.s[1]\n"
- "fmin z20.s, p1/M, z20.s, z13.s\n"
- "fmin z21.s, p1/M, z21.s, z13.s\n"
- "fmla z22.s, z11.s, z7.s[2]\n"
- "fmla z23.s, z11.s, z7.s[3]\n"
- "fmin z22.s, p1/M, z22.s, z13.s\n"
- "fmin z23.s, p1/M, z23.s, z13.s\n"
- "fmla z24.s, z11.s, z3.s[0]\n"
- "fmla z25.s, z11.s, z3.s[1]\n"
- "fmax z16.s, p1/M, z16.s, z10.s\n"
- "fmax z17.s, p1/M, z17.s, z10.s\n"
- "fmla z26.s, z11.s, z3.s[2]\n"
- "fmla z27.s, z11.s, z3.s[3]\n"
- "fmax z18.s, p1/M, z18.s, z10.s\n"
- "fmax z19.s, p1/M, z19.s, z10.s\n"
- "fmla z28.s, z11.s, z2.s[0]\n"
- "fmla z29.s, z11.s, z2.s[1]\n"
- "fmax z20.s, p1/M, z20.s, z10.s\n"
- "fmax z21.s, p1/M, z21.s, z10.s\n"
- "fmla z30.s, z11.s, z2.s[2]\n"
- "fmla z31.s, z11.s, z2.s[3]\n"
- "fmax z22.s, p1/M, z22.s, z10.s\n"
- "fmax z23.s, p1/M, z23.s, z10.s\n"
- "fmin z24.s, p1/M, z24.s, z13.s\n"
- "fmin z25.s, p1/M, z25.s, z13.s\n"
- "st1w { z16.s }, p0, [x20, x9, LSL #2]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "fmin z26.s, p1/M, z26.s, z13.s\n"
- "fmin z27.s, p1/M, z27.s, z13.s\n"
- "st1w { z17.s }, p0, [x21, x9, LSL #2]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "fmin z28.s, p1/M, z28.s, z13.s\n"
- "fmin z29.s, p1/M, z29.s, z13.s\n"
- "st1w { z18.s }, p0, [x22, x9, LSL #2]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "fmin z30.s, p1/M, z30.s, z13.s\n"
- "fmin z31.s, p1/M, z31.s, z13.s\n"
- "st1w { z19.s }, p0, [x23, x9, LSL #2]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "st1w { z20.s }, p0, [x24, x9, LSL #2]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "fmax z24.s, p1/M, z24.s, z10.s\n"
- "fmax z25.s, p1/M, z25.s, z10.s\n"
- "st1w { z21.s }, p0, [x25, x9, LSL #2]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "fmax z26.s, p1/M, z26.s, z10.s\n"
- "fmax z27.s, p1/M, z27.s, z10.s\n"
- "st1w { z22.s }, p0, [x26, x9, LSL #2]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
- "fmax z28.s, p1/M, z28.s, z10.s\n"
- "fmax z29.s, p1/M, z29.s, z10.s\n"
- "st1w { z23.s }, p0, [x27, x9, LSL #2]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
- "fmax z30.s, p1/M, z30.s, z10.s\n"
- "fmax z31.s, p1/M, z31.s, z10.s\n"
- "st1w { z24.s }, p0, [x20, x9, LSL #2]\n"
- "st1w { z25.s }, p0, [x21, x9, LSL #2]\n"
- "st1w { z26.s }, p0, [x22, x9, LSL #2]\n"
- "st1w { z27.s }, p0, [x23, x9, LSL #2]\n"
- "st1w { z28.s }, p0, [x24, x9, LSL #2]\n"
- "st1w { z29.s }, p0, [x25, x9, LSL #2]\n"
- "st1w { z30.s }, p0, [x26, x9, LSL #2]\n"
- "st1w { z31.s }, p0, [x27, x9, LSL #2]\n"
+ "fmla z16.s, z8.s, z6.s[0]\n"
+ "fmla z17.s, z8.s, z6.s[1]\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "fmla z18.s, z8.s, z6.s[2]\n"
+ "fmla z19.s, z8.s, z6.s[3]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z21.s, z8.s, z5.s[1]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "fmla z22.s, z8.s, z5.s[2]\n"
+ "fmla z23.s, z8.s, z5.s[3]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "fmla z24.s, z8.s, z1.s[0]\n"
+ "fmla z25.s, z8.s, z1.s[1]\n"
+ "fmla z26.s, z8.s, z1.s[2]\n"
+ "fmla z27.s, z8.s, z1.s[3]\n"
+ "fmla z28.s, z8.s, z2.s[0]\n"
+ "fmla z29.s, z8.s, z2.s[1]\n"
+ "fmla z30.s, z8.s, z2.s[2]\n"
+ "fmla z31.s, z8.s, z2.s[3]\n"
+ "fmla z16.s, z11.s, z0.s[0]\n"
+ "fmla z17.s, z11.s, z0.s[1]\n"
+ "fmin z16.s, p1/M, z16.s, z14.s\n"
+ "fmin z17.s, p1/M, z17.s, z14.s\n"
+ "fmla z18.s, z11.s, z0.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[3]\n"
+ "fmin z18.s, p1/M, z18.s, z14.s\n"
+ "fmin z19.s, p1/M, z19.s, z14.s\n"
+ "fmla z20.s, z11.s, z4.s[0]\n"
+ "fmla z21.s, z11.s, z4.s[1]\n"
+ "fmin z20.s, p1/M, z20.s, z14.s\n"
+ "fmin z21.s, p1/M, z21.s, z14.s\n"
+ "fmla z22.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z4.s[3]\n"
+ "fmin z22.s, p1/M, z22.s, z14.s\n"
+ "fmin z23.s, p1/M, z23.s, z14.s\n"
+ "fmla z24.s, z11.s, z7.s[0]\n"
+ "fmla z25.s, z11.s, z7.s[1]\n"
+ "fmax z16.s, p1/M, z16.s, z15.s\n"
+ "fmax z17.s, p1/M, z17.s, z15.s\n"
+ "fmla z26.s, z11.s, z7.s[2]\n"
+ "fmla z27.s, z11.s, z7.s[3]\n"
+ "fmax z18.s, p1/M, z18.s, z15.s\n"
+ "fmax z19.s, p1/M, z19.s, z15.s\n"
+ "fmla z28.s, z11.s, z3.s[0]\n"
+ "fmla z29.s, z11.s, z3.s[1]\n"
+ "fmax z20.s, p1/M, z20.s, z15.s\n"
+ "fmax z21.s, p1/M, z21.s, z15.s\n"
+ "fmla z30.s, z11.s, z3.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "fmax z22.s, p1/M, z22.s, z15.s\n"
+ "fmax z23.s, p1/M, z23.s, z15.s\n"
+ "fmin z24.s, p1/M, z24.s, z14.s\n"
+ "fmin z25.s, p1/M, z25.s, z14.s\n"
+ "st1w { z16.s }, p0, [x27, x9, LSL #2]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "fmin z26.s, p1/M, z26.s, z14.s\n"
+ "fmin z27.s, p1/M, z27.s, z14.s\n"
+ "st1w { z17.s }, p0, [x26, x9, LSL #2]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "fmin z28.s, p1/M, z28.s, z14.s\n"
+ "fmin z29.s, p1/M, z29.s, z14.s\n"
+ "st1w { z18.s }, p0, [x25, x9, LSL #2]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "fmin z30.s, p1/M, z30.s, z14.s\n"
+ "fmin z31.s, p1/M, z31.s, z14.s\n"
+ "st1w { z19.s }, p0, [x24, x9, LSL #2]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "st1w { z20.s }, p0, [x23, x9, LSL #2]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmax z24.s, p1/M, z24.s, z15.s\n"
+ "fmax z25.s, p1/M, z25.s, z15.s\n"
+ "st1w { z21.s }, p0, [x22, x9, LSL #2]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "fmax z26.s, p1/M, z26.s, z15.s\n"
+ "fmax z27.s, p1/M, z27.s, z15.s\n"
+ "st1w { z22.s }, p0, [x21, x9, LSL #2]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "fmax z28.s, p1/M, z28.s, z15.s\n"
+ "fmax z29.s, p1/M, z29.s, z15.s\n"
+ "st1w { z23.s }, p0, [x20, x9, LSL #2]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "fmax z30.s, p1/M, z30.s, z15.s\n"
+ "fmax z31.s, p1/M, z31.s, z15.s\n"
+ "st1w { z24.s }, p0, [x27, x9, LSL #2]\n"
+ "st1w { z25.s }, p0, [x26, x9, LSL #2]\n"
+ "st1w { z26.s }, p0, [x25, x9, LSL #2]\n"
+ "st1w { z27.s }, p0, [x24, x9, LSL #2]\n"
+ "st1w { z28.s }, p0, [x23, x9, LSL #2]\n"
+ "st1w { z29.s }, p0, [x22, x9, LSL #2]\n"
+ "st1w { z30.s }, p0, [x21, x9, LSL #2]\n"
+ "st1w { z31.s }, p0, [x20, x9, LSL #2]\n"
"b 7f\n"
"5:" // Output channel loop: Odd tail
- "fmla z16.s, z8.s, z1.s[0]\n"
- "fmla z17.s, z8.s, z1.s[1]\n"
- "ldp x24, x28, [x21], #0x10\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "fmla z18.s, z8.s, z1.s[2]\n"
- "fmla z19.s, z8.s, z1.s[3]\n"
- "ld1rqw { z1.s }, p1/Z, [x24]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "fmla z20.s, z8.s, z6.s[0]\n"
- "fmla z21.s, z8.s, z6.s[1]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "fmla z22.s, z8.s, z6.s[2]\n"
- "fmla z23.s, z8.s, z6.s[3]\n"
- "ld1rqw { z6.s }, p1/Z, [x24, #16]\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "fmla z24.s, z8.s, z0.s[0]\n"
- "fmla z25.s, z8.s, z0.s[1]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "fmla z26.s, z8.s, z0.s[2]\n"
- "fmla z27.s, z8.s, z0.s[3]\n"
- "ld1rqw { z0.s }, p1/Z, [x28]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "fmla z28.s, z8.s, z4.s[0]\n"
- "fmla z29.s, z8.s, z4.s[1]\n"
- "fmla z30.s, z8.s, z4.s[2]\n"
- "fmla z31.s, z8.s, z4.s[3]\n"
- "ld1w { z8.s }, p1/Z, [%x[weights]]\n"
- "ld1rqw { z4.s }, p1/Z, [x28, #16]\n"
- "fmla z16.s, z11.s, z5.s[0]\n"
- "fmla z17.s, z11.s, z5.s[1]\n"
+ "fmla z16.s, z8.s, z6.s[0]\n"
+ "fmla z17.s, z8.s, z6.s[1]\n"
+ "ldp x20, x28, [x23], #0x10\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "fmla z18.s, z8.s, z6.s[2]\n"
+ "fmla z19.s, z8.s, z6.s[3]\n"
+ "ld1rqw { z6.s }, p1/Z, [x20]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z21.s, z8.s, z5.s[1]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "fmla z22.s, z8.s, z5.s[2]\n"
+ "fmla z23.s, z8.s, z5.s[3]\n"
+ "ld1rqw { z5.s }, p1/Z, [x20, #16]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "fmla z24.s, z8.s, z1.s[0]\n"
+ "fmla z25.s, z8.s, z1.s[1]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "fmla z26.s, z8.s, z1.s[2]\n"
+ "fmla z27.s, z8.s, z1.s[3]\n"
+ "ld1rqw { z1.s }, p1/Z, [x28]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "fmla z28.s, z8.s, z2.s[0]\n"
+ "fmla z29.s, z8.s, z2.s[1]\n"
+ "fmla z30.s, z8.s, z2.s[2]\n"
+ "fmla z31.s, z8.s, z2.s[3]\n"
+ "ld1w { z10.s }, p1/Z, [%x[weights]]\n"
+ "ld1rqw { z2.s }, p1/Z, [x28, #16]\n"
+ "fmla z16.s, z11.s, z0.s[0]\n"
+ "fmla z17.s, z11.s, z0.s[1]\n"
"addvl %x[weights], %x[weights], #1\n"
- "fmla z18.s, z11.s, z5.s[2]\n"
- "fmla z19.s, z11.s, z5.s[3]\n"
- "fmla z20.s, z11.s, z7.s[0]\n"
- "fmla z21.s, z11.s, z7.s[1]\n"
- "fmla z22.s, z11.s, z7.s[2]\n"
- "fmla z23.s, z11.s, z7.s[3]\n"
- "fmla z24.s, z11.s, z3.s[0]\n"
- "fmla z25.s, z11.s, z3.s[1]\n"
- "fmla z26.s, z11.s, z3.s[2]\n"
- "fmla z27.s, z11.s, z3.s[3]\n"
- "fmla z28.s, z11.s, z2.s[0]\n"
- "fmla z29.s, z11.s, z2.s[1]\n"
- "fmla z30.s, z11.s, z2.s[2]\n"
- "fmla z31.s, z11.s, z2.s[3]\n"
- "fmla z16.s, z8.s, z1.s[0]\n"
- "fmla z17.s, z8.s, z1.s[1]\n"
- "fmin z16.s, p1/M, z16.s, z13.s\n"
- "fmin z17.s, p1/M, z17.s, z13.s\n"
- "fmla z18.s, z8.s, z1.s[2]\n"
- "fmla z19.s, z8.s, z1.s[3]\n"
- "fmin z18.s, p1/M, z18.s, z13.s\n"
- "fmin z19.s, p1/M, z19.s, z13.s\n"
- "fmla z20.s, z8.s, z6.s[0]\n"
- "fmla z21.s, z8.s, z6.s[1]\n"
- "fmin z20.s, p1/M, z20.s, z13.s\n"
- "fmin z21.s, p1/M, z21.s, z13.s\n"
- "fmla z22.s, z8.s, z6.s[2]\n"
- "fmla z23.s, z8.s, z6.s[3]\n"
- "fmin z22.s, p1/M, z22.s, z13.s\n"
- "fmin z23.s, p1/M, z23.s, z13.s\n"
- "fmla z24.s, z8.s, z0.s[0]\n"
- "fmla z25.s, z8.s, z0.s[1]\n"
- "fmax z16.s, p1/M, z16.s, z10.s\n"
- "fmax z17.s, p1/M, z17.s, z10.s\n"
- "fmla z26.s, z8.s, z0.s[2]\n"
- "fmla z27.s, z8.s, z0.s[3]\n"
- "fmax z18.s, p1/M, z18.s, z10.s\n"
- "fmax z19.s, p1/M, z19.s, z10.s\n"
- "fmla z28.s, z8.s, z4.s[0]\n"
- "fmla z29.s, z8.s, z4.s[1]\n"
- "fmax z20.s, p1/M, z20.s, z10.s\n"
- "fmax z21.s, p1/M, z21.s, z10.s\n"
- "fmla z30.s, z8.s, z4.s[2]\n"
- "fmla z31.s, z8.s, z4.s[3]\n"
- "fmax z22.s, p1/M, z22.s, z10.s\n"
- "fmax z23.s, p1/M, z23.s, z10.s\n"
- "fmin z24.s, p1/M, z24.s, z13.s\n"
- "fmin z25.s, p1/M, z25.s, z13.s\n"
- "st1w { z16.s }, p0, [x20, x9, LSL #2]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "fmin z26.s, p1/M, z26.s, z13.s\n"
- "fmin z27.s, p1/M, z27.s, z13.s\n"
- "st1w { z17.s }, p0, [x21, x9, LSL #2]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "fmin z28.s, p1/M, z28.s, z13.s\n"
- "fmin z29.s, p1/M, z29.s, z13.s\n"
- "st1w { z18.s }, p0, [x22, x9, LSL #2]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "fmin z30.s, p1/M, z30.s, z13.s\n"
- "fmin z31.s, p1/M, z31.s, z13.s\n"
- "st1w { z19.s }, p0, [x23, x9, LSL #2]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "st1w { z20.s }, p0, [x24, x9, LSL #2]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "fmax z24.s, p1/M, z24.s, z10.s\n"
- "fmax z25.s, p1/M, z25.s, z10.s\n"
- "st1w { z21.s }, p0, [x25, x9, LSL #2]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "fmax z26.s, p1/M, z26.s, z10.s\n"
- "fmax z27.s, p1/M, z27.s, z10.s\n"
- "st1w { z22.s }, p0, [x26, x9, LSL #2]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
- "fmax z28.s, p1/M, z28.s, z10.s\n"
- "fmax z29.s, p1/M, z29.s, z10.s\n"
- "st1w { z23.s }, p0, [x27, x9, LSL #2]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
- "fmax z30.s, p1/M, z30.s, z10.s\n"
- "fmax z31.s, p1/M, z31.s, z10.s\n"
- "st1w { z24.s }, p0, [x20, x9, LSL #2]\n"
- "st1w { z25.s }, p0, [x21, x9, LSL #2]\n"
- "st1w { z26.s }, p0, [x22, x9, LSL #2]\n"
- "st1w { z27.s }, p0, [x23, x9, LSL #2]\n"
- "st1w { z28.s }, p0, [x24, x9, LSL #2]\n"
- "st1w { z29.s }, p0, [x25, x9, LSL #2]\n"
- "st1w { z30.s }, p0, [x26, x9, LSL #2]\n"
- "st1w { z31.s }, p0, [x27, x9, LSL #2]\n"
+ "fmla z18.s, z11.s, z0.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[3]\n"
+ "fmla z20.s, z11.s, z4.s[0]\n"
+ "fmla z21.s, z11.s, z4.s[1]\n"
+ "fmla z22.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z4.s[3]\n"
+ "fmla z24.s, z11.s, z7.s[0]\n"
+ "fmla z25.s, z11.s, z7.s[1]\n"
+ "fmla z26.s, z11.s, z7.s[2]\n"
+ "fmla z27.s, z11.s, z7.s[3]\n"
+ "fmla z28.s, z11.s, z3.s[0]\n"
+ "fmla z29.s, z11.s, z3.s[1]\n"
+ "fmla z30.s, z11.s, z3.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "fmla z16.s, z10.s, z6.s[0]\n"
+ "fmla z17.s, z10.s, z6.s[1]\n"
+ "fmin z16.s, p1/M, z16.s, z14.s\n"
+ "fmin z17.s, p1/M, z17.s, z14.s\n"
+ "fmla z18.s, z10.s, z6.s[2]\n"
+ "fmla z19.s, z10.s, z6.s[3]\n"
+ "fmin z18.s, p1/M, z18.s, z14.s\n"
+ "fmin z19.s, p1/M, z19.s, z14.s\n"
+ "fmla z20.s, z10.s, z5.s[0]\n"
+ "fmla z21.s, z10.s, z5.s[1]\n"
+ "fmin z20.s, p1/M, z20.s, z14.s\n"
+ "fmin z21.s, p1/M, z21.s, z14.s\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z23.s, z10.s, z5.s[3]\n"
+ "fmin z22.s, p1/M, z22.s, z14.s\n"
+ "fmin z23.s, p1/M, z23.s, z14.s\n"
+ "fmla z24.s, z10.s, z1.s[0]\n"
+ "fmla z25.s, z10.s, z1.s[1]\n"
+ "fmax z16.s, p1/M, z16.s, z15.s\n"
+ "fmax z17.s, p1/M, z17.s, z15.s\n"
+ "fmla z26.s, z10.s, z1.s[2]\n"
+ "fmla z27.s, z10.s, z1.s[3]\n"
+ "fmax z18.s, p1/M, z18.s, z15.s\n"
+ "fmax z19.s, p1/M, z19.s, z15.s\n"
+ "fmla z28.s, z10.s, z2.s[0]\n"
+ "fmla z29.s, z10.s, z2.s[1]\n"
+ "fmax z20.s, p1/M, z20.s, z15.s\n"
+ "fmax z21.s, p1/M, z21.s, z15.s\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z2.s[3]\n"
+ "fmax z22.s, p1/M, z22.s, z15.s\n"
+ "fmax z23.s, p1/M, z23.s, z15.s\n"
+ "fmin z24.s, p1/M, z24.s, z14.s\n"
+ "fmin z25.s, p1/M, z25.s, z14.s\n"
+ "st1w { z16.s }, p0, [x27, x9, LSL #2]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "fmin z26.s, p1/M, z26.s, z14.s\n"
+ "fmin z27.s, p1/M, z27.s, z14.s\n"
+ "st1w { z17.s }, p0, [x26, x9, LSL #2]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "fmin z28.s, p1/M, z28.s, z14.s\n"
+ "fmin z29.s, p1/M, z29.s, z14.s\n"
+ "st1w { z18.s }, p0, [x25, x9, LSL #2]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "fmin z30.s, p1/M, z30.s, z14.s\n"
+ "fmin z31.s, p1/M, z31.s, z14.s\n"
+ "st1w { z19.s }, p0, [x24, x9, LSL #2]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "st1w { z20.s }, p0, [x23, x9, LSL #2]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmax z24.s, p1/M, z24.s, z15.s\n"
+ "fmax z25.s, p1/M, z25.s, z15.s\n"
+ "st1w { z21.s }, p0, [x22, x9, LSL #2]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "fmax z26.s, p1/M, z26.s, z15.s\n"
+ "fmax z27.s, p1/M, z27.s, z15.s\n"
+ "st1w { z22.s }, p0, [x21, x9, LSL #2]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "fmax z28.s, p1/M, z28.s, z15.s\n"
+ "fmax z29.s, p1/M, z29.s, z15.s\n"
+ "st1w { z23.s }, p0, [x20, x9, LSL #2]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "fmax z30.s, p1/M, z30.s, z15.s\n"
+ "fmax z31.s, p1/M, z31.s, z15.s\n"
+ "st1w { z24.s }, p0, [x27, x9, LSL #2]\n"
+ "st1w { z25.s }, p0, [x26, x9, LSL #2]\n"
+ "st1w { z26.s }, p0, [x25, x9, LSL #2]\n"
+ "st1w { z27.s }, p0, [x24, x9, LSL #2]\n"
+ "st1w { z28.s }, p0, [x23, x9, LSL #2]\n"
+ "st1w { z29.s }, p0, [x22, x9, LSL #2]\n"
+ "st1w { z30.s }, p0, [x21, x9, LSL #2]\n"
+ "st1w { z31.s }, p0, [x20, x9, LSL #2]\n"
"b 7f\n"
"6:" // Output channel loop: Single kernel point
- "fmla z16.s, z8.s, z1.s[0]\n"
- "fmla z17.s, z8.s, z1.s[1]\n"
- "fmin z16.s, p1/M, z16.s, z13.s\n"
- "fmin z17.s, p1/M, z17.s, z13.s\n"
- "fmla z18.s, z8.s, z1.s[2]\n"
- "fmla z19.s, z8.s, z1.s[3]\n"
- "fmin z18.s, p1/M, z18.s, z13.s\n"
- "fmin z19.s, p1/M, z19.s, z13.s\n"
- "fmla z20.s, z8.s, z6.s[0]\n"
- "fmla z21.s, z8.s, z6.s[1]\n"
- "fmin z20.s, p1/M, z20.s, z13.s\n"
- "fmin z21.s, p1/M, z21.s, z13.s\n"
- "fmla z22.s, z8.s, z6.s[2]\n"
- "fmla z23.s, z8.s, z6.s[3]\n"
- "fmin z22.s, p1/M, z22.s, z13.s\n"
- "fmin z23.s, p1/M, z23.s, z13.s\n"
- "fmla z24.s, z8.s, z0.s[0]\n"
- "fmla z25.s, z8.s, z0.s[1]\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "fmla z26.s, z8.s, z0.s[2]\n"
- "fmla z27.s, z8.s, z0.s[3]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "fmla z28.s, z8.s, z4.s[0]\n"
- "fmla z29.s, z8.s, z4.s[1]\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "fmla z30.s, z8.s, z4.s[2]\n"
- "fmla z31.s, z8.s, z4.s[3]\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "fmax z16.s, p1/M, z16.s, z10.s\n"
- "fmax z17.s, p1/M, z17.s, z10.s\n"
- "st1w { z16.s }, p0, [x20, x9, LSL #2]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "fmax z18.s, p1/M, z18.s, z10.s\n"
- "fmax z19.s, p1/M, z19.s, z10.s\n"
- "st1w { z17.s }, p0, [x21, x9, LSL #2]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "fmax z20.s, p1/M, z20.s, z10.s\n"
- "fmax z21.s, p1/M, z21.s, z10.s\n"
- "st1w { z18.s }, p0, [x22, x9, LSL #2]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "fmax z22.s, p1/M, z22.s, z10.s\n"
- "fmax z23.s, p1/M, z23.s, z10.s\n"
- "st1w { z19.s }, p0, [x23, x9, LSL #2]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "fmin z24.s, p1/M, z24.s, z13.s\n"
- "fmin z25.s, p1/M, z25.s, z13.s\n"
- "st1w { z20.s }, p0, [x24, x9, LSL #2]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "fmin z26.s, p1/M, z26.s, z13.s\n"
- "fmin z27.s, p1/M, z27.s, z13.s\n"
- "st1w { z21.s }, p0, [x25, x9, LSL #2]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "fmin z28.s, p1/M, z28.s, z13.s\n"
- "fmin z29.s, p1/M, z29.s, z13.s\n"
- "st1w { z22.s }, p0, [x26, x9, LSL #2]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
- "fmin z30.s, p1/M, z30.s, z13.s\n"
- "fmin z31.s, p1/M, z31.s, z13.s\n"
- "st1w { z23.s }, p0, [x27, x9, LSL #2]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
- "fmax z24.s, p1/M, z24.s, z10.s\n"
- "fmax z25.s, p1/M, z25.s, z10.s\n"
- "st1w { z24.s }, p0, [x20, x9, LSL #2]\n"
- "fmax z26.s, p1/M, z26.s, z10.s\n"
- "fmax z27.s, p1/M, z27.s, z10.s\n"
- "st1w { z25.s }, p0, [x21, x9, LSL #2]\n"
- "fmax z28.s, p1/M, z28.s, z10.s\n"
- "fmax z29.s, p1/M, z29.s, z10.s\n"
- "st1w { z26.s }, p0, [x22, x9, LSL #2]\n"
- "fmax z30.s, p1/M, z30.s, z10.s\n"
- "fmax z31.s, p1/M, z31.s, z10.s\n"
- "st1w { z27.s }, p0, [x23, x9, LSL #2]\n"
- "st1w { z28.s }, p0, [x24, x9, LSL #2]\n"
- "st1w { z29.s }, p0, [x25, x9, LSL #2]\n"
- "st1w { z30.s }, p0, [x26, x9, LSL #2]\n"
- "st1w { z31.s }, p0, [x27, x9, LSL #2]\n"
+ "fmla z16.s, z8.s, z6.s[0]\n"
+ "fmla z17.s, z8.s, z6.s[1]\n"
+ "fmin z16.s, p1/M, z16.s, z14.s\n"
+ "fmin z17.s, p1/M, z17.s, z14.s\n"
+ "fmla z18.s, z8.s, z6.s[2]\n"
+ "fmla z19.s, z8.s, z6.s[3]\n"
+ "fmin z18.s, p1/M, z18.s, z14.s\n"
+ "fmin z19.s, p1/M, z19.s, z14.s\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z21.s, z8.s, z5.s[1]\n"
+ "fmin z20.s, p1/M, z20.s, z14.s\n"
+ "fmin z21.s, p1/M, z21.s, z14.s\n"
+ "fmla z22.s, z8.s, z5.s[2]\n"
+ "fmla z23.s, z8.s, z5.s[3]\n"
+ "fmin z22.s, p1/M, z22.s, z14.s\n"
+ "fmin z23.s, p1/M, z23.s, z14.s\n"
+ "fmla z24.s, z8.s, z1.s[0]\n"
+ "fmla z25.s, z8.s, z1.s[1]\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "fmla z26.s, z8.s, z1.s[2]\n"
+ "fmla z27.s, z8.s, z1.s[3]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "fmla z28.s, z8.s, z2.s[0]\n"
+ "fmla z29.s, z8.s, z2.s[1]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "fmla z30.s, z8.s, z2.s[2]\n"
+ "fmla z31.s, z8.s, z2.s[3]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "fmax z16.s, p1/M, z16.s, z15.s\n"
+ "fmax z17.s, p1/M, z17.s, z15.s\n"
+ "st1w { z16.s }, p0, [x27, x9, LSL #2]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "fmax z18.s, p1/M, z18.s, z15.s\n"
+ "fmax z19.s, p1/M, z19.s, z15.s\n"
+ "st1w { z17.s }, p0, [x26, x9, LSL #2]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "fmax z20.s, p1/M, z20.s, z15.s\n"
+ "fmax z21.s, p1/M, z21.s, z15.s\n"
+ "st1w { z18.s }, p0, [x25, x9, LSL #2]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "fmax z22.s, p1/M, z22.s, z15.s\n"
+ "fmax z23.s, p1/M, z23.s, z15.s\n"
+ "st1w { z19.s }, p0, [x24, x9, LSL #2]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "fmin z24.s, p1/M, z24.s, z14.s\n"
+ "fmin z25.s, p1/M, z25.s, z14.s\n"
+ "st1w { z20.s }, p0, [x23, x9, LSL #2]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmin z26.s, p1/M, z26.s, z14.s\n"
+ "fmin z27.s, p1/M, z27.s, z14.s\n"
+ "st1w { z21.s }, p0, [x22, x9, LSL #2]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "fmin z28.s, p1/M, z28.s, z14.s\n"
+ "fmin z29.s, p1/M, z29.s, z14.s\n"
+ "st1w { z22.s }, p0, [x21, x9, LSL #2]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "fmin z30.s, p1/M, z30.s, z14.s\n"
+ "fmin z31.s, p1/M, z31.s, z14.s\n"
+ "st1w { z23.s }, p0, [x20, x9, LSL #2]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "fmax z24.s, p1/M, z24.s, z15.s\n"
+ "fmax z25.s, p1/M, z25.s, z15.s\n"
+ "st1w { z24.s }, p0, [x27, x9, LSL #2]\n"
+ "fmax z26.s, p1/M, z26.s, z15.s\n"
+ "fmax z27.s, p1/M, z27.s, z15.s\n"
+ "st1w { z25.s }, p0, [x26, x9, LSL #2]\n"
+ "fmax z28.s, p1/M, z28.s, z15.s\n"
+ "fmax z29.s, p1/M, z29.s, z15.s\n"
+ "st1w { z26.s }, p0, [x25, x9, LSL #2]\n"
+ "fmax z30.s, p1/M, z30.s, z15.s\n"
+ "fmax z31.s, p1/M, z31.s, z15.s\n"
+ "st1w { z27.s }, p0, [x24, x9, LSL #2]\n"
+ "st1w { z28.s }, p0, [x23, x9, LSL #2]\n"
+ "st1w { z29.s }, p0, [x22, x9, LSL #2]\n"
+ "st1w { z30.s }, p0, [x21, x9, LSL #2]\n"
+ "st1w { z31.s }, p0, [x20, x9, LSL #2]\n"
"7:" // Output channel loop: Done
"incw x9\n"
"whilelt p0.s, x9, %x[n_output_channels]\n"
"b.any 1b\n"
: [weights] "+&r" (weights)
: [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [minmax_vals] "r" (minmax_vals), [n_output_channels] "r" ((uint64_t) n_output_channels), [outptrs] "r" (outptrs)
- : "cc", "memory", "p0", "p1", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z10", "z11", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z10", "z11", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
index 4e2ee43374..32ea009e8a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
index 4eae5961a0..0cee302c56 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -30,472 +30,464 @@
namespace arm_conv {
namespace depthwise {
-void sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
- const unsigned int n_channels,
- const int8_t *const *const inptrs,
- const int8_t *params,
- const int32_t *, // Bias, should be wrapped into the parameters
- const arm_gemm::Requantize32& qp,
- const int32_t *, const int32_t *, // Requant parameters, also wrapped
- int8_t *const *const outptrs
-)
+void sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const int8_t *const *const inptrs, const int8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, int8_t *const *const outptrs)
{
__asm__ __volatile__(
- "mov x13, #0x0\n"
- "whilelt p2.b, x13, %x[n_channels]\n"
- "ldp x12, x11, [%x[inptrs], #0x0]\n"
- "ldp x10, x9, [%x[inptrs], #0x10]\n"
- "ldp x28, x27, [%x[inptrs], #0x20]\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
+ "mov x14, #0x0\n"
+ "whilelt p0.b, x14, %x[n_channels]\n"
+ "ldp x27, x26, [%x[inptrs], #0x0]\n"
+ "ldp x25, x24, [%x[inptrs], #0x10]\n"
+ "ldp x23, x22, [%x[inptrs], #0x20]\n"
+ "ldp x13, x21, [%x[inptrs], #0x30]\n"
"mov x20, #0x1\n"
- "ptrue p1.b\n"
- "ldp x24, x23, [%x[outptrs], #0x0]\n"
- "ldp x22, x21, [%x[outptrs], #0x10]\n"
+ "ptrue p2.b\n"
+ "ldp x12, x11, [%x[outptrs], #0x0]\n"
+ "ldp x10, x9, [%x[outptrs], #0x10]\n"
"orr x20, x20, #0x100\n"
"orr x20, x20, #0x10000\n"
- "ld1b { z14.b }, p2/Z, [x12, x13]\n"
- "ld1b { z13.b }, p2/Z, [x11, x13]\n"
- "dup z12.s, w20\n"
- "mov x20, #0x0\n"
- "ldp x12, x11, [%x[inptrs], #0x40]\n"
- "ld1b { z11.b }, p2/Z, [x10, x13]\n"
- "zip2 z10.b, z14.b, z11.b\n"
- "zip1 z14.b, z14.b, z11.b\n"
- "ld1b { z9.b }, p2/Z, [x9, x13]\n"
- "ldp x10, x9, [%x[inptrs], #0x50]\n"
- "zip1 z11.b, z13.b, z9.b\n"
- "zip2 z9.b, z13.b, z9.b\n"
- "ld1b { z8.b }, p2/Z, [x28, x13]\n"
- "ld1b { z7.b }, p2/Z, [x27, x13]\n"
- "zip2 z13.b, z14.b, z11.b\n"
- "zip1 z14.b, z14.b, z11.b\n"
- "ldp x28, x27, [%x[inptrs], #0x60]\n"
- "ld1b { z6.b }, p2/Z, [x26, x13]\n"
- "zip1 z11.b, z10.b, z9.b\n"
- "zip2 z9.b, z10.b, z9.b\n"
- "ld1b { z5.b }, p2/Z, [x25, x13]\n"
- "ldp x26, x25, [%x[inptrs], #0x70]\n"
- "zip2 z4.b, z8.b, z6.b\n"
- "zip1 z8.b, z8.b, z6.b\n"
- "ld1b { z3.b }, p2/Z, [x12, x13]\n"
- "ld1b { z2.b }, p2/Z, [x11, x13]\n"
- "zip1 z6.b, z7.b, z5.b\n"
- "zip2 z5.b, z7.b, z5.b\n"
- "ld1b { z1.b }, p2/Z, [x10, x13]\n"
- "ld1b { z0.b }, p2/Z, [x9, x13]\n"
- "zip2 z31.b, z3.b, z1.b\n"
- "zip1 z3.b, z3.b, z1.b\n"
- "ld1b { z30.b }, p2/Z, [x28, x13]\n"
- "ld1b { z29.b }, p2/Z, [x27, x13]\n"
- "zip1 z1.b, z2.b, z0.b\n"
- "zip2 z0.b, z2.b, z0.b\n"
- "ld1b { z28.b }, p2/Z, [x26, x13]\n"
- "ld1b { z27.b }, p2/Z, [x25, x13]\n"
- "zip2 z26.b, z30.b, z28.b\n"
- "zip1 z30.b, z30.b, z28.b\n"
- "zip1 z28.b, z29.b, z27.b\n"
- "zip2 z27.b, z29.b, z27.b\n"
- "ld1w { z10.s }, p1/Z, [%x[params]]\n"
- "ld1rw { z25.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z24.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
- "ld1rw { z23.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "zip2 z7.b, z8.b, z6.b\n"
- "zip1 z8.b, z8.b, z6.b\n"
- "ld1rw { z22.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "ldp x12, x11, [%x[inptrs], #0x0]\n"
- "zip1 z6.b, z4.b, z5.b\n"
- "zip2 z5.b, z4.b, z5.b\n"
- "ldp x10, x9, [%x[inptrs], #0x10]\n"
- "ldp x28, x27, [%x[inptrs], #0x20]\n"
- "zip2 z2.b, z3.b, z1.b\n"
- "zip1 z3.b, z3.b, z1.b\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
- "zip1 z1.b, z31.b, z0.b\n"
- "zip2 z0.b, z31.b, z0.b\n"
- "ld1b { z21.b }, p1/Z, [%x[params], #1, MUL VL]\n"
- "zip2 z29.b, z30.b, z28.b\n"
- "zip1 z30.b, z30.b, z28.b\n"
- "ld1b { z16.b }, p1/Z, [%x[params], #2, MUL VL]\n"
- "ld1b { z20.b }, p1/Z, [%x[params], #3, MUL VL]\n"
- "zip1 z28.b, z26.b, z27.b\n"
- "zip2 z27.b, z26.b, z27.b\n"
+ "ld1b { z15.b }, p0/Z, [x27, x14]\n"
+ "ld1b { z21.b }, p0/Z, [x26, x14]\n"
+ "dup z25.s, w20\n"
+ "mov x28, #0x0\n"
+ "ldp x27, x26, [%x[inptrs], #0x40]\n"
+ "ld1b { z31.b }, p0/Z, [x25, x14]\n"
+ "zip2 z16.b, z15.b, z31.b\n"
+ "zip1 z15.b, z15.b, z31.b\n"
+ "ld1b { z29.b }, p0/Z, [x24, x14]\n"
+ "ldp x25, x24, [%x[inptrs], #0x50]\n"
+ "zip1 z30.b, z21.b, z29.b\n"
+ "zip2 z29.b, z21.b, z29.b\n"
+ "ld1b { z9.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z20.b }, p0/Z, [x22, x14]\n"
+ "zip2 z13.b, z15.b, z30.b\n"
+ "zip1 z15.b, z15.b, z30.b\n"
+ "ldp x23, x22, [%x[inptrs], #0x60]\n"
+ "ld1b { z5.b }, p0/Z, [x13, x14]\n"
+ "zip1 z14.b, z16.b, z29.b\n"
+ "zip2 z29.b, z16.b, z29.b\n"
+ "ld1b { z17.b }, p0/Z, [x21, x14]\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "zip2 z31.b, z9.b, z5.b\n"
+ "zip1 z9.b, z9.b, z5.b\n"
+ "ld1b { z18.b }, p0/Z, [x27, x14]\n"
+ "ld1b { z28.b }, p0/Z, [x26, x14]\n"
+ "zip1 z21.b, z20.b, z17.b\n"
+ "zip2 z17.b, z20.b, z17.b\n"
+ "ld1b { z6.b }, p0/Z, [x25, x14]\n"
+ "ld1b { z4.b }, p0/Z, [x24, x14]\n"
+ "zip2 z23.b, z18.b, z6.b\n"
+ "zip1 z18.b, z18.b, z6.b\n"
+ "ld1b { z2.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z19.b }, p0/Z, [x22, x14]\n"
+ "zip1 z24.b, z28.b, z4.b\n"
+ "zip2 z4.b, z28.b, z4.b\n"
+ "ld1b { z16.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z5.b }, p0/Z, [x20, x14]\n"
+ "zip2 z22.b, z2.b, z16.b\n"
+ "zip1 z2.b, z2.b, z16.b\n"
+ "zip1 z0.b, z19.b, z5.b\n"
+ "zip2 z5.b, z19.b, z5.b\n"
+ "ld1w { z10.s }, p2/Z, [%x[params]]\n"
+ "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "zip2 z19.b, z9.b, z21.b\n"
+ "zip1 z9.b, z9.b, z21.b\n"
+ "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ldp x27, x26, [%x[inptrs], #0x0]\n"
+ "zip1 z11.b, z31.b, z17.b\n"
+ "zip2 z17.b, z31.b, z17.b\n"
+ "ldp x25, x23, [%x[inptrs], #0x10]\n"
+ "ldp x24, x22, [%x[inptrs], #0x20]\n"
+ "zip2 z12.b, z18.b, z24.b\n"
+ "zip1 z18.b, z18.b, z24.b\n"
+ "ldp x21, x20, [%x[inptrs], #0x30]\n"
+ "zip1 z20.b, z23.b, z4.b\n"
+ "zip2 z4.b, z23.b, z4.b\n"
+ "ld1b { z26.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "zip2 z24.b, z2.b, z0.b\n"
+ "zip1 z2.b, z2.b, z0.b\n"
+ "ld1b { z3.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z1.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "zip1 z0.b, z22.b, z5.b\n"
+ "zip2 z5.b, z22.b, z5.b\n"
"addvl %x[params], %x[params], #4\n"
- "mov z4.d, z10.d\n"
+ "mov z22.d, z10.d\n"
"mov z31.d, z10.d\n"
- "mov z26.d, z10.d\n"
+ "mov z21.d, z10.d\n"
"1:" // Loop
- "mov z19.s, #0x0\n"
- "sdot z19.s, z12.b, z8.b\n"
- "sdot z10.s, z21.b, z14.b\n"
- "whilelt p0.s, x20, %x[n_channels]\n"
- "sdot z19.s, z12.b, z3.b\n"
- "sdot z31.s, z21.b, z8.b\n"
- "incw x13, ALL, MUL #4\n"
- "sdot z10.s, z16.b, z8.b\n"
- "ext z8.b, z8.b, z8.b, #0x1\n"
- "movprfx z18, z19\n sdot z18.s, z12.b, z30.b\n"
- "sdot z19.s, z12.b, z14.b\n"
- "ext z14.b, z14.b, z14.b, #0x1\n"
- "sdot z31.s, z16.b, z3.b\n"
- "sdot z10.s, z20.b, z3.b\n"
- "ext z3.b, z3.b, z3.b, #0x1\n"
- "sdot z4.s, z21.b, z14.b\n"
- "sdot z26.s, z21.b, z8.b\n"
- "mov z17.s, #0x0\n"
- "sdot z17.s, z12.b, z8.b\n"
- "sdot z17.s, z12.b, z3.b\n"
- "sdot z31.s, z20.b, z30.b\n"
- "ext z30.b, z30.b, z30.b, #0x1\n"
- "sdot z4.s, z16.b, z8.b\n"
- "sdot z26.s, z16.b, z3.b\n"
- "ld1w { z8.s }, p1/Z, [%x[params], #1, MUL VL]\n"
- "mls z10.s, p1/M, z19.s, z23.s\n"
- "movprfx z16, z17\n sdot z16.s, z12.b, z30.b\n"
- "mov z19.s, #0x0\n"
- "sdot z17.s, z12.b, z14.b\n"
- "ld1w { z14.s }, p1/Z, [%x[params]]\n"
- "sdot z4.s, z20.b, z3.b\n"
- ".inst 0x04ae754a // sqrdmulh z10.s, z10.s, z14.s\n"
- "sdot z26.s, z20.b, z30.b\n"
- "mls z4.s, p1/M, z17.s, z23.s\n"
- "and z21.d, z10.d, z8.d\n"
- "mls z31.s, p1/M, z18.s, z23.s\n"
- "mls z26.s, p1/M, z16.s, z23.s\n"
- "asr z21.s, z21.s, #0x1f\n"
- ".inst 0x04ae7484 // sqrdmulh z4.s, z4.s, z14.s\n"
- ".inst 0x04ae77ff // sqrdmulh z31.s, z31.s, z14.s\n"
- "sdot z19.s, z12.b, z7.b\n"
- ".inst 0x04ae775a // sqrdmulh z26.s, z26.s, z14.s\n"
- "sqadd z10.s, z10.s, z21.s\n"
- ".inst 0x4482850a // srshl z10.s, p1/M, z10.s, z8.s\n"
- "sdot z19.s, z12.b, z2.b\n"
- "and z16.d, z4.d, z8.d\n"
- "and z20.d, z31.d, z8.d\n"
- "movprfx z18, z19\n sdot z18.s, z12.b, z29.b\n"
- "ld1w { z14.s }, p1/Z, [%x[params], #6, MUL VL]\n"
- "and z21.d, z26.d, z8.d\n"
- "asr z16.s, z16.s, #0x1f\n"
- "sdot z19.s, z12.b, z13.b\n"
- "asr z20.s, z20.s, #0x1f\n"
- "asr z21.s, z21.s, #0x1f\n"
- "sqadd z4.s, z4.s, z16.s\n"
- "sqadd z31.s, z31.s, z20.s\n"
- ".inst 0x44828504 // srshl z4.s, p1/M, z4.s, z8.s\n"
- ".inst 0x4482851f // srshl z31.s, p1/M, z31.s, z8.s\n"
- "sqadd z26.s, z26.s, z21.s\n"
- "add z10.s, z10.s, z22.s\n"
- ".inst 0x4482851a // srshl z26.s, p1/M, z26.s, z8.s\n"
- "smax z10.s, p1/M, z10.s, z25.s\n"
- "add z4.s, z4.s, z22.s\n"
- "add z31.s, z31.s, z22.s\n"
- "smin z10.s, p1/M, z10.s, z24.s\n"
- "smax z4.s, p1/M, z4.s, z25.s\n"
- "add z26.s, z26.s, z22.s\n"
- "smax z31.s, p1/M, z31.s, z25.s\n"
- "smax z26.s, p1/M, z26.s, z25.s\n"
- "st1b { z10.s }, p0, [x24, x20]\n"
- "ld1w { z10.s }, p1/Z, [%x[params], #2, MUL VL]\n"
- "ld1b { z21.b }, p1/Z, [%x[params], #3, MUL VL]\n"
- "smin z4.s, p1/M, z4.s, z24.s\n"
- "smin z31.s, p1/M, z31.s, z24.s\n"
- "smin z26.s, p1/M, z26.s, z24.s\n"
- "st1b { z4.s }, p0, [x23, x20]\n"
- "mov z4.d, z10.d\n"
- "ld1b { z16.b }, p1/Z, [%x[params], #4, MUL VL]\n"
- "st1b { z31.s }, p0, [x22, x20]\n"
- "mov z31.d, z10.d\n"
- "sdot z31.s, z21.b, z7.b\n"
- "ld1b { z20.b }, p1/Z, [%x[params], #5, MUL VL]\n"
- "st1b { z26.s }, p0, [x21, x20]\n"
- "mov z26.d, z10.d\n"
- "sdot z10.s, z21.b, z13.b\n"
- "sdot z10.s, z16.b, z7.b\n"
- "ext z13.b, z13.b, z13.b, #0x1\n"
- "ext z7.b, z7.b, z7.b, #0x1\n"
- "sdot z4.s, z21.b, z13.b\n"
- "ld1w { z8.s }, p1/Z, [%x[params], #7, MUL VL]\n"
- "mov z17.s, #0x0\n"
- "sdot z26.s, z21.b, z7.b\n"
- "sdot z17.s, z12.b, z7.b\n"
- "incw x20\n"
- "sdot z31.s, z16.b, z2.b\n"
- "sdot z10.s, z20.b, z2.b\n"
+ "mov z30.s, #0x0\n"
+ "sdot z30.s, z25.b, z9.b\n"
+ "sdot z10.s, z26.b, z15.b\n"
+ "whilelt p0.s, x28, %x[n_channels]\n"
+ "sdot z30.s, z25.b, z18.b\n"
+ "sdot z31.s, z26.b, z9.b\n"
+ "mov z27.s, #0x0\n"
+ "incw x14, ALL, MUL #4\n"
+ "sdot z10.s, z3.b, z9.b\n"
+ "ext z9.b, z9.b, z9.b, #0x1\n"
+ "movprfx z28, z30\n sdot z28.s, z25.b, z2.b\n"
+ "sdot z30.s, z25.b, z15.b\n"
+ "ext z15.b, z15.b, z15.b, #0x1\n"
+ "sdot z27.s, z25.b, z9.b\n"
+ "sdot z31.s, z3.b, z18.b\n"
+ "sdot z10.s, z1.b, z18.b\n"
+ "ext z18.b, z18.b, z18.b, #0x1\n"
+ "sdot z22.s, z26.b, z15.b\n"
+ "sdot z21.s, z26.b, z9.b\n"
+ "sdot z27.s, z25.b, z18.b\n"
+ "sdot z31.s, z1.b, z2.b\n"
"ext z2.b, z2.b, z2.b, #0x1\n"
- "whilelt p0.s, x20, %x[n_channels]\n"
- "sdot z4.s, z16.b, z7.b\n"
- "sdot z26.s, z16.b, z2.b\n"
+ "sdot z22.s, z3.b, z9.b\n"
+ "sdot z21.s, z3.b, z18.b\n"
+ "ld1w { z3.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "mls z10.s, p2/M, z30.s, z8.s\n"
+ "movprfx z26, z27\n sdot z26.s, z25.b, z2.b\n"
+ "mov z9.s, #0x0\n"
+ "sdot z27.s, z25.b, z15.b\n"
+ "ld1w { z23.s }, p2/Z, [%x[params]]\n"
+ "sdot z22.s, z1.b, z18.b\n"
+ ".inst 0x04b7754a // sqrdmulh z10.s, z10.s, z23.s\n"
+ "sdot z21.s, z1.b, z2.b\n"
+ "mls z22.s, p2/M, z27.s, z8.s\n"
+ "and z18.d, z10.d, z3.d\n"
+ "mls z31.s, p2/M, z28.s, z8.s\n"
+ "mls z21.s, p2/M, z26.s, z8.s\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ ".inst 0x04b776d6 // sqrdmulh z22.s, z22.s, z23.s\n"
+ ".inst 0x04b777ff // sqrdmulh z31.s, z31.s, z23.s\n"
+ "sdot z9.s, z25.b, z19.b\n"
+ ".inst 0x04b776b5 // sqrdmulh z21.s, z21.s, z23.s\n"
+ "sqadd z10.s, z10.s, z18.s\n"
+ ".inst 0x4482886a // srshl z10.s, p2/M, z10.s, z3.s\n"
+ "sdot z9.s, z25.b, z12.b\n"
+ "and z28.d, z22.d, z3.d\n"
+ "and z23.d, z31.d, z3.d\n"
+ "movprfx z27, z9\n sdot z27.s, z25.b, z24.b\n"
+ "ld1w { z30.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "and z18.d, z21.d, z3.d\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "sdot z9.s, z25.b, z13.b\n"
+ "asr z23.s, z23.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sqadd z22.s, z22.s, z28.s\n"
+ "sqadd z31.s, z31.s, z23.s\n"
+ ".inst 0x44828876 // srshl z22.s, p2/M, z22.s, z3.s\n"
+ ".inst 0x4482887f // srshl z31.s, p2/M, z31.s, z3.s\n"
+ "sqadd z21.s, z21.s, z18.s\n"
+ "add z10.s, z10.s, z16.s\n"
+ ".inst 0x44828875 // srshl z21.s, p2/M, z21.s, z3.s\n"
+ "smax z10.s, p2/M, z10.s, z7.s\n"
+ "add z22.s, z22.s, z16.s\n"
+ "add z31.s, z31.s, z16.s\n"
+ "smin z10.s, p2/M, z10.s, z6.s\n"
+ "smax z22.s, p2/M, z22.s, z7.s\n"
+ "add z21.s, z21.s, z16.s\n"
+ "smax z31.s, p2/M, z31.s, z7.s\n"
+ "smax z21.s, p2/M, z21.s, z7.s\n"
+ "st1b { z10.s }, p0, [x12, x28]\n"
+ "ld1w { z28.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z1.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "smin z22.s, p2/M, z22.s, z6.s\n"
+ "smin z31.s, p2/M, z31.s, z6.s\n"
+ "smin z21.s, p2/M, z21.s, z6.s\n"
+ "st1b { z22.s }, p0, [x11, x28]\n"
+ "mov z26.d, z28.d\n"
+ "ld1b { z15.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "st1b { z31.s }, p0, [x10, x28]\n"
+ "mov z31.d, z28.d\n"
+ "sdot z31.s, z1.b, z19.b\n"
+ "ld1b { z23.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "st1b { z21.s }, p0, [x9, x28]\n"
+ "mov z22.d, z28.d\n"
+ "sdot z28.s, z1.b, z13.b\n"
+ "sdot z28.s, z15.b, z19.b\n"
+ "ext z13.b, z13.b, z13.b, #0x1\n"
+ "ext z19.b, z19.b, z19.b, #0x1\n"
+ "sdot z26.s, z1.b, z13.b\n"
+ "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "mov z18.s, #0x0\n"
+ "sdot z22.s, z1.b, z19.b\n"
+ "sdot z18.s, z25.b, z19.b\n"
+ "incw x28\n"
+ "sdot z31.s, z15.b, z12.b\n"
+ "sdot z28.s, z23.b, z12.b\n"
+ "ext z12.b, z12.b, z12.b, #0x1\n"
+ "whilelt p0.s, x28, %x[n_channels]\n"
+ "sdot z26.s, z15.b, z19.b\n"
+ "sdot z22.s, z15.b, z12.b\n"
"addvl %x[params], %x[params], #16\n"
- "sdot z17.s, z12.b, z2.b\n"
- "sdot z31.s, z20.b, z29.b\n"
- "ext z29.b, z29.b, z29.b, #0x1\n"
- "mls z10.s, p1/M, z19.s, z23.s\n"
- "sdot z4.s, z20.b, z2.b\n"
- ".inst 0x04ae754a // sqrdmulh z10.s, z10.s, z14.s\n"
- "sdot z26.s, z20.b, z29.b\n"
- "movprfx z16, z17\n sdot z16.s, z12.b, z29.b\n"
- "and z21.d, z10.d, z8.d\n"
- "sdot z17.s, z12.b, z13.b\n"
- "mls z4.s, p1/M, z17.s, z23.s\n"
- "asr z21.s, z21.s, #0x1f\n"
- "mls z31.s, p1/M, z18.s, z23.s\n"
- "mls z26.s, p1/M, z16.s, z23.s\n"
- ".inst 0x04ae7484 // sqrdmulh z4.s, z4.s, z14.s\n"
- ".inst 0x04ae77ff // sqrdmulh z31.s, z31.s, z14.s\n"
- ".inst 0x04ae775a // sqrdmulh z26.s, z26.s, z14.s\n"
- "ld1w { z14.s }, p1/Z, [%x[params], #-4, MUL VL]\n"
- "sqadd z10.s, z10.s, z21.s\n"
- "and z16.d, z4.d, z8.d\n"
- ".inst 0x4482850a // srshl z10.s, p1/M, z10.s, z8.s\n"
- "and z20.d, z31.d, z8.d\n"
- "and z21.d, z26.d, z8.d\n"
- "asr z16.s, z16.s, #0x1f\n"
- "asr z20.s, z20.s, #0x1f\n"
- "asr z21.s, z21.s, #0x1f\n"
- "sqadd z4.s, z4.s, z16.s\n"
- ".inst 0x44828504 // srshl z4.s, p1/M, z4.s, z8.s\n"
- "ld1b { z16.b }, p1/Z, [%x[params], #-6, MUL VL]\n"
- "sqadd z31.s, z31.s, z20.s\n"
- "sqadd z26.s, z26.s, z21.s\n"
- ".inst 0x4482851f // srshl z31.s, p1/M, z31.s, z8.s\n"
- ".inst 0x4482851a // srshl z26.s, p1/M, z26.s, z8.s\n"
- "add z10.s, z10.s, z22.s\n"
- "smax z10.s, p1/M, z10.s, z25.s\n"
- "add z4.s, z4.s, z22.s\n"
- "smin z10.s, p1/M, z10.s, z24.s\n"
- "add z31.s, z31.s, z22.s\n"
- "add z26.s, z26.s, z22.s\n"
- "smax z4.s, p1/M, z4.s, z25.s\n"
- "smax z31.s, p1/M, z31.s, z25.s\n"
- "mov z19.s, #0x0\n"
- "sdot z19.s, z12.b, z6.b\n"
- "smax z26.s, p1/M, z26.s, z25.s\n"
- "st1b { z10.s }, p0, [x24, x20]\n"
- "ld1w { z10.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
- "ld1b { z21.b }, p1/Z, [%x[params], #-7, MUL VL]\n"
- "smin z4.s, p1/M, z4.s, z24.s\n"
- "smin z31.s, p1/M, z31.s, z24.s\n"
- "smin z26.s, p1/M, z26.s, z24.s\n"
- "st1b { z4.s }, p0, [x23, x20]\n"
- "mov z4.d, z10.d\n"
- "sdot z19.s, z12.b, z1.b\n"
- "st1b { z31.s }, p0, [x22, x20]\n"
- "mov z31.d, z10.d\n"
- "sdot z31.s, z21.b, z6.b\n"
- "movprfx z18, z19\n sdot z18.s, z12.b, z28.b\n"
- "st1b { z26.s }, p0, [x21, x20]\n"
- "mov z26.d, z10.d\n"
- "sdot z10.s, z21.b, z11.b\n"
- "sdot z10.s, z16.b, z6.b\n"
- "sdot z19.s, z12.b, z11.b\n"
+ "sdot z18.s, z25.b, z12.b\n"
+ "sdot z31.s, z23.b, z24.b\n"
+ "ext z24.b, z24.b, z24.b, #0x1\n"
+ "mls z28.s, p2/M, z9.s, z8.s\n"
+ "sdot z26.s, z23.b, z12.b\n"
+ ".inst 0x04be779c // sqrdmulh z28.s, z28.s, z30.s\n"
+ "sdot z22.s, z23.b, z24.b\n"
+ "movprfx z12, z18\n sdot z12.s, z25.b, z24.b\n"
+ "and z2.d, z28.d, z21.d\n"
+ "sdot z18.s, z25.b, z13.b\n"
+ "mls z26.s, p2/M, z18.s, z8.s\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "mls z31.s, p2/M, z27.s, z8.s\n"
+ "mls z22.s, p2/M, z12.s, z8.s\n"
+ ".inst 0x04be775a // sqrdmulh z26.s, z26.s, z30.s\n"
+ ".inst 0x04be77ff // sqrdmulh z31.s, z31.s, z30.s\n"
+ ".inst 0x04be76d6 // sqrdmulh z22.s, z22.s, z30.s\n"
+ "ld1w { z1.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
+ "sqadd z28.s, z28.s, z2.s\n"
+ "and z24.d, z26.d, z21.d\n"
+ ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
+ "and z23.d, z31.d, z21.d\n"
+ "and z18.d, z22.d, z21.d\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "asr z23.s, z23.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sqadd z26.s, z26.s, z24.s\n"
+ ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
+ "ld1b { z30.b }, p2/Z, [%x[params], #-6, MUL VL]\n"
+ "sqadd z31.s, z31.s, z23.s\n"
+ "sqadd z22.s, z22.s, z18.s\n"
+ ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
+ ".inst 0x44828ab6 // srshl z22.s, p2/M, z22.s, z21.s\n"
+ "add z28.s, z28.s, z16.s\n"
+ "smax z28.s, p2/M, z28.s, z7.s\n"
+ "add z26.s, z26.s, z16.s\n"
+ "smin z28.s, p2/M, z28.s, z6.s\n"
+ "add z31.s, z31.s, z16.s\n"
+ "add z22.s, z22.s, z16.s\n"
+ "smax z26.s, p2/M, z26.s, z7.s\n"
+ "smax z31.s, p2/M, z31.s, z7.s\n"
+ "mov z24.s, #0x0\n"
+ "sdot z24.s, z25.b, z11.b\n"
+ "smax z22.s, p2/M, z22.s, z7.s\n"
+ "st1b { z28.s }, p0, [x12, x28]\n"
+ "ld1w { z23.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+ "ld1b { z19.b }, p2/Z, [%x[params], #-7, MUL VL]\n"
+ "smin z26.s, p2/M, z26.s, z6.s\n"
+ "smin z31.s, p2/M, z31.s, z6.s\n"
+ "smin z22.s, p2/M, z22.s, z6.s\n"
+ "st1b { z26.s }, p0, [x11, x28]\n"
+ "mov z28.d, z23.d\n"
+ "sdot z24.s, z25.b, z20.b\n"
+ "st1b { z31.s }, p0, [x10, x28]\n"
+ "mov z27.d, z23.d\n"
+ "sdot z27.s, z19.b, z11.b\n"
+ "movprfx z13, z24\n sdot z13.s, z25.b, z0.b\n"
+ "st1b { z22.s }, p0, [x9, x28]\n"
+ "mov z26.d, z23.d\n"
+ "sdot z23.s, z19.b, z14.b\n"
+ "sdot z23.s, z30.b, z11.b\n"
+ "sdot z24.s, z25.b, z14.b\n"
+ "ext z14.b, z14.b, z14.b, #0x1\n"
+ "ld1b { z21.b }, p2/Z, [%x[params], #-5, MUL VL]\n"
+ "sdot z28.s, z19.b, z14.b\n"
"ext z11.b, z11.b, z11.b, #0x1\n"
- "ld1b { z20.b }, p1/Z, [%x[params], #-5, MUL VL]\n"
- "sdot z4.s, z21.b, z11.b\n"
- "ext z6.b, z6.b, z6.b, #0x1\n"
- "mov z17.s, #0x0\n"
- "sdot z26.s, z21.b, z6.b\n"
- "ld1w { z8.s }, p1/Z, [%x[params], #-3, MUL VL]\n"
- "sdot z17.s, z12.b, z6.b\n"
- "sdot z31.s, z16.b, z1.b\n"
- "incw x20\n"
- "whilelt p0.s, x20, %x[n_channels]\n"
- "sdot z10.s, z20.b, z1.b\n"
- "ext z1.b, z1.b, z1.b, #0x1\n"
- "sdot z4.s, z16.b, z6.b\n"
- "sdot z26.s, z16.b, z1.b\n"
- "sdot z17.s, z12.b, z1.b\n"
- "sdot z31.s, z20.b, z28.b\n"
- "ext z28.b, z28.b, z28.b, #0x1\n"
- "mls z10.s, p1/M, z19.s, z23.s\n"
- "sdot z4.s, z20.b, z1.b\n"
- "sdot z26.s, z20.b, z28.b\n"
- ".inst 0x04ae754a // sqrdmulh z10.s, z10.s, z14.s\n"
- "movprfx z16, z17\n sdot z16.s, z12.b, z28.b\n"
- "sdot z17.s, z12.b, z11.b\n"
- "and z21.d, z10.d, z8.d\n"
- "mls z4.s, p1/M, z17.s, z23.s\n"
- "mls z31.s, p1/M, z18.s, z23.s\n"
- "asr z21.s, z21.s, #0x1f\n"
- "mls z26.s, p1/M, z16.s, z23.s\n"
- ".inst 0x04ae7484 // sqrdmulh z4.s, z4.s, z14.s\n"
- ".inst 0x04ae77ff // sqrdmulh z31.s, z31.s, z14.s\n"
- ".inst 0x04ae775a // sqrdmulh z26.s, z26.s, z14.s\n"
- "ld1w { z14.s }, p1/Z, [%x[params], #2, MUL VL]\n"
- "sqadd z10.s, z10.s, z21.s\n"
- "and z16.d, z4.d, z8.d\n"
- ".inst 0x4482850a // srshl z10.s, p1/M, z10.s, z8.s\n"
- "and z20.d, z31.d, z8.d\n"
- "and z21.d, z26.d, z8.d\n"
- "asr z16.s, z16.s, #0x1f\n"
+ "mov z12.s, #0x0\n"
+ "sdot z26.s, z19.b, z11.b\n"
+ "ld1w { z22.s }, p2/Z, [%x[params], #-3, MUL VL]\n"
+ "sdot z12.s, z25.b, z11.b\n"
+ "sdot z27.s, z30.b, z20.b\n"
+ "incw x28\n"
+ "whilelt p0.s, x28, %x[n_channels]\n"
+ "sdot z23.s, z21.b, z20.b\n"
+ "ext z20.b, z20.b, z20.b, #0x1\n"
+ "sdot z28.s, z30.b, z11.b\n"
+ "sdot z26.s, z30.b, z20.b\n"
+ "sdot z12.s, z25.b, z20.b\n"
+ "sdot z27.s, z21.b, z0.b\n"
+ "ext z0.b, z0.b, z0.b, #0x1\n"
+ "mls z23.s, p2/M, z24.s, z8.s\n"
+ "sdot z28.s, z21.b, z20.b\n"
+ "sdot z26.s, z21.b, z0.b\n"
+ ".inst 0x04a176f7 // sqrdmulh z23.s, z23.s, z1.s\n"
+ "movprfx z19, z12\n sdot z19.s, z25.b, z0.b\n"
+ "sdot z12.s, z25.b, z14.b\n"
+ "and z18.d, z23.d, z22.d\n"
+ "mls z28.s, p2/M, z12.s, z8.s\n"
+ "mls z27.s, p2/M, z13.s, z8.s\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "mls z26.s, p2/M, z19.s, z8.s\n"
+ ".inst 0x04a1779c // sqrdmulh z28.s, z28.s, z1.s\n"
+ ".inst 0x04a1777b // sqrdmulh z27.s, z27.s, z1.s\n"
+ ".inst 0x04a1775a // sqrdmulh z26.s, z26.s, z1.s\n"
+ "ld1w { z2.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "sqadd z23.s, z23.s, z18.s\n"
+ "and z20.d, z28.d, z22.d\n"
+ ".inst 0x44828ad7 // srshl z23.s, p2/M, z23.s, z22.s\n"
+ "and z19.d, z27.d, z22.d\n"
+ "and z18.d, z26.d, z22.d\n"
"asr z20.s, z20.s, #0x1f\n"
- "asr z21.s, z21.s, #0x1f\n"
- "sqadd z4.s, z4.s, z16.s\n"
- ".inst 0x44828504 // srshl z4.s, p1/M, z4.s, z8.s\n"
- "ld1b { z16.b }, p1/Z, [%x[params]]\n"
- "sqadd z31.s, z31.s, z20.s\n"
- "sqadd z26.s, z26.s, z21.s\n"
- ".inst 0x4482851f // srshl z31.s, p1/M, z31.s, z8.s\n"
- ".inst 0x4482851a // srshl z26.s, p1/M, z26.s, z8.s\n"
- "add z10.s, z10.s, z22.s\n"
- "smax z10.s, p1/M, z10.s, z25.s\n"
- "add z4.s, z4.s, z22.s\n"
- "smin z10.s, p1/M, z10.s, z24.s\n"
- "add z31.s, z31.s, z22.s\n"
- "add z26.s, z26.s, z22.s\n"
- "smax z4.s, p1/M, z4.s, z25.s\n"
- "smax z31.s, p1/M, z31.s, z25.s\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sqadd z28.s, z28.s, z20.s\n"
+ ".inst 0x44828adc // srshl z28.s, p2/M, z28.s, z22.s\n"
+ "ld1b { z13.b }, p2/Z, [%x[params]]\n"
+ "sqadd z27.s, z27.s, z19.s\n"
+ "sqadd z26.s, z26.s, z18.s\n"
+ ".inst 0x44828adb // srshl z27.s, p2/M, z27.s, z22.s\n"
+ ".inst 0x44828ada // srshl z26.s, p2/M, z26.s, z22.s\n"
+ "add z23.s, z23.s, z16.s\n"
+ "smax z23.s, p2/M, z23.s, z7.s\n"
+ "add z28.s, z28.s, z16.s\n"
+ "smin z23.s, p2/M, z23.s, z6.s\n"
+ "add z27.s, z27.s, z16.s\n"
+ "add z26.s, z26.s, z16.s\n"
+ "smax z28.s, p2/M, z28.s, z7.s\n"
+ "smax z27.s, p2/M, z27.s, z7.s\n"
+ "mov z24.s, #0x0\n"
+ "sdot z24.s, z25.b, z17.b\n"
+ "smax z26.s, p2/M, z26.s, z7.s\n"
+ "st1b { z23.s }, p0, [x12, x28]\n"
+ "ld1w { z1.s }, p2/Z, [%x[params], #-2, MUL VL]\n"
+ "ld1b { z21.b }, p2/Z, [%x[params], #-1, MUL VL]\n"
+ "smin z28.s, p2/M, z28.s, z6.s\n"
+ "smin z27.s, p2/M, z27.s, z6.s\n"
+ "smin z26.s, p2/M, z26.s, z6.s\n"
+ "st1b { z28.s }, p0, [x11, x28]\n"
+ "mov z0.d, z1.d\n"
+ "sdot z24.s, z25.b, z4.b\n"
+ "st1b { z27.s }, p0, [x10, x28]\n"
+ "mov z31.d, z1.d\n"
+ "sdot z31.s, z21.b, z17.b\n"
+ "movprfx z23, z24\n sdot z23.s, z25.b, z5.b\n"
+ "st1b { z26.s }, p0, [x9, x28]\n"
+ "mov z30.d, z1.d\n"
+ "sdot z1.s, z21.b, z29.b\n"
+ "sdot z1.s, z13.b, z17.b\n"
+ "sdot z24.s, z25.b, z29.b\n"
+ "ext z29.b, z29.b, z29.b, #0x1\n"
+ "ld1b { z20.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "sdot z0.s, z21.b, z29.b\n"
+ "ext z17.b, z17.b, z17.b, #0x1\n"
"mov z19.s, #0x0\n"
- "sdot z19.s, z12.b, z5.b\n"
- "smax z26.s, p1/M, z26.s, z25.s\n"
- "st1b { z10.s }, p0, [x24, x20]\n"
- "ld1w { z10.s }, p1/Z, [%x[params], #-2, MUL VL]\n"
- "ld1b { z21.b }, p1/Z, [%x[params], #-1, MUL VL]\n"
- "smin z4.s, p1/M, z4.s, z24.s\n"
- "smin z31.s, p1/M, z31.s, z24.s\n"
- "smin z26.s, p1/M, z26.s, z24.s\n"
- "st1b { z4.s }, p0, [x23, x20]\n"
- "mov z4.d, z10.d\n"
- "sdot z19.s, z12.b, z0.b\n"
- "st1b { z31.s }, p0, [x22, x20]\n"
- "mov z31.d, z10.d\n"
- "sdot z31.s, z21.b, z5.b\n"
- "movprfx z18, z19\n sdot z18.s, z12.b, z27.b\n"
- "st1b { z26.s }, p0, [x21, x20]\n"
- "mov z26.d, z10.d\n"
- "sdot z10.s, z21.b, z9.b\n"
- "sdot z10.s, z16.b, z5.b\n"
- "sdot z19.s, z12.b, z9.b\n"
- "ext z9.b, z9.b, z9.b, #0x1\n"
- "ld1b { z20.b }, p1/Z, [%x[params], #1, MUL VL]\n"
- "sdot z4.s, z21.b, z9.b\n"
+ "sdot z30.s, z21.b, z17.b\n"
+ "ld1w { z22.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "sdot z19.s, z25.b, z17.b\n"
+ "sdot z31.s, z13.b, z4.b\n"
+ "incw x28\n"
+ "whilelt p1.s, x28, %x[n_channels]\n"
+ "sdot z1.s, z20.b, z4.b\n"
+ "ext z4.b, z4.b, z4.b, #0x1\n"
+ "sdot z0.s, z13.b, z17.b\n"
+ "whilelt p0.b, x14, %x[n_channels]\n"
+ "sdot z30.s, z13.b, z4.b\n"
+ "sdot z19.s, z25.b, z4.b\n"
+ "ld1b { z13.b }, p0/Z, [x26, x14]\n"
+ "ld1b { z28.b }, p0/Z, [x25, x14]\n"
+ "sdot z31.s, z20.b, z5.b\n"
"ext z5.b, z5.b, z5.b, #0x1\n"
- "mov z17.s, #0x0\n"
- "sdot z26.s, z21.b, z5.b\n"
- "ld1w { z8.s }, p1/Z, [%x[params], #3, MUL VL]\n"
- "sdot z17.s, z12.b, z5.b\n"
- "sdot z31.s, z16.b, z0.b\n"
- "incw x20\n"
- "whilelt p0.s, x20, %x[n_channels]\n"
- "sdot z10.s, z20.b, z0.b\n"
- "ext z0.b, z0.b, z0.b, #0x1\n"
- "sdot z4.s, z16.b, z5.b\n"
- "whilelt p2.b, x13, %x[n_channels]\n"
- "sdot z26.s, z16.b, z0.b\n"
- "sdot z17.s, z12.b, z0.b\n"
- "ld1b { z13.b }, p2/Z, [x11, x13]\n"
- "ld1b { z11.b }, p2/Z, [x10, x13]\n"
- "sdot z31.s, z20.b, z27.b\n"
- "ext z27.b, z27.b, z27.b, #0x1\n"
- "mls z10.s, p1/M, z19.s, z23.s\n"
- "ld1b { z7.b }, p2/Z, [x27, x13]\n"
- "sdot z4.s, z20.b, z0.b\n"
- "sdot z26.s, z20.b, z27.b\n"
- ".inst 0x04ae754a // sqrdmulh z10.s, z10.s, z14.s\n"
- "ld1b { z6.b }, p2/Z, [x26, x13]\n"
- "movprfx z16, z17\n sdot z16.s, z12.b, z27.b\n"
- "sdot z17.s, z12.b, z9.b\n"
- "and z21.d, z10.d, z8.d\n"
- "ld1b { z9.b }, p2/Z, [x9, x13]\n"
- "mls z4.s, p1/M, z17.s, z23.s\n"
- "mls z31.s, p1/M, z18.s, z23.s\n"
+ "mls z1.s, p2/M, z24.s, z8.s\n"
+ "ld1b { z27.b }, p0/Z, [x22, x14]\n"
+ "sdot z0.s, z20.b, z4.b\n"
+ "sdot z30.s, z20.b, z5.b\n"
+ ".inst 0x04a27421 // sqrdmulh z1.s, z1.s, z2.s\n"
+ "ld1b { z26.b }, p0/Z, [x21, x14]\n"
+ "movprfx z18, z19\n sdot z18.s, z25.b, z5.b\n"
+ "sdot z19.s, z25.b, z29.b\n"
+ "and z11.d, z1.d, z22.d\n"
+ "ld1b { z29.b }, p0/Z, [x23, x14]\n"
+ "mls z0.s, p2/M, z19.s, z8.s\n"
+ "mls z31.s, p2/M, z23.s, z8.s\n"
+ "asr z11.s, z11.s, #0x1f\n"
+ "ld1b { z17.b }, p0/Z, [x20, x14]\n"
+ "mls z30.s, p2/M, z18.s, z8.s\n"
+ ".inst 0x04a27400 // sqrdmulh z0.s, z0.s, z2.s\n"
+ ".inst 0x04a277ff // sqrdmulh z31.s, z31.s, z2.s\n"
+ ".inst 0x04a277de // sqrdmulh z30.s, z30.s, z2.s\n"
+ "ld1b { z15.b }, p0/Z, [x27, x14]\n"
+ "ldp x23, x22, [%x[inptrs], #0x40]\n"
+ "sqadd z1.s, z1.s, z11.s\n"
+ "and z21.d, z0.d, z22.d\n"
+ ".inst 0x44828ac1 // srshl z1.s, p2/M, z1.s, z22.s\n"
+ "ldp x21, x20, [%x[inptrs], #0x50]\n"
+ "and z20.d, z31.d, z22.d\n"
+ "and z19.d, z30.d, z22.d\n"
+ "ld1b { z18.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z11.b }, p0/Z, [x22, x14]\n"
"asr z21.s, z21.s, #0x1f\n"
- "ld1b { z5.b }, p2/Z, [x25, x13]\n"
- "mls z26.s, p1/M, z16.s, z23.s\n"
- ".inst 0x04ae7484 // sqrdmulh z4.s, z4.s, z14.s\n"
- ".inst 0x04ae77ff // sqrdmulh z31.s, z31.s, z14.s\n"
- ".inst 0x04ae775a // sqrdmulh z26.s, z26.s, z14.s\n"
- "ld1b { z14.b }, p2/Z, [x12, x13]\n"
- "ldp x12, x11, [%x[inptrs], #0x40]\n"
- "sqadd z10.s, z10.s, z21.s\n"
- "and z16.d, z4.d, z8.d\n"
- ".inst 0x4482850a // srshl z10.s, p1/M, z10.s, z8.s\n"
- "ldp x10, x9, [%x[inptrs], #0x50]\n"
- "and z20.d, z31.d, z8.d\n"
- "and z21.d, z26.d, z8.d\n"
- "ld1b { z3.b }, p2/Z, [x12, x13]\n"
- "ld1b { z2.b }, p2/Z, [x11, x13]\n"
- "asr z16.s, z16.s, #0x1f\n"
"asr z20.s, z20.s, #0x1f\n"
- "ld1b { z1.b }, p2/Z, [x10, x13]\n"
- "ld1b { z0.b }, p2/Z, [x9, x13]\n"
- "asr z21.s, z21.s, #0x1f\n"
- "sqadd z4.s, z4.s, z16.s\n"
- ".inst 0x44828504 // srshl z4.s, p1/M, z4.s, z8.s\n"
- "ld1b { z16.b }, p1/Z, [%x[params], #6, MUL VL]\n"
+ "ld1b { z24.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z4.b }, p0/Z, [x20, x14]\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "sqadd z0.s, z0.s, z21.s\n"
+ ".inst 0x44828ac0 // srshl z0.s, p2/M, z0.s, z22.s\n"
+ "ld1b { z3.b }, p2/Z, [%x[params], #6, MUL VL]\n"
"sqadd z31.s, z31.s, z20.s\n"
- "sqadd z26.s, z26.s, z21.s\n"
- ".inst 0x4482851f // srshl z31.s, p1/M, z31.s, z8.s\n"
- ".inst 0x4482851a // srshl z26.s, p1/M, z26.s, z8.s\n"
- "add z10.s, z10.s, z22.s\n"
- "smax z10.s, p1/M, z10.s, z25.s\n"
- "add z4.s, z4.s, z22.s\n"
- "ld1b { z8.b }, p2/Z, [x28, x13]\n"
- "add z31.s, z31.s, z22.s\n"
- "add z26.s, z26.s, z22.s\n"
- "ldp x28, x27, [%x[inptrs], #0x60]\n"
- "ldp x26, x25, [%x[inptrs], #0x70]\n"
- "smin z10.s, p1/M, z10.s, z24.s\n"
- "smax z4.s, p1/M, z4.s, z25.s\n"
- "st1b { z10.s }, p0, [x24, x20]\n"
- "ld1b { z30.b }, p2/Z, [x28, x13]\n"
- "smax z31.s, p1/M, z31.s, z25.s\n"
- "smax z26.s, p1/M, z26.s, z25.s\n"
- "ld1b { z29.b }, p2/Z, [x27, x13]\n"
- "ld1b { z28.b }, p2/Z, [x26, x13]\n"
- "ld1b { z27.b }, p2/Z, [x25, x13]\n"
- "zip2 z10.b, z14.b, z11.b\n"
- "zip1 z14.b, z14.b, z11.b\n"
- "smin z4.s, p1/M, z4.s, z24.s\n"
- "zip1 z11.b, z13.b, z9.b\n"
- "zip2 z9.b, z13.b, z9.b\n"
- "smin z31.s, p1/M, z31.s, z24.s\n"
- "smin z26.s, p1/M, z26.s, z24.s\n"
- "st1b { z4.s }, p0, [x23, x20]\n"
- "zip2 z13.b, z14.b, z11.b\n"
- "zip1 z14.b, z14.b, z11.b\n"
- "ldp x12, x11, [%x[inptrs], #0x0]\n"
- "st1b { z31.s }, p0, [x22, x20]\n"
- "zip1 z11.b, z10.b, z9.b\n"
- "zip2 z9.b, z10.b, z9.b\n"
- "ld1w { z10.s }, p1/Z, [%x[params], #4, MUL VL]\n"
- "st1b { z26.s }, p0, [x21, x20]\n"
- "zip2 z4.b, z8.b, z6.b\n"
- "zip1 z8.b, z8.b, z6.b\n"
- "incw x20\n"
- "zip1 z6.b, z7.b, z5.b\n"
- "zip2 z5.b, z7.b, z5.b\n"
- "ldp x10, x9, [%x[inptrs], #0x10]\n"
- "ldp x28, x27, [%x[inptrs], #0x20]\n"
- "zip2 z31.b, z3.b, z1.b\n"
- "zip1 z3.b, z3.b, z1.b\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
- "ld1b { z21.b }, p1/Z, [%x[params], #5, MUL VL]\n"
- "zip1 z1.b, z2.b, z0.b\n"
- "zip2 z0.b, z2.b, z0.b\n"
- "ld1b { z20.b }, p1/Z, [%x[params], #7, MUL VL]\n"
+ "sqadd z30.s, z30.s, z19.s\n"
+ ".inst 0x44828adf // srshl z31.s, p2/M, z31.s, z22.s\n"
+ ".inst 0x44828ade // srshl z30.s, p2/M, z30.s, z22.s\n"
+ "add z1.s, z1.s, z16.s\n"
+ "smax z1.s, p2/M, z1.s, z7.s\n"
+ "add z0.s, z0.s, z16.s\n"
+ "ld1b { z9.b }, p0/Z, [x24, x14]\n"
+ "add z31.s, z31.s, z16.s\n"
+ "add z30.s, z30.s, z16.s\n"
+ "ldp x23, x22, [%x[inptrs], #0x60]\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "smin z1.s, p2/M, z1.s, z6.s\n"
+ "smax z0.s, p2/M, z0.s, z7.s\n"
+ "st1b { z1.s }, p1, [x12, x28]\n"
+ "ld1b { z2.b }, p0/Z, [x23, x14]\n"
+ "smax z31.s, p2/M, z31.s, z7.s\n"
+ "smax z30.s, p2/M, z30.s, z7.s\n"
+ "ld1b { z23.b }, p0/Z, [x22, x14]\n"
+ "ld1b { z22.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z5.b }, p0/Z, [x20, x14]\n"
+ "zip2 z20.b, z15.b, z28.b\n"
+ "zip1 z15.b, z15.b, z28.b\n"
+ "smin z0.s, p2/M, z0.s, z6.s\n"
+ "zip1 z19.b, z13.b, z29.b\n"
+ "zip2 z29.b, z13.b, z29.b\n"
+ "smin z31.s, p2/M, z31.s, z6.s\n"
+ "smin z30.s, p2/M, z30.s, z6.s\n"
+ "st1b { z0.s }, p1, [x11, x28]\n"
+ "zip2 z13.b, z15.b, z19.b\n"
+ "zip1 z15.b, z15.b, z19.b\n"
+ "ldp x27, x26, [%x[inptrs], #0x0]\n"
+ "st1b { z31.s }, p1, [x10, x28]\n"
+ "zip1 z14.b, z20.b, z29.b\n"
+ "zip2 z29.b, z20.b, z29.b\n"
+ "ld1w { z10.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "st1b { z30.s }, p1, [x9, x28]\n"
+ "zip2 z21.b, z9.b, z26.b\n"
+ "zip1 z9.b, z9.b, z26.b\n"
+ "incw x28\n"
+ "zip1 z20.b, z27.b, z17.b\n"
+ "zip2 z17.b, z27.b, z17.b\n"
+ "ldp x25, x23, [%x[inptrs], #0x10]\n"
+ "ldp x24, x22, [%x[inptrs], #0x20]\n"
+ "zip2 z31.b, z18.b, z24.b\n"
+ "zip1 z18.b, z18.b, z24.b\n"
+ "ldp x21, x20, [%x[inptrs], #0x30]\n"
+ "ld1b { z26.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "zip1 z27.b, z11.b, z4.b\n"
+ "zip2 z4.b, z11.b, z4.b\n"
+ "ld1b { z1.b }, p2/Z, [%x[params], #7, MUL VL]\n"
"addvl %x[params], %x[params], #8\n"
- "zip2 z26.b, z30.b, z28.b\n"
- "zip1 z30.b, z30.b, z28.b\n"
- "zip1 z28.b, z29.b, z27.b\n"
- "zip2 z27.b, z29.b, z27.b\n"
- "zip2 z7.b, z8.b, z6.b\n"
- "zip1 z8.b, z8.b, z6.b\n"
- "zip1 z6.b, z4.b, z5.b\n"
- "zip2 z5.b, z4.b, z5.b\n"
- "zip2 z2.b, z3.b, z1.b\n"
- "zip1 z3.b, z3.b, z1.b\n"
- "zip1 z1.b, z31.b, z0.b\n"
- "zip2 z0.b, z31.b, z0.b\n"
- "zip2 z29.b, z30.b, z28.b\n"
- "zip1 z30.b, z30.b, z28.b\n"
- "zip1 z28.b, z26.b, z27.b\n"
- "zip2 z27.b, z26.b, z27.b\n"
- "mov z4.d, z10.d\n"
+ "zip2 z30.b, z2.b, z22.b\n"
+ "zip1 z2.b, z2.b, z22.b\n"
+ "zip1 z28.b, z23.b, z5.b\n"
+ "zip2 z5.b, z23.b, z5.b\n"
+ "zip2 z19.b, z9.b, z20.b\n"
+ "zip1 z9.b, z9.b, z20.b\n"
+ "zip1 z11.b, z21.b, z17.b\n"
+ "zip2 z17.b, z21.b, z17.b\n"
+ "zip2 z12.b, z18.b, z27.b\n"
+ "zip1 z18.b, z18.b, z27.b\n"
+ "zip1 z20.b, z31.b, z4.b\n"
+ "zip2 z4.b, z31.b, z4.b\n"
+ "zip2 z24.b, z2.b, z28.b\n"
+ "zip1 z2.b, z2.b, z28.b\n"
+ "zip1 z0.b, z30.b, z5.b\n"
+ "zip2 z5.b, z30.b, z5.b\n"
+ "mov z22.d, z10.d\n"
"mov z31.d, z10.d\n"
- "mov z26.d, z10.d\n"
+ "mov z21.d, z10.d\n"
"b.any 1b\n"
: [params] "+&r" (params)
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index 3e9765165c..c9b4daf334 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,14 +22,14 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -47,17 +47,16 @@ class sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstS
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
- unsigned int get_accumulator_depth_vl(void) const override { return 2; }
-
sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
- Parent::KernelType kernel = sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+ Parent::KernelType kernel = sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index 4ebf5be285..8ac522dc9a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -27,7 +27,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -46,7 +46,7 @@ void sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
struct Params
{
long unsigned int n_channels;
- const int8_t *weights;
+ const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
const int32_t *const requant_muls;
@@ -57,7 +57,7 @@ void sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
Params(
long unsigned int n_channels,
const int8_t *const *inptrs_raw,
- const int8_t *const weights,
+ const void *const weights,
const int32_t *const bias,
const arm_gemm::Requantize32 &qp,
const int32_t *const requant_muls,
@@ -91,320 +91,320 @@ void sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "mov x8, #0x0\n"
+ "mov x16, #0x0\n"
"ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
"ptrue p4.b\n"
"ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
- "mov x23, x8\n"
+ "mov x23, x16\n"
"add x21, x25, %[offsetof_Requantize32_a_offset]\n"
- "ldr x17, [%x[params], %[offsetof_Params_n_channels]]\n"
- "ldr x16, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x15, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
"add x20, x25, %[offsetof_Requantize32_b_offset]\n"
"add x22, x25, %[offsetof_Requantize32_c_offset]\n"
- "ld1rb { z23.b }, p4/Z, [x21]\n"
- "ld1rb { z15.b }, p4/Z, [x20]\n"
+ "ld1rb { z12.b }, p4/Z, [x21]\n"
+ "ld1rb { z30.b }, p4/Z, [x20]\n"
"add x21, x25, %[offsetof_Requantize32_minval]\n"
"add x20, x25, %[offsetof_Requantize32_maxval]\n"
- "ld1rh { z14.h }, p4/Z, [x22]\n"
- "ld1rh { z12.h }, p4/Z, [x21]\n"
- "ld1rh { z11.h }, p4/Z, [x20]\n"
- "ldp x15, x14, [x24, #0x0]\n"
+ "ld1rh { z24.h }, p4/Z, [x22]\n"
+ "ld1rh { z11.h }, p4/Z, [x21]\n"
+ "ld1rh { z26.h }, p4/Z, [x20]\n"
+ "ldp x13, x12, [x24, #0x0]\n"
"incw x23\n"
- "whilelt p3.h, x8, x17\n"
- "ldp x13, x12, [x24, #0x10]\n"
- "whilelt p2.s, x8, x17\n"
- "whilelt p1.s, x23, x17\n"
- "ldr x26, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1sb { z0.h }, p4/Z, [x16]\n"
- "ld1sb { z1.h }, p4/Z, [x16, #1, MUL VL]\n"
- "add x11, %x[params], %[offsetof_Params_inptrs]\n"
- "mov x10, #0x0\n"
- "ld1sb { z2.h }, p4/Z, [x16, #2, MUL VL]\n"
- "ld1sb { z3.h }, p4/Z, [x16, #3, MUL VL]\n"
- ".inst 0x454f1000 // ssublb z0.h, z0.b, z15.b\n"
- ".inst 0x454f1021 // ssublb z1.h, z1.b, z15.b\n"
- "ld1sb { z4.h }, p4/Z, [x16, #4, MUL VL]\n"
- "ld1sb { z5.h }, p4/Z, [x16, #5, MUL VL]\n"
- ".inst 0x454f1042 // ssublb z2.h, z2.b, z15.b\n"
- ".inst 0x454f1063 // ssublb z3.h, z3.b, z15.b\n"
- "ld1sb { z6.h }, p4/Z, [x16, #6, MUL VL]\n"
- "ld1sb { z7.h }, p4/Z, [x16, #7, MUL VL]\n"
- "inch x16, ALL, MUL #8\n"
- ".inst 0x454f1084 // ssublb z4.h, z4.b, z15.b\n"
- "ld1w { z17.s }, p2/Z, [x26]\n"
- "ld1w { z16.s }, p1/Z, [x26, #1, MUL VL]\n"
- "uzp1 z13.s, z17.s, z16.s\n"
- "uzp2 z17.s, z17.s, z16.s\n"
- "ld1sb { z8.h }, p4/Z, [x16]\n"
- "ldp x24, x23, [x11, #0x0]\n"
- "addvl x26, x26, #2\n"
- "mov z26.d, z13.d\n"
- "ldp x22, x21, [x11, #0x10]\n"
- "ldr x20, [x11, #0x20]\n"
- "mov z10.d, z17.d\n"
- "mov z24.d, z13.d\n"
- "ld1sb { z31.h }, p3/Z, [x24, x8]\n"
- "ld1sb { z30.h }, p3/Z, [x23, x8]\n"
- "mov z16.d, z17.d\n"
- "mov z25.d, z13.d\n"
- "ld1sb { z29.h }, p3/Z, [x22, x8]\n"
- "ld1sb { z28.h }, p3/Z, [x21, x8]\n"
- "mov z9.d, z17.d\n"
- ".inst 0x454f10a5 // ssublb z5.h, z5.b, z15.b\n"
- "ld1sb { z27.h }, p3/Z, [x20, x8]\n"
- "ldr x9, [%x[params], %[offsetof_Params_requant_muls]]\n"
- ".inst 0x454f10c6 // ssublb z6.h, z6.b, z15.b\n"
- ".inst 0x454f10e7 // ssublb z7.h, z7.b, z15.b\n"
- "ldr x28, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "str x26, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x454f1108 // ssublb z8.h, z8.b, z15.b\n"
- ".inst 0x455713ff // ssublb z31.h, z31.b, z23.b\n"
- ".inst 0x455713de // ssublb z30.h, z30.b, z23.b\n"
- ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
- ".inst 0x4557139c // ssublb z28.h, z28.b, z23.b\n"
- ".inst 0x4557137b // ssublb z27.h, z27.b, z23.b\n"
+ "whilelt p3.h, x16, x15\n"
+ "ldp x11, x10, [x24, #0x10]\n"
+ "whilelt p2.s, x16, x15\n"
+ "whilelt p1.s, x23, x15\n"
+ "ldr x9, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1sb { z14.h }, p4/Z, [x14]\n"
+ "ld1sb { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
+ "add x28, %x[params], %[offsetof_Params_inptrs]\n"
+ "mov x27, #0x0\n"
+ "ld1sb { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+ "ld1sb { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
+ ".inst 0x455e11ce // ssublb z14.h, z14.b, z30.b\n"
+ ".inst 0x455e12b5 // ssublb z21.h, z21.b, z30.b\n"
+ "ld1sb { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
+ "ld1sb { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
+ ".inst 0x455e1021 // ssublb z1.h, z1.b, z30.b\n"
+ ".inst 0x455e10c6 // ssublb z6.h, z6.b, z30.b\n"
+ "ld1sb { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
+ "ld1sb { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+ "inch x14, ALL, MUL #8\n"
+ ".inst 0x455e1042 // ssublb z2.h, z2.b, z30.b\n"
+ "ld1w { z17.s }, p2/Z, [x9]\n"
+ "ld1w { z16.s }, p1/Z, [x9, #1, MUL VL]\n"
+ "uzp1 z5.s, z17.s, z16.s\n"
+ "uzp2 z9.s, z17.s, z16.s\n"
+ "ld1sb { z8.h }, p4/Z, [x14]\n"
+ "ldp x24, x23, [x28, #0x0]\n"
+ "addvl x9, x9, #2\n"
+ "mov z17.d, z5.d\n"
+ "ldp x22, x21, [x28, #0x10]\n"
+ "ldr x20, [x28, #0x20]\n"
+ "mov z25.d, z9.d\n"
+ "mov z16.d, z5.d\n"
+ "ld1sb { z0.h }, p3/Z, [x24, x16]\n"
+ "ld1sb { z29.h }, p3/Z, [x23, x16]\n"
+ "mov z23.d, z9.d\n"
+ "mov z22.d, z5.d\n"
+ "ld1sb { z4.h }, p3/Z, [x22, x16]\n"
+ "ld1sb { z13.h }, p3/Z, [x21, x16]\n"
+ "mov z27.d, z9.d\n"
+ ".inst 0x455e1252 // ssublb z18.h, z18.b, z30.b\n"
+ "ld1sb { z20.h }, p3/Z, [x20, x16]\n"
+ "ldr x26, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ ".inst 0x455e10e7 // ssublb z7.h, z7.b, z30.b\n"
+ ".inst 0x455e114a // ssublb z10.h, z10.b, z30.b\n"
+ "ldr x25, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "str x9, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x455e1108 // ssublb z8.h, z8.b, z30.b\n"
+ ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
+ ".inst 0x454c13bd // ssublb z29.h, z29.b, z12.b\n"
+ ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
+ ".inst 0x454c11ad // ssublb z13.h, z13.b, z12.b\n"
+ ".inst 0x454c1294 // ssublb z20.h, z20.b, z12.b\n"
"1:" // Loop
- ".inst 0x448443ed // smlalb z13.s, p4/M, z31.h, z4.h\n"
- ".inst 0x448447f1 // smlalt z17.s, p4/M, z31.h, z4.h\n"
- "ldr x22, [x11, #0x28]\n"
- "ldr x27, [x11, #0x38]\n"
- ".inst 0x448343fa // smlalb z26.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448347ea // smlalt z10.s, p4/M, z31.h, z3.h\n"
- "ldr x21, [x11, #0x30]\n"
- "ldr x26, [x11, #0x40]\n"
- ".inst 0x448043cd // smlalb z13.s, p4/M, z30.h, z0.h\n"
- ".inst 0x448047d1 // smlalt z17.s, p4/M, z30.h, z0.h\n"
- "ldr x20, [x11, #0x48]\n"
- "ld1sb { z30.h }, p3/Z, [x20, x8]\n"
- ".inst 0x448243ba // smlalb z26.s, p4/M, z29.h, z2.h\n"
- ".inst 0x448247aa // smlalt z10.s, p4/M, z29.h, z2.h\n"
- "ld1sb { z29.h }, p3/Z, [x21, x8]\n"
- ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
- ".inst 0x448143f8 // smlalb z24.s, p4/M, z31.h, z1.h\n"
- ".inst 0x448147f0 // smlalt z16.s, p4/M, z31.h, z1.h\n"
- "ldr x25, [x11, #0x50]\n"
- "ldr x24, [x11, #0x58]\n"
- ".inst 0x448043f9 // smlalb z25.s, p4/M, z31.h, z0.h\n"
- ".inst 0x448047e9 // smlalt z9.s, p4/M, z31.h, z0.h\n"
- "ld1sb { z31.h }, p3/Z, [x22, x8]\n"
- ".inst 0x455713ff // ssublb z31.h, z31.b, z23.b\n"
- ".inst 0x4485438d // smlalb z13.s, p4/M, z28.h, z5.h\n"
- ".inst 0x44854791 // smlalt z17.s, p4/M, z28.h, z5.h\n"
- ".inst 0x455713de // ssublb z30.h, z30.b, z23.b\n"
- "ldr x23, [x11, #0x60]\n"
- ".inst 0x4484439a // smlalb z26.s, p4/M, z28.h, z4.h\n"
- ".inst 0x4484478a // smlalt z10.s, p4/M, z28.h, z4.h\n"
- "ldr x22, [x11, #0x68]\n"
- "ldr x21, [x11, #0x70]\n"
- ".inst 0x44824398 // smlalb z24.s, p4/M, z28.h, z2.h\n"
- ".inst 0x44824790 // smlalt z16.s, p4/M, z28.h, z2.h\n"
- "ldr x20, [x11, #0x78]\n"
- "ld1w { z20.s }, p2/Z, [x9]\n"
- ".inst 0x44814399 // smlalb z25.s, p4/M, z28.h, z1.h\n"
- ".inst 0x44814789 // smlalt z9.s, p4/M, z28.h, z1.h\n"
- "ld1sb { z28.h }, p3/Z, [x27, x8]\n"
- ".inst 0x4557139c // ssublb z28.h, z28.b, z23.b\n"
- ".inst 0x4487436d // smlalb z13.s, p4/M, z27.h, z7.h\n"
- ".inst 0x44874771 // smlalt z17.s, p4/M, z27.h, z7.h\n"
- "ld1w { z18.s }, p1/Z, [x9, #1, MUL VL]\n"
- "uzp1 z19.s, z20.s, z18.s\n"
- ".inst 0x4486437a // smlalb z26.s, p4/M, z27.h, z6.h\n"
- ".inst 0x4486476a // smlalt z10.s, p4/M, z27.h, z6.h\n"
- "uzp2 z22.s, z20.s, z18.s\n"
- "ld1w { z20.s }, p2/Z, [x28]\n"
- ".inst 0x448643f8 // smlalb z24.s, p4/M, z31.h, z6.h\n"
- ".inst 0x448647f0 // smlalt z16.s, p4/M, z31.h, z6.h\n"
- "ld1sb { z31.h }, p3/Z, [x26, x8]\n"
- ".inst 0x455713ff // ssublb z31.h, z31.b, z23.b\n"
- ".inst 0x44834379 // smlalb z25.s, p4/M, z27.h, z3.h\n"
- ".inst 0x44834769 // smlalt z9.s, p4/M, z27.h, z3.h\n"
- "whilelt p0.h, x10, x17\n"
+ ".inst 0x44824005 // smlalb z5.s, p4/M, z0.h, z2.h\n"
+ ".inst 0x44824409 // smlalt z9.s, p4/M, z0.h, z2.h\n"
+ "ldr x20, [x28, #0x28]\n"
+ "ldr x21, [x28, #0x38]\n"
+ ".inst 0x448e43a5 // smlalb z5.s, p4/M, z29.h, z14.h\n"
+ ".inst 0x44864011 // smlalb z17.s, p4/M, z0.h, z6.h\n"
+ "ld1sb { z3.h }, p3/Z, [x20, x16]\n"
+ "ldr x20, [x28, #0x30]\n"
+ ".inst 0x44954010 // smlalb z16.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x448e4016 // smlalb z22.s, p4/M, z0.h, z14.h\n"
+ "ld1sb { z31.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x454c1063 // ssublb z3.h, z3.b, z12.b\n"
+ ".inst 0x448e47a9 // smlalt z9.s, p4/M, z29.h, z14.h\n"
+ ".inst 0x449241a5 // smlalb z5.s, p4/M, z13.h, z18.h\n"
+ "ldr x21, [x28, #0x40]\n"
+ "ld1sb { z15.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x44864419 // smlalt z25.s, p4/M, z0.h, z6.h\n"
+ ".inst 0x44954417 // smlalt z23.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x454c13ff // ssublb z31.h, z31.b, z12.b\n"
+ "ldr x20, [x28, #0x48]\n"
+ ".inst 0x448e441b // smlalt z27.s, p4/M, z0.h, z14.h\n"
+ ".inst 0x44814091 // smlalb z17.s, p4/M, z4.h, z1.h\n"
+ "ld1sb { z19.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x454c11ef // ssublb z15.h, z15.b, z12.b\n"
+ ".inst 0x448141b0 // smlalb z16.s, p4/M, z13.h, z1.h\n"
+ ".inst 0x449541b6 // smlalb z22.s, p4/M, z13.h, z21.h\n"
+ "ld1sb { z28.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x454c1273 // ssublb z19.h, z19.b, z12.b\n"
+ ".inst 0x449245a9 // smlalt z9.s, p4/M, z13.h, z18.h\n"
+ ".inst 0x448a4285 // smlalb z5.s, p4/M, z20.h, z10.h\n"
+ "ldr x21, [x28, #0x50]\n"
+ "ldr x20, [x28, #0x58]\n"
+ ".inst 0x44814499 // smlalt z25.s, p4/M, z4.h, z1.h\n"
+ ".inst 0x448145b7 // smlalt z23.s, p4/M, z13.h, z1.h\n"
+ ".inst 0x454c139c // ssublb z28.h, z28.b, z12.b\n"
+ "ld1sb { z4.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x449545bb // smlalt z27.s, p4/M, z13.h, z21.h\n"
+ ".inst 0x448241b1 // smlalb z17.s, p4/M, z13.h, z2.h\n"
+ "ld1sb { z29.h }, p3/Z, [x20, x16]\n"
+ "ldr x21, [x28, #0x60]\n"
+ ".inst 0x44874070 // smlalb z16.s, p4/M, z3.h, z7.h\n"
+ ".inst 0x44864296 // smlalb z22.s, p4/M, z20.h, z6.h\n"
+ "ldr x20, [x28, #0x68]\n"
+ ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
+ ".inst 0x448a4689 // smlalt z9.s, p4/M, z20.h, z10.h\n"
+ ".inst 0x449543e5 // smlalb z5.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x454c13bd // ssublb z29.h, z29.b, z12.b\n"
+ "ld1sb { z0.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x448245b9 // smlalt z25.s, p4/M, z13.h, z2.h\n"
+ ".inst 0x44874477 // smlalt z23.s, p4/M, z3.h, z7.h\n"
+ "ld1sb { z3.h }, p3/Z, [x20, x16]\n"
+ "ldr x20, [x28, #0x70]\n"
+ ".inst 0x4486469b // smlalt z27.s, p4/M, z20.h, z6.h\n"
+ ".inst 0x44874291 // smlalb z17.s, p4/M, z20.h, z7.h\n"
+ ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
+ "ld1sb { z13.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x44824290 // smlalb z16.s, p4/M, z20.h, z2.h\n"
+ ".inst 0x448841f6 // smlalb z22.s, p4/M, z15.h, z8.h\n"
+ ".inst 0x454c1063 // ssublb z3.h, z3.b, z12.b\n"
+ "ldr x20, [x28, #0x78]\n"
+ ".inst 0x449547e9 // smlalt z9.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x44814265 // smlalb z5.s, p4/M, z19.h, z1.h\n"
+ ".inst 0x454c11ad // ssublb z13.h, z13.b, z12.b\n"
+ "whilelt p0.h, x27, x15\n"
+ ".inst 0x44874699 // smlalt z25.s, p4/M, z20.h, z7.h\n"
+ ".inst 0x44824697 // smlalt z23.s, p4/M, z20.h, z2.h\n"
+ "ld1w { z20.s }, p2/Z, [x26]\n"
+ "inch x14\n"
+ ".inst 0x448845fb // smlalt z27.s, p4/M, z15.h, z8.h\n"
+ ".inst 0x448e43f1 // smlalb z17.s, p4/M, z31.h, z14.h\n"
+ "ld1w { z15.s }, p1/Z, [x26, #1, MUL VL]\n"
+ "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44924390 // smlalb z16.s, p4/M, z28.h, z18.h\n"
+ ".inst 0x44824396 // smlalb z22.s, p4/M, z28.h, z2.h\n"
+ "addvl x26, x26, #2\n"
+ ".inst 0x44814669 // smlalt z9.s, p4/M, z19.h, z1.h\n"
+ ".inst 0x44884385 // smlalb z5.s, p4/M, z28.h, z8.h\n"
+ ".inst 0x448e47f9 // smlalt z25.s, p4/M, z31.h, z14.h\n"
+ ".inst 0x44924797 // smlalt z23.s, p4/M, z28.h, z18.h\n"
+ "ld1sb { z31.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x454c13ff // ssublb z31.h, z31.b, z12.b\n"
+ ".inst 0x4482479b // smlalt z27.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x44954271 // smlalb z17.s, p4/M, z19.h, z21.h\n"
+ "uzp1 z2.s, z20.s, z15.s\n"
"inch x16\n"
- ".inst 0x4481438d // smlalb z13.s, p4/M, z28.h, z1.h\n"
- ".inst 0x44814791 // smlalt z17.s, p4/M, z28.h, z1.h\n"
- "ldr x26, [%x[params], %[offsetof_Params_bias]]\n"
- "addvl x9, x9, #2\n"
- ".inst 0x4480439a // smlalb z26.s, p4/M, z28.h, z0.h\n"
- ".inst 0x4480478a // smlalt z10.s, p4/M, z28.h, z0.h\n"
- "ld1sb { z28.h }, p3/Z, [x24, x8]\n"
- ".inst 0x4557139c // ssublb z28.h, z28.b, z23.b\n"
- ".inst 0x44844378 // smlalb z24.s, p4/M, z27.h, z4.h\n"
- ".inst 0x448843b9 // smlalb z25.s, p4/M, z29.h, z8.h\n"
- ".inst 0x44844770 // smlalt z16.s, p4/M, z27.h, z4.h\n"
- ".inst 0x448847a9 // smlalt z9.s, p4/M, z29.h, z8.h\n"
- "ld1sb { z29.h }, p3/Z, [x25, x8]\n"
- ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
- ".inst 0x448243ed // smlalb z13.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448247f1 // smlalt z17.s, p4/M, z31.h, z2.h\n"
- "ld1w { z18.s }, p1/Z, [x28, #1, MUL VL]\n"
- "addvl x28, x28, #2\n"
- ".inst 0x448143fa // smlalb z26.s, p4/M, z31.h, z1.h\n"
- ".inst 0x448147ea // smlalt z10.s, p4/M, z31.h, z1.h\n"
- "ld1sb { z31.h }, p3/Z, [x23, x8]\n"
- ".inst 0x455713ff // ssublb z31.h, z31.b, z23.b\n"
- ".inst 0x448543d8 // smlalb z24.s, p4/M, z30.h, z5.h\n"
- ".inst 0x448443d9 // smlalb z25.s, p4/M, z30.h, z4.h\n"
- "uzp1 z1.s, z20.s, z18.s\n"
- ".inst 0x448843cd // smlalb z13.s, p4/M, z30.h, z8.h\n"
- ".inst 0x448847d1 // smlalt z17.s, p4/M, z30.h, z8.h\n"
- "uzp2 z27.s, z20.s, z18.s\n"
- ".inst 0x448743da // smlalb z26.s, p4/M, z30.h, z7.h\n"
- ".inst 0x448747ca // smlalt z10.s, p4/M, z30.h, z7.h\n"
- ".inst 0x448547d0 // smlalt z16.s, p4/M, z30.h, z5.h\n"
- ".inst 0x448447c9 // smlalt z9.s, p4/M, z30.h, z4.h\n"
- "ld1sb { z30.h }, p3/Z, [x22, x8]\n"
- ".inst 0x455713de // ssublb z30.h, z30.b, z23.b\n"
- ".inst 0x448043b8 // smlalb z24.s, p4/M, z29.h, z0.h\n"
- ".inst 0x44824399 // smlalb z25.s, p4/M, z28.h, z2.h\n"
- ".inst 0x448343ad // smlalb z13.s, p4/M, z29.h, z3.h\n"
- ".inst 0x448347b1 // smlalt z17.s, p4/M, z29.h, z3.h\n"
- ".inst 0x448047b0 // smlalt z16.s, p4/M, z29.h, z0.h\n"
- "ld1sb { z29.h }, p3/Z, [x21, x8]\n"
- ".inst 0x44824789 // smlalt z9.s, p4/M, z28.h, z2.h\n"
- ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
- ".inst 0x448343f8 // smlalb z24.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448543d9 // smlalb z25.s, p4/M, z30.h, z5.h\n"
- ".inst 0x4485439a // smlalb z26.s, p4/M, z28.h, z5.h\n"
- ".inst 0x4485478a // smlalt z10.s, p4/M, z28.h, z5.h\n"
- "ld1sb { z28.h }, p3/Z, [x20, x8]\n"
- ".inst 0x4557139c // ssublb z28.h, z28.b, z23.b\n"
- ".inst 0x448643ed // smlalb z13.s, p4/M, z31.h, z6.h\n"
- ".inst 0x448347f0 // smlalt z16.s, p4/M, z31.h, z3.h\n"
- ".inst 0x04b375ad // sqrdmulh z13.s, z13.s, z19.s\n"
- "inch x8\n"
- ".inst 0x448547c9 // smlalt z9.s, p4/M, z30.h, z5.h\n"
- ".inst 0x448743b8 // smlalb z24.s, p4/M, z29.h, z7.h\n"
- "and z21.d, z13.d, z1.d\n"
- "mov x20, x8\n"
- ".inst 0x448643b9 // smlalb z25.s, p4/M, z29.h, z6.h\n"
- ".inst 0x448647f1 // smlalt z17.s, p4/M, z31.h, z6.h\n"
- ".inst 0x04b67631 // sqrdmulh z17.s, z17.s, z22.s\n"
+ ".inst 0x448e4090 // smlalb z16.s, p4/M, z4.h, z14.h\n"
+ ".inst 0x448143b6 // smlalb z22.s, p4/M, z29.h, z1.h\n"
+ "uzp2 z15.s, z20.s, z15.s\n"
+ "ld1w { z20.s }, p2/Z, [x25]\n"
+ ".inst 0x44884789 // smlalt z9.s, p4/M, z28.h, z8.h\n"
+ ".inst 0x44864085 // smlalb z5.s, p4/M, z4.h, z6.h\n"
+ "mov x20, x16\n"
"incw x20\n"
- ".inst 0x448747b0 // smlalt z16.s, p4/M, z29.h, z7.h\n"
- ".inst 0x448647a9 // smlalt z9.s, p4/M, z29.h, z6.h\n"
- "asr z21.s, z21.s, #0x1f\n"
- "whilelt p2.s, x8, x17\n"
- ".inst 0x448843da // smlalb z26.s, p4/M, z30.h, z8.h\n"
- ".inst 0x44884398 // smlalb z24.s, p4/M, z28.h, z8.h\n"
- "and z20.d, z17.d, z27.d\n"
- "whilelt p1.s, x20, x17\n"
- ".inst 0x44874399 // smlalb z25.s, p4/M, z28.h, z7.h\n"
- ".inst 0x448847ca // smlalt z10.s, p4/M, z30.h, z8.h\n"
- ".inst 0x04b3775a // sqrdmulh z26.s, z26.s, z19.s\n"
- "whilelt p3.h, x8, x17\n"
- ".inst 0x44884790 // smlalt z16.s, p4/M, z28.h, z8.h\n"
- ".inst 0x44874789 // smlalt z9.s, p4/M, z28.h, z7.h\n"
- ".inst 0x04b37718 // sqrdmulh z24.s, z24.s, z19.s\n"
- ".inst 0x04b37739 // sqrdmulh z25.s, z25.s, z19.s\n"
- "sqadd z13.s, z13.s, z21.s\n"
- ".inst 0x4482902d // srshl z13.s, p4/M, z13.s, z1.s\n"
- "asr z20.s, z20.s, #0x1f\n"
- "and z19.d, z26.d, z1.d\n"
- ".inst 0x04b6754a // sqrdmulh z10.s, z10.s, z22.s\n"
- "and z18.d, z24.d, z1.d\n"
- ".inst 0x04b67610 // sqrdmulh z16.s, z16.s, z22.s\n"
- "and z21.d, z25.d, z1.d\n"
- ".inst 0x04b67529 // sqrdmulh z9.s, z9.s, z22.s\n"
- "sqadd z17.s, z17.s, z20.s\n"
- ".inst 0x44829371 // srshl z17.s, p4/M, z17.s, z27.s\n"
+ ".inst 0x44954679 // smlalt z25.s, p4/M, z19.h, z21.h\n"
+ ".inst 0x448e4497 // smlalt z23.s, p4/M, z4.h, z14.h\n"
+ "ld1w { z19.s }, p1/Z, [x25, #1, MUL VL]\n"
+ "uzp1 z21.s, z20.s, z19.s\n"
+ ".inst 0x448147bb // smlalt z27.s, p4/M, z29.h, z1.h\n"
+ ".inst 0x448a4391 // smlalb z17.s, p4/M, z28.h, z10.h\n"
+ "uzp2 z1.s, z20.s, z19.s\n"
+ "whilelt p2.s, x16, x15\n"
+ ".inst 0x44864010 // smlalb z16.s, p4/M, z0.h, z6.h\n"
+ ".inst 0x44924076 // smlalb z22.s, p4/M, z3.h, z18.h\n"
+ "whilelt p1.s, x20, x15\n"
+ "whilelt p3.h, x16, x15\n"
+ ".inst 0x44864489 // smlalt z9.s, p4/M, z4.h, z6.h\n"
+ ".inst 0x44874005 // smlalb z5.s, p4/M, z0.h, z7.h\n"
+ ".inst 0x04a274a5 // sqrdmulh z5.s, z5.s, z2.s\n"
+ "addvl x25, x25, #2\n"
+ ".inst 0x448a4799 // smlalt z25.s, p4/M, z28.h, z10.h\n"
+ ".inst 0x44864417 // smlalt z23.s, p4/M, z0.h, z6.h\n"
+ "and z19.d, z5.d, z21.d\n"
+ ".inst 0x4492447b // smlalt z27.s, p4/M, z3.h, z18.h\n"
+ ".inst 0x449243b1 // smlalb z17.s, p4/M, z29.h, z18.h\n"
"asr z19.s, z19.s, #0x1f\n"
- "and z2.d, z10.d, z27.d\n"
+ ".inst 0x448a41b0 // smlalb z16.s, p4/M, z13.h, z10.h\n"
+ ".inst 0x448741b6 // smlalb z22.s, p4/M, z13.h, z7.h\n"
+ "sqadd z5.s, z5.s, z19.s\n"
+ ".inst 0x448292a5 // srshl z5.s, p4/M, z5.s, z21.s\n"
+ ".inst 0x44874409 // smlalt z9.s, p4/M, z0.h, z7.h\n"
+ ".inst 0x449247b9 // smlalt z25.s, p4/M, z29.h, z18.h\n"
+ ".inst 0x04af7529 // sqrdmulh z9.s, z9.s, z15.s\n"
+ ".inst 0x448a45b7 // smlalt z23.s, p4/M, z13.h, z10.h\n"
+ ".inst 0x448745bb // smlalt z27.s, p4/M, z13.h, z7.h\n"
+ "and z29.d, z9.d, z1.d\n"
+ ".inst 0x44884071 // smlalb z17.s, p4/M, z3.h, z8.h\n"
+ ".inst 0x448843f0 // smlalb z16.s, p4/M, z31.h, z8.h\n"
+ ".inst 0x04a27631 // sqrdmulh z17.s, z17.s, z2.s\n"
+ ".inst 0x448a43f6 // smlalb z22.s, p4/M, z31.h, z10.h\n"
+ ".inst 0x44884479 // smlalt z25.s, p4/M, z3.h, z8.h\n"
+ ".inst 0x04a27610 // sqrdmulh z16.s, z16.s, z2.s\n"
+ ".inst 0x448847f7 // smlalt z23.s, p4/M, z31.h, z8.h\n"
+ ".inst 0x448a47fb // smlalt z27.s, p4/M, z31.h, z10.h\n"
+ ".inst 0x04a276d6 // sqrdmulh z22.s, z22.s, z2.s\n"
+ "asr z29.s, z29.s, #0x1f\n"
+ "and z18.d, z17.d, z21.d\n"
+ ".inst 0x04af7739 // sqrdmulh z25.s, z25.s, z15.s\n"
+ "and z20.d, z16.d, z21.d\n"
+ ".inst 0x04af76f7 // sqrdmulh z23.s, z23.s, z15.s\n"
+ "and z19.d, z22.d, z21.d\n"
+ ".inst 0x04af777b // sqrdmulh z27.s, z27.s, z15.s\n"
+ "sqadd z9.s, z9.s, z29.s\n"
+ ".inst 0x44829029 // srshl z9.s, p4/M, z9.s, z1.s\n"
"asr z18.s, z18.s, #0x1f\n"
- "and z22.d, z16.d, z27.d\n"
- "asr z21.s, z21.s, #0x1f\n"
- "and z20.d, z9.d, z27.d\n"
- "sqadd z26.s, z26.s, z19.s\n"
- "asr z2.s, z2.s, #0x1f\n"
- ".inst 0x4482903a // srshl z26.s, p4/M, z26.s, z1.s\n"
- "sqadd z24.s, z24.s, z18.s\n"
- "asr z22.s, z22.s, #0x1f\n"
- ".inst 0x44829038 // srshl z24.s, p4/M, z24.s, z1.s\n"
- "sqadd z25.s, z25.s, z21.s\n"
+ "and z7.d, z25.d, z1.d\n"
"asr z20.s, z20.s, #0x1f\n"
+ "and z6.d, z23.d, z1.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "and z2.d, z27.d, z1.d\n"
+ "sqadd z17.s, z17.s, z18.s\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ ".inst 0x448292b1 // srshl z17.s, p4/M, z17.s, z21.s\n"
+ "sqadd z16.s, z16.s, z20.s\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ ".inst 0x448292b0 // srshl z16.s, p4/M, z16.s, z21.s\n"
+ "sqadd z22.s, z22.s, z19.s\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ ".inst 0x448292b6 // srshl z22.s, p4/M, z22.s, z21.s\n"
+ "sqadd z25.s, z25.s, z7.s\n"
+ "sqadd z23.s, z23.s, z6.s\n"
".inst 0x44829039 // srshl z25.s, p4/M, z25.s, z1.s\n"
- "sqadd z10.s, z10.s, z2.s\n"
- "sqadd z16.s, z16.s, z22.s\n"
- ".inst 0x4482936a // srshl z10.s, p4/M, z10.s, z27.s\n"
- ".inst 0x44829370 // srshl z16.s, p4/M, z16.s, z27.s\n"
- "sqadd z9.s, z9.s, z20.s\n"
- ".inst 0x453041ad // sqxtnb z13.h, z13.s\n"
- ".inst 0x44829369 // srshl z9.s, p4/M, z9.s, z27.s\n"
- ".inst 0x4530435a // sqxtnb z26.h, z26.s\n"
- ".inst 0x45304318 // sqxtnb z24.h, z24.s\n"
- ".inst 0x45304339 // sqxtnb z25.h, z25.s\n"
- ".inst 0x4530462d // sqxtnt z13.h, z17.s\n"
- ".inst 0x4530455a // sqxtnt z26.h, z10.s\n"
- ".inst 0x45304618 // sqxtnt z24.h, z16.s\n"
- ".inst 0x45304539 // sqxtnt z25.h, z9.s\n"
- "sqadd z13.h, z13.h, z14.h\n"
- "smax z13.h, p4/M, z13.h, z12.h\n"
- "smin z13.h, p4/M, z13.h, z11.h\n"
- "sqadd z26.h, z26.h, z14.h\n"
- "sqadd z24.h, z24.h, z14.h\n"
- "smax z26.h, p4/M, z26.h, z12.h\n"
- "smax z24.h, p4/M, z24.h, z12.h\n"
- "sqadd z25.h, z25.h, z14.h\n"
- "smax z25.h, p4/M, z25.h, z12.h\n"
- "smin z26.h, p4/M, z26.h, z11.h\n"
- "st1b { z13.h }, p0, [x15, x10]\n"
- "smin z24.h, p4/M, z24.h, z11.h\n"
- "smin z25.h, p4/M, z25.h, z11.h\n"
- "st1b { z26.h }, p0, [x14, x10]\n"
- "st1b { z24.h }, p0, [x13, x10]\n"
- "st1b { z25.h }, p0, [x12, x10]\n"
- "ld1sb { z0.h }, p4/Z, [x16]\n"
- "ld1sb { z1.h }, p4/Z, [x16, #1, MUL VL]\n"
- "inch x10\n"
- "ld1sb { z2.h }, p4/Z, [x16, #2, MUL VL]\n"
- "ld1sb { z3.h }, p4/Z, [x16, #3, MUL VL]\n"
- ".inst 0x454f1000 // ssublb z0.h, z0.b, z15.b\n"
- ".inst 0x454f1021 // ssublb z1.h, z1.b, z15.b\n"
- "ld1sb { z4.h }, p4/Z, [x16, #4, MUL VL]\n"
- "ld1sb { z5.h }, p4/Z, [x16, #5, MUL VL]\n"
- ".inst 0x454f1042 // ssublb z2.h, z2.b, z15.b\n"
- ".inst 0x454f1063 // ssublb z3.h, z3.b, z15.b\n"
- "ld1sb { z6.h }, p4/Z, [x16, #6, MUL VL]\n"
- "ld1sb { z7.h }, p4/Z, [x16, #7, MUL VL]\n"
- "inch x16, ALL, MUL #8\n"
- ".inst 0x454f1084 // ssublb z4.h, z4.b, z15.b\n"
- "ld1w { z17.s }, p2/Z, [x26]\n"
- "ld1w { z16.s }, p1/Z, [x26, #1, MUL VL]\n"
- "uzp1 z13.s, z17.s, z16.s\n"
- "uzp2 z17.s, z17.s, z16.s\n"
- "ld1sb { z8.h }, p4/Z, [x16]\n"
- "ldp x24, x23, [x11, #0x0]\n"
- "addvl x26, x26, #2\n"
- "str x26, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x22, x21, [x11, #0x10]\n"
- "ldr x20, [x11, #0x20]\n"
- "mov z26.d, z13.d\n"
- "mov z10.d, z17.d\n"
- "ld1sb { z31.h }, p3/Z, [x24, x8]\n"
- "ld1sb { z30.h }, p3/Z, [x23, x8]\n"
- "mov z24.d, z13.d\n"
- "mov z16.d, z17.d\n"
- "ld1sb { z29.h }, p3/Z, [x22, x8]\n"
- "ld1sb { z28.h }, p3/Z, [x21, x8]\n"
- "mov z25.d, z13.d\n"
- "mov z9.d, z17.d\n"
- "ld1sb { z27.h }, p3/Z, [x20, x8]\n"
- ".inst 0x454f10a5 // ssublb z5.h, z5.b, z15.b\n"
- ".inst 0x454f10c6 // ssublb z6.h, z6.b, z15.b\n"
- ".inst 0x454f10e7 // ssublb z7.h, z7.b, z15.b\n"
- ".inst 0x454f1108 // ssublb z8.h, z8.b, z15.b\n"
- ".inst 0x455713ff // ssublb z31.h, z31.b, z23.b\n"
- ".inst 0x455713de // ssublb z30.h, z30.b, z23.b\n"
- ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
- ".inst 0x4557139c // ssublb z28.h, z28.b, z23.b\n"
- ".inst 0x4557137b // ssublb z27.h, z27.b, z23.b\n"
+ ".inst 0x44829037 // srshl z23.s, p4/M, z23.s, z1.s\n"
+ "sqadd z27.s, z27.s, z2.s\n"
+ ".inst 0x453040a5 // sqxtnb z5.h, z5.s\n"
+ ".inst 0x4482903b // srshl z27.s, p4/M, z27.s, z1.s\n"
+ ".inst 0x45304231 // sqxtnb z17.h, z17.s\n"
+ ".inst 0x45304210 // sqxtnb z16.h, z16.s\n"
+ ".inst 0x453042d6 // sqxtnb z22.h, z22.s\n"
+ ".inst 0x45304525 // sqxtnt z5.h, z9.s\n"
+ ".inst 0x45304731 // sqxtnt z17.h, z25.s\n"
+ ".inst 0x453046f0 // sqxtnt z16.h, z23.s\n"
+ ".inst 0x45304776 // sqxtnt z22.h, z27.s\n"
+ "sqadd z5.h, z5.h, z24.h\n"
+ "smax z5.h, p4/M, z5.h, z11.h\n"
+ "smin z5.h, p4/M, z5.h, z26.h\n"
+ "sqadd z17.h, z17.h, z24.h\n"
+ "sqadd z16.h, z16.h, z24.h\n"
+ "smax z17.h, p4/M, z17.h, z11.h\n"
+ "smax z16.h, p4/M, z16.h, z11.h\n"
+ "sqadd z22.h, z22.h, z24.h\n"
+ "smax z22.h, p4/M, z22.h, z11.h\n"
+ "smin z17.h, p4/M, z17.h, z26.h\n"
+ "st1b { z5.h }, p0, [x13, x27]\n"
+ "smin z16.h, p4/M, z16.h, z26.h\n"
+ "smin z22.h, p4/M, z22.h, z26.h\n"
+ "st1b { z17.h }, p0, [x12, x27]\n"
+ "st1b { z16.h }, p0, [x11, x27]\n"
+ "st1b { z22.h }, p0, [x10, x27]\n"
+ "ld1sb { z14.h }, p4/Z, [x14]\n"
+ "ld1sb { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
+ "inch x27\n"
+ "ld1sb { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+ "ld1sb { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
+ ".inst 0x455e11ce // ssublb z14.h, z14.b, z30.b\n"
+ ".inst 0x455e12b5 // ssublb z21.h, z21.b, z30.b\n"
+ "ld1sb { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
+ "ld1sb { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
+ ".inst 0x455e1021 // ssublb z1.h, z1.b, z30.b\n"
+ ".inst 0x455e10c6 // ssublb z6.h, z6.b, z30.b\n"
+ "ld1sb { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
+ "ld1sb { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+ "inch x14, ALL, MUL #8\n"
+ ".inst 0x455e1042 // ssublb z2.h, z2.b, z30.b\n"
+ "ld1w { z17.s }, p2/Z, [x21]\n"
+ "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+ "uzp1 z5.s, z17.s, z16.s\n"
+ "uzp2 z9.s, z17.s, z16.s\n"
+ "ld1sb { z8.h }, p4/Z, [x14]\n"
+ "ldp x24, x23, [x28, #0x0]\n"
+ "addvl x21, x21, #2\n"
+ "str x21, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x22, x21, [x28, #0x10]\n"
+ "ldr x20, [x28, #0x20]\n"
+ "mov z17.d, z5.d\n"
+ "mov z25.d, z9.d\n"
+ "ld1sb { z0.h }, p3/Z, [x24, x16]\n"
+ "ld1sb { z29.h }, p3/Z, [x23, x16]\n"
+ "mov z16.d, z5.d\n"
+ "mov z23.d, z9.d\n"
+ "ld1sb { z4.h }, p3/Z, [x22, x16]\n"
+ "ld1sb { z13.h }, p3/Z, [x21, x16]\n"
+ "mov z22.d, z5.d\n"
+ "mov z27.d, z9.d\n"
+ "ld1sb { z20.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x455e1252 // ssublb z18.h, z18.b, z30.b\n"
+ ".inst 0x455e10e7 // ssublb z7.h, z7.b, z30.b\n"
+ ".inst 0x455e114a // ssublb z10.h, z10.b, z30.b\n"
+ ".inst 0x455e1108 // ssublb z8.h, z8.b, z30.b\n"
+ ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
+ ".inst 0x454c13bd // ssublb z29.h, z29.b, z12.b\n"
+ ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
+ ".inst 0x454c11ad // ssublb z13.h, z13.b, z12.b\n"
+ ".inst 0x454c1294 // ssublb z20.h, z20.b, z12.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index 78bcd1407f..7a9b8a5bde 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,14 +22,14 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -47,17 +47,16 @@ class sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstS
constexpr static unsigned int stride_rows = 2;
constexpr static unsigned int stride_cols = 2;
- arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
- unsigned int get_accumulator_depth_vl(void) const override { return 2; }
-
sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
- Parent::KernelType kernel = sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+ Parent::KernelType kernel = sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index 357c9f8399..fc9a48bb46 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -27,7 +27,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -46,7 +46,7 @@ void sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
struct Params
{
long unsigned int n_channels;
- const int8_t *weights;
+ const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
const int32_t *const requant_muls;
@@ -57,7 +57,7 @@ void sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
Params(
long unsigned int n_channels,
const int8_t *const *inptrs_raw,
- const int8_t *const weights,
+ const void *const weights,
const int32_t *const bias,
const arm_gemm::Requantize32 &qp,
const int32_t *const requant_muls,
@@ -110,13 +110,13 @@ void sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
"add x20, x25, %[offsetof_Requantize32_b_offset]\n"
"add x22, x25, %[offsetof_Requantize32_c_offset]\n"
- "ld1rb { z23.b }, p4/Z, [x21]\n"
- "ld1rb { z12.b }, p4/Z, [x20]\n"
+ "ld1rb { z26.b }, p4/Z, [x21]\n"
+ "ld1rb { z13.b }, p4/Z, [x20]\n"
"add x21, x25, %[offsetof_Requantize32_minval]\n"
"add x20, x25, %[offsetof_Requantize32_maxval]\n"
- "ld1rh { z14.h }, p4/Z, [x22]\n"
- "ld1rh { z16.h }, p4/Z, [x21]\n"
- "ld1rh { z15.h }, p4/Z, [x20]\n"
+ "ld1rh { z19.h }, p4/Z, [x22]\n"
+ "ld1rh { z12.h }, p4/Z, [x21]\n"
+ "ld1rh { z9.h }, p4/Z, [x20]\n"
"ldp x16, x15, [x24, #0x0]\n"
"incw x23\n"
"whilelt p3.h, x7, x8\n"
@@ -124,320 +124,320 @@ void sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"whilelt p2.s, x7, x8\n"
"whilelt p1.s, x23, x8\n"
"ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1sb { z0.h }, p4/Z, [x17]\n"
- "ld1sb { z1.h }, p4/Z, [x17, #1, MUL VL]\n"
+ "ld1sb { z25.h }, p4/Z, [x17]\n"
+ "ld1sb { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
"add x11, %x[params], %[offsetof_Params_inptrs]\n"
"mov x10, #0x0\n"
- "ld1sb { z2.h }, p4/Z, [x17, #2, MUL VL]\n"
- "ld1sb { z3.h }, p4/Z, [x17, #3, MUL VL]\n"
- ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
- ".inst 0x454c1021 // ssublb z1.h, z1.b, z12.b\n"
- "ld1sb { z4.h }, p4/Z, [x17, #4, MUL VL]\n"
- "ld1sb { z5.h }, p4/Z, [x17, #5, MUL VL]\n"
- ".inst 0x454c1042 // ssublb z2.h, z2.b, z12.b\n"
- ".inst 0x454c1063 // ssublb z3.h, z3.b, z12.b\n"
- "ld1sb { z6.h }, p4/Z, [x17, #6, MUL VL]\n"
+ "ld1sb { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
+ "ld1sb { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
+ ".inst 0x454d1339 // ssublb z25.h, z25.b, z13.b\n"
+ ".inst 0x454d13de // ssublb z30.h, z30.b, z13.b\n"
+ "ld1sb { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
+ "ld1sb { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
+ ".inst 0x454d11ce // ssublb z14.h, z14.b, z13.b\n"
+ ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n"
+ "ld1sb { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
"ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
"inch x17, ALL, MUL #8\n"
- ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
- "ld1w { z18.s }, p2/Z, [x12]\n"
- "ld1w { z8.s }, p1/Z, [x12, #1, MUL VL]\n"
- "uzp1 z13.s, z18.s, z8.s\n"
- "uzp2 z17.s, z18.s, z8.s\n"
- "ld1sb { z8.h }, p4/Z, [x17]\n"
- "ldp x9, x28, [x11, #0x0]\n"
+ ".inst 0x454d114a // ssublb z10.h, z10.b, z13.b\n"
+ "ld1w { z17.s }, p2/Z, [x12]\n"
+ "ld1w { z16.s }, p1/Z, [x12, #1, MUL VL]\n"
+ "uzp1 z8.s, z17.s, z16.s\n"
+ "uzp2 z24.s, z17.s, z16.s\n"
+ "ld1sb { z2.h }, p4/Z, [x17]\n"
+ "ldp x27, x26, [x11, #0x0]\n"
"addvl x12, x12, #2\n"
- "mov z9.d, z13.d\n"
+ "mov z18.d, z8.d\n"
"ldp x25, x24, [x11, #0x10]\n"
"ldp x23, x22, [x11, #0x20]\n"
- "mov z10.d, z17.d\n"
- "mov z11.d, z13.d\n"
+ "mov z0.d, z24.d\n"
+ "mov z15.d, z8.d\n"
"ldp x21, x20, [x11, #0x30]\n"
- "ld1sb { z31.h }, p3/Z, [x9, x7]\n"
- "mov z22.d, z17.d\n"
- "mov z21.d, z13.d\n"
- "ld1sb { z30.h }, p3/Z, [x28, x7]\n"
- "ld1sb { z29.h }, p3/Z, [x25, x7]\n"
- "mov z18.d, z17.d\n"
- ".inst 0x454c10a5 // ssublb z5.h, z5.b, z12.b\n"
- "ld1sb { z28.h }, p3/Z, [x24, x7]\n"
+ "ld1sb { z21.h }, p3/Z, [x27, x7]\n"
+ "mov z1.d, z24.d\n"
+ "mov z5.d, z8.d\n"
+ "ld1sb { z22.h }, p3/Z, [x26, x7]\n"
+ "ld1sb { z11.h }, p3/Z, [x25, x7]\n"
+ "mov z6.d, z24.d\n"
+ ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n"
+ "ld1sb { z20.h }, p3/Z, [x24, x7]\n"
"ld1sb { z27.h }, p3/Z, [x23, x7]\n"
- ".inst 0x454c10c6 // ssublb z6.h, z6.b, z12.b\n"
- ".inst 0x454c10e7 // ssublb z7.h, z7.b, z12.b\n"
- "ld1sb { z26.h }, p3/Z, [x22, x7]\n"
- "ld1sb { z25.h }, p3/Z, [x21, x7]\n"
- ".inst 0x454c1108 // ssublb z8.h, z8.b, z12.b\n"
- ".inst 0x455713ff // ssublb z31.h, z31.b, z23.b\n"
- "ld1sb { z24.h }, p3/Z, [x20, x7]\n"
- "ldr x27, [%x[params], %[offsetof_Params_requant_muls]]\n"
- ".inst 0x455713de // ssublb z30.h, z30.b, z23.b\n"
- ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
- "ldr x26, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ ".inst 0x454d12f7 // ssublb z23.h, z23.b, z13.b\n"
+ ".inst 0x454d10e7 // ssublb z7.h, z7.b, z13.b\n"
+ "ld1sb { z28.h }, p3/Z, [x22, x7]\n"
+ "ld1sb { z16.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n"
+ ".inst 0x455a12b5 // ssublb z21.h, z21.b, z26.b\n"
+ "ld1sb { z31.h }, p3/Z, [x20, x7]\n"
+ "ldr x9, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ ".inst 0x455a12d6 // ssublb z22.h, z22.b, z26.b\n"
+ ".inst 0x455a116b // ssublb z11.h, z11.b, z26.b\n"
+ "ldr x28, [%x[params], %[offsetof_Params_requant_shifts]]\n"
"str x12, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x4557139c // ssublb z28.h, z28.b, z23.b\n"
- ".inst 0x4557137b // ssublb z27.h, z27.b, z23.b\n"
- ".inst 0x4557135a // ssublb z26.h, z26.b, z23.b\n"
- ".inst 0x45571339 // ssublb z25.h, z25.b, z23.b\n"
- ".inst 0x45571318 // ssublb z24.h, z24.b, z23.b\n"
+ ".inst 0x455a1294 // ssublb z20.h, z20.b, z26.b\n"
+ ".inst 0x455a137b // ssublb z27.h, z27.b, z26.b\n"
+ ".inst 0x455a139c // ssublb z28.h, z28.b, z26.b\n"
+ ".inst 0x455a1210 // ssublb z16.h, z16.b, z26.b\n"
+ ".inst 0x455a13ff // ssublb z31.h, z31.b, z26.b\n"
"1:" // Loop
- ".inst 0x448843ed // smlalb z13.s, p4/M, z31.h, z8.h\n"
- ".inst 0x448847f1 // smlalt z17.s, p4/M, z31.h, z8.h\n"
- "ldr x25, [x11, #0x40]\n"
- "ldr x24, [x11, #0x48]\n"
- ".inst 0x448643e9 // smlalb z9.s, p4/M, z31.h, z6.h\n"
- ".inst 0x448647ea // smlalt z10.s, p4/M, z31.h, z6.h\n"
- "ldr x22, [x11, #0x50]\n"
- "ldr x20, [x11, #0x58]\n"
- ".inst 0x448043cd // smlalb z13.s, p4/M, z30.h, z0.h\n"
- ".inst 0x448047d1 // smlalt z17.s, p4/M, z30.h, z0.h\n"
- "ldr x23, [x11, #0x78]\n"
- "ldr x21, [x11, #0x60]\n"
- ".inst 0x44814389 // smlalb z9.s, p4/M, z28.h, z1.h\n"
- ".inst 0x4481478a // smlalt z10.s, p4/M, z28.h, z1.h\n"
- "ld1sb { z28.h }, p3/Z, [x24, x7]\n"
- ".inst 0x4557139c // ssublb z28.h, z28.b, z23.b\n"
- ".inst 0x448143ad // smlalb z13.s, p4/M, z29.h, z1.h\n"
- ".inst 0x448147b1 // smlalt z17.s, p4/M, z29.h, z1.h\n"
- "ld1sb { z29.h }, p3/Z, [x25, x7]\n"
- ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
- ".inst 0x44824369 // smlalb z9.s, p4/M, z27.h, z2.h\n"
- ".inst 0x4482476a // smlalt z10.s, p4/M, z27.h, z2.h\n"
- "ld1sb { z27.h }, p3/Z, [x22, x7]\n"
- ".inst 0x4557137b // ssublb z27.h, z27.b, z23.b\n"
- ".inst 0x4483434d // smlalb z13.s, p4/M, z26.h, z3.h\n"
- ".inst 0x44834751 // smlalt z17.s, p4/M, z26.h, z3.h\n"
- "ld1sb { z26.h }, p3/Z, [x20, x7]\n"
- ".inst 0x4557135a // ssublb z26.h, z26.b, z23.b\n"
- ".inst 0x44804309 // smlalb z9.s, p4/M, z24.h, z0.h\n"
- ".inst 0x4480470a // smlalt z10.s, p4/M, z24.h, z0.h\n"
- "ldr x22, [x11, #0x80]\n"
- "ldr x20, [x11, #0x68]\n"
- ".inst 0x4484432d // smlalb z13.s, p4/M, z25.h, z4.h\n"
- ".inst 0x44844731 // smlalt z17.s, p4/M, z25.h, z4.h\n"
- "ld1sb { z25.h }, p3/Z, [x21, x7]\n"
- ".inst 0x45571339 // ssublb z25.h, z25.b, z23.b\n"
- ".inst 0x448443a9 // smlalb z9.s, p4/M, z29.h, z4.h\n"
- ".inst 0x448447aa // smlalt z10.s, p4/M, z29.h, z4.h\n"
- "ldr x21, [x11, #0x88]\n"
+ ".inst 0x448242a8 // smlalb z8.s, p4/M, z21.h, z2.h\n"
+ "ldr x21, [x11, #0x58]\n"
+ "ldr x20, [x11, #0x78]\n"
+ ".inst 0x448246b8 // smlalt z24.s, p4/M, z21.h, z2.h\n"
+ ".inst 0x449942c8 // smlalb z8.s, p4/M, z22.h, z25.h\n"
+ "ld1sb { z17.h }, p3/Z, [x21, x7]\n"
"ld1sb { z29.h }, p3/Z, [x20, x7]\n"
- ".inst 0x4482430d // smlalb z13.s, p4/M, z24.h, z2.h\n"
- ".inst 0x44824711 // smlalt z17.s, p4/M, z24.h, z2.h\n"
- "ldr x20, [x11, #0x70]\n"
- ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
- ".inst 0x44854389 // smlalb z9.s, p4/M, z28.h, z5.h\n"
- ".inst 0x4485478a // smlalt z10.s, p4/M, z28.h, z5.h\n"
- "ld1sb { z28.h }, p3/Z, [x22, x7]\n"
- ".inst 0x4557139c // ssublb z28.h, z28.b, z23.b\n"
- ".inst 0x448243eb // smlalb z11.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448247f6 // smlalt z22.s, p4/M, z31.h, z2.h\n"
- "ldr x25, [x11, #0x98]\n"
- "ld1sb { z24.h }, p3/Z, [x20, x7]\n"
- ".inst 0x4485436d // smlalb z13.s, p4/M, z27.h, z5.h\n"
- ".inst 0x44854771 // smlalt z17.s, p4/M, z27.h, z5.h\n"
- ".inst 0x45571318 // ssublb z24.h, z24.b, z23.b\n"
- "ldr x24, [x11, #0x90]\n"
- ".inst 0x44834369 // smlalb z9.s, p4/M, z27.h, z3.h\n"
- ".inst 0x4483476a // smlalt z10.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x449742b2 // smlalb z18.s, p4/M, z21.h, z23.h\n"
+ "ldr x21, [x11, #0x60]\n"
+ "ldr x20, [x11, #0x80]\n"
+ ".inst 0x448e42af // smlalb z15.s, p4/M, z21.h, z14.h\n"
+ ".inst 0x449942a5 // smlalb z5.s, p4/M, z21.h, z25.h\n"
+ ".inst 0x449946d8 // smlalt z24.s, p4/M, z22.h, z25.h\n"
+ ".inst 0x455a1231 // ssublb z17.h, z17.b, z26.b\n"
+ ".inst 0x449e4168 // smlalb z8.s, p4/M, z11.h, z30.h\n"
+ "ld1sb { z22.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x455a13bd // ssublb z29.h, z29.b, z26.b\n"
+ ".inst 0x449746a0 // smlalt z0.s, p4/M, z21.h, z23.h\n"
+ ".inst 0x448e46a1 // smlalt z1.s, p4/M, z21.h, z14.h\n"
+ "ldr x21, [x11, #0x68]\n"
+ ".inst 0x449946a6 // smlalt z6.s, p4/M, z21.h, z25.h\n"
+ "ld1sb { z21.h }, p3/Z, [x20, x7]\n"
+ "ldr x20, [x11, #0x88]\n"
+ ".inst 0x449e4292 // smlalb z18.s, p4/M, z20.h, z30.h\n"
+ ".inst 0x4484422f // smlalb z15.s, p4/M, z17.h, z4.h\n"
+ ".inst 0x448a43a5 // smlalb z5.s, p4/M, z29.h, z10.h\n"
+ ".inst 0x455a12d6 // ssublb z22.h, z22.b, z26.b\n"
+ "ldr x22, [x11, #0x40]\n"
+ ".inst 0x449e4578 // smlalt z24.s, p4/M, z11.h, z30.h\n"
+ ".inst 0x455a12b5 // ssublb z21.h, z21.b, z26.b\n"
+ ".inst 0x44844388 // smlalb z8.s, p4/M, z28.h, z4.h\n"
+ "ld1sb { z11.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x449e4680 // smlalt z0.s, p4/M, z20.h, z30.h\n"
+ "ld1sb { z20.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x44844621 // smlalt z1.s, p4/M, z17.h, z4.h\n"
+ "ldr x21, [x11, #0x70]\n"
+ ".inst 0x448a47a6 // smlalt z6.s, p4/M, z29.h, z10.h\n"
+ "ldr x20, [x11, #0x98]\n"
+ ".inst 0x448e4372 // smlalb z18.s, p4/M, z27.h, z14.h\n"
+ "ldr x23, [x11, #0x50]\n"
+ ".inst 0x449942cf // smlalb z15.s, p4/M, z22.h, z25.h\n"
+ ".inst 0x449e42a5 // smlalb z5.s, p4/M, z21.h, z30.h\n"
+ ".inst 0x455a116b // ssublb z11.h, z11.b, z26.b\n"
+ "ld1sb { z17.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x44844798 // smlalt z24.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x455a1294 // ssublb z20.h, z20.b, z26.b\n"
+ ".inst 0x448a4208 // smlalb z8.s, p4/M, z16.h, z10.h\n"
+ "ld1sb { z29.h }, p3/Z, [x21, x7]\n"
+ "ld1sb { z28.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x448e4760 // smlalt z0.s, p4/M, z27.h, z14.h\n"
+ "ldr x22, [x11, #0x48]\n"
+ ".inst 0x449946c1 // smlalt z1.s, p4/M, z22.h, z25.h\n"
+ ".inst 0x449e46a6 // smlalt z6.s, p4/M, z21.h, z30.h\n"
+ "ldr x21, [x11, #0x90]\n"
+ "ldr x20, [x11, #0xa8]\n"
+ ".inst 0x449943f2 // smlalb z18.s, p4/M, z31.h, z25.h\n"
"ld1sb { z27.h }, p3/Z, [x23, x7]\n"
- ".inst 0x4557137b // ssublb z27.h, z27.b, z23.b\n"
- ".inst 0x448043f5 // smlalb z21.s, p4/M, z31.h, z0.h\n"
- ".inst 0x4483434b // smlalb z11.s, p4/M, z26.h, z3.h\n"
- "ldr x23, [x11, #0xa8]\n"
- "ldr x20, [x11, #0xa0]\n"
- ".inst 0x44834756 // smlalt z22.s, p4/M, z26.h, z3.h\n"
- ".inst 0x448047f2 // smlalt z18.s, p4/M, z31.h, z0.h\n"
- "ld1sb { z26.h }, p3/Z, [x21, x7]\n"
- ".inst 0x4557135a // ssublb z26.h, z26.b, z23.b\n"
- ".inst 0x44844375 // smlalb z21.s, p4/M, z27.h, z4.h\n"
- ".inst 0x4480432b // smlalb z11.s, p4/M, z25.h, z0.h\n"
- "ldr x22, [x11, #0xb0]\n"
- "ldr x21, [x11, #0xb8]\n"
- ".inst 0x44804736 // smlalt z22.s, p4/M, z25.h, z0.h\n"
- ".inst 0x44844772 // smlalt z18.s, p4/M, z27.h, z4.h\n"
- "ld1sb { z27.h }, p3/Z, [x20, x7]\n"
- ".inst 0x4557137b // ssublb z27.h, z27.b, z23.b\n"
- ".inst 0x44814395 // smlalb z21.s, p4/M, z28.h, z1.h\n"
- ".inst 0x4486432d // smlalb z13.s, p4/M, z25.h, z6.h\n"
- "ldr x20, [x11, #0xc0]\n"
- "ld1w { z31.s }, p2/Z, [x27]\n"
- ".inst 0x44864731 // smlalt z17.s, p4/M, z25.h, z6.h\n"
- ".inst 0x448443ab // smlalb z11.s, p4/M, z29.h, z4.h\n"
- "ld1sb { z25.h }, p3/Z, [x24, x7]\n"
- ".inst 0x45571339 // ssublb z25.h, z25.b, z23.b\n"
- ".inst 0x448447b6 // smlalt z22.s, p4/M, z29.h, z4.h\n"
- "ld1sb { z29.h }, p3/Z, [x25, x7]\n"
- ".inst 0x44814792 // smlalt z18.s, p4/M, z28.h, z1.h\n"
- ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
- ".inst 0x44854355 // smlalb z21.s, p4/M, z26.h, z5.h\n"
- ".inst 0x4487430d // smlalb z13.s, p4/M, z24.h, z7.h\n"
- "ld1w { z20.s }, p1/Z, [x27, #1, MUL VL]\n"
- "uzp1 z19.s, z31.s, z20.s\n"
- ".inst 0x44874711 // smlalt z17.s, p4/M, z24.h, z7.h\n"
- ".inst 0x4481430b // smlalb z11.s, p4/M, z24.h, z1.h\n"
- "uzp2 z30.s, z31.s, z20.s\n"
- "ld1w { z31.s }, p2/Z, [x26]\n"
- ".inst 0x44814716 // smlalt z22.s, p4/M, z24.h, z1.h\n"
- "ld1sb { z24.h }, p3/Z, [x23, x7]\n"
- ".inst 0x44854752 // smlalt z18.s, p4/M, z26.h, z5.h\n"
- ".inst 0x45571318 // ssublb z24.h, z24.b, z23.b\n"
- ".inst 0x448243b5 // smlalb z21.s, p4/M, z29.h, z2.h\n"
- "ld1sb { z26.h }, p3/Z, [x22, x7]\n"
- ".inst 0x448247b2 // smlalt z18.s, p4/M, z29.h, z2.h\n"
- ".inst 0x4557135a // ssublb z26.h, z26.b, z23.b\n"
- ".inst 0x4486432b // smlalb z11.s, p4/M, z25.h, z6.h\n"
- ".inst 0x44834315 // smlalb z21.s, p4/M, z24.h, z3.h\n"
- "ld1w { z20.s }, p1/Z, [x26, #1, MUL VL]\n"
- "uzp1 z1.s, z31.s, z20.s\n"
- ".inst 0x44874389 // smlalb z9.s, p4/M, z28.h, z7.h\n"
- ".inst 0x4487478a // smlalt z10.s, p4/M, z28.h, z7.h\n"
- ".inst 0x04b375ad // sqrdmulh z13.s, z13.s, z19.s\n"
- "whilelt p0.h, x10, x8\n"
- ".inst 0x44864736 // smlalt z22.s, p4/M, z25.h, z6.h\n"
+ ".inst 0x448a416f // smlalb z15.s, p4/M, z11.h, z10.h\n"
+ ".inst 0x44834285 // smlalb z5.s, p4/M, z20.h, z3.h\n"
+ ".inst 0x455a1231 // ssublb z17.h, z17.b, z26.b\n"
+ ".inst 0x448a4618 // smlalt z24.s, p4/M, z16.h, z10.h\n"
+ ".inst 0x455a13bd // ssublb z29.h, z29.b, z26.b\n"
+ ".inst 0x448e43e8 // smlalb z8.s, p4/M, z31.h, z14.h\n"
+ "ld1sb { z16.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x455a139c // ssublb z28.h, z28.b, z26.b\n"
+ ".inst 0x449947e0 // smlalt z0.s, p4/M, z31.h, z25.h\n"
"ld1sb { z25.h }, p3/Z, [x21, x7]\n"
- ".inst 0x44834712 // smlalt z18.s, p4/M, z24.h, z3.h\n"
- ".inst 0x45571339 // ssublb z25.h, z25.b, z23.b\n"
- ".inst 0x4487436b // smlalb z11.s, p4/M, z27.h, z7.h\n"
- ".inst 0x44874355 // smlalb z21.s, p4/M, z26.h, z7.h\n"
- "uzp2 z31.s, z31.s, z20.s\n"
- "inch x17\n"
- ".inst 0x448843a9 // smlalb z9.s, p4/M, z29.h, z8.h\n"
- ".inst 0x448847aa // smlalt z10.s, p4/M, z29.h, z8.h\n"
- "ld1sb { z29.h }, p3/Z, [x20, x7]\n"
- ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
- ".inst 0x44874776 // smlalt z22.s, p4/M, z27.h, z7.h\n"
- ".inst 0x44874752 // smlalt z18.s, p4/M, z26.h, z7.h\n"
- "and z0.d, z13.d, z1.d\n"
+ ".inst 0x448a4561 // smlalt z1.s, p4/M, z11.h, z10.h\n"
+ "ld1sb { z11.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x455a137b // ssublb z27.h, z27.b, z26.b\n"
+ ".inst 0x44834686 // smlalt z6.s, p4/M, z20.h, z3.h\n"
+ "ldr x21, [x11, #0xa0]\n"
+ "ldr x20, [x11, #0xb0]\n"
+ ".inst 0x448a4232 // smlalb z18.s, p4/M, z17.h, z10.h\n"
+ ".inst 0x449e43af // smlalb z15.s, p4/M, z29.h, z30.h\n"
+ ".inst 0x455a1210 // ssublb z16.h, z16.b, z26.b\n"
+ ".inst 0x448e4385 // smlalb z5.s, p4/M, z28.h, z14.h\n"
+ ".inst 0x448e47f8 // smlalt z24.s, p4/M, z31.h, z14.h\n"
+ ".inst 0x455a1339 // ssublb z25.h, z25.b, z26.b\n"
+ "ld1sb { z20.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x455a116b // ssublb z11.h, z11.b, z26.b\n"
+ ".inst 0x44834368 // smlalb z8.s, p4/M, z27.h, z3.h\n"
+ "ld1sb { z31.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x448a4620 // smlalt z0.s, p4/M, z17.h, z10.h\n"
+ ".inst 0x449e47a1 // smlalt z1.s, p4/M, z29.h, z30.h\n"
+ ".inst 0x448e4786 // smlalt z6.s, p4/M, z28.h, z14.h\n"
+ "ldr x20, [x11, #0xb8]\n"
+ ".inst 0x455a1294 // ssublb z20.h, z20.b, z26.b\n"
+ ".inst 0x44834212 // smlalb z18.s, p4/M, z16.h, z3.h\n"
+ ".inst 0x4497432f // smlalb z15.s, p4/M, z25.h, z23.h\n"
+ ".inst 0x455a13ff // ssublb z31.h, z31.b, z26.b\n"
+ "ld1sb { z30.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x44844165 // smlalb z5.s, p4/M, z11.h, z4.h\n"
+ ".inst 0x44834778 // smlalt z24.s, p4/M, z27.h, z3.h\n"
+ "ldr x20, [x11, #0xc0]\n"
+ "ld1w { z17.s }, p2/Z, [x9]\n"
+ ".inst 0x449742c8 // smlalb z8.s, p4/M, z22.h, z23.h\n"
+ ".inst 0x44834600 // smlalt z0.s, p4/M, z16.h, z3.h\n"
+ "ld1w { z14.s }, p1/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x455a13de // ssublb z30.h, z30.b, z26.b\n"
+ ".inst 0x44974721 // smlalt z1.s, p4/M, z25.h, z23.h\n"
+ ".inst 0x44844566 // smlalt z6.s, p4/M, z11.h, z4.h\n"
+ "ld1sb { z25.h }, p3/Z, [x20, x7]\n"
+ "uzp1 z10.s, z17.s, z14.s\n"
+ ".inst 0x44844372 // smlalb z18.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x4487428f // smlalb z15.s, p4/M, z20.h, z7.h\n"
+ "uzp2 z14.s, z17.s, z14.s\n"
+ "ld1w { z17.s }, p2/Z, [x28]\n"
+ ".inst 0x448743e5 // smlalb z5.s, p4/M, z31.h, z7.h\n"
+ ".inst 0x449746d8 // smlalt z24.s, p4/M, z22.h, z23.h\n"
+ "ld1w { z16.s }, p1/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x455a1339 // ssublb z25.h, z25.b, z26.b\n"
+ ".inst 0x448743a8 // smlalb z8.s, p4/M, z29.h, z7.h\n"
+ ".inst 0x44844760 // smlalt z0.s, p4/M, z27.h, z4.h\n"
+ "uzp1 z4.s, z17.s, z16.s\n"
"inch x7\n"
- ".inst 0x4485430b // smlalb z11.s, p4/M, z24.h, z5.h\n"
- ".inst 0x44864335 // smlalb z21.s, p4/M, z25.h, z6.h\n"
- ".inst 0x04be7631 // sqrdmulh z17.s, z17.s, z30.s\n"
+ ".inst 0x44874681 // smlalt z1.s, p4/M, z20.h, z7.h\n"
+ ".inst 0x448747e6 // smlalt z6.s, p4/M, z31.h, z7.h\n"
+ ".inst 0x04aa7508 // sqrdmulh z8.s, z8.s, z10.s\n"
+ "whilelt p0.h, x10, x8\n"
+ ".inst 0x448742b2 // smlalb z18.s, p4/M, z21.h, z7.h\n"
+ ".inst 0x4483416f // smlalb z15.s, p4/M, z11.h, z3.h\n"
+ "uzp2 z22.s, z17.s, z16.s\n"
"mov x20, x7\n"
- ".inst 0x44854716 // smlalt z22.s, p4/M, z24.h, z5.h\n"
- ".inst 0x44864732 // smlalt z18.s, p4/M, z25.h, z6.h\n"
- "asr z0.s, z0.s, #0x1f\n"
+ ".inst 0x449743c5 // smlalb z5.s, p4/M, z30.h, z23.h\n"
+ ".inst 0x448747b8 // smlalt z24.s, p4/M, z29.h, z7.h\n"
+ "and z17.d, z8.d, z4.d\n"
+ "inch x17\n"
+ ".inst 0x448746a0 // smlalt z0.s, p4/M, z21.h, z7.h\n"
+ ".inst 0x44834561 // smlalt z1.s, p4/M, z11.h, z3.h\n"
+ ".inst 0x04ae7718 // sqrdmulh z24.s, z24.s, z14.s\n"
"incw x20\n"
- ".inst 0x4488432b // smlalb z11.s, p4/M, z25.h, z8.h\n"
- ".inst 0x448843b5 // smlalb z21.s, p4/M, z29.h, z8.h\n"
- "and z20.d, z17.d, z31.d\n"
+ ".inst 0x449747c6 // smlalt z6.s, p4/M, z30.h, z23.h\n"
+ ".inst 0x44824392 // smlalb z18.s, p4/M, z28.h, z2.h\n"
+ "asr z17.s, z17.s, #0x1f\n"
"whilelt p2.s, x7, x8\n"
- ".inst 0x44884736 // smlalt z22.s, p4/M, z25.h, z8.h\n"
- ".inst 0x448847b2 // smlalt z18.s, p4/M, z29.h, z8.h\n"
- ".inst 0x04b37529 // sqrdmulh z9.s, z9.s, z19.s\n"
+ ".inst 0x448243cf // smlalb z15.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x44824325 // smlalb z5.s, p4/M, z25.h, z2.h\n"
+ "and z16.d, z24.d, z22.d\n"
"whilelt p1.s, x20, x8\n"
- ".inst 0x04b3756b // sqrdmulh z11.s, z11.s, z19.s\n"
- ".inst 0x04b376b5 // sqrdmulh z21.s, z21.s, z19.s\n"
- "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44824780 // smlalt z0.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x448247c1 // smlalt z1.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x04aa7652 // sqrdmulh z18.s, z18.s, z10.s\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44824726 // smlalt z6.s, p4/M, z25.h, z2.h\n"
+ ".inst 0x04aa75ef // sqrdmulh z15.s, z15.s, z10.s\n"
"whilelt p3.h, x7, x8\n"
- "sqadd z13.s, z13.s, z0.s\n"
- "asr z20.s, z20.s, #0x1f\n"
- ".inst 0x4482902d // srshl z13.s, p4/M, z13.s, z1.s\n"
- "addvl x27, x27, #2\n"
- "and z19.d, z9.d, z1.d\n"
- ".inst 0x04be754a // sqrdmulh z10.s, z10.s, z30.s\n"
- "addvl x26, x26, #2\n"
- "and z2.d, z11.d, z1.d\n"
- ".inst 0x04be76d6 // sqrdmulh z22.s, z22.s, z30.s\n"
- "and z0.d, z21.d, z1.d\n"
- ".inst 0x04be7652 // sqrdmulh z18.s, z18.s, z30.s\n"
- "sqadd z17.s, z17.s, z20.s\n"
- "asr z19.s, z19.s, #0x1f\n"
- ".inst 0x448293f1 // srshl z17.s, p4/M, z17.s, z31.s\n"
- "and z3.d, z10.d, z31.d\n"
- "asr z2.s, z2.s, #0x1f\n"
- "and z26.d, z22.d, z31.d\n"
- "asr z0.s, z0.s, #0x1f\n"
- "and z20.d, z18.d, z31.d\n"
- "sqadd z9.s, z9.s, z19.s\n"
- ".inst 0x44829029 // srshl z9.s, p4/M, z9.s, z1.s\n"
- "asr z3.s, z3.s, #0x1f\n"
- "sqadd z11.s, z11.s, z2.s\n"
- ".inst 0x4482902b // srshl z11.s, p4/M, z11.s, z1.s\n"
- "asr z26.s, z26.s, #0x1f\n"
- "sqadd z21.s, z21.s, z0.s\n"
- ".inst 0x44829035 // srshl z21.s, p4/M, z21.s, z1.s\n"
+ "addvl x9, x9, #2\n"
+ ".inst 0x04aa74a5 // sqrdmulh z5.s, z5.s, z10.s\n"
+ "sqadd z8.s, z8.s, z17.s\n"
+ ".inst 0x44829088 // srshl z8.s, p4/M, z8.s, z4.s\n"
+ "addvl x28, x28, #2\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "and z21.d, z18.d, z4.d\n"
+ ".inst 0x04ae7400 // sqrdmulh z0.s, z0.s, z14.s\n"
+ "and z20.d, z15.d, z4.d\n"
+ ".inst 0x04ae7421 // sqrdmulh z1.s, z1.s, z14.s\n"
+ "and z28.d, z5.d, z4.d\n"
+ ".inst 0x04ae74c6 // sqrdmulh z6.s, z6.s, z14.s\n"
+ "sqadd z24.s, z24.s, z16.s\n"
+ ".inst 0x448292d8 // srshl z24.s, p4/M, z24.s, z22.s\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "and z25.d, z0.d, z22.d\n"
"asr z20.s, z20.s, #0x1f\n"
- "sqadd z10.s, z10.s, z3.s\n"
- ".inst 0x448293ea // srshl z10.s, p4/M, z10.s, z31.s\n"
- "sqadd z22.s, z22.s, z26.s\n"
- "sqadd z18.s, z18.s, z20.s\n"
- ".inst 0x448293f6 // srshl z22.s, p4/M, z22.s, z31.s\n"
- ".inst 0x448293f2 // srshl z18.s, p4/M, z18.s, z31.s\n"
- ".inst 0x453041ad // sqxtnb z13.h, z13.s\n"
- ".inst 0x45304129 // sqxtnb z9.h, z9.s\n"
- ".inst 0x4530416b // sqxtnb z11.h, z11.s\n"
- ".inst 0x453042b5 // sqxtnb z21.h, z21.s\n"
- ".inst 0x4530462d // sqxtnt z13.h, z17.s\n"
- ".inst 0x45304549 // sqxtnt z9.h, z10.s\n"
- ".inst 0x453046cb // sqxtnt z11.h, z22.s\n"
- ".inst 0x45304655 // sqxtnt z21.h, z18.s\n"
- "sqadd z13.h, z13.h, z14.h\n"
- "sqadd z9.h, z9.h, z14.h\n"
- "smax z13.h, p4/M, z13.h, z16.h\n"
- "smax z9.h, p4/M, z9.h, z16.h\n"
- "sqadd z11.h, z11.h, z14.h\n"
- "sqadd z21.h, z21.h, z14.h\n"
- "smax z11.h, p4/M, z11.h, z16.h\n"
- "smax z21.h, p4/M, z21.h, z16.h\n"
- "smin z13.h, p4/M, z13.h, z15.h\n"
- "smin z9.h, p4/M, z9.h, z15.h\n"
- "st1b { z13.h }, p0, [x16, x10]\n"
- "smin z11.h, p4/M, z11.h, z15.h\n"
- "smin z21.h, p4/M, z21.h, z15.h\n"
- "st1b { z9.h }, p0, [x15, x10]\n"
- "st1b { z11.h }, p0, [x14, x10]\n"
- "st1b { z21.h }, p0, [x13, x10]\n"
- "ld1sb { z0.h }, p4/Z, [x17]\n"
- "ld1sb { z1.h }, p4/Z, [x17, #1, MUL VL]\n"
+ "and z17.d, z1.d, z22.d\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "and z16.d, z6.d, z22.d\n"
+ "sqadd z18.s, z18.s, z21.s\n"
+ "asr z25.s, z25.s, #0x1f\n"
+ ".inst 0x44829092 // srshl z18.s, p4/M, z18.s, z4.s\n"
+ "sqadd z15.s, z15.s, z20.s\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ ".inst 0x4482908f // srshl z15.s, p4/M, z15.s, z4.s\n"
+ "sqadd z5.s, z5.s, z28.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x44829085 // srshl z5.s, p4/M, z5.s, z4.s\n"
+ "sqadd z0.s, z0.s, z25.s\n"
+ "sqadd z1.s, z1.s, z17.s\n"
+ ".inst 0x448292c0 // srshl z0.s, p4/M, z0.s, z22.s\n"
+ ".inst 0x448292c1 // srshl z1.s, p4/M, z1.s, z22.s\n"
+ "sqadd z6.s, z6.s, z16.s\n"
+ ".inst 0x45304108 // sqxtnb z8.h, z8.s\n"
+ ".inst 0x448292c6 // srshl z6.s, p4/M, z6.s, z22.s\n"
+ ".inst 0x45304252 // sqxtnb z18.h, z18.s\n"
+ ".inst 0x453041ef // sqxtnb z15.h, z15.s\n"
+ ".inst 0x453040a5 // sqxtnb z5.h, z5.s\n"
+ ".inst 0x45304708 // sqxtnt z8.h, z24.s\n"
+ ".inst 0x45304412 // sqxtnt z18.h, z0.s\n"
+ ".inst 0x4530442f // sqxtnt z15.h, z1.s\n"
+ ".inst 0x453044c5 // sqxtnt z5.h, z6.s\n"
+ "sqadd z8.h, z8.h, z19.h\n"
+ "smax z8.h, p4/M, z8.h, z12.h\n"
+ "smin z8.h, p4/M, z8.h, z9.h\n"
+ "sqadd z18.h, z18.h, z19.h\n"
+ "sqadd z15.h, z15.h, z19.h\n"
+ "smax z18.h, p4/M, z18.h, z12.h\n"
+ "smax z15.h, p4/M, z15.h, z12.h\n"
+ "sqadd z5.h, z5.h, z19.h\n"
+ "smax z5.h, p4/M, z5.h, z12.h\n"
+ "smin z18.h, p4/M, z18.h, z9.h\n"
+ "st1b { z8.h }, p0, [x16, x10]\n"
+ "smin z15.h, p4/M, z15.h, z9.h\n"
+ "smin z5.h, p4/M, z5.h, z9.h\n"
+ "st1b { z18.h }, p0, [x15, x10]\n"
+ "st1b { z15.h }, p0, [x14, x10]\n"
+ "st1b { z5.h }, p0, [x13, x10]\n"
+ "ld1sb { z25.h }, p4/Z, [x17]\n"
+ "ld1sb { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
"inch x10\n"
- "ld1sb { z2.h }, p4/Z, [x17, #2, MUL VL]\n"
- "ld1sb { z3.h }, p4/Z, [x17, #3, MUL VL]\n"
- ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
- ".inst 0x454c1021 // ssublb z1.h, z1.b, z12.b\n"
- "ld1sb { z4.h }, p4/Z, [x17, #4, MUL VL]\n"
- "ld1sb { z5.h }, p4/Z, [x17, #5, MUL VL]\n"
- ".inst 0x454c1042 // ssublb z2.h, z2.b, z12.b\n"
- ".inst 0x454c1063 // ssublb z3.h, z3.b, z12.b\n"
- "ld1sb { z6.h }, p4/Z, [x17, #6, MUL VL]\n"
+ "ld1sb { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
+ "ld1sb { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
+ ".inst 0x454d1339 // ssublb z25.h, z25.b, z13.b\n"
+ ".inst 0x454d13de // ssublb z30.h, z30.b, z13.b\n"
+ "ld1sb { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
+ "ld1sb { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
+ ".inst 0x454d11ce // ssublb z14.h, z14.b, z13.b\n"
+ ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n"
+ "ld1sb { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
"ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
"inch x17, ALL, MUL #8\n"
- ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
- "ld1w { z18.s }, p2/Z, [x12]\n"
- "ld1w { z8.s }, p1/Z, [x12, #1, MUL VL]\n"
- "uzp1 z13.s, z18.s, z8.s\n"
- "uzp2 z17.s, z18.s, z8.s\n"
- "ld1sb { z8.h }, p4/Z, [x17]\n"
- "ldp x9, x28, [x11, #0x0]\n"
- "addvl x12, x12, #2\n"
- "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x454d114a // ssublb z10.h, z10.b, z13.b\n"
+ "ld1w { z17.s }, p2/Z, [x20]\n"
+ "ld1w { z16.s }, p1/Z, [x20, #1, MUL VL]\n"
+ "uzp1 z8.s, z17.s, z16.s\n"
+ "uzp2 z24.s, z17.s, z16.s\n"
+ "ld1sb { z2.h }, p4/Z, [x17]\n"
+ "ldp x27, x26, [x11, #0x0]\n"
+ "addvl x20, x20, #2\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
"ldp x25, x24, [x11, #0x10]\n"
"ldp x23, x22, [x11, #0x20]\n"
- "mov z9.d, z13.d\n"
- "mov z10.d, z17.d\n"
+ "mov z18.d, z8.d\n"
+ "mov z0.d, z24.d\n"
"ldp x21, x20, [x11, #0x30]\n"
- "ld1sb { z31.h }, p3/Z, [x9, x7]\n"
- "mov z11.d, z13.d\n"
- "mov z22.d, z17.d\n"
- "ld1sb { z30.h }, p3/Z, [x28, x7]\n"
- "ld1sb { z29.h }, p3/Z, [x25, x7]\n"
- "mov z21.d, z13.d\n"
- "mov z18.d, z17.d\n"
- "ld1sb { z28.h }, p3/Z, [x24, x7]\n"
+ "ld1sb { z21.h }, p3/Z, [x27, x7]\n"
+ "mov z15.d, z8.d\n"
+ "mov z1.d, z24.d\n"
+ "ld1sb { z22.h }, p3/Z, [x26, x7]\n"
+ "ld1sb { z11.h }, p3/Z, [x25, x7]\n"
+ "mov z5.d, z8.d\n"
+ "mov z6.d, z24.d\n"
+ "ld1sb { z20.h }, p3/Z, [x24, x7]\n"
"ld1sb { z27.h }, p3/Z, [x23, x7]\n"
- ".inst 0x454c10a5 // ssublb z5.h, z5.b, z12.b\n"
- ".inst 0x454c10c6 // ssublb z6.h, z6.b, z12.b\n"
- "ld1sb { z26.h }, p3/Z, [x22, x7]\n"
- "ld1sb { z25.h }, p3/Z, [x21, x7]\n"
- ".inst 0x454c10e7 // ssublb z7.h, z7.b, z12.b\n"
- ".inst 0x454c1108 // ssublb z8.h, z8.b, z12.b\n"
- "ld1sb { z24.h }, p3/Z, [x20, x7]\n"
- ".inst 0x455713ff // ssublb z31.h, z31.b, z23.b\n"
- ".inst 0x455713de // ssublb z30.h, z30.b, z23.b\n"
- ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
- ".inst 0x4557139c // ssublb z28.h, z28.b, z23.b\n"
- ".inst 0x4557137b // ssublb z27.h, z27.b, z23.b\n"
- ".inst 0x4557135a // ssublb z26.h, z26.b, z23.b\n"
- ".inst 0x45571339 // ssublb z25.h, z25.b, z23.b\n"
- ".inst 0x45571318 // ssublb z24.h, z24.b, z23.b\n"
+ ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n"
+ ".inst 0x454d12f7 // ssublb z23.h, z23.b, z13.b\n"
+ "ld1sb { z28.h }, p3/Z, [x22, x7]\n"
+ "ld1sb { z16.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x454d10e7 // ssublb z7.h, z7.b, z13.b\n"
+ ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n"
+ "ld1sb { z31.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x455a12b5 // ssublb z21.h, z21.b, z26.b\n"
+ ".inst 0x455a12d6 // ssublb z22.h, z22.b, z26.b\n"
+ ".inst 0x455a116b // ssublb z11.h, z11.b, z26.b\n"
+ ".inst 0x455a1294 // ssublb z20.h, z20.b, z26.b\n"
+ ".inst 0x455a137b // ssublb z27.h, z27.b, z26.b\n"
+ ".inst 0x455a139c // ssublb z28.h, z28.b, z26.b\n"
+ ".inst 0x455a1210 // ssublb z16.h, z16.b, z26.b\n"
+ ".inst 0x455a13ff // ssublb z31.h, z31.b, z26.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
@@ -448,4 +448,4 @@ void sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index 41ecd520ae..1f8d6c5213 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,14 +22,14 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -47,17 +47,16 @@ class sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstS
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
- unsigned int get_accumulator_depth_vl(void) const override { return 2; }
-
sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
- Parent::KernelType kernel = sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+ Parent::KernelType kernel = sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index d8f4d8d199..7ff724ddd8 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -27,7 +27,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -46,7 +46,7 @@ void sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
struct Params
{
long unsigned int n_channels;
- const int8_t *weights;
+ const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
const int32_t *const requant_muls;
@@ -57,7 +57,7 @@ void sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
Params(
long unsigned int n_channels,
const int8_t *const *inptrs_raw,
- const int8_t *const weights,
+ const void *const weights,
const int32_t *const bias,
const arm_gemm::Requantize32 &qp,
const int32_t *const requant_muls,
@@ -111,542 +111,542 @@ void sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "mov x0, #0x0\n"
- "mov x24, x0\n"
+ "mov x2, #0x0\n"
+ "mov x24, x2\n"
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
- "ldr x1, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x3, [%x[params], %[offsetof_Params_n_channels]]\n"
"ptrue p4.b\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
"incw x24\n"
- "ldr x2, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x4, [%x[params], %[offsetof_Params_weights]]\n"
"add x21, x23, %[offsetof_Requantize32_a_offset]\n"
"add x20, x23, %[offsetof_Requantize32_b_offset]\n"
- "ld1rb { z15.b }, p4/Z, [x21]\n"
- "ld1rb { z17.b }, p4/Z, [x20]\n"
+ "ld1rb { z30.b }, p4/Z, [x21]\n"
+ "ld1rb { z10.b }, p4/Z, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_c_offset]\n"
"add x20, x23, %[offsetof_Requantize32_minval]\n"
- "ld1rh { z12.h }, p4/Z, [x21]\n"
- "ld1rh { z13.h }, p4/Z, [x20]\n"
+ "ld1rh { z15.h }, p4/Z, [x21]\n"
+ "ld1rh { z12.h }, p4/Z, [x20]\n"
"add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "ld1rh { z11.h }, p4/Z, [x20]\n"
- "ldp x3, x4, [x22, #0x0]\n"
- "whilelt p3.h, x0, x1\n"
- "ldp x5, x6, [x22, #0x10]\n"
- "whilelt p2.s, x0, x1\n"
- "whilelt p1.s, x24, x1\n"
- "ldr x14, [%x[params], %[offsetof_Params_bias]]\n"
- "add x7, %x[params], %[offsetof_Params_inptrs]\n"
- "ld1w { z30.s }, p2/Z, [x14]\n"
- "ld1w { z16.s }, p1/Z, [x14, #1, MUL VL]\n"
- "uzp1 z14.s, z30.s, z16.s\n"
- "ld1sb { z0.h }, p4/Z, [x2]\n"
- "ld1sb { z1.h }, p4/Z, [x2, #1, MUL VL]\n"
- "uzp2 z10.s, z30.s, z16.s\n"
- "addvl x14, x14, #2\n"
- "ld1sb { z2.h }, p4/Z, [x2, #2, MUL VL]\n"
- "ld1sb { z3.h }, p4/Z, [x2, #3, MUL VL]\n"
- "mov x8, #0x0\n"
- "mov z20.d, z14.d\n"
- "ld1sb { z4.h }, p4/Z, [x2, #4, MUL VL]\n"
- "ldp x9, x28, [x7, #0x0]\n"
- "mov z7.d, z10.d\n"
- "mov z8.d, z14.d\n"
- "ldp x27, x26, [x7, #0x10]\n"
- "ldp x25, x24, [x7, #0x20]\n"
- "mov z16.d, z10.d\n"
+ "ld1rh { z13.h }, p4/Z, [x20]\n"
+ "ldp x5, x6, [x22, #0x0]\n"
+ "whilelt p3.h, x2, x3\n"
+ "ldp x7, x8, [x22, #0x10]\n"
+ "whilelt p2.s, x2, x3\n"
+ "whilelt p1.s, x24, x3\n"
+ "ldr x10, [%x[params], %[offsetof_Params_bias]]\n"
+ "add x17, %x[params], %[offsetof_Params_inptrs]\n"
+ "ld1w { z17.s }, p2/Z, [x10]\n"
+ "ld1w { z16.s }, p1/Z, [x10, #1, MUL VL]\n"
+ "uzp1 z14.s, z17.s, z16.s\n"
+ "ld1sb { z26.h }, p4/Z, [x4]\n"
+ "ld1sb { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
+ "uzp2 z23.s, z17.s, z16.s\n"
+ "addvl x10, x10, #2\n"
+ "ld1sb { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
+ "ld1sb { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
+ "mov x16, #0x0\n"
"mov z6.d, z14.d\n"
- "ldp x23, x22, [x7, #0x30]\n"
- "ldp x21, x20, [x7, #0x40]\n"
- "mov z5.d, z10.d\n"
- ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
- "ld1sb { z31.h }, p3/Z, [x9, x0]\n"
- "ld1sb { z30.h }, p3/Z, [x28, x0]\n"
- ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
- ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
- "ld1sb { z29.h }, p3/Z, [x27, x0]\n"
- "ld1sb { z28.h }, p3/Z, [x26, x0]\n"
- ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
- ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
- "ld1sb { z27.h }, p3/Z, [x25, x0]\n"
- "ld1sb { z23.h }, p3/Z, [x24, x0]\n"
- ".inst 0x454f13ff // ssublb z31.h, z31.b, z15.b\n"
- ".inst 0x454f13de // ssublb z30.h, z30.b, z15.b\n"
- "ld1sb { z25.h }, p3/Z, [x23, x0]\n"
- "ld1sb { z24.h }, p3/Z, [x22, x0]\n"
- ".inst 0x454f13bd // ssublb z29.h, z29.b, z15.b\n"
- ".inst 0x454f139c // ssublb z28.h, z28.b, z15.b\n"
- "ld1sb { z26.h }, p3/Z, [x21, x0]\n"
- "ld1sb { z22.h }, p3/Z, [x20, x0]\n"
- ".inst 0x454f137b // ssublb z27.h, z27.b, z15.b\n"
- ".inst 0x454f12f7 // ssublb z23.h, z23.b, z15.b\n"
- "ldr x17, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "ldr x16, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "str x14, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x454f1339 // ssublb z25.h, z25.b, z15.b\n"
- ".inst 0x454f1318 // ssublb z24.h, z24.b, z15.b\n"
- ".inst 0x454f135a // ssublb z26.h, z26.b, z15.b\n"
- ".inst 0x454f12d6 // ssublb z22.h, z22.b, z15.b\n"
+ "ld1sb { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
+ "ldp x9, x28, [x17, #0x0]\n"
+ "mov z18.d, z23.d\n"
+ "mov z9.d, z14.d\n"
+ "ldp x27, x26, [x17, #0x10]\n"
+ "ldp x25, x24, [x17, #0x20]\n"
+ "mov z20.d, z23.d\n"
+ "mov z7.d, z14.d\n"
+ "ldp x23, x22, [x17, #0x30]\n"
+ "ldp x21, x20, [x17, #0x40]\n"
+ "mov z1.d, z23.d\n"
+ ".inst 0x454a135a // ssublb z26.h, z26.b, z10.b\n"
+ "ld1sb { z22.h }, p3/Z, [x9, x2]\n"
+ "ld1sb { z2.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x454a1108 // ssublb z8.h, z8.b, z10.b\n"
+ ".inst 0x454a1210 // ssublb z16.h, z16.b, z10.b\n"
+ "ld1sb { z11.h }, p3/Z, [x27, x2]\n"
+ "ld1sb { z3.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
+ ".inst 0x454a1231 // ssublb z17.h, z17.b, z10.b\n"
+ "ld1sb { z29.h }, p3/Z, [x25, x2]\n"
+ "ld1sb { z4.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x455e12d6 // ssublb z22.h, z22.b, z30.b\n"
+ ".inst 0x455e1042 // ssublb z2.h, z2.b, z30.b\n"
+ "ld1sb { z31.h }, p3/Z, [x23, x2]\n"
+ "ld1sb { z0.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e116b // ssublb z11.h, z11.b, z30.b\n"
+ ".inst 0x455e1063 // ssublb z3.h, z3.b, z30.b\n"
+ "ld1sb { z19.h }, p3/Z, [x21, x2]\n"
+ "ld1sb { z28.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e13bd // ssublb z29.h, z29.b, z30.b\n"
+ ".inst 0x455e1084 // ssublb z4.h, z4.b, z30.b\n"
+ "ldr x15, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "str x10, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x455e13ff // ssublb z31.h, z31.b, z30.b\n"
+ ".inst 0x455e1000 // ssublb z0.h, z0.b, z30.b\n"
+ ".inst 0x455e1273 // ssublb z19.h, z19.b, z30.b\n"
+ ".inst 0x455e139c // ssublb z28.h, z28.b, z30.b\n"
"1:" // Loop
- ".inst 0x448043ee // smlalb z14.s, p4/M, z31.h, z0.h\n"
- ".inst 0x448047ea // smlalt z10.s, p4/M, z31.h, z0.h\n"
- "ldr x20, [x7, #0x50]\n"
- "ld1sb { z31.h }, p3/Z, [x20, x0]\n"
- ".inst 0x448143ce // smlalb z14.s, p4/M, z30.h, z1.h\n"
- ".inst 0x448043d4 // smlalb z20.s, p4/M, z30.h, z0.h\n"
- "ldr x22, [x7, #0x58]\n"
- ".inst 0x454f13ff // ssublb z31.h, z31.b, z15.b\n"
- ".inst 0x448043a8 // smlalb z8.s, p4/M, z29.h, z0.h\n"
- ".inst 0x44804386 // smlalb z6.s, p4/M, z28.h, z0.h\n"
- "ldr x21, [x7, #0x60]\n"
- "ldr x20, [x7, #0x68]\n"
- ".inst 0x448147ca // smlalt z10.s, p4/M, z30.h, z1.h\n"
- ".inst 0x448047c7 // smlalt z7.s, p4/M, z30.h, z0.h\n"
- "ld1sb { z30.h }, p3/Z, [x22, x0]\n"
- ".inst 0x454f13de // ssublb z30.h, z30.b, z15.b\n"
- ".inst 0x448047b0 // smlalt z16.s, p4/M, z29.h, z0.h\n"
- ".inst 0x4482436e // smlalb z14.s, p4/M, z27.h, z2.h\n"
- "ldr x25, [x7, #0x70]\n"
- "ldr x24, [x7, #0x78]\n"
- ".inst 0x44804785 // smlalt z5.s, p4/M, z28.h, z0.h\n"
- ".inst 0x44814374 // smlalb z20.s, p4/M, z27.h, z1.h\n"
- "ld1sb { z0.h }, p4/Z, [x2, #5, MUL VL]\n"
- ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
- ".inst 0x44814388 // smlalb z8.s, p4/M, z28.h, z1.h\n"
- ".inst 0x448142e6 // smlalb z6.s, p4/M, z23.h, z1.h\n"
- "ldr x15, [x7, #0x80]\n"
- "ldr x23, [x7, #0x88]\n"
- ".inst 0x4482476a // smlalt z10.s, p4/M, z27.h, z2.h\n"
- ".inst 0x44814767 // smlalt z7.s, p4/M, z27.h, z1.h\n"
- "ld1sb { z27.h }, p3/Z, [x21, x0]\n"
- ".inst 0x454f137b // ssublb z27.h, z27.b, z15.b\n"
- ".inst 0x44814790 // smlalt z16.s, p4/M, z28.h, z1.h\n"
- ".inst 0x4483432e // smlalb z14.s, p4/M, z25.h, z3.h\n"
- "ldr x22, [x7, #0x90]\n"
- "ldr x21, [x7, #0x98]\n"
- ".inst 0x448146e5 // smlalt z5.s, p4/M, z23.h, z1.h\n"
- ".inst 0x44824334 // smlalb z20.s, p4/M, z25.h, z2.h\n"
- "ld1sb { z1.h }, p4/Z, [x2, #6, MUL VL]\n"
- ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
- ".inst 0x448242e8 // smlalb z8.s, p4/M, z23.h, z2.h\n"
- ".inst 0x448243e6 // smlalb z6.s, p4/M, z31.h, z2.h\n"
- "ldr x14, [x7, #0xa0]\n"
- "ldr x13, [x7, #0xa8]\n"
- ".inst 0x4483472a // smlalt z10.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44824727 // smlalt z7.s, p4/M, z25.h, z2.h\n"
- "ld1sb { z25.h }, p3/Z, [x20, x0]\n"
- ".inst 0x454f1339 // ssublb z25.h, z25.b, z15.b\n"
- ".inst 0x448246f0 // smlalt z16.s, p4/M, z23.h, z2.h\n"
- ".inst 0x4484430e // smlalb z14.s, p4/M, z24.h, z4.h\n"
- "ldr x12, [x7, #0xb0]\n"
- "ldr x20, [x7, #0xb8]\n"
- ".inst 0x448247e5 // smlalt z5.s, p4/M, z31.h, z2.h\n"
- ".inst 0x44834314 // smlalb z20.s, p4/M, z24.h, z3.h\n"
- "ld1sb { z2.h }, p4/Z, [x2, #7, MUL VL]\n"
- "inch x2, ALL, MUL #8\n"
- ".inst 0x448343e8 // smlalb z8.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448343c6 // smlalb z6.s, p4/M, z30.h, z3.h\n"
- ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
- "ldr x11, [x7, #0xc0]\n"
- ".inst 0x4484470a // smlalt z10.s, p4/M, z24.h, z4.h\n"
- ".inst 0x44834707 // smlalt z7.s, p4/M, z24.h, z3.h\n"
- "ld1sb { z24.h }, p3/Z, [x25, x0]\n"
- ".inst 0x454f1318 // ssublb z24.h, z24.b, z15.b\n"
- ".inst 0x448347f0 // smlalt z16.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448043ae // smlalb z14.s, p4/M, z29.h, z0.h\n"
- "ldr x10, [x7, #0xc8]\n"
- "ldr x9, [x7, #0xd0]\n"
- ".inst 0x448347c5 // smlalt z5.s, p4/M, z30.h, z3.h\n"
- ".inst 0x44844374 // smlalb z20.s, p4/M, z27.h, z4.h\n"
- "ld1sb { z3.h }, p4/Z, [x2]\n"
- ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
- ".inst 0x448443c8 // smlalb z8.s, p4/M, z30.h, z4.h\n"
- ".inst 0x44844346 // smlalb z6.s, p4/M, z26.h, z4.h\n"
- "ldr x28, [x7, #0xd8]\n"
- "ldr x27, [x7, #0xe0]\n"
- ".inst 0x448047aa // smlalt z10.s, p4/M, z29.h, z0.h\n"
- ".inst 0x44844767 // smlalt z7.s, p4/M, z27.h, z4.h\n"
- "ld1sb { z27.h }, p3/Z, [x24, x0]\n"
- ".inst 0x454f137b // ssublb z27.h, z27.b, z15.b\n"
- ".inst 0x448447d0 // smlalt z16.s, p4/M, z30.h, z4.h\n"
- ".inst 0x4481438e // smlalb z14.s, p4/M, z28.h, z1.h\n"
- "ldr x26, [x7, #0xe8]\n"
- "ldr x25, [x7, #0xf0]\n"
- ".inst 0x44844745 // smlalt z5.s, p4/M, z26.h, z4.h\n"
- ".inst 0x44804394 // smlalb z20.s, p4/M, z28.h, z0.h\n"
- "ld1sb { z4.h }, p4/Z, [x2, #1, MUL VL]\n"
- ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
- ".inst 0x448042c8 // smlalb z8.s, p4/M, z22.h, z0.h\n"
- ".inst 0x44804326 // smlalb z6.s, p4/M, z25.h, z0.h\n"
- "ld1w { z19.s }, p2/Z, [x17]\n"
- "ld1w { z18.s }, p1/Z, [x17, #1, MUL VL]\n"
- ".inst 0x4481478a // smlalt z10.s, p4/M, z28.h, z1.h\n"
- ".inst 0x44804787 // smlalt z7.s, p4/M, z28.h, z0.h\n"
- "ld1sb { z28.h }, p3/Z, [x23, x0]\n"
- ".inst 0x454f139c // ssublb z28.h, z28.b, z15.b\n"
- ".inst 0x448046d0 // smlalt z16.s, p4/M, z22.h, z0.h\n"
- ".inst 0x448242ee // smlalb z14.s, p4/M, z23.h, z2.h\n"
- "ldr x24, [x7, #0xf8]\n"
- "uzp1 z9.s, z19.s, z18.s\n"
- ".inst 0x44804725 // smlalt z5.s, p4/M, z25.h, z0.h\n"
- ".inst 0x448142f4 // smlalb z20.s, p4/M, z23.h, z1.h\n"
- "ld1sb { z0.h }, p4/Z, [x2, #2, MUL VL]\n"
- ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
- ".inst 0x44814328 // smlalb z8.s, p4/M, z25.h, z1.h\n"
- ".inst 0x44814306 // smlalb z6.s, p4/M, z24.h, z1.h\n"
- "uzp2 z29.s, z19.s, z18.s\n"
- "ld1w { z19.s }, p2/Z, [x16]\n"
- ".inst 0x448246ea // smlalt z10.s, p4/M, z23.h, z2.h\n"
- ".inst 0x448146e7 // smlalt z7.s, p4/M, z23.h, z1.h\n"
- "ld1sb { z23.h }, p3/Z, [x15, x0]\n"
- ".inst 0x454f12f7 // ssublb z23.h, z23.b, z15.b\n"
- ".inst 0x44814730 // smlalt z16.s, p4/M, z25.h, z1.h\n"
- ".inst 0x448343ee // smlalb z14.s, p4/M, z31.h, z3.h\n"
- "ldr x23, [x7, #0x100]\n"
- "whilelt p0.h, x8, x1\n"
- ".inst 0x44814705 // smlalt z5.s, p4/M, z24.h, z1.h\n"
- ".inst 0x448243f4 // smlalb z20.s, p4/M, z31.h, z2.h\n"
- "ld1sb { z1.h }, p4/Z, [x2, #3, MUL VL]\n"
- ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
- ".inst 0x44824308 // smlalb z8.s, p4/M, z24.h, z2.h\n"
- ".inst 0x44824366 // smlalb z6.s, p4/M, z27.h, z2.h\n"
- "addvl x17, x17, #2\n"
- ".inst 0x448347ea // smlalt z10.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448247e7 // smlalt z7.s, p4/M, z31.h, z2.h\n"
- "ld1sb { z31.h }, p3/Z, [x22, x0]\n"
- ".inst 0x454f13ff // ssublb z31.h, z31.b, z15.b\n"
- ".inst 0x44824710 // smlalt z16.s, p4/M, z24.h, z2.h\n"
- ".inst 0x448443ce // smlalb z14.s, p4/M, z30.h, z4.h\n"
- "ldr x22, [x7, #0x108]\n"
- ".inst 0x44824765 // smlalt z5.s, p4/M, z27.h, z2.h\n"
- ".inst 0x448343d4 // smlalb z20.s, p4/M, z30.h, z3.h\n"
- "ld1sb { z2.h }, p4/Z, [x2, #4, MUL VL]\n"
- ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
- ".inst 0x44834368 // smlalb z8.s, p4/M, z27.h, z3.h\n"
- ".inst 0x448342e6 // smlalb z6.s, p4/M, z23.h, z3.h\n"
- ".inst 0x448447ca // smlalt z10.s, p4/M, z30.h, z4.h\n"
- ".inst 0x448347c7 // smlalt z7.s, p4/M, z30.h, z3.h\n"
- "ld1sb { z30.h }, p3/Z, [x21, x0]\n"
- ".inst 0x454f13de // ssublb z30.h, z30.b, z15.b\n"
- ".inst 0x44834770 // smlalt z16.s, p4/M, z27.h, z3.h\n"
- ".inst 0x448042ce // smlalb z14.s, p4/M, z22.h, z0.h\n"
- "ldr x21, [x7, #0x110]\n"
- ".inst 0x448346e5 // smlalt z5.s, p4/M, z23.h, z3.h\n"
- ".inst 0x44844354 // smlalb z20.s, p4/M, z26.h, z4.h\n"
- "ld1sb { z3.h }, p4/Z, [x2, #5, MUL VL]\n"
- ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
- ".inst 0x448442e8 // smlalb z8.s, p4/M, z23.h, z4.h\n"
- ".inst 0x44844386 // smlalb z6.s, p4/M, z28.h, z4.h\n"
- ".inst 0x448046ca // smlalt z10.s, p4/M, z22.h, z0.h\n"
- ".inst 0x44844747 // smlalt z7.s, p4/M, z26.h, z4.h\n"
- "ld1sb { z26.h }, p3/Z, [x14, x0]\n"
- ".inst 0x454f135a // ssublb z26.h, z26.b, z15.b\n"
- ".inst 0x448446f0 // smlalt z16.s, p4/M, z23.h, z4.h\n"
- ".inst 0x4481432e // smlalb z14.s, p4/M, z25.h, z1.h\n"
- "ld1sb { z22.h }, p3/Z, [x20, x0]\n"
- ".inst 0x454f12d6 // ssublb z22.h, z22.b, z15.b\n"
- ".inst 0x44844785 // smlalt z5.s, p4/M, z28.h, z4.h\n"
- ".inst 0x44804334 // smlalb z20.s, p4/M, z25.h, z0.h\n"
- "ld1sb { z4.h }, p4/Z, [x2, #6, MUL VL]\n"
- ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
- ".inst 0x448043e8 // smlalb z8.s, p4/M, z31.h, z0.h\n"
- ".inst 0x448043c6 // smlalb z6.s, p4/M, z30.h, z0.h\n"
- "ldr x20, [x7, #0x118]\n"
- "ldr x14, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x4481472a // smlalt z10.s, p4/M, z25.h, z1.h\n"
- ".inst 0x44804727 // smlalt z7.s, p4/M, z25.h, z0.h\n"
- "ld1sb { z25.h }, p3/Z, [x13, x0]\n"
- ".inst 0x454f1339 // ssublb z25.h, z25.b, z15.b\n"
- ".inst 0x448047f0 // smlalt z16.s, p4/M, z31.h, z0.h\n"
- ".inst 0x4482430e // smlalb z14.s, p4/M, z24.h, z2.h\n"
- ".inst 0x448047c5 // smlalt z5.s, p4/M, z30.h, z0.h\n"
- ".inst 0x44814314 // smlalb z20.s, p4/M, z24.h, z1.h\n"
- "ld1sb { z0.h }, p4/Z, [x2, #7, MUL VL]\n"
- "inch x2, ALL, MUL #8\n"
- ".inst 0x448143c8 // smlalb z8.s, p4/M, z30.h, z1.h\n"
- ".inst 0x44814346 // smlalb z6.s, p4/M, z26.h, z1.h\n"
- ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
- ".inst 0x4482470a // smlalt z10.s, p4/M, z24.h, z2.h\n"
- ".inst 0x44814707 // smlalt z7.s, p4/M, z24.h, z1.h\n"
- "ld1sb { z24.h }, p3/Z, [x12, x0]\n"
- ".inst 0x454f1318 // ssublb z24.h, z24.b, z15.b\n"
- ".inst 0x448147d0 // smlalt z16.s, p4/M, z30.h, z1.h\n"
- ".inst 0x4483436e // smlalb z14.s, p4/M, z27.h, z3.h\n"
- ".inst 0x44814745 // smlalt z5.s, p4/M, z26.h, z1.h\n"
- ".inst 0x44824374 // smlalb z20.s, p4/M, z27.h, z2.h\n"
- "ld1sb { z1.h }, p4/Z, [x2]\n"
- ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
- ".inst 0x44824348 // smlalb z8.s, p4/M, z26.h, z2.h\n"
- ".inst 0x44824326 // smlalb z6.s, p4/M, z25.h, z2.h\n"
- ".inst 0x4483476a // smlalt z10.s, p4/M, z27.h, z3.h\n"
- ".inst 0x44824767 // smlalt z7.s, p4/M, z27.h, z2.h\n"
- "ld1sb { z27.h }, p3/Z, [x11, x0]\n"
- ".inst 0x454f137b // ssublb z27.h, z27.b, z15.b\n"
- ".inst 0x44824750 // smlalt z16.s, p4/M, z26.h, z2.h\n"
- ".inst 0x448442ee // smlalb z14.s, p4/M, z23.h, z4.h\n"
- ".inst 0x44824725 // smlalt z5.s, p4/M, z25.h, z2.h\n"
- ".inst 0x448342f4 // smlalb z20.s, p4/M, z23.h, z3.h\n"
- "ld1sb { z2.h }, p4/Z, [x2, #1, MUL VL]\n"
- ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
- ".inst 0x44834328 // smlalb z8.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44834306 // smlalb z6.s, p4/M, z24.h, z3.h\n"
- ".inst 0x448446ea // smlalt z10.s, p4/M, z23.h, z4.h\n"
- ".inst 0x448346e7 // smlalt z7.s, p4/M, z23.h, z3.h\n"
- "ld1sb { z23.h }, p3/Z, [x10, x0]\n"
- ".inst 0x454f12f7 // ssublb z23.h, z23.b, z15.b\n"
- ".inst 0x44834730 // smlalt z16.s, p4/M, z25.h, z3.h\n"
- ".inst 0x448043ee // smlalb z14.s, p4/M, z31.h, z0.h\n"
- ".inst 0x44834705 // smlalt z5.s, p4/M, z24.h, z3.h\n"
- ".inst 0x44844394 // smlalb z20.s, p4/M, z28.h, z4.h\n"
- "ld1sb { z3.h }, p4/Z, [x2, #2, MUL VL]\n"
- ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
- ".inst 0x44844308 // smlalb z8.s, p4/M, z24.h, z4.h\n"
- ".inst 0x448442c6 // smlalb z6.s, p4/M, z22.h, z4.h\n"
- ".inst 0x448047ea // smlalt z10.s, p4/M, z31.h, z0.h\n"
- ".inst 0x44844787 // smlalt z7.s, p4/M, z28.h, z4.h\n"
- "ld1sb { z31.h }, p3/Z, [x9, x0]\n"
- ".inst 0x454f13ff // ssublb z31.h, z31.b, z15.b\n"
- ".inst 0x44844710 // smlalt z16.s, p4/M, z24.h, z4.h\n"
- ".inst 0x448143ce // smlalb z14.s, p4/M, z30.h, z1.h\n"
- "ld1sb { z28.h }, p3/Z, [x27, x0]\n"
- ".inst 0x454f139c // ssublb z28.h, z28.b, z15.b\n"
- ".inst 0x448446c5 // smlalt z5.s, p4/M, z22.h, z4.h\n"
- ".inst 0x448043d4 // smlalb z20.s, p4/M, z30.h, z0.h\n"
- "ld1sb { z4.h }, p4/Z, [x2, #3, MUL VL]\n"
- ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
- ".inst 0x44804368 // smlalb z8.s, p4/M, z27.h, z0.h\n"
- ".inst 0x448042e6 // smlalb z6.s, p4/M, z23.h, z0.h\n"
- ".inst 0x448147ca // smlalt z10.s, p4/M, z30.h, z1.h\n"
- ".inst 0x448047c7 // smlalt z7.s, p4/M, z30.h, z0.h\n"
- "ld1sb { z30.h }, p3/Z, [x28, x0]\n"
- ".inst 0x454f13de // ssublb z30.h, z30.b, z15.b\n"
- ".inst 0x44804770 // smlalt z16.s, p4/M, z27.h, z0.h\n"
- ".inst 0x4482434e // smlalb z14.s, p4/M, z26.h, z2.h\n"
- ".inst 0x448046e5 // smlalt z5.s, p4/M, z23.h, z0.h\n"
- ".inst 0x44814354 // smlalb z20.s, p4/M, z26.h, z1.h\n"
- "ld1sb { z0.h }, p4/Z, [x2, #4, MUL VL]\n"
- ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
- ".inst 0x448142e8 // smlalb z8.s, p4/M, z23.h, z1.h\n"
- ".inst 0x448143e6 // smlalb z6.s, p4/M, z31.h, z1.h\n"
- ".inst 0x4482474a // smlalt z10.s, p4/M, z26.h, z2.h\n"
- ".inst 0x44814747 // smlalt z7.s, p4/M, z26.h, z1.h\n"
- "ld1sb { z26.h }, p3/Z, [x26, x0]\n"
- ".inst 0x454f135a // ssublb z26.h, z26.b, z15.b\n"
- ".inst 0x448146f0 // smlalt z16.s, p4/M, z23.h, z1.h\n"
- ".inst 0x4483432e // smlalb z14.s, p4/M, z25.h, z3.h\n"
- ".inst 0x448147e5 // smlalt z5.s, p4/M, z31.h, z1.h\n"
- ".inst 0x44824334 // smlalb z20.s, p4/M, z25.h, z2.h\n"
- "ld1sb { z1.h }, p4/Z, [x2, #5, MUL VL]\n"
- ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
- ".inst 0x448243e8 // smlalb z8.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448243c6 // smlalb z6.s, p4/M, z30.h, z2.h\n"
- ".inst 0x4483472a // smlalt z10.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44824727 // smlalt z7.s, p4/M, z25.h, z2.h\n"
- "ld1sb { z25.h }, p3/Z, [x25, x0]\n"
- ".inst 0x454f1339 // ssublb z25.h, z25.b, z15.b\n"
- ".inst 0x448247f0 // smlalt z16.s, p4/M, z31.h, z2.h\n"
- ".inst 0x4484430e // smlalb z14.s, p4/M, z24.h, z4.h\n"
- ".inst 0x448247c5 // smlalt z5.s, p4/M, z30.h, z2.h\n"
- ".inst 0x44834314 // smlalb z20.s, p4/M, z24.h, z3.h\n"
- "ld1sb { z2.h }, p4/Z, [x2, #6, MUL VL]\n"
- ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
- ".inst 0x448343c8 // smlalb z8.s, p4/M, z30.h, z3.h\n"
- ".inst 0x44834386 // smlalb z6.s, p4/M, z28.h, z3.h\n"
- ".inst 0x4484470a // smlalt z10.s, p4/M, z24.h, z4.h\n"
- ".inst 0x44834707 // smlalt z7.s, p4/M, z24.h, z3.h\n"
- "ld1sb { z24.h }, p3/Z, [x24, x0]\n"
- ".inst 0x454f1318 // ssublb z24.h, z24.b, z15.b\n"
- ".inst 0x448347d0 // smlalt z16.s, p4/M, z30.h, z3.h\n"
- ".inst 0x4480436e // smlalb z14.s, p4/M, z27.h, z0.h\n"
- ".inst 0x44834785 // smlalt z5.s, p4/M, z28.h, z3.h\n"
- ".inst 0x448442d4 // smlalb z20.s, p4/M, z22.h, z4.h\n"
- "ld1sb { z3.h }, p4/Z, [x2, #7, MUL VL]\n"
- "inch x2, ALL, MUL #8\n"
- ".inst 0x44844388 // smlalb z8.s, p4/M, z28.h, z4.h\n"
- ".inst 0x44844346 // smlalb z6.s, p4/M, z26.h, z4.h\n"
- ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
- ".inst 0x4480476a // smlalt z10.s, p4/M, z27.h, z0.h\n"
- ".inst 0x44844790 // smlalt z16.s, p4/M, z28.h, z4.h\n"
- "ld1sb { z27.h }, p3/Z, [x23, x0]\n"
- ".inst 0x454f137b // ssublb z27.h, z27.b, z15.b\n"
- ".inst 0x448142ee // smlalb z14.s, p4/M, z23.h, z1.h\n"
- ".inst 0x448446c7 // smlalt z7.s, p4/M, z22.h, z4.h\n"
- "ld1w { z18.s }, p1/Z, [x16, #1, MUL VL]\n"
- "addvl x16, x16, #2\n"
- ".inst 0x44844745 // smlalt z5.s, p4/M, z26.h, z4.h\n"
- ".inst 0x448042f4 // smlalb z20.s, p4/M, z23.h, z0.h\n"
- "ld1sb { z4.h }, p4/Z, [x2]\n"
- ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
- ".inst 0x44804328 // smlalb z8.s, p4/M, z25.h, z0.h\n"
- ".inst 0x44804306 // smlalb z6.s, p4/M, z24.h, z0.h\n"
- "inch x2\n"
- ".inst 0x448146ea // smlalt z10.s, p4/M, z23.h, z1.h\n"
- ".inst 0x44804730 // smlalt z16.s, p4/M, z25.h, z0.h\n"
- "ld1sb { z25.h }, p3/Z, [x22, x0]\n"
- ".inst 0x454f1339 // ssublb z25.h, z25.b, z15.b\n"
+ ".inst 0x449a42ce // smlalb z14.s, p4/M, z22.h, z26.h\n"
+ ".inst 0x449a46d7 // smlalt z23.s, p4/M, z22.h, z26.h\n"
+ "ldr x20, [x17, #0x50]\n"
+ "ld1sb { z27.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x4488404e // smlalb z14.s, p4/M, z2.h, z8.h\n"
+ ".inst 0x449a4046 // smlalb z6.s, p4/M, z2.h, z26.h\n"
+ "ldr x20, [x17, #0x58]\n"
+ ".inst 0x455e137b // ssublb z27.h, z27.b, z30.b\n"
+ ".inst 0x449a4169 // smlalb z9.s, p4/M, z11.h, z26.h\n"
+ ".inst 0x449a4067 // smlalb z7.s, p4/M, z3.h, z26.h\n"
+ "ld1sb { z5.h }, p3/Z, [x20, x2]\n"
+ "ldr x20, [x17, #0x60]\n"
+ ".inst 0x44884457 // smlalt z23.s, p4/M, z2.h, z8.h\n"
+ ".inst 0x449043ae // smlalb z14.s, p4/M, z29.h, z16.h\n"
+ "ld1sb { z25.h }, p4/Z, [x4, #5, MUL VL]\n"
+ ".inst 0x455e10a5 // ssublb z5.h, z5.b, z30.b\n"
+ ".inst 0x449a4452 // smlalt z18.s, p4/M, z2.h, z26.h\n"
+ ".inst 0x449a4574 // smlalt z20.s, p4/M, z11.h, z26.h\n"
+ "ld1sb { z22.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x454a1339 // ssublb z25.h, z25.b, z10.b\n"
+ ".inst 0x449a4461 // smlalt z1.s, p4/M, z3.h, z26.h\n"
+ ".inst 0x448843a6 // smlalb z6.s, p4/M, z29.h, z8.h\n"
+ "ldr x20, [x17, #0x68]\n"
+ "ld1sb { z2.h }, p4/Z, [x4, #6, MUL VL]\n"
+ ".inst 0x44884069 // smlalb z9.s, p4/M, z3.h, z8.h\n"
+ ".inst 0x44884087 // smlalb z7.s, p4/M, z4.h, z8.h\n"
+ ".inst 0x455e12d6 // ssublb z22.h, z22.b, z30.b\n"
+ "ld1sb { z26.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x449047b7 // smlalt z23.s, p4/M, z29.h, z16.h\n"
+ ".inst 0x449543ee // smlalb z14.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x454a1042 // ssublb z2.h, z2.b, z10.b\n"
+ "ldr x20, [x17, #0x70]\n"
+ ".inst 0x448847b2 // smlalt z18.s, p4/M, z29.h, z8.h\n"
+ ".inst 0x44884474 // smlalt z20.s, p4/M, z3.h, z8.h\n"
+ "ld1sb { z29.h }, p4/Z, [x4, #7, MUL VL]\n"
+ ".inst 0x455e135a // ssublb z26.h, z26.b, z30.b\n"
+ ".inst 0x44884481 // smlalt z1.s, p4/M, z4.h, z8.h\n"
+ ".inst 0x449043e6 // smlalb z6.s, p4/M, z31.h, z16.h\n"
+ "inch x4, ALL, MUL #8\n"
+ "ld1sb { z8.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x44904089 // smlalb z9.s, p4/M, z4.h, z16.h\n"
+ ".inst 0x44904367 // smlalb z7.s, p4/M, z27.h, z16.h\n"
+ ".inst 0x454a13bd // ssublb z29.h, z29.b, z10.b\n"
+ "ldr x20, [x17, #0x78]\n"
+ ".inst 0x449547f7 // smlalt z23.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x4491400e // smlalb z14.s, p4/M, z0.h, z17.h\n"
+ "ld1sb { z24.h }, p4/Z, [x4]\n"
+ ".inst 0x455e1108 // ssublb z8.h, z8.b, z30.b\n"
+ ".inst 0x449047f2 // smlalt z18.s, p4/M, z31.h, z16.h\n"
+ ".inst 0x44904494 // smlalt z20.s, p4/M, z4.h, z16.h\n"
+ "ld1sb { z31.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x454a1318 // ssublb z24.h, z24.b, z10.b\n"
+ ".inst 0x44904761 // smlalt z1.s, p4/M, z27.h, z16.h\n"
+ ".inst 0x44954006 // smlalb z6.s, p4/M, z0.h, z21.h\n"
+ "ldr x22, [x17, #0x80]\n"
+ "ld1sb { z16.h }, p4/Z, [x4, #1, MUL VL]\n"
+ ".inst 0x44954369 // smlalb z9.s, p4/M, z27.h, z21.h\n"
+ ".inst 0x449540a7 // smlalb z7.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x455e13ff // ssublb z31.h, z31.b, z30.b\n"
+ "ldr x21, [x17, #0x88]\n"
+ ".inst 0x44914417 // smlalt z23.s, p4/M, z0.h, z17.h\n"
+ ".inst 0x4499416e // smlalb z14.s, p4/M, z11.h, z25.h\n"
+ ".inst 0x454a1210 // ssublb z16.h, z16.b, z10.b\n"
+ "ldr x20, [x17, #0x90]\n"
+ ".inst 0x44954412 // smlalt z18.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x44954774 // smlalt z20.s, p4/M, z27.h, z21.h\n"
+ "ld1sb { z0.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e1000 // ssublb z0.h, z0.b, z30.b\n"
+ ".inst 0x449544a1 // smlalt z1.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x449142c6 // smlalb z6.s, p4/M, z22.h, z17.h\n"
+ "ld1sb { z21.h }, p4/Z, [x4, #2, MUL VL]\n"
+ ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
+ ".inst 0x449140a9 // smlalb z9.s, p4/M, z5.h, z17.h\n"
+ ".inst 0x44914267 // smlalb z7.s, p4/M, z19.h, z17.h\n"
+ "ldr x23, [x17, #0x98]\n"
+ "ldr x22, [x17, #0xa0]\n"
+ ".inst 0x44994577 // smlalt z23.s, p4/M, z11.h, z25.h\n"
+ ".inst 0x4482406e // smlalb z14.s, p4/M, z3.h, z2.h\n"
+ "ld1sb { z11.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x455e116b // ssublb z11.h, z11.b, z30.b\n"
+ ".inst 0x449146d2 // smlalt z18.s, p4/M, z22.h, z17.h\n"
+ ".inst 0x449144b4 // smlalt z20.s, p4/M, z5.h, z17.h\n"
+ "ld1sb { z22.h }, p4/Z, [x4, #3, MUL VL]\n"
+ ".inst 0x454a12d6 // ssublb z22.h, z22.b, z10.b\n"
+ ".inst 0x44914661 // smlalt z1.s, p4/M, z19.h, z17.h\n"
+ ".inst 0x44994066 // smlalb z6.s, p4/M, z3.h, z25.h\n"
+ "ld1sb { z17.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1231 // ssublb z17.h, z17.b, z30.b\n"
+ ".inst 0x44994389 // smlalb z9.s, p4/M, z28.h, z25.h\n"
+ ".inst 0x44994347 // smlalb z7.s, p4/M, z26.h, z25.h\n"
+ "ldr x20, [x17, #0xa8]\n"
+ "ldr x21, [x17, #0xb0]\n"
+ ".inst 0x44824477 // smlalt z23.s, p4/M, z3.h, z2.h\n"
+ ".inst 0x449d408e // smlalb z14.s, p4/M, z4.h, z29.h\n"
+ "ldr x13, [x17, #0xb8]\n"
+ "ldr x12, [x17, #0xc0]\n"
+ ".inst 0x44994472 // smlalt z18.s, p4/M, z3.h, z25.h\n"
+ ".inst 0x44994794 // smlalt z20.s, p4/M, z28.h, z25.h\n"
+ "ld1sb { z3.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x455e1063 // ssublb z3.h, z3.b, z30.b\n"
+ ".inst 0x44994741 // smlalt z1.s, p4/M, z26.h, z25.h\n"
+ ".inst 0x44824086 // smlalb z6.s, p4/M, z4.h, z2.h\n"
+ "ld1sb { z25.h }, p4/Z, [x4, #4, MUL VL]\n"
+ ".inst 0x454a1339 // ssublb z25.h, z25.b, z10.b\n"
+ ".inst 0x44824349 // smlalb z9.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x44824107 // smlalb z7.s, p4/M, z8.h, z2.h\n"
+ "ldr x11, [x17, #0xc8]\n"
+ "ldr x10, [x17, #0xd0]\n"
+ ".inst 0x449d4497 // smlalt z23.s, p4/M, z4.h, z29.h\n"
+ ".inst 0x4498436e // smlalb z14.s, p4/M, z27.h, z24.h\n"
+ "ldr x9, [x17, #0xd8]\n"
+ "ldr x28, [x17, #0xe0]\n"
+ ".inst 0x44824492 // smlalt z18.s, p4/M, z4.h, z2.h\n"
+ ".inst 0x44824754 // smlalt z20.s, p4/M, z26.h, z2.h\n"
+ "ld1sb { z4.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e1084 // ssublb z4.h, z4.b, z30.b\n"
+ ".inst 0x44824501 // smlalt z1.s, p4/M, z8.h, z2.h\n"
+ ".inst 0x449d4366 // smlalb z6.s, p4/M, z27.h, z29.h\n"
+ "ld1sb { z2.h }, p4/Z, [x4, #5, MUL VL]\n"
+ ".inst 0x454a1042 // ssublb z2.h, z2.b, z10.b\n"
+ ".inst 0x449d4109 // smlalb z9.s, p4/M, z8.h, z29.h\n"
+ ".inst 0x449d43e7 // smlalb z7.s, p4/M, z31.h, z29.h\n"
+ "ldr x27, [x17, #0xe8]\n"
+ "ldr x26, [x17, #0xf0]\n"
+ ".inst 0x44984777 // smlalt z23.s, p4/M, z27.h, z24.h\n"
+ ".inst 0x449040ae // smlalb z14.s, p4/M, z5.h, z16.h\n"
+ "ldr x25, [x17, #0xf8]\n"
+ "ldr x24, [x17, #0x100]\n"
+ ".inst 0x449d4772 // smlalt z18.s, p4/M, z27.h, z29.h\n"
+ ".inst 0x449d4514 // smlalt z20.s, p4/M, z8.h, z29.h\n"
+ "ld1sb { z27.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e137b // ssublb z27.h, z27.b, z30.b\n"
+ ".inst 0x449d47e1 // smlalt z1.s, p4/M, z31.h, z29.h\n"
+ ".inst 0x449840a6 // smlalb z6.s, p4/M, z5.h, z24.h\n"
+ "ld1sb { z29.h }, p4/Z, [x4, #6, MUL VL]\n"
+ ".inst 0x454a13bd // ssublb z29.h, z29.b, z10.b\n"
+ ".inst 0x449843e9 // smlalb z9.s, p4/M, z31.h, z24.h\n"
+ ".inst 0x44984007 // smlalb z7.s, p4/M, z0.h, z24.h\n"
+ "ldr x23, [x17, #0x108]\n"
+ "ldr x22, [x17, #0x110]\n"
+ ".inst 0x449044b7 // smlalt z23.s, p4/M, z5.h, z16.h\n"
+ ".inst 0x4495438e // smlalb z14.s, p4/M, z28.h, z21.h\n"
+ "ldr x20, [x17, #0x118]\n"
+ "whilelt p0.h, x16, x3\n"
+ ".inst 0x449844b2 // smlalt z18.s, p4/M, z5.h, z24.h\n"
+ ".inst 0x449847f4 // smlalt z20.s, p4/M, z31.h, z24.h\n"
+ "ld1sb { z5.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x455e10a5 // ssublb z5.h, z5.b, z30.b\n"
+ ".inst 0x44984401 // smlalt z1.s, p4/M, z0.h, z24.h\n"
+ ".inst 0x44904266 // smlalb z6.s, p4/M, z19.h, z16.h\n"
+ "ld1sb { z24.h }, p4/Z, [x4, #7, MUL VL]\n"
+ "inch x4, ALL, MUL #8\n"
+ ".inst 0x44904009 // smlalb z9.s, p4/M, z0.h, z16.h\n"
+ ".inst 0x44904167 // smlalb z7.s, p4/M, z11.h, z16.h\n"
+ ".inst 0x454a1318 // ssublb z24.h, z24.b, z10.b\n"
+ "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44954797 // smlalt z23.s, p4/M, z28.h, z21.h\n"
+ ".inst 0x4496434e // smlalb z14.s, p4/M, z26.h, z22.h\n"
+ "ld1sb { z28.h }, p3/Z, [x13, x2]\n"
+ ".inst 0x455e139c // ssublb z28.h, z28.b, z30.b\n"
+ ".inst 0x44904672 // smlalt z18.s, p4/M, z19.h, z16.h\n"
+ ".inst 0x44904414 // smlalt z20.s, p4/M, z0.h, z16.h\n"
+ "ld1sb { z19.h }, p4/Z, [x4]\n"
+ ".inst 0x454a1273 // ssublb z19.h, z19.b, z10.b\n"
+ ".inst 0x44904561 // smlalt z1.s, p4/M, z11.h, z16.h\n"
+ ".inst 0x44954346 // smlalb z6.s, p4/M, z26.h, z21.h\n"
+ "ld1sb { z16.h }, p3/Z, [x12, x2]\n"
+ ".inst 0x455e1210 // ssublb z16.h, z16.b, z30.b\n"
+ ".inst 0x44954229 // smlalb z9.s, p4/M, z17.h, z21.h\n"
+ ".inst 0x44954067 // smlalb z7.s, p4/M, z3.h, z21.h\n"
+ ".inst 0x44964757 // smlalt z23.s, p4/M, z26.h, z22.h\n"
+ ".inst 0x4499410e // smlalb z14.s, p4/M, z8.h, z25.h\n"
+ ".inst 0x44954752 // smlalt z18.s, p4/M, z26.h, z21.h\n"
+ ".inst 0x44954634 // smlalt z20.s, p4/M, z17.h, z21.h\n"
+ "ld1sb { z26.h }, p3/Z, [x11, x2]\n"
+ ".inst 0x455e135a // ssublb z26.h, z26.b, z30.b\n"
+ ".inst 0x44954461 // smlalt z1.s, p4/M, z3.h, z21.h\n"
+ ".inst 0x44964106 // smlalb z6.s, p4/M, z8.h, z22.h\n"
+ "ld1sb { z21.h }, p4/Z, [x4, #1, MUL VL]\n"
+ ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
+ ".inst 0x44964069 // smlalb z9.s, p4/M, z3.h, z22.h\n"
+ ".inst 0x44964087 // smlalb z7.s, p4/M, z4.h, z22.h\n"
+ ".inst 0x44994517 // smlalt z23.s, p4/M, z8.h, z25.h\n"
".inst 0x448243ee // smlalb z14.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448046e7 // smlalt z7.s, p4/M, z23.h, z0.h\n"
- "uzp1 z23.s, z19.s, z18.s\n"
- ".inst 0x44804705 // smlalt z5.s, p4/M, z24.h, z0.h\n"
- ".inst 0x448143f4 // smlalb z20.s, p4/M, z31.h, z1.h\n"
- "uzp2 z22.s, z19.s, z18.s\n"
- ".inst 0x44814308 // smlalb z8.s, p4/M, z24.h, z1.h\n"
- ".inst 0x44814366 // smlalb z6.s, p4/M, z27.h, z1.h\n"
- ".inst 0x448247ea // smlalt z10.s, p4/M, z31.h, z2.h\n"
- ".inst 0x44814710 // smlalt z16.s, p4/M, z24.h, z1.h\n"
- "ld1sb { z24.h }, p3/Z, [x21, x0]\n"
- ".inst 0x454f1318 // ssublb z24.h, z24.b, z15.b\n"
- ".inst 0x448343ce // smlalb z14.s, p4/M, z30.h, z3.h\n"
- ".inst 0x448147e7 // smlalt z7.s, p4/M, z31.h, z1.h\n"
- ".inst 0x44814765 // smlalt z5.s, p4/M, z27.h, z1.h\n"
- ".inst 0x448243d4 // smlalb z20.s, p4/M, z30.h, z2.h\n"
- ".inst 0x44824368 // smlalb z8.s, p4/M, z27.h, z2.h\n"
- ".inst 0x44824326 // smlalb z6.s, p4/M, z25.h, z2.h\n"
- ".inst 0x448347ca // smlalt z10.s, p4/M, z30.h, z3.h\n"
- ".inst 0x44824770 // smlalt z16.s, p4/M, z27.h, z2.h\n"
- "ld1sb { z27.h }, p3/Z, [x20, x0]\n"
- ".inst 0x454f137b // ssublb z27.h, z27.b, z15.b\n"
- ".inst 0x4484438e // smlalb z14.s, p4/M, z28.h, z4.h\n"
- ".inst 0x448247c7 // smlalt z7.s, p4/M, z30.h, z2.h\n"
- ".inst 0x04a975ce // sqrdmulh z14.s, z14.s, z9.s\n"
- "inch x0\n"
- ".inst 0x44824725 // smlalt z5.s, p4/M, z25.h, z2.h\n"
- ".inst 0x44834394 // smlalb z20.s, p4/M, z28.h, z3.h\n"
- "and z21.d, z14.d, z23.d\n"
- "mov x20, x0\n"
- ".inst 0x44834328 // smlalb z8.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44834306 // smlalb z6.s, p4/M, z24.h, z3.h\n"
- "asr z21.s, z21.s, #0x1f\n"
+ ".inst 0x44964512 // smlalt z18.s, p4/M, z8.h, z22.h\n"
+ ".inst 0x44964474 // smlalt z20.s, p4/M, z3.h, z22.h\n"
+ "ld1sb { z8.h }, p3/Z, [x10, x2]\n"
+ ".inst 0x455e1108 // ssublb z8.h, z8.b, z30.b\n"
+ ".inst 0x44964481 // smlalt z1.s, p4/M, z4.h, z22.h\n"
+ ".inst 0x449943e6 // smlalb z6.s, p4/M, z31.h, z25.h\n"
+ "ld1sb { z22.h }, p4/Z, [x4, #2, MUL VL]\n"
+ ".inst 0x454a12d6 // ssublb z22.h, z22.b, z10.b\n"
+ ".inst 0x44994089 // smlalb z9.s, p4/M, z4.h, z25.h\n"
+ ".inst 0x44994367 // smlalb z7.s, p4/M, z27.h, z25.h\n"
+ ".inst 0x448247f7 // smlalt z23.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x449d400e // smlalb z14.s, p4/M, z0.h, z29.h\n"
+ ".inst 0x449947f2 // smlalt z18.s, p4/M, z31.h, z25.h\n"
+ ".inst 0x44994494 // smlalt z20.s, p4/M, z4.h, z25.h\n"
+ "ld1sb { z31.h }, p3/Z, [x9, x2]\n"
+ ".inst 0x455e13ff // ssublb z31.h, z31.b, z30.b\n"
+ ".inst 0x44994761 // smlalt z1.s, p4/M, z27.h, z25.h\n"
+ ".inst 0x44824006 // smlalb z6.s, p4/M, z0.h, z2.h\n"
+ "ld1sb { z25.h }, p4/Z, [x4, #3, MUL VL]\n"
+ ".inst 0x454a1339 // ssublb z25.h, z25.b, z10.b\n"
+ ".inst 0x44824369 // smlalb z9.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x448240a7 // smlalb z7.s, p4/M, z5.h, z2.h\n"
+ ".inst 0x449d4417 // smlalt z23.s, p4/M, z0.h, z29.h\n"
+ ".inst 0x4498422e // smlalb z14.s, p4/M, z17.h, z24.h\n"
+ ".inst 0x44824412 // smlalt z18.s, p4/M, z0.h, z2.h\n"
+ ".inst 0x44824774 // smlalt z20.s, p4/M, z27.h, z2.h\n"
+ "ld1sb { z0.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x455e1000 // ssublb z0.h, z0.b, z30.b\n"
+ ".inst 0x448244a1 // smlalt z1.s, p4/M, z5.h, z2.h\n"
+ ".inst 0x449d4166 // smlalb z6.s, p4/M, z11.h, z29.h\n"
+ "ld1sb { z2.h }, p4/Z, [x4, #4, MUL VL]\n"
+ ".inst 0x454a1042 // ssublb z2.h, z2.b, z10.b\n"
+ ".inst 0x449d40a9 // smlalb z9.s, p4/M, z5.h, z29.h\n"
+ ".inst 0x449d4387 // smlalb z7.s, p4/M, z28.h, z29.h\n"
+ ".inst 0x44984637 // smlalt z23.s, p4/M, z17.h, z24.h\n"
+ ".inst 0x4493406e // smlalb z14.s, p4/M, z3.h, z19.h\n"
+ "ld1sb { z17.h }, p3/Z, [x27, x2]\n"
+ ".inst 0x455e1231 // ssublb z17.h, z17.b, z30.b\n"
+ ".inst 0x449d4572 // smlalt z18.s, p4/M, z11.h, z29.h\n"
+ ".inst 0x449d44b4 // smlalt z20.s, p4/M, z5.h, z29.h\n"
+ "ld1sb { z11.h }, p4/Z, [x4, #5, MUL VL]\n"
+ ".inst 0x454a116b // ssublb z11.h, z11.b, z10.b\n"
+ ".inst 0x449d4781 // smlalt z1.s, p4/M, z28.h, z29.h\n"
+ ".inst 0x44984066 // smlalb z6.s, p4/M, z3.h, z24.h\n"
+ "ld1sb { z29.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x455e13bd // ssublb z29.h, z29.b, z30.b\n"
+ ".inst 0x44984209 // smlalb z9.s, p4/M, z16.h, z24.h\n"
+ ".inst 0x44984347 // smlalb z7.s, p4/M, z26.h, z24.h\n"
+ ".inst 0x44934477 // smlalt z23.s, p4/M, z3.h, z19.h\n"
+ ".inst 0x4495408e // smlalb z14.s, p4/M, z4.h, z21.h\n"
+ ".inst 0x44984472 // smlalt z18.s, p4/M, z3.h, z24.h\n"
+ ".inst 0x44984614 // smlalt z20.s, p4/M, z16.h, z24.h\n"
+ "ld1sb { z3.h }, p3/Z, [x25, x2]\n"
+ ".inst 0x455e1063 // ssublb z3.h, z3.b, z30.b\n"
+ ".inst 0x44984741 // smlalt z1.s, p4/M, z26.h, z24.h\n"
+ ".inst 0x44934086 // smlalb z6.s, p4/M, z4.h, z19.h\n"
+ "ld1sb { z24.h }, p4/Z, [x4, #6, MUL VL]\n"
+ ".inst 0x454a1318 // ssublb z24.h, z24.b, z10.b\n"
+ ".inst 0x44934349 // smlalb z9.s, p4/M, z26.h, z19.h\n"
+ ".inst 0x44934107 // smlalb z7.s, p4/M, z8.h, z19.h\n"
+ ".inst 0x44954497 // smlalt z23.s, p4/M, z4.h, z21.h\n"
+ ".inst 0x4496436e // smlalb z14.s, p4/M, z27.h, z22.h\n"
+ ".inst 0x44934492 // smlalt z18.s, p4/M, z4.h, z19.h\n"
+ ".inst 0x44934754 // smlalt z20.s, p4/M, z26.h, z19.h\n"
+ "ld1sb { z4.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x455e1084 // ssublb z4.h, z4.b, z30.b\n"
+ ".inst 0x44934501 // smlalt z1.s, p4/M, z8.h, z19.h\n"
+ ".inst 0x44954366 // smlalb z6.s, p4/M, z27.h, z21.h\n"
+ "ld1sb { z19.h }, p4/Z, [x4, #7, MUL VL]\n"
+ "inch x4, ALL, MUL #8\n"
+ ".inst 0x44954109 // smlalb z9.s, p4/M, z8.h, z21.h\n"
+ ".inst 0x449543e7 // smlalb z7.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x454a1273 // ssublb z19.h, z19.b, z10.b\n"
+ ".inst 0x44964777 // smlalt z23.s, p4/M, z27.h, z22.h\n"
+ ".inst 0x449940ae // smlalb z14.s, p4/M, z5.h, z25.h\n"
+ ".inst 0x44954772 // smlalt z18.s, p4/M, z27.h, z21.h\n"
+ ".inst 0x44954514 // smlalt z20.s, p4/M, z8.h, z21.h\n"
+ "ld1sb { z27.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x455e137b // ssublb z27.h, z27.b, z30.b\n"
+ ".inst 0x449547e1 // smlalt z1.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x449640a6 // smlalb z6.s, p4/M, z5.h, z22.h\n"
+ "ld1sb { z21.h }, p4/Z, [x4]\n"
+ ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
+ ".inst 0x449643e9 // smlalb z9.s, p4/M, z31.h, z22.h\n"
+ ".inst 0x44964007 // smlalb z7.s, p4/M, z0.h, z22.h\n"
+ "inch x4\n"
+ ".inst 0x449944b7 // smlalt z23.s, p4/M, z5.h, z25.h\n"
+ ".inst 0x4482420e // smlalb z14.s, p4/M, z16.h, z2.h\n"
+ ".inst 0x449644b2 // smlalt z18.s, p4/M, z5.h, z22.h\n"
+ ".inst 0x449647f4 // smlalt z20.s, p4/M, z31.h, z22.h\n"
+ "ld1sb { z5.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e10a5 // ssublb z5.h, z5.b, z30.b\n"
+ ".inst 0x44964401 // smlalt z1.s, p4/M, z0.h, z22.h\n"
+ ".inst 0x44994386 // smlalb z6.s, p4/M, z28.h, z25.h\n"
+ "ld1w { z22.s }, p2/Z, [x15]\n"
+ ".inst 0x44994009 // smlalb z9.s, p4/M, z0.h, z25.h\n"
+ ".inst 0x44994227 // smlalb z7.s, p4/M, z17.h, z25.h\n"
+ ".inst 0x44824617 // smlalt z23.s, p4/M, z16.h, z2.h\n"
+ ".inst 0x448b434e // smlalb z14.s, p4/M, z26.h, z11.h\n"
+ "ld1w { z16.s }, p1/Z, [x15, #1, MUL VL]\n"
+ "addvl x15, x15, #2\n"
+ ".inst 0x44994792 // smlalt z18.s, p4/M, z28.h, z25.h\n"
+ ".inst 0x44994414 // smlalt z20.s, p4/M, z0.h, z25.h\n"
+ "ld1sb { z28.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e139c // ssublb z28.h, z28.b, z30.b\n"
+ ".inst 0x44994621 // smlalt z1.s, p4/M, z17.h, z25.h\n"
+ ".inst 0x44824346 // smlalb z6.s, p4/M, z26.h, z2.h\n"
+ "uzp1 z25.s, z22.s, z16.s\n"
+ "inch x2\n"
+ ".inst 0x448243a9 // smlalb z9.s, p4/M, z29.h, z2.h\n"
+ ".inst 0x44824067 // smlalb z7.s, p4/M, z3.h, z2.h\n"
+ "uzp2 z16.s, z22.s, z16.s\n"
+ "ld1w { z22.s }, p2/Z, [x14]\n"
+ ".inst 0x448b4757 // smlalt z23.s, p4/M, z26.h, z11.h\n"
+ ".inst 0x4498410e // smlalb z14.s, p4/M, z8.h, z24.h\n"
+ "mov x20, x2\n"
"incw x20\n"
- ".inst 0x4484478a // smlalt z10.s, p4/M, z28.h, z4.h\n"
- ".inst 0x44834787 // smlalt z7.s, p4/M, z28.h, z3.h\n"
- ".inst 0x04bd754a // sqrdmulh z10.s, z10.s, z29.s\n"
- "whilelt p2.s, x0, x1\n"
- ".inst 0x44834730 // smlalt z16.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44834705 // smlalt z5.s, p4/M, z24.h, z3.h\n"
- "and z3.d, z10.d, z22.d\n"
- "whilelt p1.s, x20, x1\n"
- ".inst 0x44844354 // smlalb z20.s, p4/M, z26.h, z4.h\n"
- ".inst 0x44844308 // smlalb z8.s, p4/M, z24.h, z4.h\n"
- ".inst 0x04a97694 // sqrdmulh z20.s, z20.s, z9.s\n"
- "whilelt p3.h, x0, x1\n"
- ".inst 0x44844366 // smlalb z6.s, p4/M, z27.h, z4.h\n"
- ".inst 0x44844747 // smlalt z7.s, p4/M, z26.h, z4.h\n"
- ".inst 0x04a97508 // sqrdmulh z8.s, z8.s, z9.s\n"
- ".inst 0x44844710 // smlalt z16.s, p4/M, z24.h, z4.h\n"
- ".inst 0x44844765 // smlalt z5.s, p4/M, z27.h, z4.h\n"
- ".inst 0x04a974c6 // sqrdmulh z6.s, z6.s, z9.s\n"
- "sqadd z14.s, z14.s, z21.s\n"
+ ".inst 0x44824752 // smlalt z18.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x448247b4 // smlalt z20.s, p4/M, z29.h, z2.h\n"
+ "ld1w { z26.s }, p1/Z, [x14, #1, MUL VL]\n"
+ "uzp1 z29.s, z22.s, z26.s\n"
+ ".inst 0x44824461 // smlalt z1.s, p4/M, z3.h, z2.h\n"
+ ".inst 0x448b4106 // smlalb z6.s, p4/M, z8.h, z11.h\n"
+ "uzp2 z22.s, z22.s, z26.s\n"
+ "whilelt p2.s, x2, x3\n"
+ ".inst 0x448b4069 // smlalb z9.s, p4/M, z3.h, z11.h\n"
+ ".inst 0x448b4087 // smlalb z7.s, p4/M, z4.h, z11.h\n"
+ "whilelt p1.s, x20, x3\n"
+ "whilelt p3.h, x2, x3\n"
+ ".inst 0x44984517 // smlalt z23.s, p4/M, z8.h, z24.h\n"
+ ".inst 0x449343ee // smlalb z14.s, p4/M, z31.h, z19.h\n"
+ "addvl x14, x14, #2\n"
+ ".inst 0x448b4512 // smlalt z18.s, p4/M, z8.h, z11.h\n"
+ ".inst 0x448b4474 // smlalt z20.s, p4/M, z3.h, z11.h\n"
+ ".inst 0x448b4481 // smlalt z1.s, p4/M, z4.h, z11.h\n"
+ ".inst 0x449843e6 // smlalb z6.s, p4/M, z31.h, z24.h\n"
+ ".inst 0x44984089 // smlalb z9.s, p4/M, z4.h, z24.h\n"
+ ".inst 0x44984367 // smlalb z7.s, p4/M, z27.h, z24.h\n"
+ ".inst 0x449347f7 // smlalt z23.s, p4/M, z31.h, z19.h\n"
+ ".inst 0x4495400e // smlalb z14.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x04b975ce // sqrdmulh z14.s, z14.s, z25.s\n"
+ ".inst 0x449847f2 // smlalt z18.s, p4/M, z31.h, z24.h\n"
+ ".inst 0x44984494 // smlalt z20.s, p4/M, z4.h, z24.h\n"
+ "and z3.d, z14.d, z29.d\n"
+ ".inst 0x44984761 // smlalt z1.s, p4/M, z27.h, z24.h\n"
+ ".inst 0x44934006 // smlalb z6.s, p4/M, z0.h, z19.h\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ ".inst 0x44934369 // smlalb z9.s, p4/M, z27.h, z19.h\n"
+ ".inst 0x449340a7 // smlalb z7.s, p4/M, z5.h, z19.h\n"
+ "sqadd z14.s, z14.s, z3.s\n"
+ ".inst 0x448293ae // srshl z14.s, p4/M, z14.s, z29.s\n"
+ ".inst 0x44954417 // smlalt z23.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x44934412 // smlalt z18.s, p4/M, z0.h, z19.h\n"
+ ".inst 0x04b076f7 // sqrdmulh z23.s, z23.s, z16.s\n"
+ ".inst 0x44934774 // smlalt z20.s, p4/M, z27.h, z19.h\n"
+ ".inst 0x449344a1 // smlalt z1.s, p4/M, z5.h, z19.h\n"
+ "and z31.d, z23.d, z22.d\n"
+ ".inst 0x44954226 // smlalb z6.s, p4/M, z17.h, z21.h\n"
+ ".inst 0x449540a9 // smlalb z9.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x04b974c6 // sqrdmulh z6.s, z6.s, z25.s\n"
+ ".inst 0x44954387 // smlalb z7.s, p4/M, z28.h, z21.h\n"
+ ".inst 0x44954632 // smlalt z18.s, p4/M, z17.h, z21.h\n"
+ ".inst 0x04b97529 // sqrdmulh z9.s, z9.s, z25.s\n"
+ ".inst 0x449544b4 // smlalt z20.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x44954781 // smlalt z1.s, p4/M, z28.h, z21.h\n"
+ ".inst 0x04b974e7 // sqrdmulh z7.s, z7.s, z25.s\n"
+ "asr z31.s, z31.s, #0x1f\n"
+ "and z3.d, z6.d, z29.d\n"
+ ".inst 0x04b07652 // sqrdmulh z18.s, z18.s, z16.s\n"
+ "and z0.d, z9.d, z29.d\n"
+ ".inst 0x04b07694 // sqrdmulh z20.s, z20.s, z16.s\n"
+ "and z19.d, z7.d, z29.d\n"
+ ".inst 0x04b07421 // sqrdmulh z1.s, z1.s, z16.s\n"
+ "sqadd z23.s, z23.s, z31.s\n"
+ ".inst 0x448292d7 // srshl z23.s, p4/M, z23.s, z22.s\n"
"asr z3.s, z3.s, #0x1f\n"
- ".inst 0x448292ee // srshl z14.s, p4/M, z14.s, z23.s\n"
- "and z19.d, z20.d, z23.d\n"
- ".inst 0x04bd74e7 // sqrdmulh z7.s, z7.s, z29.s\n"
- "and z18.d, z8.d, z23.d\n"
- ".inst 0x04bd7610 // sqrdmulh z16.s, z16.s, z29.s\n"
- "and z21.d, z6.d, z23.d\n"
- ".inst 0x04bd74a5 // sqrdmulh z5.s, z5.s, z29.s\n"
- "sqadd z10.s, z10.s, z3.s\n"
+ "and z21.d, z18.d, z22.d\n"
+ "asr z0.s, z0.s, #0x1f\n"
+ "and z17.d, z20.d, z22.d\n"
"asr z19.s, z19.s, #0x1f\n"
- ".inst 0x448292ca // srshl z10.s, p4/M, z10.s, z22.s\n"
- "and z1.d, z7.d, z22.d\n"
- "asr z18.s, z18.s, #0x1f\n"
- "and z2.d, z16.d, z22.d\n"
+ "and z16.d, z1.d, z22.d\n"
+ "sqadd z6.s, z6.s, z3.s\n"
"asr z21.s, z21.s, #0x1f\n"
- "and z3.d, z5.d, z22.d\n"
- "sqadd z20.s, z20.s, z19.s\n"
- ".inst 0x448292f4 // srshl z20.s, p4/M, z20.s, z23.s\n"
- "asr z1.s, z1.s, #0x1f\n"
- "sqadd z8.s, z8.s, z18.s\n"
- ".inst 0x448292e8 // srshl z8.s, p4/M, z8.s, z23.s\n"
- "asr z2.s, z2.s, #0x1f\n"
- "sqadd z6.s, z6.s, z21.s\n"
- ".inst 0x448292e6 // srshl z6.s, p4/M, z6.s, z23.s\n"
- "asr z3.s, z3.s, #0x1f\n"
- "sqadd z7.s, z7.s, z1.s\n"
- ".inst 0x448292c7 // srshl z7.s, p4/M, z7.s, z22.s\n"
- "sqadd z16.s, z16.s, z2.s\n"
- "sqadd z5.s, z5.s, z3.s\n"
- ".inst 0x448292d0 // srshl z16.s, p4/M, z16.s, z22.s\n"
- ".inst 0x448292c5 // srshl z5.s, p4/M, z5.s, z22.s\n"
+ ".inst 0x448293a6 // srshl z6.s, p4/M, z6.s, z29.s\n"
+ "sqadd z9.s, z9.s, z0.s\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ ".inst 0x448293a9 // srshl z9.s, p4/M, z9.s, z29.s\n"
+ "sqadd z7.s, z7.s, z19.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x448293a7 // srshl z7.s, p4/M, z7.s, z29.s\n"
+ "sqadd z18.s, z18.s, z21.s\n"
+ "sqadd z20.s, z20.s, z17.s\n"
+ ".inst 0x448292d2 // srshl z18.s, p4/M, z18.s, z22.s\n"
+ ".inst 0x448292d4 // srshl z20.s, p4/M, z20.s, z22.s\n"
+ "sqadd z1.s, z1.s, z16.s\n"
".inst 0x453041ce // sqxtnb z14.h, z14.s\n"
- ".inst 0x45304294 // sqxtnb z20.h, z20.s\n"
- ".inst 0x45304108 // sqxtnb z8.h, z8.s\n"
+ ".inst 0x448292c1 // srshl z1.s, p4/M, z1.s, z22.s\n"
".inst 0x453040c6 // sqxtnb z6.h, z6.s\n"
- ".inst 0x4530454e // sqxtnt z14.h, z10.s\n"
- ".inst 0x453044f4 // sqxtnt z20.h, z7.s\n"
- ".inst 0x45304608 // sqxtnt z8.h, z16.s\n"
- ".inst 0x453044a6 // sqxtnt z6.h, z5.s\n"
- "sqadd z14.h, z14.h, z12.h\n"
- "sqadd z20.h, z20.h, z12.h\n"
- "smax z14.h, p4/M, z14.h, z13.h\n"
- "smax z20.h, p4/M, z20.h, z13.h\n"
- "sqadd z8.h, z8.h, z12.h\n"
- "sqadd z6.h, z6.h, z12.h\n"
- "smax z8.h, p4/M, z8.h, z13.h\n"
- "smax z6.h, p4/M, z6.h, z13.h\n"
- "smin z14.h, p4/M, z14.h, z11.h\n"
- "smin z20.h, p4/M, z20.h, z11.h\n"
- "st1b { z14.h }, p0, [x3, x8]\n"
- "smin z8.h, p4/M, z8.h, z11.h\n"
- "smin z6.h, p4/M, z6.h, z11.h\n"
- "st1b { z20.h }, p0, [x4, x8]\n"
- "st1b { z8.h }, p0, [x5, x8]\n"
- "st1b { z6.h }, p0, [x6, x8]\n"
- "ld1w { z30.s }, p2/Z, [x14]\n"
- "ld1w { z16.s }, p1/Z, [x14, #1, MUL VL]\n"
- "uzp1 z14.s, z30.s, z16.s\n"
- "ld1sb { z0.h }, p4/Z, [x2]\n"
- "ld1sb { z1.h }, p4/Z, [x2, #1, MUL VL]\n"
- "uzp2 z10.s, z30.s, z16.s\n"
- "addvl x14, x14, #2\n"
- "ld1sb { z2.h }, p4/Z, [x2, #2, MUL VL]\n"
- "ld1sb { z3.h }, p4/Z, [x2, #3, MUL VL]\n"
- "inch x8\n"
- "str x14, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1sb { z4.h }, p4/Z, [x2, #4, MUL VL]\n"
- "ldp x9, x28, [x7, #0x0]\n"
- "mov z20.d, z14.d\n"
- "mov z7.d, z10.d\n"
- "ldp x27, x26, [x7, #0x10]\n"
- "ldp x25, x24, [x7, #0x20]\n"
- "mov z8.d, z14.d\n"
- "mov z16.d, z10.d\n"
- "ldp x23, x22, [x7, #0x30]\n"
- "ldp x21, x20, [x7, #0x40]\n"
+ ".inst 0x45304129 // sqxtnb z9.h, z9.s\n"
+ ".inst 0x453040e7 // sqxtnb z7.h, z7.s\n"
+ ".inst 0x453046ee // sqxtnt z14.h, z23.s\n"
+ ".inst 0x45304646 // sqxtnt z6.h, z18.s\n"
+ ".inst 0x45304689 // sqxtnt z9.h, z20.s\n"
+ ".inst 0x45304427 // sqxtnt z7.h, z1.s\n"
+ "sqadd z14.h, z14.h, z15.h\n"
+ "smax z14.h, p4/M, z14.h, z12.h\n"
+ "smin z14.h, p4/M, z14.h, z13.h\n"
+ "sqadd z6.h, z6.h, z15.h\n"
+ "sqadd z9.h, z9.h, z15.h\n"
+ "smax z6.h, p4/M, z6.h, z12.h\n"
+ "smax z9.h, p4/M, z9.h, z12.h\n"
+ "sqadd z7.h, z7.h, z15.h\n"
+ "smax z7.h, p4/M, z7.h, z12.h\n"
+ "smin z6.h, p4/M, z6.h, z13.h\n"
+ "st1b { z14.h }, p0, [x5, x16]\n"
+ "smin z9.h, p4/M, z9.h, z13.h\n"
+ "smin z7.h, p4/M, z7.h, z13.h\n"
+ "st1b { z6.h }, p0, [x6, x16]\n"
+ "st1b { z9.h }, p0, [x7, x16]\n"
+ "st1b { z7.h }, p0, [x8, x16]\n"
+ "ld1w { z17.s }, p2/Z, [x21]\n"
+ "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+ "uzp1 z14.s, z17.s, z16.s\n"
+ "ld1sb { z26.h }, p4/Z, [x4]\n"
+ "ld1sb { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
+ "uzp2 z23.s, z17.s, z16.s\n"
+ "addvl x21, x21, #2\n"
+ "ld1sb { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
+ "ld1sb { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
+ "inch x16\n"
+ "str x21, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1sb { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
+ "ldp x9, x28, [x17, #0x0]\n"
"mov z6.d, z14.d\n"
- "mov z5.d, z10.d\n"
- "ld1sb { z31.h }, p3/Z, [x9, x0]\n"
- "ld1sb { z30.h }, p3/Z, [x28, x0]\n"
- ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
- ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
- "ld1sb { z29.h }, p3/Z, [x27, x0]\n"
- "ld1sb { z28.h }, p3/Z, [x26, x0]\n"
- ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
- ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
- "ld1sb { z27.h }, p3/Z, [x25, x0]\n"
- "ld1sb { z23.h }, p3/Z, [x24, x0]\n"
- ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
- ".inst 0x454f13ff // ssublb z31.h, z31.b, z15.b\n"
- "ld1sb { z25.h }, p3/Z, [x23, x0]\n"
- "ld1sb { z24.h }, p3/Z, [x22, x0]\n"
- ".inst 0x454f13de // ssublb z30.h, z30.b, z15.b\n"
- ".inst 0x454f13bd // ssublb z29.h, z29.b, z15.b\n"
- "ld1sb { z26.h }, p3/Z, [x21, x0]\n"
- "ld1sb { z22.h }, p3/Z, [x20, x0]\n"
- ".inst 0x454f139c // ssublb z28.h, z28.b, z15.b\n"
- ".inst 0x454f137b // ssublb z27.h, z27.b, z15.b\n"
- ".inst 0x454f12f7 // ssublb z23.h, z23.b, z15.b\n"
- ".inst 0x454f1339 // ssublb z25.h, z25.b, z15.b\n"
- ".inst 0x454f1318 // ssublb z24.h, z24.b, z15.b\n"
- ".inst 0x454f135a // ssublb z26.h, z26.b, z15.b\n"
- ".inst 0x454f12d6 // ssublb z22.h, z22.b, z15.b\n"
+ "mov z18.d, z23.d\n"
+ "ldp x27, x26, [x17, #0x10]\n"
+ "ldp x25, x24, [x17, #0x20]\n"
+ "mov z9.d, z14.d\n"
+ "mov z20.d, z23.d\n"
+ "ldp x23, x22, [x17, #0x30]\n"
+ "ldp x21, x20, [x17, #0x40]\n"
+ "mov z7.d, z14.d\n"
+ "mov z1.d, z23.d\n"
+ "ld1sb { z22.h }, p3/Z, [x9, x2]\n"
+ "ld1sb { z2.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x454a135a // ssublb z26.h, z26.b, z10.b\n"
+ ".inst 0x454a1108 // ssublb z8.h, z8.b, z10.b\n"
+ "ld1sb { z11.h }, p3/Z, [x27, x2]\n"
+ "ld1sb { z3.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x454a1210 // ssublb z16.h, z16.b, z10.b\n"
+ ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
+ "ld1sb { z29.h }, p3/Z, [x25, x2]\n"
+ "ld1sb { z4.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x454a1231 // ssublb z17.h, z17.b, z10.b\n"
+ ".inst 0x455e12d6 // ssublb z22.h, z22.b, z30.b\n"
+ "ld1sb { z31.h }, p3/Z, [x23, x2]\n"
+ "ld1sb { z0.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e1042 // ssublb z2.h, z2.b, z30.b\n"
+ ".inst 0x455e116b // ssublb z11.h, z11.b, z30.b\n"
+ "ld1sb { z19.h }, p3/Z, [x21, x2]\n"
+ "ld1sb { z28.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1063 // ssublb z3.h, z3.b, z30.b\n"
+ ".inst 0x455e13bd // ssublb z29.h, z29.b, z30.b\n"
+ ".inst 0x455e1084 // ssublb z4.h, z4.b, z30.b\n"
+ ".inst 0x455e13ff // ssublb z31.h, z31.b, z30.b\n"
+ ".inst 0x455e1000 // ssublb z0.h, z0.b, z30.b\n"
+ ".inst 0x455e1273 // ssublb z19.h, z19.b, z30.b\n"
+ ".inst 0x455e139c // ssublb z28.h, z28.b, z30.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
index 2e8c2019db..abc09ee5a3 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
index 6fba4d47d2..274b29dcfc 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -47,285 +47,285 @@ void sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
"ldr x20, [%x[inptrs], #0x10]\n"
"ldr x22, [%x[inptrs], #0x20]\n"
"ldr x21, [%x[inptrs], #0x0]\n"
- "mov z15.b, #0x1\n"
- "lsr z15.s, z15.s, #0x8\n"
+ "mov z13.b, #0x1\n"
+ "lsr z13.s, z13.s, #0x8\n"
"ld1b { z1.b }, p0/Z, [x23]\n"
"ld1b { z2.b }, p0/Z, [x20]\n"
- "mov z30.d, z1.d\n"
- "mov z29.d, z1.d\n"
+ "mov z8.d, z1.d\n"
+ "mov z27.d, z1.d\n"
"ldr x20, [%x[inptrs], #0x18]\n"
"ld1b { z4.b }, p0/Z, [x22]\n"
- "mov z28.d, z1.d\n"
- "mov z27.d, z2.d\n"
+ "mov z31.d, z1.d\n"
+ "mov z28.d, z2.d\n"
"ld1b { z0.b }, p0/Z, [x21]\n"
+ "mov z30.d, z2.d\n"
"mov z26.d, z2.d\n"
- "mov z25.d, z2.d\n"
"ld1b { z3.b }, p0/Z, [x20]\n"
- "mov z24.d, z4.d\n"
- "mov z23.d, z4.d\n"
- "ptrue p2.b\n"
- "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
"mov z22.d, z4.d\n"
- "ext z30.b, z30.b, z30.b, #0x2\n"
+ "mov z10.d, z4.d\n"
+ "ptrue p2.b\n"
+ "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z18.d, z4.d\n"
+ "ext z8.b, z8.b, z8.b, #0x2\n"
"lsl x10, %x[n_channels], #0x2\n"
- "neg z14.s, p2/M, z14.s\n"
- "ext z29.b, z29.b, z29.b, #0x4\n"
- "ext z28.b, z28.b, z28.b, #0x6\n"
+ "neg z11.s, p2/M, z11.s\n"
+ "ext z27.b, z27.b, z27.b, #0x4\n"
+ "ext z31.b, z31.b, z31.b, #0x6\n"
"mov x9, #0x0\n"
- "whilelt p1.b, x9, x10\n"
- "ext z27.b, z27.b, z27.b, #0x2\n"
- "ext z26.b, z26.b, z26.b, #0x4\n"
- "ld1w { z13.s }, p1/Z, [%x[params]]\n"
+ "whilelt p0.b, x9, x10\n"
+ "ext z28.b, z28.b, z28.b, #0x2\n"
+ "ext z30.b, z30.b, z30.b, #0x4\n"
+ "ld1w { z14.s }, p0/Z, [%x[params]]\n"
"mov x28, #0x0\n"
- "ext z25.b, z25.b, z25.b, #0x6\n"
- "ext z24.b, z24.b, z24.b, #0x2\n"
+ "ext z26.b, z26.b, z26.b, #0x6\n"
+ "ext z22.b, z22.b, z22.b, #0x2\n"
"ldp x27, x26, [%x[outptrs], #0x0]\n"
"ldp x25, x24, [%x[outptrs], #0x10]\n"
- "ext z23.b, z23.b, z23.b, #0x4\n"
- "ext z22.b, z22.b, z22.b, #0x6\n"
+ "ext z10.b, z10.b, z10.b, #0x4\n"
+ "ext z18.b, z18.b, z18.b, #0x6\n"
"ldp x23, x22, [%x[outptrs], #0x20]\n"
"ldp x21, x20, [%x[outptrs], #0x30]\n"
"mov z21.d, z0.d\n"
"mov z20.d, z0.d\n"
- "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"mov z19.d, z0.d\n"
- "mov z18.d, z3.d\n"
- "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
- "ld1b { z5.b }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "mov z24.d, z3.d\n"
+ "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1b { z5.b }, p0/Z, [%x[params], #1, MUL VL]\n"
"mov z17.d, z3.d\n"
"mov z16.d, z3.d\n"
- "ld1b { z6.b }, p1/Z, [%x[params], #2, MUL VL]\n"
- "ld1b { z7.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "ld1b { z6.b }, p0/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z7.b }, p0/Z, [%x[params], #3, MUL VL]\n"
"ext z21.b, z21.b, z21.b, #0x2\n"
"ext z20.b, z20.b, z20.b, #0x4\n"
"addvl %x[params], %x[params], #4\n"
"ext z19.b, z19.b, z19.b, #0x6\n"
- "zip1 z1.s, z1.s, z29.s\n"
- "zip1 z30.s, z30.s, z28.s\n"
- "zip1 z2.s, z2.s, z26.s\n"
- "zip1 z27.s, z27.s, z25.s\n"
- "ext z18.b, z18.b, z18.b, #0x2\n"
+ "zip1 z1.s, z1.s, z27.s\n"
+ "zip1 z8.s, z8.s, z31.s\n"
+ "zip1 z2.s, z2.s, z30.s\n"
+ "zip1 z28.s, z28.s, z26.s\n"
+ "ext z24.b, z24.b, z24.b, #0x2\n"
"ext z17.b, z17.b, z17.b, #0x4\n"
"ext z16.b, z16.b, z16.b, #0x6\n"
- "zip1 z4.s, z4.s, z23.s\n"
- "zip1 z24.s, z24.s, z22.s\n"
+ "zip1 z4.s, z4.s, z10.s\n"
+ "zip1 z22.s, z22.s, z18.s\n"
"zip1 z0.s, z0.s, z20.s\n"
"zip1 z21.s, z21.s, z19.s\n"
- "zip1 z1.s, z1.s, z30.s\n"
- "zip1 z2.s, z2.s, z27.s\n"
+ "zip1 z1.s, z1.s, z8.s\n"
+ "zip1 z2.s, z2.s, z28.s\n"
"zip1 z3.s, z3.s, z17.s\n"
- "zip1 z18.s, z18.s, z16.s\n"
- "zip1 z4.s, z4.s, z24.s\n"
+ "zip1 z24.s, z24.s, z16.s\n"
+ "zip1 z4.s, z4.s, z22.s\n"
"zip1 z0.s, z0.s, z21.s\n"
"mov z1.q, z1.q[0]\n"
"mov z2.q, z2.q[0]\n"
- "zip1 z3.s, z3.s, z18.s\n"
+ "zip1 z3.s, z3.s, z24.s\n"
"mov z4.q, z4.q[0]\n"
"mov z24.s, #0x0\n"
"mov z25.s, #0x0\n"
- "sdot z24.s, z15.b, z1.b[0]\n"
+ "sdot z24.s, z13.b, z1.b[0]\n"
"mov z23.s, #0x0\n"
"mov z22.s, #0x0\n"
- "sdot z25.s, z15.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z1.b[1]\n"
"mov z21.s, #0x0\n"
- "mov z20.s, #0x0\n"
- "sdot z23.s, z15.b, z1.b[2]\n"
- "mov z9.s, #0x0\n"
- "mov z8.s, #0x0\n"
- "sdot z22.s, z15.b, z1.b[3]\n"
"mov z19.s, #0x0\n"
+ "sdot z23.s, z13.b, z1.b[2]\n"
+ "mov z10.s, #0x0\n"
+ "mov z8.s, #0x0\n"
+ "sdot z22.s, z13.b, z1.b[3]\n"
+ "mov z20.s, #0x0\n"
"mov z18.s, #0x0\n"
- "sdot z21.s, z15.b, z2.b[0]\n"
+ "sdot z21.s, z13.b, z2.b[0]\n"
"mov z17.s, #0x0\n"
"mov z16.s, #0x0\n"
- "sdot z20.s, z15.b, z2.b[1]\n"
- "sdot z9.s, z15.b, z2.b[2]\n"
- "sdot z8.s, z15.b, z2.b[3]\n"
+ "sdot z19.s, z13.b, z2.b[1]\n"
+ "sdot z10.s, z13.b, z2.b[2]\n"
+ "sdot z8.s, z13.b, z2.b[3]\n"
"mov z0.q, z0.q[0]\n"
- "sdot z19.s, z15.b, z4.b[0]\n"
- "sdot z18.s, z15.b, z4.b[1]\n"
+ "sdot z20.s, z13.b, z4.b[0]\n"
+ "sdot z18.s, z13.b, z4.b[1]\n"
"mov z3.q, z3.q[0]\n"
- "sdot z17.s, z15.b, z4.b[2]\n"
- "sdot z16.s, z15.b, z4.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[2]\n"
+ "sdot z16.s, z13.b, z4.b[3]\n"
"mov z31.s, #0x0\n"
"mov z30.s, #0x0\n"
- "mov z29.s, #0x0\n"
- "sdot z31.s, z15.b, z0.b[0]\n"
+ "mov z26.s, #0x0\n"
+ "sdot z31.s, z13.b, z0.b[0]\n"
+ "mov z27.s, #0x0\n"
"mov z28.s, #0x0\n"
- "sdot z30.s, z15.b, z0.b[1]\n"
- "sdot z29.s, z15.b, z0.b[2]\n"
- "sdot z28.s, z15.b, z0.b[3]\n"
+ "sdot z30.s, z13.b, z0.b[1]\n"
+ "mov z29.s, #0x0\n"
+ "sdot z26.s, z13.b, z0.b[2]\n"
+ "sdot z27.s, z13.b, z0.b[3]\n"
+ "sdot z28.s, z13.b, z3.b[0]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
"add z24.s, z24.s, z21.s\n"
- "add z25.s, z25.s, z20.s\n"
- "add z26.s, z23.s, z9.s\n"
- "add z27.s, z22.s, z8.s\n"
- "add z23.s, z19.s, z21.s\n"
- "mov z22.s, #0x0\n"
- "sdot z22.s, z15.b, z3.b[0]\n"
- "add z21.s, z18.s, z20.s\n"
+ "add z25.s, z25.s, z19.s\n"
+ "add z23.s, z23.s, z10.s\n"
+ "add z22.s, z22.s, z8.s\n"
+ "add z21.s, z20.s, z21.s\n"
"mov z20.s, #0x0\n"
- "sdot z20.s, z15.b, z3.b[1]\n"
- "add z19.s, z17.s, z9.s\n"
+ "sdot z20.s, z13.b, z3.b[2]\n"
+ "add z19.s, z18.s, z19.s\n"
"mov z18.s, #0x0\n"
- "sdot z18.s, z15.b, z3.b[2]\n"
- "add z17.s, z16.s, z8.s\n"
- "mov z16.s, #0x0\n"
- "sdot z16.s, z15.b, z3.b[3]\n"
+ "sdot z18.s, z13.b, z3.b[3]\n"
+ "add z17.s, z17.s, z10.s\n"
+ "add z16.s, z16.s, z8.s\n"
"add z24.s, z24.s, z31.s\n"
"add z25.s, z25.s, z30.s\n"
- "mul z24.s, p2/M, z24.s, z14.s\n"
- "mul z25.s, p2/M, z25.s, z14.s\n"
- "add z26.s, z26.s, z29.s\n"
- "add z27.s, z27.s, z28.s\n"
- "mul z26.s, p2/M, z26.s, z14.s\n"
- "mul z27.s, p2/M, z27.s, z14.s\n"
- "add z28.s, z23.s, z22.s\n"
- "add z29.s, z21.s, z20.s\n"
- "mul z28.s, p2/M, z28.s, z14.s\n"
- "mul z29.s, p2/M, z29.s, z14.s\n"
- "add z30.s, z19.s, z18.s\n"
- "add z31.s, z17.s, z16.s\n"
- "mul z30.s, p2/M, z30.s, z14.s\n"
- "mul z31.s, p2/M, z31.s, z14.s\n"
+ "mul z24.s, p2/M, z24.s, z11.s\n"
+ "mul z25.s, p2/M, z25.s, z11.s\n"
+ "add z26.s, z23.s, z26.s\n"
+ "add z27.s, z22.s, z27.s\n"
+ "mul z26.s, p2/M, z26.s, z11.s\n"
+ "mul z27.s, p2/M, z27.s, z11.s\n"
+ "add z28.s, z21.s, z28.s\n"
+ "add z29.s, z19.s, z29.s\n"
+ "mul z28.s, p2/M, z28.s, z11.s\n"
+ "mul z29.s, p2/M, z29.s, z11.s\n"
+ "add z30.s, z17.s, z20.s\n"
+ "add z31.s, z16.s, z18.s\n"
+ "mul z30.s, p2/M, z30.s, z11.s\n"
+ "mul z31.s, p2/M, z31.s, z11.s\n"
"zip1 z19.s, z24.s, z26.s\n"
"zip1 z18.s, z25.s, z27.s\n"
"zip1 z17.s, z28.s, z30.s\n"
"zip1 z16.s, z29.s, z31.s\n"
"zip1 z22.s, z19.s, z18.s\n"
"zip1 z23.s, z17.s, z16.s\n"
- "add z24.s, z24.s, z13.s\n"
- "add z25.s, z25.s, z13.s\n"
- "add z26.s, z26.s, z13.s\n"
- "add z27.s, z27.s, z13.s\n"
- "add z28.s, z28.s, z13.s\n"
- "add z29.s, z29.s, z13.s\n"
- "add z30.s, z30.s, z13.s\n"
- "add z31.s, z31.s, z13.s\n"
+ "add z24.s, z24.s, z14.s\n"
+ "add z25.s, z25.s, z14.s\n"
+ "add z26.s, z26.s, z14.s\n"
+ "add z27.s, z27.s, z14.s\n"
+ "add z28.s, z28.s, z14.s\n"
+ "add z29.s, z29.s, z14.s\n"
+ "add z30.s, z30.s, z14.s\n"
+ "add z31.s, z31.s, z14.s\n"
"1:" // Loop
"sdot z24.s, z5.b, z0.b[0]\n"
"sdot z25.s, z5.b, z0.b[1]\n"
- "ld1w { z21.s }, p2/Z, [%x[params]]\n"
- "ld1w { z20.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "ld1w { z8.s }, p2/Z, [%x[params]]\n"
+ "ld1w { z21.s }, p2/Z, [%x[params], #1, MUL VL]\n"
"sdot z26.s, z5.b, z0.b[2]\n"
"sdot z27.s, z5.b, z0.b[3]\n"
"incb x9\n"
- "whilelt p0.s, x28, %x[n_channels]\n"
+ "whilelt p1.s, x28, %x[n_channels]\n"
"sdot z24.s, z6.b, z1.b[0]\n"
"sdot z25.s, z6.b, z1.b[1]\n"
- "whilelt p1.b, x9, x10\n"
- "ld1w { z13.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "whilelt p0.b, x9, x10\n"
+ "ld1w { z20.s }, p0/Z, [%x[params], #2, MUL VL]\n"
"sdot z26.s, z6.b, z1.b[2]\n"
"sdot z27.s, z6.b, z1.b[3]\n"
"sdot z28.s, z5.b, z2.b[0]\n"
"sdot z29.s, z5.b, z2.b[1]\n"
"sdot z30.s, z5.b, z2.b[2]\n"
"sdot z31.s, z5.b, z2.b[3]\n"
- "ld1b { z5.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "ld1b { z5.b }, p0/Z, [%x[params], #3, MUL VL]\n"
"sdot z24.s, z7.b, z2.b[0]\n"
"sdot z25.s, z7.b, z2.b[1]\n"
- ".inst 0x04b57718 // sqrdmulh z24.s, z24.s, z21.s\n"
+ ".inst 0x04a87718 // sqrdmulh z24.s, z24.s, z8.s\n"
"sdot z26.s, z7.b, z2.b[2]\n"
"sdot z27.s, z7.b, z2.b[3]\n"
- ".inst 0x04b57739 // sqrdmulh z25.s, z25.s, z21.s\n"
+ ".inst 0x04a87739 // sqrdmulh z25.s, z25.s, z8.s\n"
"sdot z28.s, z6.b, z3.b[0]\n"
"sdot z29.s, z6.b, z3.b[1]\n"
- ".inst 0x04b5775a // sqrdmulh z26.s, z26.s, z21.s\n"
+ ".inst 0x04a8775a // sqrdmulh z26.s, z26.s, z8.s\n"
"sdot z30.s, z6.b, z3.b[2]\n"
"sdot z31.s, z6.b, z3.b[3]\n"
- ".inst 0x04b5777b // sqrdmulh z27.s, z27.s, z21.s\n"
- "ld1b { z6.b }, p1/Z, [%x[params], #4, MUL VL]\n"
+ ".inst 0x04a8777b // sqrdmulh z27.s, z27.s, z8.s\n"
+ "ld1b { z6.b }, p0/Z, [%x[params], #4, MUL VL]\n"
"sdot z28.s, z7.b, z4.b[0]\n"
"sdot z29.s, z7.b, z4.b[1]\n"
- "and z19.d, z24.d, z20.d\n"
+ "and z19.d, z24.d, z21.d\n"
"sdot z30.s, z7.b, z4.b[2]\n"
"sdot z31.s, z7.b, z4.b[3]\n"
- "and z18.d, z25.d, z20.d\n"
- "ld1b { z7.b }, p1/Z, [%x[params], #5, MUL VL]\n"
- "and z17.d, z26.d, z20.d\n"
- "and z16.d, z27.d, z20.d\n"
+ "and z18.d, z25.d, z21.d\n"
+ "ld1b { z7.b }, p0/Z, [%x[params], #5, MUL VL]\n"
+ "and z17.d, z26.d, z21.d\n"
+ "and z16.d, z27.d, z21.d\n"
"addvl %x[params], %x[params], #6\n"
"asr z19.s, z19.s, #0x1f\n"
"asr z18.s, z18.s, #0x1f\n"
"asr z17.s, z17.s, #0x1f\n"
"asr z16.s, z16.s, #0x1f\n"
- ".inst 0x04b5779c // sqrdmulh z28.s, z28.s, z21.s\n"
- ".inst 0x04b577bd // sqrdmulh z29.s, z29.s, z21.s\n"
- ".inst 0x04b577de // sqrdmulh z30.s, z30.s, z21.s\n"
- ".inst 0x04b577ff // sqrdmulh z31.s, z31.s, z21.s\n"
+ ".inst 0x04a8779c // sqrdmulh z28.s, z28.s, z8.s\n"
+ ".inst 0x04a877bd // sqrdmulh z29.s, z29.s, z8.s\n"
+ ".inst 0x04a877de // sqrdmulh z30.s, z30.s, z8.s\n"
+ ".inst 0x04a877ff // sqrdmulh z31.s, z31.s, z8.s\n"
"sqadd z24.s, z24.s, z19.s\n"
"sqadd z25.s, z25.s, z18.s\n"
- ".inst 0x44828a98 // srshl z24.s, p2/M, z24.s, z20.s\n"
- ".inst 0x44828a99 // srshl z25.s, p2/M, z25.s, z20.s\n"
+ ".inst 0x44828ab8 // srshl z24.s, p2/M, z24.s, z21.s\n"
+ ".inst 0x44828ab9 // srshl z25.s, p2/M, z25.s, z21.s\n"
"sqadd z26.s, z26.s, z17.s\n"
"sqadd z27.s, z27.s, z16.s\n"
- ".inst 0x44828a9a // srshl z26.s, p2/M, z26.s, z20.s\n"
- ".inst 0x44828a9b // srshl z27.s, p2/M, z27.s, z20.s\n"
- "and z19.d, z28.d, z20.d\n"
- "and z18.d, z29.d, z20.d\n"
- "and z17.d, z30.d, z20.d\n"
- "and z16.d, z31.d, z20.d\n"
+ ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
+ ".inst 0x44828abb // srshl z27.s, p2/M, z27.s, z21.s\n"
+ "and z19.d, z28.d, z21.d\n"
+ "and z18.d, z29.d, z21.d\n"
+ "and z17.d, z30.d, z21.d\n"
+ "and z16.d, z31.d, z21.d\n"
"asr z19.s, z19.s, #0x1f\n"
"asr z18.s, z18.s, #0x1f\n"
"asr z17.s, z17.s, #0x1f\n"
"asr z16.s, z16.s, #0x1f\n"
"sqadd z28.s, z28.s, z19.s\n"
"sqadd z29.s, z29.s, z18.s\n"
- ".inst 0x44828a9c // srshl z28.s, p2/M, z28.s, z20.s\n"
- ".inst 0x44828a9d // srshl z29.s, p2/M, z29.s, z20.s\n"
+ ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
+ ".inst 0x44828abd // srshl z29.s, p2/M, z29.s, z21.s\n"
"sqadd z30.s, z30.s, z17.s\n"
"sqadd z31.s, z31.s, z16.s\n"
- ".inst 0x44828a9e // srshl z30.s, p2/M, z30.s, z20.s\n"
- ".inst 0x44828a9f // srshl z31.s, p2/M, z31.s, z20.s\n"
- "add z24.s, z24.s, z12.s\n"
- "add z25.s, z25.s, z12.s\n"
- "smin z24.s, p2/M, z24.s, z10.s\n"
- "smin z25.s, p2/M, z25.s, z10.s\n"
- "add z26.s, z26.s, z12.s\n"
- "add z27.s, z27.s, z12.s\n"
- "smin z26.s, p2/M, z26.s, z10.s\n"
- "smin z27.s, p2/M, z27.s, z10.s\n"
- "add z28.s, z28.s, z12.s\n"
- "add z29.s, z29.s, z12.s\n"
- "smin z28.s, p2/M, z28.s, z10.s\n"
- "smin z29.s, p2/M, z29.s, z10.s\n"
- "add z30.s, z30.s, z12.s\n"
- "add z31.s, z31.s, z12.s\n"
- "smin z30.s, p2/M, z30.s, z10.s\n"
- "smin z31.s, p2/M, z31.s, z10.s\n"
- "smax z24.s, p2/M, z24.s, z11.s\n"
- "smax z25.s, p2/M, z25.s, z11.s\n"
- "st1b { z24.s }, p0, [x27, x28]\n"
+ ".inst 0x44828abe // srshl z30.s, p2/M, z30.s, z21.s\n"
+ ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
+ "add z24.s, z24.s, z9.s\n"
+ "add z25.s, z25.s, z9.s\n"
+ "smin z24.s, p2/M, z24.s, z12.s\n"
+ "smin z25.s, p2/M, z25.s, z12.s\n"
+ "add z26.s, z26.s, z9.s\n"
+ "add z27.s, z27.s, z9.s\n"
+ "smin z26.s, p2/M, z26.s, z12.s\n"
+ "smin z27.s, p2/M, z27.s, z12.s\n"
+ "add z28.s, z28.s, z9.s\n"
+ "add z29.s, z29.s, z9.s\n"
+ "smin z28.s, p2/M, z28.s, z12.s\n"
+ "smin z29.s, p2/M, z29.s, z12.s\n"
+ "add z30.s, z30.s, z9.s\n"
+ "add z31.s, z31.s, z9.s\n"
+ "smin z30.s, p2/M, z30.s, z12.s\n"
+ "smin z31.s, p2/M, z31.s, z12.s\n"
+ "smax z24.s, p2/M, z24.s, z15.s\n"
+ "smax z25.s, p2/M, z25.s, z15.s\n"
+ "st1b { z24.s }, p1, [x27, x28]\n"
"mov z24.s, z22.s[0]\n"
- "smax z26.s, p2/M, z26.s, z11.s\n"
- "smax z27.s, p2/M, z27.s, z11.s\n"
- "st1b { z25.s }, p0, [x26, x28]\n"
+ "smax z26.s, p2/M, z26.s, z15.s\n"
+ "smax z27.s, p2/M, z27.s, z15.s\n"
+ "st1b { z25.s }, p1, [x26, x28]\n"
"mov z25.s, z22.s[1]\n"
- "smax z28.s, p2/M, z28.s, z11.s\n"
- "smax z29.s, p2/M, z29.s, z11.s\n"
- "st1b { z26.s }, p0, [x25, x28]\n"
+ "smax z28.s, p2/M, z28.s, z15.s\n"
+ "smax z29.s, p2/M, z29.s, z15.s\n"
+ "st1b { z26.s }, p1, [x25, x28]\n"
"mov z26.s, z22.s[2]\n"
- "smax z30.s, p2/M, z30.s, z11.s\n"
- "smax z31.s, p2/M, z31.s, z11.s\n"
- "st1b { z27.s }, p0, [x24, x28]\n"
+ "smax z30.s, p2/M, z30.s, z15.s\n"
+ "smax z31.s, p2/M, z31.s, z15.s\n"
+ "st1b { z27.s }, p1, [x24, x28]\n"
"mov z27.s, z22.s[3]\n"
- "st1b { z28.s }, p0, [x23, x28]\n"
+ "st1b { z28.s }, p1, [x23, x28]\n"
"mov z28.s, z23.s[0]\n"
- "add z24.s, z24.s, z13.s\n"
- "st1b { z29.s }, p0, [x22, x28]\n"
+ "add z24.s, z24.s, z20.s\n"
+ "st1b { z29.s }, p1, [x22, x28]\n"
"mov z29.s, z23.s[1]\n"
- "add z25.s, z25.s, z13.s\n"
- "st1b { z30.s }, p0, [x21, x28]\n"
+ "add z25.s, z25.s, z20.s\n"
+ "st1b { z30.s }, p1, [x21, x28]\n"
"mov z30.s, z23.s[2]\n"
- "add z26.s, z26.s, z13.s\n"
- "st1b { z31.s }, p0, [x20, x28]\n"
+ "add z26.s, z26.s, z20.s\n"
+ "st1b { z31.s }, p1, [x20, x28]\n"
"mov z31.s, z23.s[3]\n"
"incw x28\n"
- "add z27.s, z27.s, z13.s\n"
- "add z28.s, z28.s, z13.s\n"
- "add z29.s, z29.s, z13.s\n"
- "add z30.s, z30.s, z13.s\n"
- "add z31.s, z31.s, z13.s\n"
+ "add z27.s, z27.s, z20.s\n"
+ "add z28.s, z28.s, z20.s\n"
+ "add z29.s, z29.s, z20.s\n"
+ "add z30.s, z30.s, z20.s\n"
+ "add z31.s, z31.s, z20.s\n"
"b.any 1b\n"
: [params] "+&r" (params)
: [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
index 4874fb9a77..701948f264 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
index 2ed7cfc815..a3b2b429c0 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -36,7 +36,7 @@ void sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
const int8_t *const *const inptrs,
int8_t *const *const outptrs,
const void *params,
- const unsigned int n_output_channels,
+ unsigned int n_output_channels,
const arm_gemm::Requantize32& qp
)
{
@@ -47,8 +47,8 @@ void sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"ldr x21, [%x[inptrs], #0x20]\n"
"ldr x20, [%x[inptrs], #0x10]\n"
"ld1b { z3.b }, p0/Z, [x22]\n"
- "mov z20.d, z3.d\n"
- "ext z20.b, z20.b, z20.b, #0x1\n"
+ "mov z23.d, z3.d\n"
+ "ext z23.b, z23.b, z23.b, #0x1\n"
"ld1b { z4.b }, p0/Z, [x21]\n"
"ldr x24, [%x[inptrs], #0x8]\n"
"mov z18.d, z4.d\n"
@@ -59,132 +59,132 @@ void sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"ext z15.b, z15.b, z15.b, #0x1\n"
"ldr x22, [%x[inptrs], #0x30]\n"
"ldr x21, [%x[inptrs], #0x38]\n"
- "zip1 z3.d, z3.d, z20.d\n"
+ "zip1 z3.d, z3.d, z23.d\n"
"zip1 z4.d, z4.d, z18.d\n"
"ldr x20, [%x[inptrs], #0x0]\n"
"ld1b { z1.b }, p0/Z, [x24]\n"
- "mov z20.d, z1.d\n"
- "ext z20.b, z20.b, z20.b, #0x1\n"
+ "mov z19.d, z1.d\n"
+ "ext z19.b, z19.b, z19.b, #0x1\n"
"ld1b { z5.b }, p0/Z, [x23]\n"
"ld1b { z6.b }, p0/Z, [x22]\n"
- "mov z13.d, z5.d\n"
- "mov z19.d, z6.d\n"
+ "mov z18.d, z5.d\n"
+ "mov z22.d, z6.d\n"
"ld1b { z7.b }, p0/Z, [x21]\n"
"ld1b { z0.b }, p0/Z, [x20]\n"
- "mov z25.d, z7.d\n"
+ "mov z8.d, z7.d\n"
"zip1 z2.d, z2.d, z15.d\n"
"mov z3.q, z3.q[0]\n"
"mov z4.q, z4.q[0]\n"
"ptrue p2.b\n"
"ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "ext z13.b, z13.b, z13.b, #0x1\n"
- "ext z19.b, z19.b, z19.b, #0x1\n"
+ "ext z18.b, z18.b, z18.b, #0x1\n"
+ "ext z22.b, z22.b, z22.b, #0x1\n"
"lsl x10, %x[n_channels], #0x2\n"
"neg z23.s, p2/M, z23.s\n"
- "ext z25.b, z25.b, z25.b, #0x1\n"
- "mov z30.b, #0x1\n"
+ "ext z8.b, z8.b, z8.b, #0x1\n"
+ "mov z28.b, #0x1\n"
"mov x9, #0x0\n"
- "whilelt p1.b, x9, x10\n"
+ "whilelt p0.b, x9, x10\n"
+ "mov z25.s, #0x0\n"
"mov z24.s, #0x0\n"
- "mov z28.s, #0x0\n"
- "sdot z24.s, z30.b, z3.b[0]\n"
- "ld1w { z12.s }, p1/Z, [%x[params]]\n"
- "mov z18.s, #0x0\n"
+ "sdot z25.s, z28.b, z3.b[0]\n"
+ "ld1w { z12.s }, p0/Z, [%x[params]]\n"
"mov z17.s, #0x0\n"
- "sdot z28.s, z30.b, z3.b[2]\n"
+ "mov z16.s, #0x0\n"
+ "sdot z24.s, z28.b, z3.b[2]\n"
"mov x28, #0x0\n"
- "mov z16.d, z0.d\n"
- "sdot z18.s, z30.b, z4.b[0]\n"
- "sdot z17.s, z30.b, z4.b[2]\n"
+ "mov z27.d, z0.d\n"
+ "sdot z17.s, z28.b, z4.b[0]\n"
+ "sdot z16.s, z28.b, z4.b[2]\n"
"ldp x27, x26, [%x[outptrs], #0x0]\n"
- "ext z16.b, z16.b, z16.b, #0x1\n"
- "zip1 z1.d, z1.d, z20.d\n"
+ "ext z27.b, z27.b, z27.b, #0x1\n"
+ "zip1 z1.d, z1.d, z19.d\n"
"ldp x25, x24, [%x[outptrs], #0x10]\n"
"ldp x23, x22, [%x[outptrs], #0x20]\n"
"mov z2.q, z2.q[0]\n"
- "zip1 z5.d, z5.d, z13.d\n"
+ "zip1 z5.d, z5.d, z18.d\n"
"ldp x21, x20, [%x[outptrs], #0x30]\n"
- "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "zip1 z6.d, z6.d, z19.d\n"
- "zip1 z7.d, z7.d, z25.d\n"
- "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "zip1 z6.d, z6.d, z22.d\n"
+ "zip1 z7.d, z7.d, z8.d\n"
+ "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
- "mov z26.s, #0x0\n"
- "mov z22.s, #0x0\n"
- "sdot z26.s, z30.b, z2.b[0]\n"
- "ld1b { z8.b }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "mov z30.s, #0x0\n"
+ "mov z31.s, #0x0\n"
+ "sdot z30.s, z28.b, z2.b[0]\n"
+ "ld1b { z8.b }, p0/Z, [%x[params], #1, MUL VL]\n"
"mov z29.s, #0x1\n"
- "sdot z22.s, z30.b, z2.b[2]\n"
- "sdot z24.s, z29.b, z3.b[1]\n"
- "ld1b { z9.b }, p1/Z, [%x[params], #2, MUL VL]\n"
- "zip1 z0.d, z0.d, z16.d\n"
+ "sdot z31.s, z28.b, z2.b[2]\n"
+ "sdot z25.s, z29.b, z3.b[1]\n"
+ "ld1b { z9.b }, p0/Z, [%x[params], #2, MUL VL]\n"
+ "zip1 z0.d, z0.d, z27.d\n"
"mov z1.q, z1.q[0]\n"
- "sdot z28.s, z29.b, z3.b[3]\n"
- "ld1b { z10.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "sdot z24.s, z29.b, z3.b[3]\n"
+ "ld1b { z10.b }, p0/Z, [%x[params], #3, MUL VL]\n"
"mov z5.q, z5.q[0]\n"
"mov z6.q, z6.q[0]\n"
- "sdot z18.s, z29.b, z4.b[1]\n"
- "ld1b { z11.b }, p1/Z, [%x[params], #4, MUL VL]\n"
+ "sdot z17.s, z29.b, z4.b[1]\n"
+ "ld1b { z11.b }, p0/Z, [%x[params], #4, MUL VL]\n"
"mov z7.q, z7.q[0]\n"
- "mov z21.s, #0x0\n"
- "sdot z17.s, z29.b, z4.b[3]\n"
+ "mov z22.s, #0x0\n"
+ "sdot z16.s, z29.b, z4.b[3]\n"
"addvl %x[params], %x[params], #5\n"
- "mov z20.s, #0x0\n"
- "mov z25.s, #0x0\n"
- "sdot z21.s, z30.b, z1.b[0]\n"
+ "mov z21.s, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "sdot z22.s, z28.b, z1.b[0]\n"
"mov z27.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "sdot z21.s, z28.b, z1.b[2]\n"
"mov z19.s, #0x0\n"
- "sdot z20.s, z30.b, z1.b[2]\n"
- "sdot z25.s, z30.b, z5.b[0]\n"
- "sdot z27.s, z30.b, z5.b[2]\n"
- "mov z0.q, z0.q[0]\n"
- "sdot z19.s, z30.b, z6.b[0]\n"
- "sdot z26.s, z29.b, z2.b[1]\n"
- "add z24.s, z24.s, z18.s\n"
"mov z18.s, #0x0\n"
- "sdot z18.s, z30.b, z6.b[2]\n"
- "sdot z22.s, z29.b, z2.b[3]\n"
- "add z17.s, z28.s, z17.s\n"
- "mov z16.s, #0x0\n"
- "sdot z16.s, z30.b, z7.b[0]\n"
- "sdot z21.s, z29.b, z1.b[1]\n"
- "sdot z20.s, z29.b, z1.b[3]\n"
- "add z28.s, z26.s, z24.s\n"
- "sdot z25.s, z29.b, z5.b[1]\n"
+ "sdot z26.s, z28.b, z5.b[0]\n"
+ "sdot z27.s, z28.b, z5.b[2]\n"
+ "sdot z20.s, z28.b, z6.b[0]\n"
+ "mov z0.q, z0.q[0]\n"
+ "sdot z19.s, z28.b, z6.b[2]\n"
+ "sdot z18.s, z28.b, z7.b[0]\n"
+ "add z17.s, z25.s, z17.s\n"
+ "mov z25.s, #0x0\n"
+ "sdot z25.s, z28.b, z7.b[2]\n"
+ "sdot z30.s, z29.b, z2.b[1]\n"
+ "sdot z31.s, z29.b, z2.b[3]\n"
+ "add z16.s, z24.s, z16.s\n"
+ "sdot z22.s, z29.b, z1.b[1]\n"
+ "mov z24.s, #0x0\n"
+ "sdot z24.s, z28.b, z0.b[0]\n"
+ "sdot z21.s, z29.b, z1.b[3]\n"
+ "sdot z26.s, z29.b, z5.b[1]\n"
"sdot z27.s, z29.b, z5.b[3]\n"
- "add z31.s, z22.s, z17.s\n"
- "sdot z19.s, z29.b, z6.b[1]\n"
- "sdot z18.s, z29.b, z6.b[3]\n"
- "add z22.s, z21.s, z28.s\n"
- "sdot z16.s, z29.b, z7.b[1]\n"
- "add z21.s, z20.s, z31.s\n"
- "add z20.s, z25.s, z19.s\n"
- "add z19.s, z27.s, z18.s\n"
- "add z18.s, z16.s, z24.s\n"
- "mov z16.s, #0x0\n"
- "sdot z16.s, z30.b, z7.b[2]\n"
- "sdot z16.s, z29.b, z7.b[3]\n"
- "add z17.s, z16.s, z17.s\n"
- "mov z16.s, #0x0\n"
- "sdot z16.s, z30.b, z0.b[0]\n"
- "sdot z16.s, z29.b, z0.b[1]\n"
- "add z24.s, z22.s, z16.s\n"
- "add z26.s, z22.s, z25.s\n"
+ "add z30.s, z30.s, z17.s\n"
+ "sdot z20.s, z29.b, z6.b[1]\n"
+ "sdot z19.s, z29.b, z6.b[3]\n"
+ "add z31.s, z31.s, z16.s\n"
+ "sdot z18.s, z29.b, z7.b[1]\n"
+ "sdot z25.s, z29.b, z7.b[3]\n"
+ "add z22.s, z22.s, z30.s\n"
+ "sdot z24.s, z29.b, z0.b[1]\n"
+ "add z21.s, z21.s, z31.s\n"
+ "add z20.s, z26.s, z20.s\n"
+ "add z19.s, z27.s, z19.s\n"
+ "add z18.s, z18.s, z17.s\n"
+ "mov z17.s, #0x0\n"
+ "sdot z17.s, z28.b, z0.b[2]\n"
+ "sdot z17.s, z29.b, z0.b[3]\n"
+ "add z16.s, z25.s, z16.s\n"
+ "add z24.s, z22.s, z24.s\n"
+ "add z25.s, z21.s, z17.s\n"
"mul z24.s, p2/M, z24.s, z23.s\n"
- "mul z26.s, p2/M, z26.s, z23.s\n"
- "mov z16.s, #0x0\n"
- "sdot z16.s, z30.b, z0.b[2]\n"
- "sdot z16.s, z29.b, z0.b[3]\n"
- "add z25.s, z21.s, z16.s\n"
- "add z27.s, z21.s, z27.s\n"
"mul z25.s, p2/M, z25.s, z23.s\n"
+ "add z26.s, z26.s, z22.s\n"
+ "add z27.s, z27.s, z21.s\n"
+ "mul z26.s, p2/M, z26.s, z23.s\n"
"mul z27.s, p2/M, z27.s, z23.s\n"
- "add z28.s, z20.s, z28.s\n"
+ "add z28.s, z20.s, z30.s\n"
"add z29.s, z19.s, z31.s\n"
"mul z28.s, p2/M, z28.s, z23.s\n"
"mul z29.s, p2/M, z29.s, z23.s\n"
- "add z30.s, z18.s, z20.s\n"
- "add z31.s, z17.s, z19.s\n"
+ "add z30.s, z20.s, z18.s\n"
+ "add z31.s, z19.s, z16.s\n"
"mul z30.s, p2/M, z30.s, z23.s\n"
"mul z31.s, p2/M, z31.s, z23.s\n"
"zip1 z19.s, z24.s, z26.s\n"
@@ -204,22 +204,22 @@ void sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"1:" // Loop
"sdot z24.s, z8.b, z0.b[0]\n"
"sdot z25.s, z8.b, z0.b[2]\n"
- "ld1w { z17.s }, p2/Z, [%x[params], #6, MUL VL]\n"
- "ld1w { z19.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "ld1w { z12.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
"sdot z26.s, z8.b, z1.b[0]\n"
"sdot z27.s, z8.b, z1.b[2]\n"
"incb x9\n"
- "whilelt p0.s, x28, %x[n_channels]\n"
+ "whilelt p1.s, x28, %x[n_channels]\n"
"sdot z24.s, z9.b, z0.b[1]\n"
"sdot z25.s, z9.b, z0.b[3]\n"
- "whilelt p1.b, x9, x10\n"
+ "whilelt p0.b, x9, x10\n"
"sdot z26.s, z9.b, z1.b[1]\n"
"sdot z27.s, z9.b, z1.b[3]\n"
"sdot z28.s, z8.b, z2.b[0]\n"
"sdot z29.s, z8.b, z2.b[2]\n"
"sdot z30.s, z8.b, z3.b[0]\n"
"sdot z31.s, z8.b, z3.b[2]\n"
- "ld1b { z8.b }, p2/Z, [%x[params]]\n"
+ "ld1b { z17.b }, p2/Z, [%x[params]]\n"
"sdot z24.s, z10.b, z1.b[0]\n"
"sdot z25.s, z10.b, z1.b[2]\n"
"sdot z26.s, z10.b, z2.b[0]\n"
@@ -228,7 +228,7 @@ void sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"sdot z29.s, z9.b, z2.b[3]\n"
"sdot z30.s, z9.b, z3.b[1]\n"
"sdot z31.s, z9.b, z3.b[3]\n"
- "ld1b { z9.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [%x[params], #1, MUL VL]\n"
"sdot z24.s, z11.b, z1.b[1]\n"
"sdot z25.s, z11.b, z1.b[3]\n"
"sdot z26.s, z11.b, z2.b[1]\n"
@@ -237,158 +237,158 @@ void sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"sdot z29.s, z10.b, z3.b[2]\n"
"sdot z30.s, z10.b, z4.b[0]\n"
"sdot z31.s, z10.b, z4.b[2]\n"
- "ld1b { z10.b }, p2/Z, [%x[params], #2, MUL VL]\n"
- "sdot z24.s, z8.b, z2.b[0]\n"
- "sdot z25.s, z8.b, z2.b[2]\n"
- "sdot z26.s, z8.b, z3.b[0]\n"
- "sdot z27.s, z8.b, z3.b[2]\n"
+ "ld1b { z19.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "sdot z24.s, z17.b, z2.b[0]\n"
+ "sdot z25.s, z17.b, z2.b[2]\n"
+ "sdot z26.s, z17.b, z3.b[0]\n"
+ "sdot z27.s, z17.b, z3.b[2]\n"
"sdot z28.s, z11.b, z3.b[1]\n"
"sdot z29.s, z11.b, z3.b[3]\n"
"sdot z30.s, z11.b, z4.b[1]\n"
"sdot z31.s, z11.b, z4.b[3]\n"
- "ld1b { z11.b }, p2/Z, [%x[params], #3, MUL VL]\n"
- "sdot z24.s, z9.b, z2.b[1]\n"
- "sdot z25.s, z9.b, z2.b[3]\n"
- "sdot z26.s, z9.b, z3.b[1]\n"
- "sdot z27.s, z9.b, z3.b[3]\n"
- "sdot z28.s, z8.b, z4.b[0]\n"
- "sdot z29.s, z8.b, z4.b[2]\n"
- "sdot z30.s, z8.b, z5.b[0]\n"
- "sdot z31.s, z8.b, z5.b[2]\n"
- "ld1b { z8.b }, p2/Z, [%x[params], #4, MUL VL]\n"
- "sdot z24.s, z10.b, z3.b[0]\n"
- "sdot z25.s, z10.b, z3.b[2]\n"
- "sdot z26.s, z10.b, z4.b[0]\n"
- "sdot z27.s, z10.b, z4.b[2]\n"
- "sdot z28.s, z9.b, z4.b[1]\n"
- "sdot z29.s, z9.b, z4.b[3]\n"
- "sdot z30.s, z9.b, z5.b[1]\n"
- "sdot z31.s, z9.b, z5.b[3]\n"
- "ld1b { z9.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "ld1b { z18.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "sdot z24.s, z16.b, z2.b[1]\n"
+ "sdot z25.s, z16.b, z2.b[3]\n"
+ "sdot z26.s, z16.b, z3.b[1]\n"
+ "sdot z27.s, z16.b, z3.b[3]\n"
+ "sdot z28.s, z17.b, z4.b[0]\n"
+ "sdot z29.s, z17.b, z4.b[2]\n"
+ "sdot z30.s, z17.b, z5.b[0]\n"
+ "sdot z31.s, z17.b, z5.b[2]\n"
+ "ld1b { z17.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "sdot z24.s, z19.b, z3.b[0]\n"
+ "sdot z25.s, z19.b, z3.b[2]\n"
+ "sdot z26.s, z19.b, z4.b[0]\n"
+ "sdot z27.s, z19.b, z4.b[2]\n"
+ "sdot z28.s, z16.b, z4.b[1]\n"
+ "sdot z29.s, z16.b, z4.b[3]\n"
+ "sdot z30.s, z16.b, z5.b[1]\n"
+ "sdot z31.s, z16.b, z5.b[3]\n"
+ "ld1b { z16.b }, p2/Z, [%x[params], #5, MUL VL]\n"
"addvl %x[params], %x[params], #16\n"
- "sdot z24.s, z11.b, z3.b[1]\n"
- "sdot z25.s, z11.b, z3.b[3]\n"
- "ld1w { z12.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
- "sdot z26.s, z11.b, z4.b[1]\n"
- "sdot z27.s, z11.b, z4.b[3]\n"
- "sdot z28.s, z10.b, z5.b[0]\n"
- "sdot z29.s, z10.b, z5.b[2]\n"
- "sdot z30.s, z10.b, z6.b[0]\n"
- "sdot z31.s, z10.b, z6.b[2]\n"
- "ld1b { z10.b }, p1/Z, [%x[params], #-5, MUL VL]\n"
- "sdot z24.s, z8.b, z4.b[0]\n"
- "sdot z25.s, z8.b, z4.b[2]\n"
- "sdot z26.s, z8.b, z5.b[0]\n"
- "sdot z27.s, z8.b, z5.b[2]\n"
- "sdot z28.s, z11.b, z5.b[1]\n"
- "sdot z29.s, z11.b, z5.b[3]\n"
- "sdot z30.s, z11.b, z6.b[1]\n"
- "sdot z31.s, z11.b, z6.b[3]\n"
- "ld1b { z11.b }, p1/Z, [%x[params], #-4, MUL VL]\n"
- "sdot z24.s, z9.b, z4.b[1]\n"
- "sdot z25.s, z9.b, z4.b[3]\n"
- ".inst 0x04b17718 // sqrdmulh z24.s, z24.s, z17.s\n"
- "sdot z26.s, z9.b, z5.b[1]\n"
- "sdot z27.s, z9.b, z5.b[3]\n"
- ".inst 0x04b17739 // sqrdmulh z25.s, z25.s, z17.s\n"
- "sdot z28.s, z8.b, z6.b[0]\n"
- "sdot z29.s, z8.b, z6.b[2]\n"
- ".inst 0x04b1775a // sqrdmulh z26.s, z26.s, z17.s\n"
- "sdot z30.s, z8.b, z7.b[0]\n"
- "sdot z31.s, z8.b, z7.b[2]\n"
- ".inst 0x04b1777b // sqrdmulh z27.s, z27.s, z17.s\n"
- "ld1b { z8.b }, p1/Z, [%x[params], #-7, MUL VL]\n"
- "sdot z28.s, z9.b, z6.b[1]\n"
- "sdot z29.s, z9.b, z6.b[3]\n"
- "and z16.d, z24.d, z19.d\n"
- "sdot z30.s, z9.b, z7.b[1]\n"
- "sdot z31.s, z9.b, z7.b[3]\n"
- "and z18.d, z25.d, z19.d\n"
- "ld1b { z9.b }, p1/Z, [%x[params], #-6, MUL VL]\n"
- "asr z16.s, z16.s, #0x1f\n"
- "asr z18.s, z18.s, #0x1f\n"
+ "sdot z24.s, z18.b, z3.b[1]\n"
+ "sdot z25.s, z18.b, z3.b[3]\n"
+ "ld1w { z20.s }, p0/Z, [%x[params], #-8, MUL VL]\n"
+ "sdot z26.s, z18.b, z4.b[1]\n"
+ "sdot z27.s, z18.b, z4.b[3]\n"
+ "sdot z28.s, z19.b, z5.b[0]\n"
+ "sdot z29.s, z19.b, z5.b[2]\n"
+ "sdot z30.s, z19.b, z6.b[0]\n"
+ "sdot z31.s, z19.b, z6.b[2]\n"
+ "ld1b { z10.b }, p0/Z, [%x[params], #-5, MUL VL]\n"
+ "sdot z24.s, z17.b, z4.b[0]\n"
+ "sdot z25.s, z17.b, z4.b[2]\n"
+ "sdot z26.s, z17.b, z5.b[0]\n"
+ "sdot z27.s, z17.b, z5.b[2]\n"
+ "sdot z28.s, z18.b, z5.b[1]\n"
+ "sdot z29.s, z18.b, z5.b[3]\n"
+ "sdot z30.s, z18.b, z6.b[1]\n"
+ "sdot z31.s, z18.b, z6.b[3]\n"
+ "ld1b { z11.b }, p0/Z, [%x[params], #-4, MUL VL]\n"
+ "sdot z24.s, z16.b, z4.b[1]\n"
+ "sdot z25.s, z16.b, z4.b[3]\n"
+ ".inst 0x04ac7718 // sqrdmulh z24.s, z24.s, z12.s\n"
+ "sdot z26.s, z16.b, z5.b[1]\n"
+ "sdot z27.s, z16.b, z5.b[3]\n"
+ ".inst 0x04ac7739 // sqrdmulh z25.s, z25.s, z12.s\n"
+ "sdot z28.s, z17.b, z6.b[0]\n"
+ "sdot z29.s, z17.b, z6.b[2]\n"
+ ".inst 0x04ac775a // sqrdmulh z26.s, z26.s, z12.s\n"
+ "sdot z30.s, z17.b, z7.b[0]\n"
+ "sdot z31.s, z17.b, z7.b[2]\n"
+ ".inst 0x04ac777b // sqrdmulh z27.s, z27.s, z12.s\n"
+ "ld1b { z8.b }, p0/Z, [%x[params], #-7, MUL VL]\n"
+ "sdot z28.s, z16.b, z6.b[1]\n"
+ "sdot z29.s, z16.b, z6.b[3]\n"
+ "and z19.d, z24.d, z21.d\n"
+ "sdot z30.s, z16.b, z7.b[1]\n"
+ "sdot z31.s, z16.b, z7.b[3]\n"
+ "and z18.d, z25.d, z21.d\n"
+ "ld1b { z9.b }, p0/Z, [%x[params], #-6, MUL VL]\n"
+ "and z17.d, z26.d, z21.d\n"
+ "and z16.d, z27.d, z21.d\n"
"addvl %x[params], %x[params], #-3\n"
- ".inst 0x04b1779c // sqrdmulh z28.s, z28.s, z17.s\n"
- ".inst 0x04b177bd // sqrdmulh z29.s, z29.s, z17.s\n"
- ".inst 0x04b177de // sqrdmulh z30.s, z30.s, z17.s\n"
- ".inst 0x04b177ff // sqrdmulh z31.s, z31.s, z17.s\n"
- "and z17.d, z26.d, z19.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
"asr z17.s, z17.s, #0x1f\n"
- "sqadd z24.s, z24.s, z16.s\n"
- "and z16.d, z27.d, z19.d\n"
- ".inst 0x44828a78 // srshl z24.s, p2/M, z24.s, z19.s\n"
"asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x04ac779c // sqrdmulh z28.s, z28.s, z12.s\n"
+ ".inst 0x04ac77bd // sqrdmulh z29.s, z29.s, z12.s\n"
+ ".inst 0x04ac77de // sqrdmulh z30.s, z30.s, z12.s\n"
+ ".inst 0x04ac77ff // sqrdmulh z31.s, z31.s, z12.s\n"
+ "sqadd z24.s, z24.s, z19.s\n"
"sqadd z25.s, z25.s, z18.s\n"
- ".inst 0x44828a79 // srshl z25.s, p2/M, z25.s, z19.s\n"
+ ".inst 0x44828ab8 // srshl z24.s, p2/M, z24.s, z21.s\n"
+ ".inst 0x44828ab9 // srshl z25.s, p2/M, z25.s, z21.s\n"
"sqadd z26.s, z26.s, z17.s\n"
"sqadd z27.s, z27.s, z16.s\n"
- ".inst 0x44828a7a // srshl z26.s, p2/M, z26.s, z19.s\n"
- ".inst 0x44828a7b // srshl z27.s, p2/M, z27.s, z19.s\n"
- "and z16.d, z28.d, z19.d\n"
- "and z18.d, z29.d, z19.d\n"
- "and z17.d, z30.d, z19.d\n"
- "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
+ ".inst 0x44828abb // srshl z27.s, p2/M, z27.s, z21.s\n"
+ "and z19.d, z28.d, z21.d\n"
+ "and z18.d, z29.d, z21.d\n"
+ "and z17.d, z30.d, z21.d\n"
+ "and z16.d, z31.d, z21.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
"asr z18.s, z18.s, #0x1f\n"
"asr z17.s, z17.s, #0x1f\n"
- "sqadd z28.s, z28.s, z16.s\n"
- "and z16.d, z31.d, z19.d\n"
- ".inst 0x44828a7c // srshl z28.s, p2/M, z28.s, z19.s\n"
"asr z16.s, z16.s, #0x1f\n"
+ "sqadd z28.s, z28.s, z19.s\n"
"sqadd z29.s, z29.s, z18.s\n"
- ".inst 0x44828a7d // srshl z29.s, p2/M, z29.s, z19.s\n"
+ ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
+ ".inst 0x44828abd // srshl z29.s, p2/M, z29.s, z21.s\n"
"sqadd z30.s, z30.s, z17.s\n"
"sqadd z31.s, z31.s, z16.s\n"
- ".inst 0x44828a7e // srshl z30.s, p2/M, z30.s, z19.s\n"
- ".inst 0x44828a7f // srshl z31.s, p2/M, z31.s, z19.s\n"
- "add z24.s, z24.s, z14.s\n"
- "add z25.s, z25.s, z14.s\n"
+ ".inst 0x44828abe // srshl z30.s, p2/M, z30.s, z21.s\n"
+ ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
+ "add z24.s, z24.s, z13.s\n"
+ "add z25.s, z25.s, z13.s\n"
"smin z24.s, p2/M, z24.s, z15.s\n"
"smin z25.s, p2/M, z25.s, z15.s\n"
- "add z26.s, z26.s, z14.s\n"
- "add z27.s, z27.s, z14.s\n"
+ "add z26.s, z26.s, z13.s\n"
+ "add z27.s, z27.s, z13.s\n"
"smin z26.s, p2/M, z26.s, z15.s\n"
"smin z27.s, p2/M, z27.s, z15.s\n"
- "add z28.s, z28.s, z14.s\n"
- "add z29.s, z29.s, z14.s\n"
+ "add z28.s, z28.s, z13.s\n"
+ "add z29.s, z29.s, z13.s\n"
"smin z28.s, p2/M, z28.s, z15.s\n"
"smin z29.s, p2/M, z29.s, z15.s\n"
- "add z30.s, z30.s, z14.s\n"
- "add z31.s, z31.s, z14.s\n"
+ "add z30.s, z30.s, z13.s\n"
+ "add z31.s, z31.s, z13.s\n"
"smin z30.s, p2/M, z30.s, z15.s\n"
"smin z31.s, p2/M, z31.s, z15.s\n"
- "smax z24.s, p2/M, z24.s, z13.s\n"
- "smax z25.s, p2/M, z25.s, z13.s\n"
- "st1b { z24.s }, p0, [x27, x28]\n"
+ "smax z24.s, p2/M, z24.s, z14.s\n"
+ "smax z25.s, p2/M, z25.s, z14.s\n"
+ "st1b { z24.s }, p1, [x27, x28]\n"
"mov z24.s, z22.s[0]\n"
- "smax z26.s, p2/M, z26.s, z13.s\n"
- "smax z27.s, p2/M, z27.s, z13.s\n"
- "st1b { z25.s }, p0, [x26, x28]\n"
+ "smax z26.s, p2/M, z26.s, z14.s\n"
+ "smax z27.s, p2/M, z27.s, z14.s\n"
+ "st1b { z25.s }, p1, [x26, x28]\n"
"mov z25.s, z22.s[1]\n"
- "smax z28.s, p2/M, z28.s, z13.s\n"
- "smax z29.s, p2/M, z29.s, z13.s\n"
- "st1b { z26.s }, p0, [x25, x28]\n"
+ "smax z28.s, p2/M, z28.s, z14.s\n"
+ "smax z29.s, p2/M, z29.s, z14.s\n"
+ "st1b { z26.s }, p1, [x25, x28]\n"
"mov z26.s, z22.s[2]\n"
- "smax z30.s, p2/M, z30.s, z13.s\n"
- "smax z31.s, p2/M, z31.s, z13.s\n"
- "st1b { z27.s }, p0, [x24, x28]\n"
+ "smax z30.s, p2/M, z30.s, z14.s\n"
+ "smax z31.s, p2/M, z31.s, z14.s\n"
+ "st1b { z27.s }, p1, [x24, x28]\n"
"mov z27.s, z22.s[3]\n"
- "st1b { z28.s }, p0, [x23, x28]\n"
+ "st1b { z28.s }, p1, [x23, x28]\n"
"mov z28.s, z23.s[0]\n"
- "add z24.s, z24.s, z12.s\n"
- "st1b { z29.s }, p0, [x22, x28]\n"
+ "add z24.s, z24.s, z20.s\n"
+ "st1b { z29.s }, p1, [x22, x28]\n"
"mov z29.s, z23.s[1]\n"
- "add z25.s, z25.s, z12.s\n"
- "st1b { z30.s }, p0, [x21, x28]\n"
+ "add z25.s, z25.s, z20.s\n"
+ "st1b { z30.s }, p1, [x21, x28]\n"
"mov z30.s, z23.s[2]\n"
- "add z26.s, z26.s, z12.s\n"
- "st1b { z31.s }, p0, [x20, x28]\n"
+ "add z26.s, z26.s, z20.s\n"
+ "st1b { z31.s }, p1, [x20, x28]\n"
"mov z31.s, z23.s[3]\n"
"incw x28\n"
- "add z27.s, z27.s, z12.s\n"
- "add z28.s, z28.s, z12.s\n"
- "add z29.s, z29.s, z12.s\n"
- "add z30.s, z30.s, z12.s\n"
- "add z31.s, z31.s, z12.s\n"
+ "add z27.s, z27.s, z20.s\n"
+ "add z28.s, z28.s, z20.s\n"
+ "add z29.s, z29.s, z20.s\n"
+ "add z30.s, z30.s, z20.s\n"
+ "add z31.s, z31.s, z20.s\n"
"b.any 1b\n"
: [params] "+&r" (params)
: [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
index 0d185fcafc..1730574933 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
index 6a432e1961..d9c8644fc4 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -30,411 +30,403 @@
namespace arm_conv {
namespace depthwise {
-void sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
- const unsigned int n_channels,
- const int8_t *const *const inptrs,
- const int8_t *params,
- const int32_t *, // Bias, should be wrapped into the parameters
- const arm_gemm::Requantize32& qp,
- const int32_t *, const int32_t *, // Requant parameters, also wrapped
- int8_t *const *const outptrs
-)
+void sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const int8_t *const *const inptrs, const int8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, int8_t *const *const outptrs)
{
__asm__ __volatile__(
"mov x13, #0x0\n"
- "whilelt p2.b, x13, %x[n_channels]\n"
- "ldp x12, x11, [%x[inptrs], #0x0]\n"
- "ldp x10, x9, [%x[inptrs], #0x10]\n"
- "ldp x28, x27, [%x[inptrs], #0x20]\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
- "ptrue p1.b\n"
- "mov x24, #0x0\n"
- "ldp x23, x22, [%x[outptrs], #0x0]\n"
- "ldp x21, x20, [%x[outptrs], #0x10]\n"
- "ld1b { z9.b }, p2/Z, [x12, x13]\n"
- "ld1b { z8.b }, p2/Z, [x11, x13]\n"
- "ldp x12, x11, [%x[inptrs], #0x40]\n"
- "ld1b { z7.b }, p2/Z, [x10, x13]\n"
- "zip2 z6.b, z9.b, z7.b\n"
- "zip1 z9.b, z9.b, z7.b\n"
- "ld1b { z5.b }, p2/Z, [x9, x13]\n"
- "ldp x10, x9, [%x[inptrs], #0x50]\n"
- "zip1 z7.b, z8.b, z5.b\n"
- "zip2 z5.b, z8.b, z5.b\n"
- "ld1b { z4.b }, p2/Z, [x28, x13]\n"
- "ld1b { z3.b }, p2/Z, [x27, x13]\n"
- "zip2 z8.b, z9.b, z7.b\n"
- "zip1 z9.b, z9.b, z7.b\n"
- "ldp x28, x27, [%x[inptrs], #0x60]\n"
- "ld1b { z2.b }, p2/Z, [x26, x13]\n"
- "zip1 z7.b, z6.b, z5.b\n"
- "zip2 z5.b, z6.b, z5.b\n"
- "ld1b { z1.b }, p2/Z, [x25, x13]\n"
- "ldp x26, x25, [%x[inptrs], #0x70]\n"
- "zip2 z0.b, z4.b, z2.b\n"
- "zip1 z4.b, z4.b, z2.b\n"
- "ld1b { z31.b }, p2/Z, [x12, x13]\n"
- "ld1b { z30.b }, p2/Z, [x11, x13]\n"
- "zip1 z2.b, z3.b, z1.b\n"
- "zip2 z1.b, z3.b, z1.b\n"
- "ld1b { z29.b }, p2/Z, [x10, x13]\n"
- "ld1b { z28.b }, p2/Z, [x9, x13]\n"
- "zip2 z27.b, z31.b, z29.b\n"
- "zip1 z31.b, z31.b, z29.b\n"
- "ld1b { z26.b }, p2/Z, [x28, x13]\n"
- "ld1b { z25.b }, p2/Z, [x27, x13]\n"
- "zip1 z29.b, z30.b, z28.b\n"
- "zip2 z28.b, z30.b, z28.b\n"
- "ld1b { z24.b }, p2/Z, [x26, x13]\n"
- "ld1b { z23.b }, p2/Z, [x25, x13]\n"
- "zip2 z22.b, z26.b, z24.b\n"
- "zip1 z26.b, z26.b, z24.b\n"
- "zip1 z24.b, z25.b, z23.b\n"
- "zip2 z23.b, z25.b, z23.b\n"
- "ld1w { z6.s }, p1/Z, [%x[params]]\n"
- "ld1rw { z21.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z20.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
- "ld1rw { z19.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "zip2 z3.b, z4.b, z2.b\n"
- "zip1 z4.b, z4.b, z2.b\n"
- "ldp x12, x11, [%x[inptrs], #0x0]\n"
- "ldp x10, x9, [%x[inptrs], #0x10]\n"
- "zip1 z2.b, z0.b, z1.b\n"
- "zip2 z1.b, z0.b, z1.b\n"
- "ldp x28, x27, [%x[inptrs], #0x20]\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
- "zip2 z30.b, z31.b, z29.b\n"
- "zip1 z31.b, z31.b, z29.b\n"
- "zip1 z29.b, z27.b, z28.b\n"
- "zip2 z28.b, z27.b, z28.b\n"
- "ld1b { z18.b }, p1/Z, [%x[params], #1, MUL VL]\n"
- "ld1b { z17.b }, p1/Z, [%x[params], #2, MUL VL]\n"
- "zip2 z25.b, z26.b, z24.b\n"
- "zip1 z26.b, z26.b, z24.b\n"
- "ld1b { z16.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "whilelt p0.b, x13, %x[n_channels]\n"
+ "ldp x27, x26, [%x[inptrs], #0x0]\n"
+ "ldp x25, x24, [%x[inptrs], #0x10]\n"
+ "ldp x23, x22, [%x[inptrs], #0x20]\n"
+ "ldp x21, x20, [%x[inptrs], #0x30]\n"
+ "ptrue p2.b\n"
+ "mov x12, #0x0\n"
+ "ldp x11, x10, [%x[outptrs], #0x0]\n"
+ "ldp x9, x28, [%x[outptrs], #0x10]\n"
+ "ld1b { z15.b }, p0/Z, [x27, x13]\n"
+ "ld1b { z18.b }, p0/Z, [x26, x13]\n"
+ "ldp x27, x26, [%x[inptrs], #0x40]\n"
+ "ld1b { z16.b }, p0/Z, [x25, x13]\n"
+ "zip2 z17.b, z15.b, z16.b\n"
+ "zip1 z15.b, z15.b, z16.b\n"
+ "ld1b { z14.b }, p0/Z, [x24, x13]\n"
+ "ldp x25, x24, [%x[inptrs], #0x50]\n"
+ "zip1 z16.b, z18.b, z14.b\n"
+ "zip2 z14.b, z18.b, z14.b\n"
+ "ld1b { z13.b }, p0/Z, [x23, x13]\n"
+ "ld1b { z18.b }, p0/Z, [x22, x13]\n"
+ "zip2 z12.b, z15.b, z16.b\n"
+ "zip1 z15.b, z15.b, z16.b\n"
+ "ldp x23, x22, [%x[inptrs], #0x60]\n"
+ "ld1b { z16.b }, p0/Z, [x21, x13]\n"
+ "zip1 z11.b, z17.b, z14.b\n"
+ "zip2 z14.b, z17.b, z14.b\n"
+ "ld1b { z10.b }, p0/Z, [x20, x13]\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "zip2 z22.b, z13.b, z16.b\n"
+ "zip1 z13.b, z13.b, z16.b\n"
+ "ld1b { z9.b }, p0/Z, [x27, x13]\n"
+ "ld1b { z17.b }, p0/Z, [x26, x13]\n"
+ "zip1 z21.b, z18.b, z10.b\n"
+ "zip2 z10.b, z18.b, z10.b\n"
+ "ld1b { z16.b }, p0/Z, [x25, x13]\n"
+ "ld1b { z8.b }, p0/Z, [x24, x13]\n"
+ "zip2 z20.b, z9.b, z16.b\n"
+ "zip1 z9.b, z9.b, z16.b\n"
+ "ld1b { z7.b }, p0/Z, [x23, x13]\n"
+ "ld1b { z19.b }, p0/Z, [x22, x13]\n"
+ "zip1 z18.b, z17.b, z8.b\n"
+ "zip2 z8.b, z17.b, z8.b\n"
+ "ld1b { z16.b }, p0/Z, [x21, x13]\n"
+ "ld1b { z6.b }, p0/Z, [x20, x13]\n"
+ "zip2 z17.b, z7.b, z16.b\n"
+ "zip1 z7.b, z7.b, z16.b\n"
+ "zip1 z16.b, z19.b, z6.b\n"
+ "zip2 z6.b, z19.b, z6.b\n"
+ "ld1w { z5.s }, p2/Z, [%x[params]]\n"
+ "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z3.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z2.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "zip2 z1.b, z13.b, z21.b\n"
+ "zip1 z13.b, z13.b, z21.b\n"
+ "ldp x27, x26, [%x[inptrs], #0x0]\n"
+ "ldp x25, x23, [%x[inptrs], #0x10]\n"
+ "zip1 z0.b, z22.b, z10.b\n"
+ "zip2 z10.b, z22.b, z10.b\n"
+ "ldp x24, x22, [%x[inptrs], #0x20]\n"
+ "ldp x21, x20, [%x[inptrs], #0x30]\n"
+ "zip2 z31.b, z9.b, z18.b\n"
+ "zip1 z9.b, z9.b, z18.b\n"
+ "zip1 z30.b, z20.b, z8.b\n"
+ "zip2 z8.b, z20.b, z8.b\n"
+ "ld1b { z29.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "ld1b { z28.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "zip2 z27.b, z7.b, z16.b\n"
+ "zip1 z7.b, z7.b, z16.b\n"
+ "ld1b { z26.b }, p2/Z, [%x[params], #3, MUL VL]\n"
"addvl %x[params], %x[params], #4\n"
- "zip1 z24.b, z22.b, z23.b\n"
- "zip2 z23.b, z22.b, z23.b\n"
- "mov z0.d, z6.d\n"
- "mov z27.d, z6.d\n"
- "mov z22.d, z6.d\n"
+ "zip1 z25.b, z17.b, z6.b\n"
+ "zip2 z6.b, z17.b, z6.b\n"
+ "mov z24.d, z5.d\n"
+ "mov z22.d, z5.d\n"
+ "mov z21.d, z5.d\n"
"1:" // Loop
- "sdot z6.s, z18.b, z9.b\n"
- "sdot z27.s, z18.b, z4.b\n"
- "ext z9.b, z9.b, z9.b, #0x1\n"
- "whilelt p0.s, x24, %x[n_channels]\n"
- "sdot z6.s, z17.b, z4.b\n"
- "ext z4.b, z4.b, z4.b, #0x1\n"
- "sdot z0.s, z18.b, z9.b\n"
- "ld1w { z9.s }, p1/Z, [%x[params]]\n"
- "sdot z22.s, z18.b, z4.b\n"
- "sdot z27.s, z17.b, z31.b\n"
+ "sdot z5.s, z29.b, z15.b\n"
+ "sdot z22.s, z29.b, z13.b\n"
+ "ext z15.b, z15.b, z15.b, #0x1\n"
+ "whilelt p0.s, x12, %x[n_channels]\n"
+ "sdot z5.s, z28.b, z13.b\n"
+ "ext z13.b, z13.b, z13.b, #0x1\n"
+ "sdot z24.s, z29.b, z15.b\n"
+ "ld1w { z17.s }, p2/Z, [%x[params]]\n"
+ "sdot z21.s, z29.b, z13.b\n"
+ "sdot z22.s, z28.b, z9.b\n"
"incw x13, ALL, MUL #4\n"
- "sdot z6.s, z16.b, z31.b\n"
- "ext z31.b, z31.b, z31.b, #0x1\n"
- "sdot z0.s, z17.b, z4.b\n"
- "ld1w { z4.s }, p1/Z, [%x[params], #1, MUL VL]\n"
- "sdot z22.s, z17.b, z31.b\n"
- "sdot z27.s, z16.b, z26.b\n"
- "ext z26.b, z26.b, z26.b, #0x1\n"
- ".inst 0x04a974c6 // sqrdmulh z6.s, z6.s, z9.s\n"
- "sdot z0.s, z16.b, z31.b\n"
- "sdot z22.s, z16.b, z26.b\n"
- "and z18.d, z6.d, z4.d\n"
+ "sdot z5.s, z26.b, z9.b\n"
+ "ext z9.b, z9.b, z9.b, #0x1\n"
+ "sdot z24.s, z28.b, z13.b\n"
+ "ld1w { z20.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "sdot z21.s, z28.b, z9.b\n"
+ "sdot z22.s, z26.b, z7.b\n"
+ "ext z7.b, z7.b, z7.b, #0x1\n"
+ ".inst 0x04b174a5 // sqrdmulh z5.s, z5.s, z17.s\n"
+ "sdot z24.s, z26.b, z9.b\n"
+ "sdot z21.s, z26.b, z7.b\n"
+ "and z16.d, z5.d, z20.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x04b17718 // sqrdmulh z24.s, z24.s, z17.s\n"
+ ".inst 0x04b176d6 // sqrdmulh z22.s, z22.s, z17.s\n"
+ ".inst 0x04b176b5 // sqrdmulh z21.s, z21.s, z17.s\n"
+ "sqadd z5.s, z5.s, z16.s\n"
+ ".inst 0x44828a85 // srshl z5.s, p2/M, z5.s, z20.s\n"
+ "ld1w { z19.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "and z18.d, z24.d, z20.d\n"
+ "and z17.d, z22.d, z20.d\n"
+ "and z16.d, z21.d, z20.d\n"
"asr z18.s, z18.s, #0x1f\n"
- ".inst 0x04a97400 // sqrdmulh z0.s, z0.s, z9.s\n"
- ".inst 0x04a9777b // sqrdmulh z27.s, z27.s, z9.s\n"
- ".inst 0x04a976d6 // sqrdmulh z22.s, z22.s, z9.s\n"
- "sqadd z6.s, z6.s, z18.s\n"
- ".inst 0x44828486 // srshl z6.s, p1/M, z6.s, z4.s\n"
- "ld1w { z9.s }, p1/Z, [%x[params], #6, MUL VL]\n"
- "and z17.d, z0.d, z4.d\n"
- "and z16.d, z27.d, z4.d\n"
- "and z18.d, z22.d, z4.d\n"
"asr z17.s, z17.s, #0x1f\n"
"asr z16.s, z16.s, #0x1f\n"
- "asr z18.s, z18.s, #0x1f\n"
- "sqadd z0.s, z0.s, z17.s\n"
- "sqadd z27.s, z27.s, z16.s\n"
- ".inst 0x44828480 // srshl z0.s, p1/M, z0.s, z4.s\n"
- ".inst 0x4482849b // srshl z27.s, p1/M, z27.s, z4.s\n"
- "sqadd z22.s, z22.s, z18.s\n"
- "add z6.s, z6.s, z19.s\n"
- ".inst 0x44828496 // srshl z22.s, p1/M, z22.s, z4.s\n"
- "smax z6.s, p1/M, z6.s, z21.s\n"
- "add z0.s, z0.s, z19.s\n"
- "add z27.s, z27.s, z19.s\n"
- "smin z6.s, p1/M, z6.s, z20.s\n"
- "smax z0.s, p1/M, z0.s, z21.s\n"
- "add z22.s, z22.s, z19.s\n"
- "smax z27.s, p1/M, z27.s, z21.s\n"
- "smax z22.s, p1/M, z22.s, z21.s\n"
- "st1b { z6.s }, p0, [x23, x24]\n"
- "ld1w { z6.s }, p1/Z, [%x[params], #2, MUL VL]\n"
- "ld1b { z18.b }, p1/Z, [%x[params], #3, MUL VL]\n"
- "smin z0.s, p1/M, z0.s, z20.s\n"
- "smin z27.s, p1/M, z27.s, z20.s\n"
- "smin z22.s, p1/M, z22.s, z20.s\n"
- "st1b { z0.s }, p0, [x22, x24]\n"
- "mov z0.d, z6.d\n"
- "ld1b { z17.b }, p1/Z, [%x[params], #4, MUL VL]\n"
- "st1b { z27.s }, p0, [x21, x24]\n"
- "mov z27.d, z6.d\n"
- "sdot z27.s, z18.b, z3.b\n"
- "ld1b { z16.b }, p1/Z, [%x[params], #5, MUL VL]\n"
- "st1b { z22.s }, p0, [x20, x24]\n"
- "mov z22.d, z6.d\n"
- "sdot z6.s, z18.b, z8.b\n"
- "sdot z6.s, z17.b, z3.b\n"
- "ext z8.b, z8.b, z8.b, #0x1\n"
- "ext z3.b, z3.b, z3.b, #0x1\n"
- "sdot z0.s, z18.b, z8.b\n"
- "ld1w { z4.s }, p1/Z, [%x[params], #7, MUL VL]\n"
- "sdot z22.s, z18.b, z3.b\n"
- "sdot z27.s, z17.b, z30.b\n"
- "incw x24\n"
- "whilelt p0.s, x24, %x[n_channels]\n"
- "sdot z6.s, z16.b, z30.b\n"
- "ext z30.b, z30.b, z30.b, #0x1\n"
- "sdot z0.s, z17.b, z3.b\n"
+ "sqadd z24.s, z24.s, z18.s\n"
+ "sqadd z22.s, z22.s, z17.s\n"
+ ".inst 0x44828a98 // srshl z24.s, p2/M, z24.s, z20.s\n"
+ ".inst 0x44828a96 // srshl z22.s, p2/M, z22.s, z20.s\n"
+ "sqadd z21.s, z21.s, z16.s\n"
+ "add z5.s, z5.s, z2.s\n"
+ ".inst 0x44828a95 // srshl z21.s, p2/M, z21.s, z20.s\n"
+ "smax z5.s, p2/M, z5.s, z4.s\n"
+ "add z24.s, z24.s, z2.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "smin z5.s, p2/M, z5.s, z3.s\n"
+ "smax z24.s, p2/M, z24.s, z4.s\n"
+ "add z21.s, z21.s, z2.s\n"
+ "smax z22.s, p2/M, z22.s, z4.s\n"
+ "smax z21.s, p2/M, z21.s, z4.s\n"
+ "st1b { z5.s }, p0, [x11, x12]\n"
+ "ld1w { z23.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z18.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "smin z24.s, p2/M, z24.s, z3.s\n"
+ "smin z22.s, p2/M, z22.s, z3.s\n"
+ "smin z21.s, p2/M, z21.s, z3.s\n"
+ "st1b { z24.s }, p0, [x10, x12]\n"
+ "mov z24.d, z23.d\n"
+ "ld1b { z17.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "st1b { z22.s }, p0, [x9, x12]\n"
+ "mov z22.d, z23.d\n"
+ "sdot z22.s, z18.b, z1.b\n"
+ "ld1b { z16.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "st1b { z21.s }, p0, [x28, x12]\n"
+ "mov z21.d, z23.d\n"
+ "sdot z23.s, z18.b, z12.b\n"
+ "sdot z23.s, z17.b, z1.b\n"
+ "ext z12.b, z12.b, z12.b, #0x1\n"
+ "ext z1.b, z1.b, z1.b, #0x1\n"
+ "sdot z24.s, z18.b, z12.b\n"
+ "ld1w { z20.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "sdot z21.s, z18.b, z1.b\n"
+ "sdot z22.s, z17.b, z31.b\n"
+ "incw x12\n"
+ "whilelt p0.s, x12, %x[n_channels]\n"
+ "sdot z23.s, z16.b, z31.b\n"
+ "ext z31.b, z31.b, z31.b, #0x1\n"
+ "sdot z24.s, z17.b, z1.b\n"
"addvl %x[params], %x[params], #16\n"
- "sdot z22.s, z17.b, z30.b\n"
- "sdot z27.s, z16.b, z25.b\n"
- "ext z25.b, z25.b, z25.b, #0x1\n"
- ".inst 0x04a974c6 // sqrdmulh z6.s, z6.s, z9.s\n"
- "sdot z0.s, z16.b, z30.b\n"
- "sdot z22.s, z16.b, z25.b\n"
- "and z18.d, z6.d, z4.d\n"
+ "sdot z21.s, z17.b, z31.b\n"
+ "sdot z22.s, z16.b, z27.b\n"
+ "ext z27.b, z27.b, z27.b, #0x1\n"
+ ".inst 0x04b376f7 // sqrdmulh z23.s, z23.s, z19.s\n"
+ "sdot z24.s, z16.b, z31.b\n"
+ "sdot z21.s, z16.b, z27.b\n"
+ "and z16.d, z23.d, z20.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x04b37718 // sqrdmulh z24.s, z24.s, z19.s\n"
+ ".inst 0x04b376d6 // sqrdmulh z22.s, z22.s, z19.s\n"
+ ".inst 0x04b376b5 // sqrdmulh z21.s, z21.s, z19.s\n"
+ "sqadd z23.s, z23.s, z16.s\n"
+ ".inst 0x44828a97 // srshl z23.s, p2/M, z23.s, z20.s\n"
+ "ld1w { z19.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
+ "and z18.d, z24.d, z20.d\n"
+ "and z17.d, z22.d, z20.d\n"
+ "and z16.d, z21.d, z20.d\n"
"asr z18.s, z18.s, #0x1f\n"
- ".inst 0x04a97400 // sqrdmulh z0.s, z0.s, z9.s\n"
- ".inst 0x04a9777b // sqrdmulh z27.s, z27.s, z9.s\n"
- ".inst 0x04a976d6 // sqrdmulh z22.s, z22.s, z9.s\n"
- "sqadd z6.s, z6.s, z18.s\n"
- ".inst 0x44828486 // srshl z6.s, p1/M, z6.s, z4.s\n"
- "ld1w { z9.s }, p1/Z, [%x[params], #-4, MUL VL]\n"
- "and z17.d, z0.d, z4.d\n"
- "and z16.d, z27.d, z4.d\n"
- "and z18.d, z22.d, z4.d\n"
"asr z17.s, z17.s, #0x1f\n"
"asr z16.s, z16.s, #0x1f\n"
+ "sqadd z24.s, z24.s, z18.s\n"
+ "sqadd z22.s, z22.s, z17.s\n"
+ ".inst 0x44828a98 // srshl z24.s, p2/M, z24.s, z20.s\n"
+ ".inst 0x44828a96 // srshl z22.s, p2/M, z22.s, z20.s\n"
+ "sqadd z21.s, z21.s, z16.s\n"
+ "add z23.s, z23.s, z2.s\n"
+ ".inst 0x44828a95 // srshl z21.s, p2/M, z21.s, z20.s\n"
+ "smax z23.s, p2/M, z23.s, z4.s\n"
+ "add z24.s, z24.s, z2.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "smin z23.s, p2/M, z23.s, z3.s\n"
+ "smax z24.s, p2/M, z24.s, z4.s\n"
+ "add z21.s, z21.s, z2.s\n"
+ "smax z22.s, p2/M, z22.s, z4.s\n"
+ "smax z21.s, p2/M, z21.s, z4.s\n"
+ "st1b { z23.s }, p0, [x11, x12]\n"
+ "ld1w { z23.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+ "ld1b { z18.b }, p2/Z, [%x[params], #-7, MUL VL]\n"
+ "smin z24.s, p2/M, z24.s, z3.s\n"
+ "smin z22.s, p2/M, z22.s, z3.s\n"
+ "smin z21.s, p2/M, z21.s, z3.s\n"
+ "st1b { z24.s }, p0, [x10, x12]\n"
+ "mov z24.d, z23.d\n"
+ "ld1b { z17.b }, p2/Z, [%x[params], #-6, MUL VL]\n"
+ "st1b { z22.s }, p0, [x9, x12]\n"
+ "mov z22.d, z23.d\n"
+ "sdot z22.s, z18.b, z0.b\n"
+ "ld1b { z16.b }, p2/Z, [%x[params], #-5, MUL VL]\n"
+ "st1b { z21.s }, p0, [x28, x12]\n"
+ "mov z21.d, z23.d\n"
+ "sdot z23.s, z18.b, z11.b\n"
+ "sdot z23.s, z17.b, z0.b\n"
+ "ext z11.b, z11.b, z11.b, #0x1\n"
+ "ext z0.b, z0.b, z0.b, #0x1\n"
+ "sdot z24.s, z18.b, z11.b\n"
+ "ld1w { z20.s }, p2/Z, [%x[params], #-3, MUL VL]\n"
+ "sdot z21.s, z18.b, z0.b\n"
+ "sdot z22.s, z17.b, z30.b\n"
+ "incw x12\n"
+ "whilelt p0.s, x12, %x[n_channels]\n"
+ "sdot z23.s, z16.b, z30.b\n"
+ "ext z30.b, z30.b, z30.b, #0x1\n"
+ "sdot z24.s, z17.b, z0.b\n"
+ "sdot z21.s, z17.b, z30.b\n"
+ "sdot z22.s, z16.b, z25.b\n"
+ "ext z25.b, z25.b, z25.b, #0x1\n"
+ ".inst 0x04b376f7 // sqrdmulh z23.s, z23.s, z19.s\n"
+ "sdot z24.s, z16.b, z30.b\n"
+ "sdot z21.s, z16.b, z25.b\n"
+ "and z16.d, z23.d, z20.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x04b37718 // sqrdmulh z24.s, z24.s, z19.s\n"
+ ".inst 0x04b376d6 // sqrdmulh z22.s, z22.s, z19.s\n"
+ ".inst 0x04b376b5 // sqrdmulh z21.s, z21.s, z19.s\n"
+ "sqadd z23.s, z23.s, z16.s\n"
+ ".inst 0x44828a97 // srshl z23.s, p2/M, z23.s, z20.s\n"
+ "ld1w { z19.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "and z18.d, z24.d, z20.d\n"
+ "and z17.d, z22.d, z20.d\n"
+ "and z16.d, z21.d, z20.d\n"
"asr z18.s, z18.s, #0x1f\n"
- "sqadd z0.s, z0.s, z17.s\n"
- "sqadd z27.s, z27.s, z16.s\n"
- ".inst 0x44828480 // srshl z0.s, p1/M, z0.s, z4.s\n"
- ".inst 0x4482849b // srshl z27.s, p1/M, z27.s, z4.s\n"
- "sqadd z22.s, z22.s, z18.s\n"
- "add z6.s, z6.s, z19.s\n"
- ".inst 0x44828496 // srshl z22.s, p1/M, z22.s, z4.s\n"
- "smax z6.s, p1/M, z6.s, z21.s\n"
- "add z0.s, z0.s, z19.s\n"
- "add z27.s, z27.s, z19.s\n"
- "smin z6.s, p1/M, z6.s, z20.s\n"
- "smax z0.s, p1/M, z0.s, z21.s\n"
- "add z22.s, z22.s, z19.s\n"
- "smax z27.s, p1/M, z27.s, z21.s\n"
- "smax z22.s, p1/M, z22.s, z21.s\n"
- "st1b { z6.s }, p0, [x23, x24]\n"
- "ld1w { z6.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
- "ld1b { z18.b }, p1/Z, [%x[params], #-7, MUL VL]\n"
- "smin z0.s, p1/M, z0.s, z20.s\n"
- "smin z27.s, p1/M, z27.s, z20.s\n"
- "smin z22.s, p1/M, z22.s, z20.s\n"
- "st1b { z0.s }, p0, [x22, x24]\n"
- "mov z0.d, z6.d\n"
- "ld1b { z17.b }, p1/Z, [%x[params], #-6, MUL VL]\n"
- "st1b { z27.s }, p0, [x21, x24]\n"
- "mov z27.d, z6.d\n"
- "sdot z27.s, z18.b, z2.b\n"
- "ld1b { z16.b }, p1/Z, [%x[params], #-5, MUL VL]\n"
- "st1b { z22.s }, p0, [x20, x24]\n"
- "mov z22.d, z6.d\n"
- "sdot z6.s, z18.b, z7.b\n"
- "sdot z6.s, z17.b, z2.b\n"
- "ext z7.b, z7.b, z7.b, #0x1\n"
- "ext z2.b, z2.b, z2.b, #0x1\n"
- "sdot z0.s, z18.b, z7.b\n"
- "ld1w { z4.s }, p1/Z, [%x[params], #-3, MUL VL]\n"
- "sdot z22.s, z18.b, z2.b\n"
- "sdot z27.s, z17.b, z29.b\n"
- "incw x24\n"
- "whilelt p0.s, x24, %x[n_channels]\n"
- "sdot z6.s, z16.b, z29.b\n"
- "ext z29.b, z29.b, z29.b, #0x1\n"
- "sdot z0.s, z17.b, z2.b\n"
- "sdot z22.s, z17.b, z29.b\n"
- "sdot z27.s, z16.b, z24.b\n"
- "ext z24.b, z24.b, z24.b, #0x1\n"
- ".inst 0x04a974c6 // sqrdmulh z6.s, z6.s, z9.s\n"
- "sdot z0.s, z16.b, z29.b\n"
- "sdot z22.s, z16.b, z24.b\n"
- "and z18.d, z6.d, z4.d\n"
- "asr z18.s, z18.s, #0x1f\n"
- ".inst 0x04a97400 // sqrdmulh z0.s, z0.s, z9.s\n"
- ".inst 0x04a9777b // sqrdmulh z27.s, z27.s, z9.s\n"
- ".inst 0x04a976d6 // sqrdmulh z22.s, z22.s, z9.s\n"
- "sqadd z6.s, z6.s, z18.s\n"
- ".inst 0x44828486 // srshl z6.s, p1/M, z6.s, z4.s\n"
- "ld1w { z9.s }, p1/Z, [%x[params], #2, MUL VL]\n"
- "and z17.d, z0.d, z4.d\n"
- "and z16.d, z27.d, z4.d\n"
- "and z18.d, z22.d, z4.d\n"
"asr z17.s, z17.s, #0x1f\n"
"asr z16.s, z16.s, #0x1f\n"
- "asr z18.s, z18.s, #0x1f\n"
- "sqadd z0.s, z0.s, z17.s\n"
- "sqadd z27.s, z27.s, z16.s\n"
- ".inst 0x44828480 // srshl z0.s, p1/M, z0.s, z4.s\n"
- ".inst 0x4482849b // srshl z27.s, p1/M, z27.s, z4.s\n"
- "sqadd z22.s, z22.s, z18.s\n"
- "add z6.s, z6.s, z19.s\n"
- ".inst 0x44828496 // srshl z22.s, p1/M, z22.s, z4.s\n"
- "smax z6.s, p1/M, z6.s, z21.s\n"
- "add z0.s, z0.s, z19.s\n"
- "add z27.s, z27.s, z19.s\n"
- "smin z6.s, p1/M, z6.s, z20.s\n"
- "smax z0.s, p1/M, z0.s, z21.s\n"
- "add z22.s, z22.s, z19.s\n"
- "smax z27.s, p1/M, z27.s, z21.s\n"
- "smax z22.s, p1/M, z22.s, z21.s\n"
- "st1b { z6.s }, p0, [x23, x24]\n"
- "ld1w { z6.s }, p1/Z, [%x[params], #-2, MUL VL]\n"
- "ld1b { z18.b }, p1/Z, [%x[params], #-1, MUL VL]\n"
- "smin z0.s, p1/M, z0.s, z20.s\n"
- "smin z27.s, p1/M, z27.s, z20.s\n"
- "smin z22.s, p1/M, z22.s, z20.s\n"
- "st1b { z0.s }, p0, [x22, x24]\n"
- "mov z0.d, z6.d\n"
- "ld1b { z17.b }, p1/Z, [%x[params]]\n"
- "st1b { z27.s }, p0, [x21, x24]\n"
- "mov z27.d, z6.d\n"
- "sdot z27.s, z18.b, z1.b\n"
- "ld1b { z16.b }, p1/Z, [%x[params], #1, MUL VL]\n"
- "st1b { z22.s }, p0, [x20, x24]\n"
- "mov z22.d, z6.d\n"
- "sdot z6.s, z18.b, z5.b\n"
- "sdot z6.s, z17.b, z1.b\n"
- "ext z5.b, z5.b, z5.b, #0x1\n"
- "ext z1.b, z1.b, z1.b, #0x1\n"
- "sdot z0.s, z18.b, z5.b\n"
- "ld1w { z4.s }, p1/Z, [%x[params], #3, MUL VL]\n"
- "sdot z22.s, z18.b, z1.b\n"
- "sdot z27.s, z17.b, z28.b\n"
- "incw x24\n"
- "whilelt p0.s, x24, %x[n_channels]\n"
- "sdot z6.s, z16.b, z28.b\n"
- "ext z28.b, z28.b, z28.b, #0x1\n"
- "sdot z0.s, z17.b, z1.b\n"
- "whilelt p2.b, x13, %x[n_channels]\n"
- "sdot z22.s, z17.b, z28.b\n"
- "sdot z27.s, z16.b, z23.b\n"
- "ext z23.b, z23.b, z23.b, #0x1\n"
- "ld1b { z8.b }, p2/Z, [x11, x13]\n"
- ".inst 0x04a974c6 // sqrdmulh z6.s, z6.s, z9.s\n"
- "sdot z0.s, z16.b, z28.b\n"
- "sdot z22.s, z16.b, z23.b\n"
- "ld1b { z7.b }, p2/Z, [x10, x13]\n"
- "and z18.d, z6.d, z4.d\n"
- "asr z18.s, z18.s, #0x1f\n"
- "ld1b { z5.b }, p2/Z, [x9, x13]\n"
- "ld1b { z3.b }, p2/Z, [x27, x13]\n"
- ".inst 0x04a97400 // sqrdmulh z0.s, z0.s, z9.s\n"
- ".inst 0x04a9777b // sqrdmulh z27.s, z27.s, z9.s\n"
- "ld1b { z2.b }, p2/Z, [x26, x13]\n"
- "ld1b { z1.b }, p2/Z, [x25, x13]\n"
- ".inst 0x04a976d6 // sqrdmulh z22.s, z22.s, z9.s\n"
- "sqadd z6.s, z6.s, z18.s\n"
- ".inst 0x44828486 // srshl z6.s, p1/M, z6.s, z4.s\n"
- "ld1b { z9.b }, p2/Z, [x12, x13]\n"
- "and z17.d, z0.d, z4.d\n"
- "and z16.d, z27.d, z4.d\n"
- "ldp x12, x11, [%x[inptrs], #0x40]\n"
- "ldp x10, x9, [%x[inptrs], #0x50]\n"
- "and z18.d, z22.d, z4.d\n"
+ "sqadd z24.s, z24.s, z18.s\n"
+ "sqadd z22.s, z22.s, z17.s\n"
+ ".inst 0x44828a98 // srshl z24.s, p2/M, z24.s, z20.s\n"
+ ".inst 0x44828a96 // srshl z22.s, p2/M, z22.s, z20.s\n"
+ "sqadd z21.s, z21.s, z16.s\n"
+ "add z23.s, z23.s, z2.s\n"
+ ".inst 0x44828a95 // srshl z21.s, p2/M, z21.s, z20.s\n"
+ "smax z23.s, p2/M, z23.s, z4.s\n"
+ "add z24.s, z24.s, z2.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "smin z23.s, p2/M, z23.s, z3.s\n"
+ "smax z24.s, p2/M, z24.s, z4.s\n"
+ "add z21.s, z21.s, z2.s\n"
+ "smax z22.s, p2/M, z22.s, z4.s\n"
+ "smax z21.s, p2/M, z21.s, z4.s\n"
+ "st1b { z23.s }, p0, [x11, x12]\n"
+ "ld1w { z23.s }, p2/Z, [%x[params], #-2, MUL VL]\n"
+ "ld1b { z18.b }, p2/Z, [%x[params], #-1, MUL VL]\n"
+ "smin z24.s, p2/M, z24.s, z3.s\n"
+ "smin z22.s, p2/M, z22.s, z3.s\n"
+ "smin z21.s, p2/M, z21.s, z3.s\n"
+ "st1b { z24.s }, p0, [x10, x12]\n"
+ "mov z29.d, z23.d\n"
+ "ld1b { z17.b }, p2/Z, [%x[params]]\n"
+ "st1b { z22.s }, p0, [x9, x12]\n"
+ "mov z28.d, z23.d\n"
+ "sdot z28.s, z18.b, z10.b\n"
+ "ld1b { z16.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "st1b { z21.s }, p0, [x28, x12]\n"
+ "mov z27.d, z23.d\n"
+ "sdot z23.s, z18.b, z14.b\n"
+ "sdot z23.s, z17.b, z10.b\n"
+ "ext z14.b, z14.b, z14.b, #0x1\n"
+ "ext z10.b, z10.b, z10.b, #0x1\n"
+ "sdot z29.s, z18.b, z14.b\n"
+ "ld1w { z22.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "sdot z27.s, z18.b, z10.b\n"
+ "sdot z28.s, z17.b, z8.b\n"
+ "incw x12\n"
+ "whilelt p1.s, x12, %x[n_channels]\n"
+ "sdot z23.s, z16.b, z8.b\n"
+ "ext z8.b, z8.b, z8.b, #0x1\n"
+ "sdot z29.s, z17.b, z10.b\n"
+ "whilelt p0.b, x13, %x[n_channels]\n"
+ "sdot z27.s, z17.b, z8.b\n"
+ "sdot z28.s, z16.b, z6.b\n"
+ "ext z6.b, z6.b, z6.b, #0x1\n"
+ "ld1b { z26.b }, p0/Z, [x26, x13]\n"
+ ".inst 0x04b376f7 // sqrdmulh z23.s, z23.s, z19.s\n"
+ "sdot z29.s, z16.b, z8.b\n"
+ "sdot z27.s, z16.b, z6.b\n"
+ "ld1b { z21.b }, p0/Z, [x25, x13]\n"
+ "and z16.d, z23.d, z22.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "ld1b { z14.b }, p0/Z, [x23, x13]\n"
+ "ld1b { z25.b }, p0/Z, [x22, x13]\n"
+ ".inst 0x04b377bd // sqrdmulh z29.s, z29.s, z19.s\n"
+ ".inst 0x04b3779c // sqrdmulh z28.s, z28.s, z19.s\n"
+ "ld1b { z20.b }, p0/Z, [x21, x13]\n"
+ "ld1b { z10.b }, p0/Z, [x20, x13]\n"
+ ".inst 0x04b3777b // sqrdmulh z27.s, z27.s, z19.s\n"
+ "sqadd z23.s, z23.s, z16.s\n"
+ ".inst 0x44828ad7 // srshl z23.s, p2/M, z23.s, z22.s\n"
+ "ld1b { z15.b }, p0/Z, [x27, x13]\n"
+ "and z19.d, z29.d, z22.d\n"
+ "and z17.d, z28.d, z22.d\n"
+ "ldp x23, x22, [%x[inptrs], #0x40]\n"
+ "ldp x21, x20, [%x[inptrs], #0x50]\n"
+ "and z16.d, z27.d, z22.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "ld1b { z9.b }, p0/Z, [x23, x13]\n"
+ "ld1b { z24.b }, p0/Z, [x22, x13]\n"
"asr z17.s, z17.s, #0x1f\n"
- "ld1b { z31.b }, p2/Z, [x12, x13]\n"
- "ld1b { z30.b }, p2/Z, [x11, x13]\n"
"asr z16.s, z16.s, #0x1f\n"
- "asr z18.s, z18.s, #0x1f\n"
- "ld1b { z29.b }, p2/Z, [x10, x13]\n"
- "ld1b { z28.b }, p2/Z, [x9, x13]\n"
- "sqadd z0.s, z0.s, z17.s\n"
+ "ld1b { z18.b }, p0/Z, [x21, x13]\n"
+ "ld1b { z8.b }, p0/Z, [x20, x13]\n"
+ "sqadd z29.s, z29.s, z19.s\n"
+ "sqadd z28.s, z28.s, z17.s\n"
+ ".inst 0x44828add // srshl z29.s, p2/M, z29.s, z22.s\n"
+ ".inst 0x44828adc // srshl z28.s, p2/M, z28.s, z22.s\n"
"sqadd z27.s, z27.s, z16.s\n"
- ".inst 0x44828480 // srshl z0.s, p1/M, z0.s, z4.s\n"
- ".inst 0x4482849b // srshl z27.s, p1/M, z27.s, z4.s\n"
- "sqadd z22.s, z22.s, z18.s\n"
- "add z6.s, z6.s, z19.s\n"
- ".inst 0x44828496 // srshl z22.s, p1/M, z22.s, z4.s\n"
- "smax z6.s, p1/M, z6.s, z21.s\n"
- "add z0.s, z0.s, z19.s\n"
- "add z27.s, z27.s, z19.s\n"
- "ld1b { z4.b }, p2/Z, [x28, x13]\n"
- "ldp x28, x27, [%x[inptrs], #0x60]\n"
- "add z22.s, z22.s, z19.s\n"
- "ldp x26, x25, [%x[inptrs], #0x70]\n"
- "smin z6.s, p1/M, z6.s, z20.s\n"
- "smax z0.s, p1/M, z0.s, z21.s\n"
- "smax z27.s, p1/M, z27.s, z21.s\n"
- "smax z22.s, p1/M, z22.s, z21.s\n"
- "st1b { z6.s }, p0, [x23, x24]\n"
- "ld1b { z26.b }, p2/Z, [x28, x13]\n"
- "ld1b { z25.b }, p2/Z, [x27, x13]\n"
- "ld1b { z24.b }, p2/Z, [x26, x13]\n"
- "zip2 z6.b, z9.b, z7.b\n"
- "zip1 z9.b, z9.b, z7.b\n"
- "ld1b { z23.b }, p2/Z, [x25, x13]\n"
- "zip1 z7.b, z8.b, z5.b\n"
- "zip2 z5.b, z8.b, z5.b\n"
- "smin z0.s, p1/M, z0.s, z20.s\n"
- "smin z27.s, p1/M, z27.s, z20.s\n"
- "smin z22.s, p1/M, z22.s, z20.s\n"
- "st1b { z0.s }, p0, [x22, x24]\n"
- "zip2 z8.b, z9.b, z7.b\n"
- "st1b { z27.s }, p0, [x21, x24]\n"
- "zip1 z9.b, z9.b, z7.b\n"
- "zip1 z7.b, z6.b, z5.b\n"
- "ldp x12, x11, [%x[inptrs], #0x0]\n"
- "st1b { z22.s }, p0, [x20, x24]\n"
- "zip2 z5.b, z6.b, z5.b\n"
- "zip2 z0.b, z4.b, z2.b\n"
- "ld1w { z6.s }, p1/Z, [%x[params], #4, MUL VL]\n"
- "zip1 z4.b, z4.b, z2.b\n"
- "zip1 z2.b, z3.b, z1.b\n"
- "incw x24\n"
- "ldp x10, x9, [%x[inptrs], #0x10]\n"
- "zip2 z1.b, z3.b, z1.b\n"
- "zip2 z27.b, z31.b, z29.b\n"
- "ldp x28, x27, [%x[inptrs], #0x20]\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
- "zip1 z31.b, z31.b, z29.b\n"
- "zip1 z29.b, z30.b, z28.b\n"
- "ld1b { z18.b }, p1/Z, [%x[params], #5, MUL VL]\n"
- "ld1b { z17.b }, p1/Z, [%x[params], #6, MUL VL]\n"
- "zip2 z28.b, z30.b, z28.b\n"
- "zip2 z22.b, z26.b, z24.b\n"
- "ld1b { z16.b }, p1/Z, [%x[params], #7, MUL VL]\n"
+ "add z23.s, z23.s, z2.s\n"
+ ".inst 0x44828adb // srshl z27.s, p2/M, z27.s, z22.s\n"
+ "smax z23.s, p2/M, z23.s, z4.s\n"
+ "add z29.s, z29.s, z2.s\n"
+ "add z28.s, z28.s, z2.s\n"
+ "ld1b { z13.b }, p0/Z, [x24, x13]\n"
+ "ldp x23, x22, [%x[inptrs], #0x60]\n"
+ "add z27.s, z27.s, z2.s\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "smin z23.s, p2/M, z23.s, z3.s\n"
+ "smax z29.s, p2/M, z29.s, z4.s\n"
+ "smax z28.s, p2/M, z28.s, z4.s\n"
+ "smax z27.s, p2/M, z27.s, z4.s\n"
+ "st1b { z23.s }, p1, [x11, x12]\n"
+ "ld1b { z7.b }, p0/Z, [x23, x13]\n"
+ "ld1b { z23.b }, p0/Z, [x22, x13]\n"
+ "ld1b { z22.b }, p0/Z, [x21, x13]\n"
+ "zip2 z17.b, z15.b, z21.b\n"
+ "zip1 z15.b, z15.b, z21.b\n"
+ "ld1b { z6.b }, p0/Z, [x20, x13]\n"
+ "zip1 z16.b, z26.b, z14.b\n"
+ "zip2 z14.b, z26.b, z14.b\n"
+ "smin z29.s, p2/M, z29.s, z3.s\n"
+ "smin z28.s, p2/M, z28.s, z3.s\n"
+ "smin z27.s, p2/M, z27.s, z3.s\n"
+ "st1b { z29.s }, p1, [x10, x12]\n"
+ "zip2 z12.b, z15.b, z16.b\n"
+ "st1b { z28.s }, p1, [x9, x12]\n"
+ "zip1 z15.b, z15.b, z16.b\n"
+ "zip1 z11.b, z17.b, z14.b\n"
+ "ldp x27, x26, [%x[inptrs], #0x0]\n"
+ "st1b { z27.s }, p1, [x28, x12]\n"
+ "zip2 z14.b, z17.b, z14.b\n"
+ "zip2 z21.b, z13.b, z20.b\n"
+ "ld1w { z5.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "zip1 z13.b, z13.b, z20.b\n"
+ "zip1 z20.b, z25.b, z10.b\n"
+ "incw x12\n"
+ "ldp x25, x23, [%x[inptrs], #0x10]\n"
+ "zip2 z10.b, z25.b, z10.b\n"
+ "zip2 z19.b, z9.b, z18.b\n"
+ "ldp x24, x22, [%x[inptrs], #0x20]\n"
+ "ldp x21, x20, [%x[inptrs], #0x30]\n"
+ "zip1 z9.b, z9.b, z18.b\n"
+ "zip1 z18.b, z24.b, z8.b\n"
+ "ld1b { z29.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "ld1b { z28.b }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "zip2 z8.b, z24.b, z8.b\n"
+ "zip2 z17.b, z7.b, z22.b\n"
+ "ld1b { z26.b }, p2/Z, [%x[params], #7, MUL VL]\n"
"addvl %x[params], %x[params], #8\n"
- "zip1 z26.b, z26.b, z24.b\n"
- "zip1 z24.b, z25.b, z23.b\n"
- "zip2 z23.b, z25.b, z23.b\n"
- "zip2 z3.b, z4.b, z2.b\n"
- "zip1 z4.b, z4.b, z2.b\n"
- "zip1 z2.b, z0.b, z1.b\n"
- "zip2 z1.b, z0.b, z1.b\n"
- "zip2 z30.b, z31.b, z29.b\n"
- "zip1 z31.b, z31.b, z29.b\n"
- "zip1 z29.b, z27.b, z28.b\n"
- "zip2 z28.b, z27.b, z28.b\n"
- "zip2 z25.b, z26.b, z24.b\n"
- "zip1 z26.b, z26.b, z24.b\n"
- "zip1 z24.b, z22.b, z23.b\n"
- "zip2 z23.b, z22.b, z23.b\n"
- "mov z0.d, z6.d\n"
- "mov z27.d, z6.d\n"
- "mov z22.d, z6.d\n"
+ "zip1 z7.b, z7.b, z22.b\n"
+ "zip1 z16.b, z23.b, z6.b\n"
+ "zip2 z6.b, z23.b, z6.b\n"
+ "zip2 z1.b, z13.b, z20.b\n"
+ "zip1 z13.b, z13.b, z20.b\n"
+ "zip1 z0.b, z21.b, z10.b\n"
+ "zip2 z10.b, z21.b, z10.b\n"
+ "zip2 z31.b, z9.b, z18.b\n"
+ "zip1 z9.b, z9.b, z18.b\n"
+ "zip1 z30.b, z19.b, z8.b\n"
+ "zip2 z8.b, z19.b, z8.b\n"
+ "zip2 z27.b, z7.b, z16.b\n"
+ "zip1 z7.b, z7.b, z16.b\n"
+ "zip1 z25.b, z17.b, z6.b\n"
+ "zip2 z6.b, z17.b, z6.b\n"
+ "mov z24.d, z5.d\n"
+ "mov z22.d, z5.d\n"
+ "mov z21.d, z5.d\n"
"b.any 1b\n"
: [params] "+&r" (params)
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
index 648b2da163..9432cd7550 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
index 257c4d44dc..f0860c98b9 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -30,472 +30,464 @@
namespace arm_conv {
namespace depthwise {
-void sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
- const unsigned int n_channels,
- const uint8_t *const *const inptrs,
- const uint8_t *params,
- const int32_t *, // Bias, should be wrapped into the parameters
- const arm_gemm::Requantize32& qp,
- const int32_t *, const int32_t *, // Requant parameters, also wrapped
- uint8_t *const *const outptrs
-)
+void sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const uint8_t *const *const inptrs, const uint8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, uint8_t *const *const outptrs)
{
__asm__ __volatile__(
- "mov x13, #0x0\n"
- "whilelt p2.b, x13, %x[n_channels]\n"
- "ldp x12, x11, [%x[inptrs], #0x0]\n"
- "ldp x10, x9, [%x[inptrs], #0x10]\n"
- "ldp x28, x27, [%x[inptrs], #0x20]\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
+ "mov x14, #0x0\n"
+ "whilelt p0.b, x14, %x[n_channels]\n"
+ "ldp x27, x26, [%x[inptrs], #0x0]\n"
+ "ldp x25, x24, [%x[inptrs], #0x10]\n"
+ "ldp x23, x22, [%x[inptrs], #0x20]\n"
+ "ldp x13, x21, [%x[inptrs], #0x30]\n"
"mov x20, #0x1\n"
- "ptrue p1.b\n"
- "ldp x24, x23, [%x[outptrs], #0x0]\n"
- "ldp x22, x21, [%x[outptrs], #0x10]\n"
+ "ptrue p2.b\n"
+ "ldp x12, x11, [%x[outptrs], #0x0]\n"
+ "ldp x10, x9, [%x[outptrs], #0x10]\n"
"orr x20, x20, #0x100\n"
"orr x20, x20, #0x10000\n"
- "ld1b { z14.b }, p2/Z, [x12, x13]\n"
- "ld1b { z13.b }, p2/Z, [x11, x13]\n"
- "dup z12.s, w20\n"
- "mov x20, #0x0\n"
- "ldp x12, x11, [%x[inptrs], #0x40]\n"
- "ld1b { z11.b }, p2/Z, [x10, x13]\n"
- "zip2 z10.b, z14.b, z11.b\n"
- "zip1 z14.b, z14.b, z11.b\n"
- "ld1b { z9.b }, p2/Z, [x9, x13]\n"
- "ldp x10, x9, [%x[inptrs], #0x50]\n"
- "zip1 z11.b, z13.b, z9.b\n"
- "zip2 z9.b, z13.b, z9.b\n"
- "ld1b { z8.b }, p2/Z, [x28, x13]\n"
- "ld1b { z7.b }, p2/Z, [x27, x13]\n"
- "zip2 z13.b, z14.b, z11.b\n"
- "zip1 z14.b, z14.b, z11.b\n"
- "ldp x28, x27, [%x[inptrs], #0x60]\n"
- "ld1b { z6.b }, p2/Z, [x26, x13]\n"
- "zip1 z11.b, z10.b, z9.b\n"
- "zip2 z9.b, z10.b, z9.b\n"
- "ld1b { z5.b }, p2/Z, [x25, x13]\n"
- "ldp x26, x25, [%x[inptrs], #0x70]\n"
- "zip2 z4.b, z8.b, z6.b\n"
- "zip1 z8.b, z8.b, z6.b\n"
- "ld1b { z3.b }, p2/Z, [x12, x13]\n"
- "ld1b { z2.b }, p2/Z, [x11, x13]\n"
- "zip1 z6.b, z7.b, z5.b\n"
- "zip2 z5.b, z7.b, z5.b\n"
- "ld1b { z1.b }, p2/Z, [x10, x13]\n"
- "ld1b { z0.b }, p2/Z, [x9, x13]\n"
- "zip2 z31.b, z3.b, z1.b\n"
- "zip1 z3.b, z3.b, z1.b\n"
- "ld1b { z30.b }, p2/Z, [x28, x13]\n"
- "ld1b { z29.b }, p2/Z, [x27, x13]\n"
- "zip1 z1.b, z2.b, z0.b\n"
- "zip2 z0.b, z2.b, z0.b\n"
- "ld1b { z28.b }, p2/Z, [x26, x13]\n"
- "ld1b { z27.b }, p2/Z, [x25, x13]\n"
- "zip2 z26.b, z30.b, z28.b\n"
- "zip1 z30.b, z30.b, z28.b\n"
- "zip1 z28.b, z29.b, z27.b\n"
- "zip2 z27.b, z29.b, z27.b\n"
- "ld1w { z10.s }, p1/Z, [%x[params]]\n"
- "ld1rw { z25.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z24.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
- "ld1rw { z23.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "zip2 z7.b, z8.b, z6.b\n"
- "zip1 z8.b, z8.b, z6.b\n"
- "ld1rw { z22.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "ldp x12, x11, [%x[inptrs], #0x0]\n"
- "zip1 z6.b, z4.b, z5.b\n"
- "zip2 z5.b, z4.b, z5.b\n"
- "ldp x10, x9, [%x[inptrs], #0x10]\n"
- "ldp x28, x27, [%x[inptrs], #0x20]\n"
- "zip2 z2.b, z3.b, z1.b\n"
- "zip1 z3.b, z3.b, z1.b\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
- "zip1 z1.b, z31.b, z0.b\n"
- "zip2 z0.b, z31.b, z0.b\n"
- "ld1b { z21.b }, p1/Z, [%x[params], #1, MUL VL]\n"
- "zip2 z29.b, z30.b, z28.b\n"
- "zip1 z30.b, z30.b, z28.b\n"
- "ld1b { z16.b }, p1/Z, [%x[params], #2, MUL VL]\n"
- "ld1b { z20.b }, p1/Z, [%x[params], #3, MUL VL]\n"
- "zip1 z28.b, z26.b, z27.b\n"
- "zip2 z27.b, z26.b, z27.b\n"
+ "ld1b { z15.b }, p0/Z, [x27, x14]\n"
+ "ld1b { z21.b }, p0/Z, [x26, x14]\n"
+ "dup z25.s, w20\n"
+ "mov x28, #0x0\n"
+ "ldp x27, x26, [%x[inptrs], #0x40]\n"
+ "ld1b { z31.b }, p0/Z, [x25, x14]\n"
+ "zip2 z16.b, z15.b, z31.b\n"
+ "zip1 z15.b, z15.b, z31.b\n"
+ "ld1b { z29.b }, p0/Z, [x24, x14]\n"
+ "ldp x25, x24, [%x[inptrs], #0x50]\n"
+ "zip1 z30.b, z21.b, z29.b\n"
+ "zip2 z29.b, z21.b, z29.b\n"
+ "ld1b { z9.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z20.b }, p0/Z, [x22, x14]\n"
+ "zip2 z13.b, z15.b, z30.b\n"
+ "zip1 z15.b, z15.b, z30.b\n"
+ "ldp x23, x22, [%x[inptrs], #0x60]\n"
+ "ld1b { z5.b }, p0/Z, [x13, x14]\n"
+ "zip1 z14.b, z16.b, z29.b\n"
+ "zip2 z29.b, z16.b, z29.b\n"
+ "ld1b { z17.b }, p0/Z, [x21, x14]\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "zip2 z31.b, z9.b, z5.b\n"
+ "zip1 z9.b, z9.b, z5.b\n"
+ "ld1b { z18.b }, p0/Z, [x27, x14]\n"
+ "ld1b { z28.b }, p0/Z, [x26, x14]\n"
+ "zip1 z21.b, z20.b, z17.b\n"
+ "zip2 z17.b, z20.b, z17.b\n"
+ "ld1b { z6.b }, p0/Z, [x25, x14]\n"
+ "ld1b { z4.b }, p0/Z, [x24, x14]\n"
+ "zip2 z23.b, z18.b, z6.b\n"
+ "zip1 z18.b, z18.b, z6.b\n"
+ "ld1b { z2.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z19.b }, p0/Z, [x22, x14]\n"
+ "zip1 z24.b, z28.b, z4.b\n"
+ "zip2 z4.b, z28.b, z4.b\n"
+ "ld1b { z16.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z5.b }, p0/Z, [x20, x14]\n"
+ "zip2 z22.b, z2.b, z16.b\n"
+ "zip1 z2.b, z2.b, z16.b\n"
+ "zip1 z0.b, z19.b, z5.b\n"
+ "zip2 z5.b, z19.b, z5.b\n"
+ "ld1w { z10.s }, p2/Z, [%x[params]]\n"
+ "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "zip2 z19.b, z9.b, z21.b\n"
+ "zip1 z9.b, z9.b, z21.b\n"
+ "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ldp x27, x26, [%x[inptrs], #0x0]\n"
+ "zip1 z11.b, z31.b, z17.b\n"
+ "zip2 z17.b, z31.b, z17.b\n"
+ "ldp x25, x23, [%x[inptrs], #0x10]\n"
+ "ldp x24, x22, [%x[inptrs], #0x20]\n"
+ "zip2 z12.b, z18.b, z24.b\n"
+ "zip1 z18.b, z18.b, z24.b\n"
+ "ldp x21, x20, [%x[inptrs], #0x30]\n"
+ "zip1 z20.b, z23.b, z4.b\n"
+ "zip2 z4.b, z23.b, z4.b\n"
+ "ld1b { z26.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "zip2 z24.b, z2.b, z0.b\n"
+ "zip1 z2.b, z2.b, z0.b\n"
+ "ld1b { z3.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z1.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "zip1 z0.b, z22.b, z5.b\n"
+ "zip2 z5.b, z22.b, z5.b\n"
"addvl %x[params], %x[params], #4\n"
- "mov z4.d, z10.d\n"
+ "mov z22.d, z10.d\n"
"mov z31.d, z10.d\n"
- "mov z26.d, z10.d\n"
+ "mov z21.d, z10.d\n"
"1:" // Loop
- "mov z19.s, #0x0\n"
- "udot z19.s, z12.b, z8.b\n"
- "udot z10.s, z21.b, z14.b\n"
- "whilelt p0.s, x20, %x[n_channels]\n"
- "udot z19.s, z12.b, z3.b\n"
- "udot z31.s, z21.b, z8.b\n"
- "incw x13, ALL, MUL #4\n"
- "udot z10.s, z16.b, z8.b\n"
- "ext z8.b, z8.b, z8.b, #0x1\n"
- "movprfx z18, z19\n udot z18.s, z12.b, z30.b\n"
- "udot z19.s, z12.b, z14.b\n"
- "ext z14.b, z14.b, z14.b, #0x1\n"
- "udot z31.s, z16.b, z3.b\n"
- "udot z10.s, z20.b, z3.b\n"
- "ext z3.b, z3.b, z3.b, #0x1\n"
- "udot z4.s, z21.b, z14.b\n"
- "udot z26.s, z21.b, z8.b\n"
- "mov z17.s, #0x0\n"
- "udot z17.s, z12.b, z8.b\n"
- "udot z17.s, z12.b, z3.b\n"
- "udot z31.s, z20.b, z30.b\n"
- "ext z30.b, z30.b, z30.b, #0x1\n"
- "udot z4.s, z16.b, z8.b\n"
- "udot z26.s, z16.b, z3.b\n"
- "ld1w { z8.s }, p1/Z, [%x[params], #1, MUL VL]\n"
- "mls z10.s, p1/M, z19.s, z23.s\n"
- "movprfx z16, z17\n udot z16.s, z12.b, z30.b\n"
- "mov z19.s, #0x0\n"
- "udot z17.s, z12.b, z14.b\n"
- "ld1w { z14.s }, p1/Z, [%x[params]]\n"
- "udot z4.s, z20.b, z3.b\n"
- ".inst 0x04ae754a // sqrdmulh z10.s, z10.s, z14.s\n"
- "udot z26.s, z20.b, z30.b\n"
- "mls z4.s, p1/M, z17.s, z23.s\n"
- "and z21.d, z10.d, z8.d\n"
- "mls z31.s, p1/M, z18.s, z23.s\n"
- "mls z26.s, p1/M, z16.s, z23.s\n"
- "asr z21.s, z21.s, #0x1f\n"
- ".inst 0x04ae7484 // sqrdmulh z4.s, z4.s, z14.s\n"
- ".inst 0x04ae77ff // sqrdmulh z31.s, z31.s, z14.s\n"
- "udot z19.s, z12.b, z7.b\n"
- ".inst 0x04ae775a // sqrdmulh z26.s, z26.s, z14.s\n"
- "sqadd z10.s, z10.s, z21.s\n"
- ".inst 0x4482850a // srshl z10.s, p1/M, z10.s, z8.s\n"
- "udot z19.s, z12.b, z2.b\n"
- "and z16.d, z4.d, z8.d\n"
- "and z20.d, z31.d, z8.d\n"
- "movprfx z18, z19\n udot z18.s, z12.b, z29.b\n"
- "ld1w { z14.s }, p1/Z, [%x[params], #6, MUL VL]\n"
- "and z21.d, z26.d, z8.d\n"
- "asr z16.s, z16.s, #0x1f\n"
- "udot z19.s, z12.b, z13.b\n"
- "asr z20.s, z20.s, #0x1f\n"
- "asr z21.s, z21.s, #0x1f\n"
- "sqadd z4.s, z4.s, z16.s\n"
- "sqadd z31.s, z31.s, z20.s\n"
- ".inst 0x44828504 // srshl z4.s, p1/M, z4.s, z8.s\n"
- ".inst 0x4482851f // srshl z31.s, p1/M, z31.s, z8.s\n"
- "sqadd z26.s, z26.s, z21.s\n"
- "add z10.s, z10.s, z22.s\n"
- ".inst 0x4482851a // srshl z26.s, p1/M, z26.s, z8.s\n"
- "smax z10.s, p1/M, z10.s, z25.s\n"
- "add z4.s, z4.s, z22.s\n"
- "add z31.s, z31.s, z22.s\n"
- "smin z10.s, p1/M, z10.s, z24.s\n"
- "smax z4.s, p1/M, z4.s, z25.s\n"
- "add z26.s, z26.s, z22.s\n"
- "smax z31.s, p1/M, z31.s, z25.s\n"
- "smax z26.s, p1/M, z26.s, z25.s\n"
- "st1b { z10.s }, p0, [x24, x20]\n"
- "ld1w { z10.s }, p1/Z, [%x[params], #2, MUL VL]\n"
- "ld1b { z21.b }, p1/Z, [%x[params], #3, MUL VL]\n"
- "smin z4.s, p1/M, z4.s, z24.s\n"
- "smin z31.s, p1/M, z31.s, z24.s\n"
- "smin z26.s, p1/M, z26.s, z24.s\n"
- "st1b { z4.s }, p0, [x23, x20]\n"
- "mov z4.d, z10.d\n"
- "ld1b { z16.b }, p1/Z, [%x[params], #4, MUL VL]\n"
- "st1b { z31.s }, p0, [x22, x20]\n"
- "mov z31.d, z10.d\n"
- "udot z31.s, z21.b, z7.b\n"
- "ld1b { z20.b }, p1/Z, [%x[params], #5, MUL VL]\n"
- "st1b { z26.s }, p0, [x21, x20]\n"
- "mov z26.d, z10.d\n"
- "udot z10.s, z21.b, z13.b\n"
- "udot z10.s, z16.b, z7.b\n"
- "ext z13.b, z13.b, z13.b, #0x1\n"
- "ext z7.b, z7.b, z7.b, #0x1\n"
- "udot z4.s, z21.b, z13.b\n"
- "ld1w { z8.s }, p1/Z, [%x[params], #7, MUL VL]\n"
- "mov z17.s, #0x0\n"
- "udot z26.s, z21.b, z7.b\n"
- "udot z17.s, z12.b, z7.b\n"
- "incw x20\n"
- "udot z31.s, z16.b, z2.b\n"
- "udot z10.s, z20.b, z2.b\n"
+ "mov z30.s, #0x0\n"
+ "udot z30.s, z25.b, z9.b\n"
+ "udot z10.s, z26.b, z15.b\n"
+ "whilelt p0.s, x28, %x[n_channels]\n"
+ "udot z30.s, z25.b, z18.b\n"
+ "udot z31.s, z26.b, z9.b\n"
+ "mov z27.s, #0x0\n"
+ "incw x14, ALL, MUL #4\n"
+ "udot z10.s, z3.b, z9.b\n"
+ "ext z9.b, z9.b, z9.b, #0x1\n"
+ "movprfx z28, z30\n udot z28.s, z25.b, z2.b\n"
+ "udot z30.s, z25.b, z15.b\n"
+ "ext z15.b, z15.b, z15.b, #0x1\n"
+ "udot z27.s, z25.b, z9.b\n"
+ "udot z31.s, z3.b, z18.b\n"
+ "udot z10.s, z1.b, z18.b\n"
+ "ext z18.b, z18.b, z18.b, #0x1\n"
+ "udot z22.s, z26.b, z15.b\n"
+ "udot z21.s, z26.b, z9.b\n"
+ "udot z27.s, z25.b, z18.b\n"
+ "udot z31.s, z1.b, z2.b\n"
"ext z2.b, z2.b, z2.b, #0x1\n"
- "whilelt p0.s, x20, %x[n_channels]\n"
- "udot z4.s, z16.b, z7.b\n"
- "udot z26.s, z16.b, z2.b\n"
+ "udot z22.s, z3.b, z9.b\n"
+ "udot z21.s, z3.b, z18.b\n"
+ "ld1w { z3.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "mls z10.s, p2/M, z30.s, z8.s\n"
+ "movprfx z26, z27\n udot z26.s, z25.b, z2.b\n"
+ "mov z9.s, #0x0\n"
+ "udot z27.s, z25.b, z15.b\n"
+ "ld1w { z23.s }, p2/Z, [%x[params]]\n"
+ "udot z22.s, z1.b, z18.b\n"
+ ".inst 0x04b7754a // sqrdmulh z10.s, z10.s, z23.s\n"
+ "udot z21.s, z1.b, z2.b\n"
+ "mls z22.s, p2/M, z27.s, z8.s\n"
+ "and z18.d, z10.d, z3.d\n"
+ "mls z31.s, p2/M, z28.s, z8.s\n"
+ "mls z21.s, p2/M, z26.s, z8.s\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ ".inst 0x04b776d6 // sqrdmulh z22.s, z22.s, z23.s\n"
+ ".inst 0x04b777ff // sqrdmulh z31.s, z31.s, z23.s\n"
+ "udot z9.s, z25.b, z19.b\n"
+ ".inst 0x04b776b5 // sqrdmulh z21.s, z21.s, z23.s\n"
+ "sqadd z10.s, z10.s, z18.s\n"
+ ".inst 0x4482886a // srshl z10.s, p2/M, z10.s, z3.s\n"
+ "udot z9.s, z25.b, z12.b\n"
+ "and z28.d, z22.d, z3.d\n"
+ "and z23.d, z31.d, z3.d\n"
+ "movprfx z27, z9\n udot z27.s, z25.b, z24.b\n"
+ "ld1w { z30.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "and z18.d, z21.d, z3.d\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "udot z9.s, z25.b, z13.b\n"
+ "asr z23.s, z23.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sqadd z22.s, z22.s, z28.s\n"
+ "sqadd z31.s, z31.s, z23.s\n"
+ ".inst 0x44828876 // srshl z22.s, p2/M, z22.s, z3.s\n"
+ ".inst 0x4482887f // srshl z31.s, p2/M, z31.s, z3.s\n"
+ "sqadd z21.s, z21.s, z18.s\n"
+ "add z10.s, z10.s, z16.s\n"
+ ".inst 0x44828875 // srshl z21.s, p2/M, z21.s, z3.s\n"
+ "smax z10.s, p2/M, z10.s, z7.s\n"
+ "add z22.s, z22.s, z16.s\n"
+ "add z31.s, z31.s, z16.s\n"
+ "smin z10.s, p2/M, z10.s, z6.s\n"
+ "smax z22.s, p2/M, z22.s, z7.s\n"
+ "add z21.s, z21.s, z16.s\n"
+ "smax z31.s, p2/M, z31.s, z7.s\n"
+ "smax z21.s, p2/M, z21.s, z7.s\n"
+ "st1b { z10.s }, p0, [x12, x28]\n"
+ "ld1w { z28.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z1.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "smin z22.s, p2/M, z22.s, z6.s\n"
+ "smin z31.s, p2/M, z31.s, z6.s\n"
+ "smin z21.s, p2/M, z21.s, z6.s\n"
+ "st1b { z22.s }, p0, [x11, x28]\n"
+ "mov z26.d, z28.d\n"
+ "ld1b { z15.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "st1b { z31.s }, p0, [x10, x28]\n"
+ "mov z31.d, z28.d\n"
+ "udot z31.s, z1.b, z19.b\n"
+ "ld1b { z23.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "st1b { z21.s }, p0, [x9, x28]\n"
+ "mov z22.d, z28.d\n"
+ "udot z28.s, z1.b, z13.b\n"
+ "udot z28.s, z15.b, z19.b\n"
+ "ext z13.b, z13.b, z13.b, #0x1\n"
+ "ext z19.b, z19.b, z19.b, #0x1\n"
+ "udot z26.s, z1.b, z13.b\n"
+ "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "mov z18.s, #0x0\n"
+ "udot z22.s, z1.b, z19.b\n"
+ "udot z18.s, z25.b, z19.b\n"
+ "incw x28\n"
+ "udot z31.s, z15.b, z12.b\n"
+ "udot z28.s, z23.b, z12.b\n"
+ "ext z12.b, z12.b, z12.b, #0x1\n"
+ "whilelt p0.s, x28, %x[n_channels]\n"
+ "udot z26.s, z15.b, z19.b\n"
+ "udot z22.s, z15.b, z12.b\n"
"addvl %x[params], %x[params], #16\n"
- "udot z17.s, z12.b, z2.b\n"
- "udot z31.s, z20.b, z29.b\n"
- "ext z29.b, z29.b, z29.b, #0x1\n"
- "mls z10.s, p1/M, z19.s, z23.s\n"
- "udot z4.s, z20.b, z2.b\n"
- ".inst 0x04ae754a // sqrdmulh z10.s, z10.s, z14.s\n"
- "udot z26.s, z20.b, z29.b\n"
- "movprfx z16, z17\n udot z16.s, z12.b, z29.b\n"
- "and z21.d, z10.d, z8.d\n"
- "udot z17.s, z12.b, z13.b\n"
- "mls z4.s, p1/M, z17.s, z23.s\n"
- "asr z21.s, z21.s, #0x1f\n"
- "mls z31.s, p1/M, z18.s, z23.s\n"
- "mls z26.s, p1/M, z16.s, z23.s\n"
- ".inst 0x04ae7484 // sqrdmulh z4.s, z4.s, z14.s\n"
- ".inst 0x04ae77ff // sqrdmulh z31.s, z31.s, z14.s\n"
- ".inst 0x04ae775a // sqrdmulh z26.s, z26.s, z14.s\n"
- "ld1w { z14.s }, p1/Z, [%x[params], #-4, MUL VL]\n"
- "sqadd z10.s, z10.s, z21.s\n"
- "and z16.d, z4.d, z8.d\n"
- ".inst 0x4482850a // srshl z10.s, p1/M, z10.s, z8.s\n"
- "and z20.d, z31.d, z8.d\n"
- "and z21.d, z26.d, z8.d\n"
- "asr z16.s, z16.s, #0x1f\n"
- "asr z20.s, z20.s, #0x1f\n"
- "asr z21.s, z21.s, #0x1f\n"
- "sqadd z4.s, z4.s, z16.s\n"
- ".inst 0x44828504 // srshl z4.s, p1/M, z4.s, z8.s\n"
- "ld1b { z16.b }, p1/Z, [%x[params], #-6, MUL VL]\n"
- "sqadd z31.s, z31.s, z20.s\n"
- "sqadd z26.s, z26.s, z21.s\n"
- ".inst 0x4482851f // srshl z31.s, p1/M, z31.s, z8.s\n"
- ".inst 0x4482851a // srshl z26.s, p1/M, z26.s, z8.s\n"
- "add z10.s, z10.s, z22.s\n"
- "smax z10.s, p1/M, z10.s, z25.s\n"
- "add z4.s, z4.s, z22.s\n"
- "smin z10.s, p1/M, z10.s, z24.s\n"
- "add z31.s, z31.s, z22.s\n"
- "add z26.s, z26.s, z22.s\n"
- "smax z4.s, p1/M, z4.s, z25.s\n"
- "smax z31.s, p1/M, z31.s, z25.s\n"
- "mov z19.s, #0x0\n"
- "udot z19.s, z12.b, z6.b\n"
- "smax z26.s, p1/M, z26.s, z25.s\n"
- "st1b { z10.s }, p0, [x24, x20]\n"
- "ld1w { z10.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
- "ld1b { z21.b }, p1/Z, [%x[params], #-7, MUL VL]\n"
- "smin z4.s, p1/M, z4.s, z24.s\n"
- "smin z31.s, p1/M, z31.s, z24.s\n"
- "smin z26.s, p1/M, z26.s, z24.s\n"
- "st1b { z4.s }, p0, [x23, x20]\n"
- "mov z4.d, z10.d\n"
- "udot z19.s, z12.b, z1.b\n"
- "st1b { z31.s }, p0, [x22, x20]\n"
- "mov z31.d, z10.d\n"
- "udot z31.s, z21.b, z6.b\n"
- "movprfx z18, z19\n udot z18.s, z12.b, z28.b\n"
- "st1b { z26.s }, p0, [x21, x20]\n"
- "mov z26.d, z10.d\n"
- "udot z10.s, z21.b, z11.b\n"
- "udot z10.s, z16.b, z6.b\n"
- "udot z19.s, z12.b, z11.b\n"
+ "udot z18.s, z25.b, z12.b\n"
+ "udot z31.s, z23.b, z24.b\n"
+ "ext z24.b, z24.b, z24.b, #0x1\n"
+ "mls z28.s, p2/M, z9.s, z8.s\n"
+ "udot z26.s, z23.b, z12.b\n"
+ ".inst 0x04be779c // sqrdmulh z28.s, z28.s, z30.s\n"
+ "udot z22.s, z23.b, z24.b\n"
+ "movprfx z12, z18\n udot z12.s, z25.b, z24.b\n"
+ "and z2.d, z28.d, z21.d\n"
+ "udot z18.s, z25.b, z13.b\n"
+ "mls z26.s, p2/M, z18.s, z8.s\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "mls z31.s, p2/M, z27.s, z8.s\n"
+ "mls z22.s, p2/M, z12.s, z8.s\n"
+ ".inst 0x04be775a // sqrdmulh z26.s, z26.s, z30.s\n"
+ ".inst 0x04be77ff // sqrdmulh z31.s, z31.s, z30.s\n"
+ ".inst 0x04be76d6 // sqrdmulh z22.s, z22.s, z30.s\n"
+ "ld1w { z1.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
+ "sqadd z28.s, z28.s, z2.s\n"
+ "and z24.d, z26.d, z21.d\n"
+ ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
+ "and z23.d, z31.d, z21.d\n"
+ "and z18.d, z22.d, z21.d\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "asr z23.s, z23.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sqadd z26.s, z26.s, z24.s\n"
+ ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
+ "ld1b { z30.b }, p2/Z, [%x[params], #-6, MUL VL]\n"
+ "sqadd z31.s, z31.s, z23.s\n"
+ "sqadd z22.s, z22.s, z18.s\n"
+ ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
+ ".inst 0x44828ab6 // srshl z22.s, p2/M, z22.s, z21.s\n"
+ "add z28.s, z28.s, z16.s\n"
+ "smax z28.s, p2/M, z28.s, z7.s\n"
+ "add z26.s, z26.s, z16.s\n"
+ "smin z28.s, p2/M, z28.s, z6.s\n"
+ "add z31.s, z31.s, z16.s\n"
+ "add z22.s, z22.s, z16.s\n"
+ "smax z26.s, p2/M, z26.s, z7.s\n"
+ "smax z31.s, p2/M, z31.s, z7.s\n"
+ "mov z24.s, #0x0\n"
+ "udot z24.s, z25.b, z11.b\n"
+ "smax z22.s, p2/M, z22.s, z7.s\n"
+ "st1b { z28.s }, p0, [x12, x28]\n"
+ "ld1w { z23.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+ "ld1b { z19.b }, p2/Z, [%x[params], #-7, MUL VL]\n"
+ "smin z26.s, p2/M, z26.s, z6.s\n"
+ "smin z31.s, p2/M, z31.s, z6.s\n"
+ "smin z22.s, p2/M, z22.s, z6.s\n"
+ "st1b { z26.s }, p0, [x11, x28]\n"
+ "mov z28.d, z23.d\n"
+ "udot z24.s, z25.b, z20.b\n"
+ "st1b { z31.s }, p0, [x10, x28]\n"
+ "mov z27.d, z23.d\n"
+ "udot z27.s, z19.b, z11.b\n"
+ "movprfx z13, z24\n udot z13.s, z25.b, z0.b\n"
+ "st1b { z22.s }, p0, [x9, x28]\n"
+ "mov z26.d, z23.d\n"
+ "udot z23.s, z19.b, z14.b\n"
+ "udot z23.s, z30.b, z11.b\n"
+ "udot z24.s, z25.b, z14.b\n"
+ "ext z14.b, z14.b, z14.b, #0x1\n"
+ "ld1b { z21.b }, p2/Z, [%x[params], #-5, MUL VL]\n"
+ "udot z28.s, z19.b, z14.b\n"
"ext z11.b, z11.b, z11.b, #0x1\n"
- "ld1b { z20.b }, p1/Z, [%x[params], #-5, MUL VL]\n"
- "udot z4.s, z21.b, z11.b\n"
- "ext z6.b, z6.b, z6.b, #0x1\n"
- "mov z17.s, #0x0\n"
- "udot z26.s, z21.b, z6.b\n"
- "ld1w { z8.s }, p1/Z, [%x[params], #-3, MUL VL]\n"
- "udot z17.s, z12.b, z6.b\n"
- "udot z31.s, z16.b, z1.b\n"
- "incw x20\n"
- "whilelt p0.s, x20, %x[n_channels]\n"
- "udot z10.s, z20.b, z1.b\n"
- "ext z1.b, z1.b, z1.b, #0x1\n"
- "udot z4.s, z16.b, z6.b\n"
- "udot z26.s, z16.b, z1.b\n"
- "udot z17.s, z12.b, z1.b\n"
- "udot z31.s, z20.b, z28.b\n"
- "ext z28.b, z28.b, z28.b, #0x1\n"
- "mls z10.s, p1/M, z19.s, z23.s\n"
- "udot z4.s, z20.b, z1.b\n"
- "udot z26.s, z20.b, z28.b\n"
- ".inst 0x04ae754a // sqrdmulh z10.s, z10.s, z14.s\n"
- "movprfx z16, z17\n udot z16.s, z12.b, z28.b\n"
- "udot z17.s, z12.b, z11.b\n"
- "and z21.d, z10.d, z8.d\n"
- "mls z4.s, p1/M, z17.s, z23.s\n"
- "mls z31.s, p1/M, z18.s, z23.s\n"
- "asr z21.s, z21.s, #0x1f\n"
- "mls z26.s, p1/M, z16.s, z23.s\n"
- ".inst 0x04ae7484 // sqrdmulh z4.s, z4.s, z14.s\n"
- ".inst 0x04ae77ff // sqrdmulh z31.s, z31.s, z14.s\n"
- ".inst 0x04ae775a // sqrdmulh z26.s, z26.s, z14.s\n"
- "ld1w { z14.s }, p1/Z, [%x[params], #2, MUL VL]\n"
- "sqadd z10.s, z10.s, z21.s\n"
- "and z16.d, z4.d, z8.d\n"
- ".inst 0x4482850a // srshl z10.s, p1/M, z10.s, z8.s\n"
- "and z20.d, z31.d, z8.d\n"
- "and z21.d, z26.d, z8.d\n"
- "asr z16.s, z16.s, #0x1f\n"
+ "mov z12.s, #0x0\n"
+ "udot z26.s, z19.b, z11.b\n"
+ "ld1w { z22.s }, p2/Z, [%x[params], #-3, MUL VL]\n"
+ "udot z12.s, z25.b, z11.b\n"
+ "udot z27.s, z30.b, z20.b\n"
+ "incw x28\n"
+ "whilelt p0.s, x28, %x[n_channels]\n"
+ "udot z23.s, z21.b, z20.b\n"
+ "ext z20.b, z20.b, z20.b, #0x1\n"
+ "udot z28.s, z30.b, z11.b\n"
+ "udot z26.s, z30.b, z20.b\n"
+ "udot z12.s, z25.b, z20.b\n"
+ "udot z27.s, z21.b, z0.b\n"
+ "ext z0.b, z0.b, z0.b, #0x1\n"
+ "mls z23.s, p2/M, z24.s, z8.s\n"
+ "udot z28.s, z21.b, z20.b\n"
+ "udot z26.s, z21.b, z0.b\n"
+ ".inst 0x04a176f7 // sqrdmulh z23.s, z23.s, z1.s\n"
+ "movprfx z19, z12\n udot z19.s, z25.b, z0.b\n"
+ "udot z12.s, z25.b, z14.b\n"
+ "and z18.d, z23.d, z22.d\n"
+ "mls z28.s, p2/M, z12.s, z8.s\n"
+ "mls z27.s, p2/M, z13.s, z8.s\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "mls z26.s, p2/M, z19.s, z8.s\n"
+ ".inst 0x04a1779c // sqrdmulh z28.s, z28.s, z1.s\n"
+ ".inst 0x04a1777b // sqrdmulh z27.s, z27.s, z1.s\n"
+ ".inst 0x04a1775a // sqrdmulh z26.s, z26.s, z1.s\n"
+ "ld1w { z2.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "sqadd z23.s, z23.s, z18.s\n"
+ "and z20.d, z28.d, z22.d\n"
+ ".inst 0x44828ad7 // srshl z23.s, p2/M, z23.s, z22.s\n"
+ "and z19.d, z27.d, z22.d\n"
+ "and z18.d, z26.d, z22.d\n"
"asr z20.s, z20.s, #0x1f\n"
- "asr z21.s, z21.s, #0x1f\n"
- "sqadd z4.s, z4.s, z16.s\n"
- ".inst 0x44828504 // srshl z4.s, p1/M, z4.s, z8.s\n"
- "ld1b { z16.b }, p1/Z, [%x[params]]\n"
- "sqadd z31.s, z31.s, z20.s\n"
- "sqadd z26.s, z26.s, z21.s\n"
- ".inst 0x4482851f // srshl z31.s, p1/M, z31.s, z8.s\n"
- ".inst 0x4482851a // srshl z26.s, p1/M, z26.s, z8.s\n"
- "add z10.s, z10.s, z22.s\n"
- "smax z10.s, p1/M, z10.s, z25.s\n"
- "add z4.s, z4.s, z22.s\n"
- "smin z10.s, p1/M, z10.s, z24.s\n"
- "add z31.s, z31.s, z22.s\n"
- "add z26.s, z26.s, z22.s\n"
- "smax z4.s, p1/M, z4.s, z25.s\n"
- "smax z31.s, p1/M, z31.s, z25.s\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sqadd z28.s, z28.s, z20.s\n"
+ ".inst 0x44828adc // srshl z28.s, p2/M, z28.s, z22.s\n"
+ "ld1b { z13.b }, p2/Z, [%x[params]]\n"
+ "sqadd z27.s, z27.s, z19.s\n"
+ "sqadd z26.s, z26.s, z18.s\n"
+ ".inst 0x44828adb // srshl z27.s, p2/M, z27.s, z22.s\n"
+ ".inst 0x44828ada // srshl z26.s, p2/M, z26.s, z22.s\n"
+ "add z23.s, z23.s, z16.s\n"
+ "smax z23.s, p2/M, z23.s, z7.s\n"
+ "add z28.s, z28.s, z16.s\n"
+ "smin z23.s, p2/M, z23.s, z6.s\n"
+ "add z27.s, z27.s, z16.s\n"
+ "add z26.s, z26.s, z16.s\n"
+ "smax z28.s, p2/M, z28.s, z7.s\n"
+ "smax z27.s, p2/M, z27.s, z7.s\n"
+ "mov z24.s, #0x0\n"
+ "udot z24.s, z25.b, z17.b\n"
+ "smax z26.s, p2/M, z26.s, z7.s\n"
+ "st1b { z23.s }, p0, [x12, x28]\n"
+ "ld1w { z1.s }, p2/Z, [%x[params], #-2, MUL VL]\n"
+ "ld1b { z21.b }, p2/Z, [%x[params], #-1, MUL VL]\n"
+ "smin z28.s, p2/M, z28.s, z6.s\n"
+ "smin z27.s, p2/M, z27.s, z6.s\n"
+ "smin z26.s, p2/M, z26.s, z6.s\n"
+ "st1b { z28.s }, p0, [x11, x28]\n"
+ "mov z0.d, z1.d\n"
+ "udot z24.s, z25.b, z4.b\n"
+ "st1b { z27.s }, p0, [x10, x28]\n"
+ "mov z31.d, z1.d\n"
+ "udot z31.s, z21.b, z17.b\n"
+ "movprfx z23, z24\n udot z23.s, z25.b, z5.b\n"
+ "st1b { z26.s }, p0, [x9, x28]\n"
+ "mov z30.d, z1.d\n"
+ "udot z1.s, z21.b, z29.b\n"
+ "udot z1.s, z13.b, z17.b\n"
+ "udot z24.s, z25.b, z29.b\n"
+ "ext z29.b, z29.b, z29.b, #0x1\n"
+ "ld1b { z20.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "udot z0.s, z21.b, z29.b\n"
+ "ext z17.b, z17.b, z17.b, #0x1\n"
"mov z19.s, #0x0\n"
- "udot z19.s, z12.b, z5.b\n"
- "smax z26.s, p1/M, z26.s, z25.s\n"
- "st1b { z10.s }, p0, [x24, x20]\n"
- "ld1w { z10.s }, p1/Z, [%x[params], #-2, MUL VL]\n"
- "ld1b { z21.b }, p1/Z, [%x[params], #-1, MUL VL]\n"
- "smin z4.s, p1/M, z4.s, z24.s\n"
- "smin z31.s, p1/M, z31.s, z24.s\n"
- "smin z26.s, p1/M, z26.s, z24.s\n"
- "st1b { z4.s }, p0, [x23, x20]\n"
- "mov z4.d, z10.d\n"
- "udot z19.s, z12.b, z0.b\n"
- "st1b { z31.s }, p0, [x22, x20]\n"
- "mov z31.d, z10.d\n"
- "udot z31.s, z21.b, z5.b\n"
- "movprfx z18, z19\n udot z18.s, z12.b, z27.b\n"
- "st1b { z26.s }, p0, [x21, x20]\n"
- "mov z26.d, z10.d\n"
- "udot z10.s, z21.b, z9.b\n"
- "udot z10.s, z16.b, z5.b\n"
- "udot z19.s, z12.b, z9.b\n"
- "ext z9.b, z9.b, z9.b, #0x1\n"
- "ld1b { z20.b }, p1/Z, [%x[params], #1, MUL VL]\n"
- "udot z4.s, z21.b, z9.b\n"
+ "udot z30.s, z21.b, z17.b\n"
+ "ld1w { z22.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "udot z19.s, z25.b, z17.b\n"
+ "udot z31.s, z13.b, z4.b\n"
+ "incw x28\n"
+ "whilelt p1.s, x28, %x[n_channels]\n"
+ "udot z1.s, z20.b, z4.b\n"
+ "ext z4.b, z4.b, z4.b, #0x1\n"
+ "udot z0.s, z13.b, z17.b\n"
+ "whilelt p0.b, x14, %x[n_channels]\n"
+ "udot z30.s, z13.b, z4.b\n"
+ "udot z19.s, z25.b, z4.b\n"
+ "ld1b { z13.b }, p0/Z, [x26, x14]\n"
+ "ld1b { z28.b }, p0/Z, [x25, x14]\n"
+ "udot z31.s, z20.b, z5.b\n"
"ext z5.b, z5.b, z5.b, #0x1\n"
- "mov z17.s, #0x0\n"
- "udot z26.s, z21.b, z5.b\n"
- "ld1w { z8.s }, p1/Z, [%x[params], #3, MUL VL]\n"
- "udot z17.s, z12.b, z5.b\n"
- "udot z31.s, z16.b, z0.b\n"
- "incw x20\n"
- "whilelt p0.s, x20, %x[n_channels]\n"
- "udot z10.s, z20.b, z0.b\n"
- "ext z0.b, z0.b, z0.b, #0x1\n"
- "udot z4.s, z16.b, z5.b\n"
- "whilelt p2.b, x13, %x[n_channels]\n"
- "udot z26.s, z16.b, z0.b\n"
- "udot z17.s, z12.b, z0.b\n"
- "ld1b { z13.b }, p2/Z, [x11, x13]\n"
- "ld1b { z11.b }, p2/Z, [x10, x13]\n"
- "udot z31.s, z20.b, z27.b\n"
- "ext z27.b, z27.b, z27.b, #0x1\n"
- "mls z10.s, p1/M, z19.s, z23.s\n"
- "ld1b { z7.b }, p2/Z, [x27, x13]\n"
- "udot z4.s, z20.b, z0.b\n"
- "udot z26.s, z20.b, z27.b\n"
- ".inst 0x04ae754a // sqrdmulh z10.s, z10.s, z14.s\n"
- "ld1b { z6.b }, p2/Z, [x26, x13]\n"
- "movprfx z16, z17\n udot z16.s, z12.b, z27.b\n"
- "udot z17.s, z12.b, z9.b\n"
- "and z21.d, z10.d, z8.d\n"
- "ld1b { z9.b }, p2/Z, [x9, x13]\n"
- "mls z4.s, p1/M, z17.s, z23.s\n"
- "mls z31.s, p1/M, z18.s, z23.s\n"
+ "mls z1.s, p2/M, z24.s, z8.s\n"
+ "ld1b { z27.b }, p0/Z, [x22, x14]\n"
+ "udot z0.s, z20.b, z4.b\n"
+ "udot z30.s, z20.b, z5.b\n"
+ ".inst 0x04a27421 // sqrdmulh z1.s, z1.s, z2.s\n"
+ "ld1b { z26.b }, p0/Z, [x21, x14]\n"
+ "movprfx z18, z19\n udot z18.s, z25.b, z5.b\n"
+ "udot z19.s, z25.b, z29.b\n"
+ "and z11.d, z1.d, z22.d\n"
+ "ld1b { z29.b }, p0/Z, [x23, x14]\n"
+ "mls z0.s, p2/M, z19.s, z8.s\n"
+ "mls z31.s, p2/M, z23.s, z8.s\n"
+ "asr z11.s, z11.s, #0x1f\n"
+ "ld1b { z17.b }, p0/Z, [x20, x14]\n"
+ "mls z30.s, p2/M, z18.s, z8.s\n"
+ ".inst 0x04a27400 // sqrdmulh z0.s, z0.s, z2.s\n"
+ ".inst 0x04a277ff // sqrdmulh z31.s, z31.s, z2.s\n"
+ ".inst 0x04a277de // sqrdmulh z30.s, z30.s, z2.s\n"
+ "ld1b { z15.b }, p0/Z, [x27, x14]\n"
+ "ldp x23, x22, [%x[inptrs], #0x40]\n"
+ "sqadd z1.s, z1.s, z11.s\n"
+ "and z21.d, z0.d, z22.d\n"
+ ".inst 0x44828ac1 // srshl z1.s, p2/M, z1.s, z22.s\n"
+ "ldp x21, x20, [%x[inptrs], #0x50]\n"
+ "and z20.d, z31.d, z22.d\n"
+ "and z19.d, z30.d, z22.d\n"
+ "ld1b { z18.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z11.b }, p0/Z, [x22, x14]\n"
"asr z21.s, z21.s, #0x1f\n"
- "ld1b { z5.b }, p2/Z, [x25, x13]\n"
- "mls z26.s, p1/M, z16.s, z23.s\n"
- ".inst 0x04ae7484 // sqrdmulh z4.s, z4.s, z14.s\n"
- ".inst 0x04ae77ff // sqrdmulh z31.s, z31.s, z14.s\n"
- ".inst 0x04ae775a // sqrdmulh z26.s, z26.s, z14.s\n"
- "ld1b { z14.b }, p2/Z, [x12, x13]\n"
- "ldp x12, x11, [%x[inptrs], #0x40]\n"
- "sqadd z10.s, z10.s, z21.s\n"
- "and z16.d, z4.d, z8.d\n"
- ".inst 0x4482850a // srshl z10.s, p1/M, z10.s, z8.s\n"
- "ldp x10, x9, [%x[inptrs], #0x50]\n"
- "and z20.d, z31.d, z8.d\n"
- "and z21.d, z26.d, z8.d\n"
- "ld1b { z3.b }, p2/Z, [x12, x13]\n"
- "ld1b { z2.b }, p2/Z, [x11, x13]\n"
- "asr z16.s, z16.s, #0x1f\n"
"asr z20.s, z20.s, #0x1f\n"
- "ld1b { z1.b }, p2/Z, [x10, x13]\n"
- "ld1b { z0.b }, p2/Z, [x9, x13]\n"
- "asr z21.s, z21.s, #0x1f\n"
- "sqadd z4.s, z4.s, z16.s\n"
- ".inst 0x44828504 // srshl z4.s, p1/M, z4.s, z8.s\n"
- "ld1b { z16.b }, p1/Z, [%x[params], #6, MUL VL]\n"
+ "ld1b { z24.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z4.b }, p0/Z, [x20, x14]\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "sqadd z0.s, z0.s, z21.s\n"
+ ".inst 0x44828ac0 // srshl z0.s, p2/M, z0.s, z22.s\n"
+ "ld1b { z3.b }, p2/Z, [%x[params], #6, MUL VL]\n"
"sqadd z31.s, z31.s, z20.s\n"
- "sqadd z26.s, z26.s, z21.s\n"
- ".inst 0x4482851f // srshl z31.s, p1/M, z31.s, z8.s\n"
- ".inst 0x4482851a // srshl z26.s, p1/M, z26.s, z8.s\n"
- "add z10.s, z10.s, z22.s\n"
- "smax z10.s, p1/M, z10.s, z25.s\n"
- "add z4.s, z4.s, z22.s\n"
- "ld1b { z8.b }, p2/Z, [x28, x13]\n"
- "add z31.s, z31.s, z22.s\n"
- "add z26.s, z26.s, z22.s\n"
- "ldp x28, x27, [%x[inptrs], #0x60]\n"
- "ldp x26, x25, [%x[inptrs], #0x70]\n"
- "smin z10.s, p1/M, z10.s, z24.s\n"
- "smax z4.s, p1/M, z4.s, z25.s\n"
- "st1b { z10.s }, p0, [x24, x20]\n"
- "ld1b { z30.b }, p2/Z, [x28, x13]\n"
- "smax z31.s, p1/M, z31.s, z25.s\n"
- "smax z26.s, p1/M, z26.s, z25.s\n"
- "ld1b { z29.b }, p2/Z, [x27, x13]\n"
- "ld1b { z28.b }, p2/Z, [x26, x13]\n"
- "ld1b { z27.b }, p2/Z, [x25, x13]\n"
- "zip2 z10.b, z14.b, z11.b\n"
- "zip1 z14.b, z14.b, z11.b\n"
- "smin z4.s, p1/M, z4.s, z24.s\n"
- "zip1 z11.b, z13.b, z9.b\n"
- "zip2 z9.b, z13.b, z9.b\n"
- "smin z31.s, p1/M, z31.s, z24.s\n"
- "smin z26.s, p1/M, z26.s, z24.s\n"
- "st1b { z4.s }, p0, [x23, x20]\n"
- "zip2 z13.b, z14.b, z11.b\n"
- "zip1 z14.b, z14.b, z11.b\n"
- "ldp x12, x11, [%x[inptrs], #0x0]\n"
- "st1b { z31.s }, p0, [x22, x20]\n"
- "zip1 z11.b, z10.b, z9.b\n"
- "zip2 z9.b, z10.b, z9.b\n"
- "ld1w { z10.s }, p1/Z, [%x[params], #4, MUL VL]\n"
- "st1b { z26.s }, p0, [x21, x20]\n"
- "zip2 z4.b, z8.b, z6.b\n"
- "zip1 z8.b, z8.b, z6.b\n"
- "incw x20\n"
- "zip1 z6.b, z7.b, z5.b\n"
- "zip2 z5.b, z7.b, z5.b\n"
- "ldp x10, x9, [%x[inptrs], #0x10]\n"
- "ldp x28, x27, [%x[inptrs], #0x20]\n"
- "zip2 z31.b, z3.b, z1.b\n"
- "zip1 z3.b, z3.b, z1.b\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
- "ld1b { z21.b }, p1/Z, [%x[params], #5, MUL VL]\n"
- "zip1 z1.b, z2.b, z0.b\n"
- "zip2 z0.b, z2.b, z0.b\n"
- "ld1b { z20.b }, p1/Z, [%x[params], #7, MUL VL]\n"
+ "sqadd z30.s, z30.s, z19.s\n"
+ ".inst 0x44828adf // srshl z31.s, p2/M, z31.s, z22.s\n"
+ ".inst 0x44828ade // srshl z30.s, p2/M, z30.s, z22.s\n"
+ "add z1.s, z1.s, z16.s\n"
+ "smax z1.s, p2/M, z1.s, z7.s\n"
+ "add z0.s, z0.s, z16.s\n"
+ "ld1b { z9.b }, p0/Z, [x24, x14]\n"
+ "add z31.s, z31.s, z16.s\n"
+ "add z30.s, z30.s, z16.s\n"
+ "ldp x23, x22, [%x[inptrs], #0x60]\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "smin z1.s, p2/M, z1.s, z6.s\n"
+ "smax z0.s, p2/M, z0.s, z7.s\n"
+ "st1b { z1.s }, p1, [x12, x28]\n"
+ "ld1b { z2.b }, p0/Z, [x23, x14]\n"
+ "smax z31.s, p2/M, z31.s, z7.s\n"
+ "smax z30.s, p2/M, z30.s, z7.s\n"
+ "ld1b { z23.b }, p0/Z, [x22, x14]\n"
+ "ld1b { z22.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z5.b }, p0/Z, [x20, x14]\n"
+ "zip2 z20.b, z15.b, z28.b\n"
+ "zip1 z15.b, z15.b, z28.b\n"
+ "smin z0.s, p2/M, z0.s, z6.s\n"
+ "zip1 z19.b, z13.b, z29.b\n"
+ "zip2 z29.b, z13.b, z29.b\n"
+ "smin z31.s, p2/M, z31.s, z6.s\n"
+ "smin z30.s, p2/M, z30.s, z6.s\n"
+ "st1b { z0.s }, p1, [x11, x28]\n"
+ "zip2 z13.b, z15.b, z19.b\n"
+ "zip1 z15.b, z15.b, z19.b\n"
+ "ldp x27, x26, [%x[inptrs], #0x0]\n"
+ "st1b { z31.s }, p1, [x10, x28]\n"
+ "zip1 z14.b, z20.b, z29.b\n"
+ "zip2 z29.b, z20.b, z29.b\n"
+ "ld1w { z10.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "st1b { z30.s }, p1, [x9, x28]\n"
+ "zip2 z21.b, z9.b, z26.b\n"
+ "zip1 z9.b, z9.b, z26.b\n"
+ "incw x28\n"
+ "zip1 z20.b, z27.b, z17.b\n"
+ "zip2 z17.b, z27.b, z17.b\n"
+ "ldp x25, x23, [%x[inptrs], #0x10]\n"
+ "ldp x24, x22, [%x[inptrs], #0x20]\n"
+ "zip2 z31.b, z18.b, z24.b\n"
+ "zip1 z18.b, z18.b, z24.b\n"
+ "ldp x21, x20, [%x[inptrs], #0x30]\n"
+ "ld1b { z26.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "zip1 z27.b, z11.b, z4.b\n"
+ "zip2 z4.b, z11.b, z4.b\n"
+ "ld1b { z1.b }, p2/Z, [%x[params], #7, MUL VL]\n"
"addvl %x[params], %x[params], #8\n"
- "zip2 z26.b, z30.b, z28.b\n"
- "zip1 z30.b, z30.b, z28.b\n"
- "zip1 z28.b, z29.b, z27.b\n"
- "zip2 z27.b, z29.b, z27.b\n"
- "zip2 z7.b, z8.b, z6.b\n"
- "zip1 z8.b, z8.b, z6.b\n"
- "zip1 z6.b, z4.b, z5.b\n"
- "zip2 z5.b, z4.b, z5.b\n"
- "zip2 z2.b, z3.b, z1.b\n"
- "zip1 z3.b, z3.b, z1.b\n"
- "zip1 z1.b, z31.b, z0.b\n"
- "zip2 z0.b, z31.b, z0.b\n"
- "zip2 z29.b, z30.b, z28.b\n"
- "zip1 z30.b, z30.b, z28.b\n"
- "zip1 z28.b, z26.b, z27.b\n"
- "zip2 z27.b, z26.b, z27.b\n"
- "mov z4.d, z10.d\n"
+ "zip2 z30.b, z2.b, z22.b\n"
+ "zip1 z2.b, z2.b, z22.b\n"
+ "zip1 z28.b, z23.b, z5.b\n"
+ "zip2 z5.b, z23.b, z5.b\n"
+ "zip2 z19.b, z9.b, z20.b\n"
+ "zip1 z9.b, z9.b, z20.b\n"
+ "zip1 z11.b, z21.b, z17.b\n"
+ "zip2 z17.b, z21.b, z17.b\n"
+ "zip2 z12.b, z18.b, z27.b\n"
+ "zip1 z18.b, z18.b, z27.b\n"
+ "zip1 z20.b, z31.b, z4.b\n"
+ "zip2 z4.b, z31.b, z4.b\n"
+ "zip2 z24.b, z2.b, z28.b\n"
+ "zip1 z2.b, z2.b, z28.b\n"
+ "zip1 z0.b, z30.b, z5.b\n"
+ "zip2 z5.b, z30.b, z5.b\n"
+ "mov z22.d, z10.d\n"
"mov z31.d, z10.d\n"
- "mov z26.d, z10.d\n"
+ "mov z21.d, z10.d\n"
"b.any 1b\n"
: [params] "+&r" (params)
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index 1cf20ef721..0300b71d7c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,14 +22,14 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -47,17 +47,16 @@ class sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstS
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
- unsigned int get_accumulator_depth_vl(void) const override { return 2; }
-
sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
- Parent::KernelType kernel = sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+ Parent::KernelType kernel = sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index 386eb96cff..5c26010c0d 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -27,7 +27,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -46,7 +46,7 @@ void sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
struct Params
{
long unsigned int n_channels;
- const uint8_t *weights;
+ const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
const int32_t *const requant_muls;
@@ -57,7 +57,7 @@ void sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
Params(
long unsigned int n_channels,
const uint8_t *const *inptrs_raw,
- const uint8_t *const weights,
+ const void *const weights,
const int32_t *const bias,
const arm_gemm::Requantize32 &qp,
const int32_t *const requant_muls,
@@ -91,320 +91,320 @@ void sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "mov x8, #0x0\n"
+ "mov x16, #0x0\n"
"ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
"ptrue p4.b\n"
"ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
- "mov x23, x8\n"
+ "mov x23, x16\n"
"add x21, x25, %[offsetof_Requantize32_a_offset]\n"
- "ldr x17, [%x[params], %[offsetof_Params_n_channels]]\n"
- "ldr x16, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x15, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
"add x20, x25, %[offsetof_Requantize32_b_offset]\n"
"add x22, x25, %[offsetof_Requantize32_c_offset]\n"
- "ld1rb { z23.b }, p4/Z, [x21]\n"
- "ld1rb { z15.b }, p4/Z, [x20]\n"
+ "ld1rb { z12.b }, p4/Z, [x21]\n"
+ "ld1rb { z30.b }, p4/Z, [x20]\n"
"add x21, x25, %[offsetof_Requantize32_minval]\n"
"add x20, x25, %[offsetof_Requantize32_maxval]\n"
- "ld1rh { z14.h }, p4/Z, [x22]\n"
- "ld1rh { z12.h }, p4/Z, [x21]\n"
- "ld1rh { z11.h }, p4/Z, [x20]\n"
- "ldp x15, x14, [x24, #0x0]\n"
+ "ld1rh { z24.h }, p4/Z, [x22]\n"
+ "ld1rh { z11.h }, p4/Z, [x21]\n"
+ "ld1rh { z26.h }, p4/Z, [x20]\n"
+ "ldp x13, x12, [x24, #0x0]\n"
"incw x23\n"
- "whilelt p3.h, x8, x17\n"
- "ldp x13, x12, [x24, #0x10]\n"
- "whilelt p2.s, x8, x17\n"
- "whilelt p1.s, x23, x17\n"
- "ldr x26, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1b { z0.h }, p4/Z, [x16]\n"
- "ld1b { z1.h }, p4/Z, [x16, #1, MUL VL]\n"
- "add x11, %x[params], %[offsetof_Params_inptrs]\n"
- "mov x10, #0x0\n"
- "ld1b { z2.h }, p4/Z, [x16, #2, MUL VL]\n"
- "ld1b { z3.h }, p4/Z, [x16, #3, MUL VL]\n"
- ".inst 0x454f1800 // usublb z0.h, z0.b, z15.b\n"
- ".inst 0x454f1821 // usublb z1.h, z1.b, z15.b\n"
- "ld1b { z4.h }, p4/Z, [x16, #4, MUL VL]\n"
- "ld1b { z5.h }, p4/Z, [x16, #5, MUL VL]\n"
- ".inst 0x454f1842 // usublb z2.h, z2.b, z15.b\n"
- ".inst 0x454f1863 // usublb z3.h, z3.b, z15.b\n"
- "ld1b { z6.h }, p4/Z, [x16, #6, MUL VL]\n"
- "ld1b { z7.h }, p4/Z, [x16, #7, MUL VL]\n"
- "inch x16, ALL, MUL #8\n"
- ".inst 0x454f1884 // usublb z4.h, z4.b, z15.b\n"
- "ld1w { z17.s }, p2/Z, [x26]\n"
- "ld1w { z16.s }, p1/Z, [x26, #1, MUL VL]\n"
- "uzp1 z13.s, z17.s, z16.s\n"
- "uzp2 z17.s, z17.s, z16.s\n"
- "ld1b { z8.h }, p4/Z, [x16]\n"
- "ldp x24, x23, [x11, #0x0]\n"
- "addvl x26, x26, #2\n"
- "mov z26.d, z13.d\n"
- "ldp x22, x21, [x11, #0x10]\n"
- "ldr x20, [x11, #0x20]\n"
- "mov z10.d, z17.d\n"
- "mov z24.d, z13.d\n"
- "ld1b { z31.h }, p3/Z, [x24, x8]\n"
- "ld1b { z30.h }, p3/Z, [x23, x8]\n"
- "mov z16.d, z17.d\n"
- "mov z25.d, z13.d\n"
- "ld1b { z29.h }, p3/Z, [x22, x8]\n"
- "ld1b { z28.h }, p3/Z, [x21, x8]\n"
- "mov z9.d, z17.d\n"
- ".inst 0x454f18a5 // usublb z5.h, z5.b, z15.b\n"
- "ld1b { z27.h }, p3/Z, [x20, x8]\n"
- "ldr x9, [%x[params], %[offsetof_Params_requant_muls]]\n"
- ".inst 0x454f18c6 // usublb z6.h, z6.b, z15.b\n"
- ".inst 0x454f18e7 // usublb z7.h, z7.b, z15.b\n"
- "ldr x28, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "str x26, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x454f1908 // usublb z8.h, z8.b, z15.b\n"
- ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
- ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
+ "whilelt p3.h, x16, x15\n"
+ "ldp x11, x10, [x24, #0x10]\n"
+ "whilelt p2.s, x16, x15\n"
+ "whilelt p1.s, x23, x15\n"
+ "ldr x9, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1b { z14.h }, p4/Z, [x14]\n"
+ "ld1b { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
+ "add x28, %x[params], %[offsetof_Params_inptrs]\n"
+ "mov x27, #0x0\n"
+ "ld1b { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
+ ".inst 0x455e19ce // usublb z14.h, z14.b, z30.b\n"
+ ".inst 0x455e1ab5 // usublb z21.h, z21.b, z30.b\n"
+ "ld1b { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
+ "ld1b { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
+ ".inst 0x455e1821 // usublb z1.h, z1.b, z30.b\n"
+ ".inst 0x455e18c6 // usublb z6.h, z6.b, z30.b\n"
+ "ld1b { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
+ "ld1b { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+ "inch x14, ALL, MUL #8\n"
+ ".inst 0x455e1842 // usublb z2.h, z2.b, z30.b\n"
+ "ld1w { z17.s }, p2/Z, [x9]\n"
+ "ld1w { z16.s }, p1/Z, [x9, #1, MUL VL]\n"
+ "uzp1 z5.s, z17.s, z16.s\n"
+ "uzp2 z9.s, z17.s, z16.s\n"
+ "ld1b { z8.h }, p4/Z, [x14]\n"
+ "ldp x24, x23, [x28, #0x0]\n"
+ "addvl x9, x9, #2\n"
+ "mov z17.d, z5.d\n"
+ "ldp x22, x21, [x28, #0x10]\n"
+ "ldr x20, [x28, #0x20]\n"
+ "mov z25.d, z9.d\n"
+ "mov z16.d, z5.d\n"
+ "ld1b { z0.h }, p3/Z, [x24, x16]\n"
+ "ld1b { z29.h }, p3/Z, [x23, x16]\n"
+ "mov z23.d, z9.d\n"
+ "mov z22.d, z5.d\n"
+ "ld1b { z4.h }, p3/Z, [x22, x16]\n"
+ "ld1b { z13.h }, p3/Z, [x21, x16]\n"
+ "mov z27.d, z9.d\n"
+ ".inst 0x455e1a52 // usublb z18.h, z18.b, z30.b\n"
+ "ld1b { z20.h }, p3/Z, [x20, x16]\n"
+ "ldr x26, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ ".inst 0x455e18e7 // usublb z7.h, z7.b, z30.b\n"
+ ".inst 0x455e194a // usublb z10.h, z10.b, z30.b\n"
+ "ldr x25, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "str x9, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x455e1908 // usublb z8.h, z8.b, z30.b\n"
+ ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
+ ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
+ ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
+ ".inst 0x454c19ad // usublb z13.h, z13.b, z12.b\n"
+ ".inst 0x454c1a94 // usublb z20.h, z20.b, z12.b\n"
"1:" // Loop
- ".inst 0x448443ed // smlalb z13.s, p4/M, z31.h, z4.h\n"
- ".inst 0x448447f1 // smlalt z17.s, p4/M, z31.h, z4.h\n"
- "ldr x22, [x11, #0x28]\n"
- "ldr x27, [x11, #0x38]\n"
- ".inst 0x448343fa // smlalb z26.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448347ea // smlalt z10.s, p4/M, z31.h, z3.h\n"
- "ldr x21, [x11, #0x30]\n"
- "ldr x26, [x11, #0x40]\n"
- ".inst 0x448043cd // smlalb z13.s, p4/M, z30.h, z0.h\n"
- ".inst 0x448047d1 // smlalt z17.s, p4/M, z30.h, z0.h\n"
- "ldr x20, [x11, #0x48]\n"
- "ld1b { z30.h }, p3/Z, [x20, x8]\n"
- ".inst 0x448243ba // smlalb z26.s, p4/M, z29.h, z2.h\n"
- ".inst 0x448247aa // smlalt z10.s, p4/M, z29.h, z2.h\n"
- "ld1b { z29.h }, p3/Z, [x21, x8]\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x448143f8 // smlalb z24.s, p4/M, z31.h, z1.h\n"
- ".inst 0x448147f0 // smlalt z16.s, p4/M, z31.h, z1.h\n"
- "ldr x25, [x11, #0x50]\n"
- "ldr x24, [x11, #0x58]\n"
- ".inst 0x448043f9 // smlalb z25.s, p4/M, z31.h, z0.h\n"
- ".inst 0x448047e9 // smlalt z9.s, p4/M, z31.h, z0.h\n"
- "ld1b { z31.h }, p3/Z, [x22, x8]\n"
- ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
- ".inst 0x4485438d // smlalb z13.s, p4/M, z28.h, z5.h\n"
- ".inst 0x44854791 // smlalt z17.s, p4/M, z28.h, z5.h\n"
- ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
- "ldr x23, [x11, #0x60]\n"
- ".inst 0x4484439a // smlalb z26.s, p4/M, z28.h, z4.h\n"
- ".inst 0x4484478a // smlalt z10.s, p4/M, z28.h, z4.h\n"
- "ldr x22, [x11, #0x68]\n"
- "ldr x21, [x11, #0x70]\n"
- ".inst 0x44824398 // smlalb z24.s, p4/M, z28.h, z2.h\n"
- ".inst 0x44824790 // smlalt z16.s, p4/M, z28.h, z2.h\n"
- "ldr x20, [x11, #0x78]\n"
- "ld1w { z20.s }, p2/Z, [x9]\n"
- ".inst 0x44814399 // smlalb z25.s, p4/M, z28.h, z1.h\n"
- ".inst 0x44814789 // smlalt z9.s, p4/M, z28.h, z1.h\n"
- "ld1b { z28.h }, p3/Z, [x27, x8]\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x4487436d // smlalb z13.s, p4/M, z27.h, z7.h\n"
- ".inst 0x44874771 // smlalt z17.s, p4/M, z27.h, z7.h\n"
- "ld1w { z18.s }, p1/Z, [x9, #1, MUL VL]\n"
- "uzp1 z19.s, z20.s, z18.s\n"
- ".inst 0x4486437a // smlalb z26.s, p4/M, z27.h, z6.h\n"
- ".inst 0x4486476a // smlalt z10.s, p4/M, z27.h, z6.h\n"
- "uzp2 z22.s, z20.s, z18.s\n"
- "ld1w { z20.s }, p2/Z, [x28]\n"
- ".inst 0x448643f8 // smlalb z24.s, p4/M, z31.h, z6.h\n"
- ".inst 0x448647f0 // smlalt z16.s, p4/M, z31.h, z6.h\n"
- "ld1b { z31.h }, p3/Z, [x26, x8]\n"
- ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
- ".inst 0x44834379 // smlalb z25.s, p4/M, z27.h, z3.h\n"
- ".inst 0x44834769 // smlalt z9.s, p4/M, z27.h, z3.h\n"
- "whilelt p0.h, x10, x17\n"
+ ".inst 0x44824005 // smlalb z5.s, p4/M, z0.h, z2.h\n"
+ ".inst 0x44824409 // smlalt z9.s, p4/M, z0.h, z2.h\n"
+ "ldr x20, [x28, #0x28]\n"
+ "ldr x21, [x28, #0x38]\n"
+ ".inst 0x448e43a5 // smlalb z5.s, p4/M, z29.h, z14.h\n"
+ ".inst 0x44864011 // smlalb z17.s, p4/M, z0.h, z6.h\n"
+ "ld1b { z3.h }, p3/Z, [x20, x16]\n"
+ "ldr x20, [x28, #0x30]\n"
+ ".inst 0x44954010 // smlalb z16.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x448e4016 // smlalb z22.s, p4/M, z0.h, z14.h\n"
+ "ld1b { z31.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x454c1863 // usublb z3.h, z3.b, z12.b\n"
+ ".inst 0x448e47a9 // smlalt z9.s, p4/M, z29.h, z14.h\n"
+ ".inst 0x449241a5 // smlalb z5.s, p4/M, z13.h, z18.h\n"
+ "ldr x21, [x28, #0x40]\n"
+ "ld1b { z15.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x44864419 // smlalt z25.s, p4/M, z0.h, z6.h\n"
+ ".inst 0x44954417 // smlalt z23.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n"
+ "ldr x20, [x28, #0x48]\n"
+ ".inst 0x448e441b // smlalt z27.s, p4/M, z0.h, z14.h\n"
+ ".inst 0x44814091 // smlalb z17.s, p4/M, z4.h, z1.h\n"
+ "ld1b { z19.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x454c19ef // usublb z15.h, z15.b, z12.b\n"
+ ".inst 0x448141b0 // smlalb z16.s, p4/M, z13.h, z1.h\n"
+ ".inst 0x449541b6 // smlalb z22.s, p4/M, z13.h, z21.h\n"
+ "ld1b { z28.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x454c1a73 // usublb z19.h, z19.b, z12.b\n"
+ ".inst 0x449245a9 // smlalt z9.s, p4/M, z13.h, z18.h\n"
+ ".inst 0x448a4285 // smlalb z5.s, p4/M, z20.h, z10.h\n"
+ "ldr x21, [x28, #0x50]\n"
+ "ldr x20, [x28, #0x58]\n"
+ ".inst 0x44814499 // smlalt z25.s, p4/M, z4.h, z1.h\n"
+ ".inst 0x448145b7 // smlalt z23.s, p4/M, z13.h, z1.h\n"
+ ".inst 0x454c1b9c // usublb z28.h, z28.b, z12.b\n"
+ "ld1b { z4.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x449545bb // smlalt z27.s, p4/M, z13.h, z21.h\n"
+ ".inst 0x448241b1 // smlalb z17.s, p4/M, z13.h, z2.h\n"
+ "ld1b { z29.h }, p3/Z, [x20, x16]\n"
+ "ldr x21, [x28, #0x60]\n"
+ ".inst 0x44874070 // smlalb z16.s, p4/M, z3.h, z7.h\n"
+ ".inst 0x44864296 // smlalb z22.s, p4/M, z20.h, z6.h\n"
+ "ldr x20, [x28, #0x68]\n"
+ ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
+ ".inst 0x448a4689 // smlalt z9.s, p4/M, z20.h, z10.h\n"
+ ".inst 0x449543e5 // smlalb z5.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
+ "ld1b { z0.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x448245b9 // smlalt z25.s, p4/M, z13.h, z2.h\n"
+ ".inst 0x44874477 // smlalt z23.s, p4/M, z3.h, z7.h\n"
+ "ld1b { z3.h }, p3/Z, [x20, x16]\n"
+ "ldr x20, [x28, #0x70]\n"
+ ".inst 0x4486469b // smlalt z27.s, p4/M, z20.h, z6.h\n"
+ ".inst 0x44874291 // smlalb z17.s, p4/M, z20.h, z7.h\n"
+ ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
+ "ld1b { z13.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x44824290 // smlalb z16.s, p4/M, z20.h, z2.h\n"
+ ".inst 0x448841f6 // smlalb z22.s, p4/M, z15.h, z8.h\n"
+ ".inst 0x454c1863 // usublb z3.h, z3.b, z12.b\n"
+ "ldr x20, [x28, #0x78]\n"
+ ".inst 0x449547e9 // smlalt z9.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x44814265 // smlalb z5.s, p4/M, z19.h, z1.h\n"
+ ".inst 0x454c19ad // usublb z13.h, z13.b, z12.b\n"
+ "whilelt p0.h, x27, x15\n"
+ ".inst 0x44874699 // smlalt z25.s, p4/M, z20.h, z7.h\n"
+ ".inst 0x44824697 // smlalt z23.s, p4/M, z20.h, z2.h\n"
+ "ld1w { z20.s }, p2/Z, [x26]\n"
+ "inch x14\n"
+ ".inst 0x448845fb // smlalt z27.s, p4/M, z15.h, z8.h\n"
+ ".inst 0x448e43f1 // smlalb z17.s, p4/M, z31.h, z14.h\n"
+ "ld1w { z15.s }, p1/Z, [x26, #1, MUL VL]\n"
+ "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44924390 // smlalb z16.s, p4/M, z28.h, z18.h\n"
+ ".inst 0x44824396 // smlalb z22.s, p4/M, z28.h, z2.h\n"
+ "addvl x26, x26, #2\n"
+ ".inst 0x44814669 // smlalt z9.s, p4/M, z19.h, z1.h\n"
+ ".inst 0x44884385 // smlalb z5.s, p4/M, z28.h, z8.h\n"
+ ".inst 0x448e47f9 // smlalt z25.s, p4/M, z31.h, z14.h\n"
+ ".inst 0x44924797 // smlalt z23.s, p4/M, z28.h, z18.h\n"
+ "ld1b { z31.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n"
+ ".inst 0x4482479b // smlalt z27.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x44954271 // smlalb z17.s, p4/M, z19.h, z21.h\n"
+ "uzp1 z2.s, z20.s, z15.s\n"
"inch x16\n"
- ".inst 0x4481438d // smlalb z13.s, p4/M, z28.h, z1.h\n"
- ".inst 0x44814791 // smlalt z17.s, p4/M, z28.h, z1.h\n"
- "ldr x26, [%x[params], %[offsetof_Params_bias]]\n"
- "addvl x9, x9, #2\n"
- ".inst 0x4480439a // smlalb z26.s, p4/M, z28.h, z0.h\n"
- ".inst 0x4480478a // smlalt z10.s, p4/M, z28.h, z0.h\n"
- "ld1b { z28.h }, p3/Z, [x24, x8]\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x44844378 // smlalb z24.s, p4/M, z27.h, z4.h\n"
- ".inst 0x448843b9 // smlalb z25.s, p4/M, z29.h, z8.h\n"
- ".inst 0x44844770 // smlalt z16.s, p4/M, z27.h, z4.h\n"
- ".inst 0x448847a9 // smlalt z9.s, p4/M, z29.h, z8.h\n"
- "ld1b { z29.h }, p3/Z, [x25, x8]\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x448243ed // smlalb z13.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448247f1 // smlalt z17.s, p4/M, z31.h, z2.h\n"
- "ld1w { z18.s }, p1/Z, [x28, #1, MUL VL]\n"
- "addvl x28, x28, #2\n"
- ".inst 0x448143fa // smlalb z26.s, p4/M, z31.h, z1.h\n"
- ".inst 0x448147ea // smlalt z10.s, p4/M, z31.h, z1.h\n"
- "ld1b { z31.h }, p3/Z, [x23, x8]\n"
- ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
- ".inst 0x448543d8 // smlalb z24.s, p4/M, z30.h, z5.h\n"
- ".inst 0x448443d9 // smlalb z25.s, p4/M, z30.h, z4.h\n"
- "uzp1 z1.s, z20.s, z18.s\n"
- ".inst 0x448843cd // smlalb z13.s, p4/M, z30.h, z8.h\n"
- ".inst 0x448847d1 // smlalt z17.s, p4/M, z30.h, z8.h\n"
- "uzp2 z27.s, z20.s, z18.s\n"
- ".inst 0x448743da // smlalb z26.s, p4/M, z30.h, z7.h\n"
- ".inst 0x448747ca // smlalt z10.s, p4/M, z30.h, z7.h\n"
- ".inst 0x448547d0 // smlalt z16.s, p4/M, z30.h, z5.h\n"
- ".inst 0x448447c9 // smlalt z9.s, p4/M, z30.h, z4.h\n"
- "ld1b { z30.h }, p3/Z, [x22, x8]\n"
- ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
- ".inst 0x448043b8 // smlalb z24.s, p4/M, z29.h, z0.h\n"
- ".inst 0x44824399 // smlalb z25.s, p4/M, z28.h, z2.h\n"
- ".inst 0x448343ad // smlalb z13.s, p4/M, z29.h, z3.h\n"
- ".inst 0x448347b1 // smlalt z17.s, p4/M, z29.h, z3.h\n"
- ".inst 0x448047b0 // smlalt z16.s, p4/M, z29.h, z0.h\n"
- "ld1b { z29.h }, p3/Z, [x21, x8]\n"
- ".inst 0x44824789 // smlalt z9.s, p4/M, z28.h, z2.h\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x448343f8 // smlalb z24.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448543d9 // smlalb z25.s, p4/M, z30.h, z5.h\n"
- ".inst 0x4485439a // smlalb z26.s, p4/M, z28.h, z5.h\n"
- ".inst 0x4485478a // smlalt z10.s, p4/M, z28.h, z5.h\n"
- "ld1b { z28.h }, p3/Z, [x20, x8]\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x448643ed // smlalb z13.s, p4/M, z31.h, z6.h\n"
- ".inst 0x448347f0 // smlalt z16.s, p4/M, z31.h, z3.h\n"
- ".inst 0x04b375ad // sqrdmulh z13.s, z13.s, z19.s\n"
- "inch x8\n"
- ".inst 0x448547c9 // smlalt z9.s, p4/M, z30.h, z5.h\n"
- ".inst 0x448743b8 // smlalb z24.s, p4/M, z29.h, z7.h\n"
- "and z21.d, z13.d, z1.d\n"
- "mov x20, x8\n"
- ".inst 0x448643b9 // smlalb z25.s, p4/M, z29.h, z6.h\n"
- ".inst 0x448647f1 // smlalt z17.s, p4/M, z31.h, z6.h\n"
- ".inst 0x04b67631 // sqrdmulh z17.s, z17.s, z22.s\n"
+ ".inst 0x448e4090 // smlalb z16.s, p4/M, z4.h, z14.h\n"
+ ".inst 0x448143b6 // smlalb z22.s, p4/M, z29.h, z1.h\n"
+ "uzp2 z15.s, z20.s, z15.s\n"
+ "ld1w { z20.s }, p2/Z, [x25]\n"
+ ".inst 0x44884789 // smlalt z9.s, p4/M, z28.h, z8.h\n"
+ ".inst 0x44864085 // smlalb z5.s, p4/M, z4.h, z6.h\n"
+ "mov x20, x16\n"
"incw x20\n"
- ".inst 0x448747b0 // smlalt z16.s, p4/M, z29.h, z7.h\n"
- ".inst 0x448647a9 // smlalt z9.s, p4/M, z29.h, z6.h\n"
- "asr z21.s, z21.s, #0x1f\n"
- "whilelt p2.s, x8, x17\n"
- ".inst 0x448843da // smlalb z26.s, p4/M, z30.h, z8.h\n"
- ".inst 0x44884398 // smlalb z24.s, p4/M, z28.h, z8.h\n"
- "and z20.d, z17.d, z27.d\n"
- "whilelt p1.s, x20, x17\n"
- ".inst 0x44874399 // smlalb z25.s, p4/M, z28.h, z7.h\n"
- ".inst 0x448847ca // smlalt z10.s, p4/M, z30.h, z8.h\n"
- ".inst 0x04b3775a // sqrdmulh z26.s, z26.s, z19.s\n"
- "whilelt p3.h, x8, x17\n"
- ".inst 0x44884790 // smlalt z16.s, p4/M, z28.h, z8.h\n"
- ".inst 0x44874789 // smlalt z9.s, p4/M, z28.h, z7.h\n"
- ".inst 0x04b37718 // sqrdmulh z24.s, z24.s, z19.s\n"
- ".inst 0x04b37739 // sqrdmulh z25.s, z25.s, z19.s\n"
- "sqadd z13.s, z13.s, z21.s\n"
- ".inst 0x4482902d // srshl z13.s, p4/M, z13.s, z1.s\n"
- "asr z20.s, z20.s, #0x1f\n"
- "and z19.d, z26.d, z1.d\n"
- ".inst 0x04b6754a // sqrdmulh z10.s, z10.s, z22.s\n"
- "and z18.d, z24.d, z1.d\n"
- ".inst 0x04b67610 // sqrdmulh z16.s, z16.s, z22.s\n"
- "and z21.d, z25.d, z1.d\n"
- ".inst 0x04b67529 // sqrdmulh z9.s, z9.s, z22.s\n"
- "sqadd z17.s, z17.s, z20.s\n"
- ".inst 0x44829371 // srshl z17.s, p4/M, z17.s, z27.s\n"
+ ".inst 0x44954679 // smlalt z25.s, p4/M, z19.h, z21.h\n"
+ ".inst 0x448e4497 // smlalt z23.s, p4/M, z4.h, z14.h\n"
+ "ld1w { z19.s }, p1/Z, [x25, #1, MUL VL]\n"
+ "uzp1 z21.s, z20.s, z19.s\n"
+ ".inst 0x448147bb // smlalt z27.s, p4/M, z29.h, z1.h\n"
+ ".inst 0x448a4391 // smlalb z17.s, p4/M, z28.h, z10.h\n"
+ "uzp2 z1.s, z20.s, z19.s\n"
+ "whilelt p2.s, x16, x15\n"
+ ".inst 0x44864010 // smlalb z16.s, p4/M, z0.h, z6.h\n"
+ ".inst 0x44924076 // smlalb z22.s, p4/M, z3.h, z18.h\n"
+ "whilelt p1.s, x20, x15\n"
+ "whilelt p3.h, x16, x15\n"
+ ".inst 0x44864489 // smlalt z9.s, p4/M, z4.h, z6.h\n"
+ ".inst 0x44874005 // smlalb z5.s, p4/M, z0.h, z7.h\n"
+ ".inst 0x04a274a5 // sqrdmulh z5.s, z5.s, z2.s\n"
+ "addvl x25, x25, #2\n"
+ ".inst 0x448a4799 // smlalt z25.s, p4/M, z28.h, z10.h\n"
+ ".inst 0x44864417 // smlalt z23.s, p4/M, z0.h, z6.h\n"
+ "and z19.d, z5.d, z21.d\n"
+ ".inst 0x4492447b // smlalt z27.s, p4/M, z3.h, z18.h\n"
+ ".inst 0x449243b1 // smlalb z17.s, p4/M, z29.h, z18.h\n"
"asr z19.s, z19.s, #0x1f\n"
- "and z2.d, z10.d, z27.d\n"
+ ".inst 0x448a41b0 // smlalb z16.s, p4/M, z13.h, z10.h\n"
+ ".inst 0x448741b6 // smlalb z22.s, p4/M, z13.h, z7.h\n"
+ "sqadd z5.s, z5.s, z19.s\n"
+ ".inst 0x448292a5 // srshl z5.s, p4/M, z5.s, z21.s\n"
+ ".inst 0x44874409 // smlalt z9.s, p4/M, z0.h, z7.h\n"
+ ".inst 0x449247b9 // smlalt z25.s, p4/M, z29.h, z18.h\n"
+ ".inst 0x04af7529 // sqrdmulh z9.s, z9.s, z15.s\n"
+ ".inst 0x448a45b7 // smlalt z23.s, p4/M, z13.h, z10.h\n"
+ ".inst 0x448745bb // smlalt z27.s, p4/M, z13.h, z7.h\n"
+ "and z29.d, z9.d, z1.d\n"
+ ".inst 0x44884071 // smlalb z17.s, p4/M, z3.h, z8.h\n"
+ ".inst 0x448843f0 // smlalb z16.s, p4/M, z31.h, z8.h\n"
+ ".inst 0x04a27631 // sqrdmulh z17.s, z17.s, z2.s\n"
+ ".inst 0x448a43f6 // smlalb z22.s, p4/M, z31.h, z10.h\n"
+ ".inst 0x44884479 // smlalt z25.s, p4/M, z3.h, z8.h\n"
+ ".inst 0x04a27610 // sqrdmulh z16.s, z16.s, z2.s\n"
+ ".inst 0x448847f7 // smlalt z23.s, p4/M, z31.h, z8.h\n"
+ ".inst 0x448a47fb // smlalt z27.s, p4/M, z31.h, z10.h\n"
+ ".inst 0x04a276d6 // sqrdmulh z22.s, z22.s, z2.s\n"
+ "asr z29.s, z29.s, #0x1f\n"
+ "and z18.d, z17.d, z21.d\n"
+ ".inst 0x04af7739 // sqrdmulh z25.s, z25.s, z15.s\n"
+ "and z20.d, z16.d, z21.d\n"
+ ".inst 0x04af76f7 // sqrdmulh z23.s, z23.s, z15.s\n"
+ "and z19.d, z22.d, z21.d\n"
+ ".inst 0x04af777b // sqrdmulh z27.s, z27.s, z15.s\n"
+ "sqadd z9.s, z9.s, z29.s\n"
+ ".inst 0x44829029 // srshl z9.s, p4/M, z9.s, z1.s\n"
"asr z18.s, z18.s, #0x1f\n"
- "and z22.d, z16.d, z27.d\n"
- "asr z21.s, z21.s, #0x1f\n"
- "and z20.d, z9.d, z27.d\n"
- "sqadd z26.s, z26.s, z19.s\n"
- "asr z2.s, z2.s, #0x1f\n"
- ".inst 0x4482903a // srshl z26.s, p4/M, z26.s, z1.s\n"
- "sqadd z24.s, z24.s, z18.s\n"
- "asr z22.s, z22.s, #0x1f\n"
- ".inst 0x44829038 // srshl z24.s, p4/M, z24.s, z1.s\n"
- "sqadd z25.s, z25.s, z21.s\n"
+ "and z7.d, z25.d, z1.d\n"
"asr z20.s, z20.s, #0x1f\n"
+ "and z6.d, z23.d, z1.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "and z2.d, z27.d, z1.d\n"
+ "sqadd z17.s, z17.s, z18.s\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ ".inst 0x448292b1 // srshl z17.s, p4/M, z17.s, z21.s\n"
+ "sqadd z16.s, z16.s, z20.s\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ ".inst 0x448292b0 // srshl z16.s, p4/M, z16.s, z21.s\n"
+ "sqadd z22.s, z22.s, z19.s\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ ".inst 0x448292b6 // srshl z22.s, p4/M, z22.s, z21.s\n"
+ "sqadd z25.s, z25.s, z7.s\n"
+ "sqadd z23.s, z23.s, z6.s\n"
".inst 0x44829039 // srshl z25.s, p4/M, z25.s, z1.s\n"
- "sqadd z10.s, z10.s, z2.s\n"
- "sqadd z16.s, z16.s, z22.s\n"
- ".inst 0x4482936a // srshl z10.s, p4/M, z10.s, z27.s\n"
- ".inst 0x44829370 // srshl z16.s, p4/M, z16.s, z27.s\n"
- "sqadd z9.s, z9.s, z20.s\n"
- ".inst 0x453041ad // sqxtnb z13.h, z13.s\n"
- ".inst 0x44829369 // srshl z9.s, p4/M, z9.s, z27.s\n"
- ".inst 0x4530435a // sqxtnb z26.h, z26.s\n"
- ".inst 0x45304318 // sqxtnb z24.h, z24.s\n"
- ".inst 0x45304339 // sqxtnb z25.h, z25.s\n"
- ".inst 0x4530462d // sqxtnt z13.h, z17.s\n"
- ".inst 0x4530455a // sqxtnt z26.h, z10.s\n"
- ".inst 0x45304618 // sqxtnt z24.h, z16.s\n"
- ".inst 0x45304539 // sqxtnt z25.h, z9.s\n"
- "sqadd z13.h, z13.h, z14.h\n"
- "smax z13.h, p4/M, z13.h, z12.h\n"
- "smin z13.h, p4/M, z13.h, z11.h\n"
- "sqadd z26.h, z26.h, z14.h\n"
- "sqadd z24.h, z24.h, z14.h\n"
- "smax z26.h, p4/M, z26.h, z12.h\n"
- "smax z24.h, p4/M, z24.h, z12.h\n"
- "sqadd z25.h, z25.h, z14.h\n"
- "smax z25.h, p4/M, z25.h, z12.h\n"
- "smin z26.h, p4/M, z26.h, z11.h\n"
- "st1b { z13.h }, p0, [x15, x10]\n"
- "smin z24.h, p4/M, z24.h, z11.h\n"
- "smin z25.h, p4/M, z25.h, z11.h\n"
- "st1b { z26.h }, p0, [x14, x10]\n"
- "st1b { z24.h }, p0, [x13, x10]\n"
- "st1b { z25.h }, p0, [x12, x10]\n"
- "ld1b { z0.h }, p4/Z, [x16]\n"
- "ld1b { z1.h }, p4/Z, [x16, #1, MUL VL]\n"
- "inch x10\n"
- "ld1b { z2.h }, p4/Z, [x16, #2, MUL VL]\n"
- "ld1b { z3.h }, p4/Z, [x16, #3, MUL VL]\n"
- ".inst 0x454f1800 // usublb z0.h, z0.b, z15.b\n"
- ".inst 0x454f1821 // usublb z1.h, z1.b, z15.b\n"
- "ld1b { z4.h }, p4/Z, [x16, #4, MUL VL]\n"
- "ld1b { z5.h }, p4/Z, [x16, #5, MUL VL]\n"
- ".inst 0x454f1842 // usublb z2.h, z2.b, z15.b\n"
- ".inst 0x454f1863 // usublb z3.h, z3.b, z15.b\n"
- "ld1b { z6.h }, p4/Z, [x16, #6, MUL VL]\n"
- "ld1b { z7.h }, p4/Z, [x16, #7, MUL VL]\n"
- "inch x16, ALL, MUL #8\n"
- ".inst 0x454f1884 // usublb z4.h, z4.b, z15.b\n"
- "ld1w { z17.s }, p2/Z, [x26]\n"
- "ld1w { z16.s }, p1/Z, [x26, #1, MUL VL]\n"
- "uzp1 z13.s, z17.s, z16.s\n"
- "uzp2 z17.s, z17.s, z16.s\n"
- "ld1b { z8.h }, p4/Z, [x16]\n"
- "ldp x24, x23, [x11, #0x0]\n"
- "addvl x26, x26, #2\n"
- "str x26, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x22, x21, [x11, #0x10]\n"
- "ldr x20, [x11, #0x20]\n"
- "mov z26.d, z13.d\n"
- "mov z10.d, z17.d\n"
- "ld1b { z31.h }, p3/Z, [x24, x8]\n"
- "ld1b { z30.h }, p3/Z, [x23, x8]\n"
- "mov z24.d, z13.d\n"
- "mov z16.d, z17.d\n"
- "ld1b { z29.h }, p3/Z, [x22, x8]\n"
- "ld1b { z28.h }, p3/Z, [x21, x8]\n"
- "mov z25.d, z13.d\n"
- "mov z9.d, z17.d\n"
- "ld1b { z27.h }, p3/Z, [x20, x8]\n"
- ".inst 0x454f18a5 // usublb z5.h, z5.b, z15.b\n"
- ".inst 0x454f18c6 // usublb z6.h, z6.b, z15.b\n"
- ".inst 0x454f18e7 // usublb z7.h, z7.b, z15.b\n"
- ".inst 0x454f1908 // usublb z8.h, z8.b, z15.b\n"
- ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
- ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
+ ".inst 0x44829037 // srshl z23.s, p4/M, z23.s, z1.s\n"
+ "sqadd z27.s, z27.s, z2.s\n"
+ ".inst 0x453040a5 // sqxtnb z5.h, z5.s\n"
+ ".inst 0x4482903b // srshl z27.s, p4/M, z27.s, z1.s\n"
+ ".inst 0x45304231 // sqxtnb z17.h, z17.s\n"
+ ".inst 0x45304210 // sqxtnb z16.h, z16.s\n"
+ ".inst 0x453042d6 // sqxtnb z22.h, z22.s\n"
+ ".inst 0x45304525 // sqxtnt z5.h, z9.s\n"
+ ".inst 0x45304731 // sqxtnt z17.h, z25.s\n"
+ ".inst 0x453046f0 // sqxtnt z16.h, z23.s\n"
+ ".inst 0x45304776 // sqxtnt z22.h, z27.s\n"
+ "sqadd z5.h, z5.h, z24.h\n"
+ "smax z5.h, p4/M, z5.h, z11.h\n"
+ "smin z5.h, p4/M, z5.h, z26.h\n"
+ "sqadd z17.h, z17.h, z24.h\n"
+ "sqadd z16.h, z16.h, z24.h\n"
+ "smax z17.h, p4/M, z17.h, z11.h\n"
+ "smax z16.h, p4/M, z16.h, z11.h\n"
+ "sqadd z22.h, z22.h, z24.h\n"
+ "smax z22.h, p4/M, z22.h, z11.h\n"
+ "smin z17.h, p4/M, z17.h, z26.h\n"
+ "st1b { z5.h }, p0, [x13, x27]\n"
+ "smin z16.h, p4/M, z16.h, z26.h\n"
+ "smin z22.h, p4/M, z22.h, z26.h\n"
+ "st1b { z17.h }, p0, [x12, x27]\n"
+ "st1b { z16.h }, p0, [x11, x27]\n"
+ "st1b { z22.h }, p0, [x10, x27]\n"
+ "ld1b { z14.h }, p4/Z, [x14]\n"
+ "ld1b { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
+ "inch x27\n"
+ "ld1b { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
+ ".inst 0x455e19ce // usublb z14.h, z14.b, z30.b\n"
+ ".inst 0x455e1ab5 // usublb z21.h, z21.b, z30.b\n"
+ "ld1b { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
+ "ld1b { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
+ ".inst 0x455e1821 // usublb z1.h, z1.b, z30.b\n"
+ ".inst 0x455e18c6 // usublb z6.h, z6.b, z30.b\n"
+ "ld1b { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
+ "ld1b { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+ "inch x14, ALL, MUL #8\n"
+ ".inst 0x455e1842 // usublb z2.h, z2.b, z30.b\n"
+ "ld1w { z17.s }, p2/Z, [x21]\n"
+ "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+ "uzp1 z5.s, z17.s, z16.s\n"
+ "uzp2 z9.s, z17.s, z16.s\n"
+ "ld1b { z8.h }, p4/Z, [x14]\n"
+ "ldp x24, x23, [x28, #0x0]\n"
+ "addvl x21, x21, #2\n"
+ "str x21, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x22, x21, [x28, #0x10]\n"
+ "ldr x20, [x28, #0x20]\n"
+ "mov z17.d, z5.d\n"
+ "mov z25.d, z9.d\n"
+ "ld1b { z0.h }, p3/Z, [x24, x16]\n"
+ "ld1b { z29.h }, p3/Z, [x23, x16]\n"
+ "mov z16.d, z5.d\n"
+ "mov z23.d, z9.d\n"
+ "ld1b { z4.h }, p3/Z, [x22, x16]\n"
+ "ld1b { z13.h }, p3/Z, [x21, x16]\n"
+ "mov z22.d, z5.d\n"
+ "mov z27.d, z9.d\n"
+ "ld1b { z20.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x455e1a52 // usublb z18.h, z18.b, z30.b\n"
+ ".inst 0x455e18e7 // usublb z7.h, z7.b, z30.b\n"
+ ".inst 0x455e194a // usublb z10.h, z10.b, z30.b\n"
+ ".inst 0x455e1908 // usublb z8.h, z8.b, z30.b\n"
+ ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
+ ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
+ ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
+ ".inst 0x454c19ad // usublb z13.h, z13.b, z12.b\n"
+ ".inst 0x454c1a94 // usublb z20.h, z20.b, z12.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index a794095c6f..bcd0d60d3c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,14 +22,14 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -47,17 +47,16 @@ class sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstS
constexpr static unsigned int stride_rows = 2;
constexpr static unsigned int stride_cols = 2;
- arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
- unsigned int get_accumulator_depth_vl(void) const override { return 2; }
-
sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
- Parent::KernelType kernel = sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+ Parent::KernelType kernel = sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index 9f21401840..1ea2fcbfbd 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -27,7 +27,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -46,7 +46,7 @@ void sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
struct Params
{
long unsigned int n_channels;
- const uint8_t *weights;
+ const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
const int32_t *const requant_muls;
@@ -57,7 +57,7 @@ void sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
Params(
long unsigned int n_channels,
const uint8_t *const *inptrs_raw,
- const uint8_t *const weights,
+ const void *const weights,
const int32_t *const bias,
const arm_gemm::Requantize32 &qp,
const int32_t *const requant_muls,
@@ -110,13 +110,13 @@ void sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
"add x20, x25, %[offsetof_Requantize32_b_offset]\n"
"add x22, x25, %[offsetof_Requantize32_c_offset]\n"
- "ld1rb { z23.b }, p4/Z, [x21]\n"
- "ld1rb { z12.b }, p4/Z, [x20]\n"
+ "ld1rb { z26.b }, p4/Z, [x21]\n"
+ "ld1rb { z13.b }, p4/Z, [x20]\n"
"add x21, x25, %[offsetof_Requantize32_minval]\n"
"add x20, x25, %[offsetof_Requantize32_maxval]\n"
- "ld1rh { z14.h }, p4/Z, [x22]\n"
- "ld1rh { z16.h }, p4/Z, [x21]\n"
- "ld1rh { z15.h }, p4/Z, [x20]\n"
+ "ld1rh { z19.h }, p4/Z, [x22]\n"
+ "ld1rh { z12.h }, p4/Z, [x21]\n"
+ "ld1rh { z9.h }, p4/Z, [x20]\n"
"ldp x16, x15, [x24, #0x0]\n"
"incw x23\n"
"whilelt p3.h, x7, x8\n"
@@ -124,320 +124,320 @@ void sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"whilelt p2.s, x7, x8\n"
"whilelt p1.s, x23, x8\n"
"ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1b { z0.h }, p4/Z, [x17]\n"
- "ld1b { z1.h }, p4/Z, [x17, #1, MUL VL]\n"
+ "ld1b { z25.h }, p4/Z, [x17]\n"
+ "ld1b { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
"add x11, %x[params], %[offsetof_Params_inptrs]\n"
"mov x10, #0x0\n"
- "ld1b { z2.h }, p4/Z, [x17, #2, MUL VL]\n"
- "ld1b { z3.h }, p4/Z, [x17, #3, MUL VL]\n"
- ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
- ".inst 0x454c1821 // usublb z1.h, z1.b, z12.b\n"
- "ld1b { z4.h }, p4/Z, [x17, #4, MUL VL]\n"
- "ld1b { z5.h }, p4/Z, [x17, #5, MUL VL]\n"
- ".inst 0x454c1842 // usublb z2.h, z2.b, z12.b\n"
- ".inst 0x454c1863 // usublb z3.h, z3.b, z12.b\n"
- "ld1b { z6.h }, p4/Z, [x17, #6, MUL VL]\n"
+ "ld1b { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
+ "ld1b { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
+ ".inst 0x454d1b39 // usublb z25.h, z25.b, z13.b\n"
+ ".inst 0x454d1bde // usublb z30.h, z30.b, z13.b\n"
+ "ld1b { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
+ "ld1b { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
+ ".inst 0x454d19ce // usublb z14.h, z14.b, z13.b\n"
+ ".inst 0x454d1884 // usublb z4.h, z4.b, z13.b\n"
+ "ld1b { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
"ld1b { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
"inch x17, ALL, MUL #8\n"
- ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
- "ld1w { z18.s }, p2/Z, [x12]\n"
- "ld1w { z8.s }, p1/Z, [x12, #1, MUL VL]\n"
- "uzp1 z13.s, z18.s, z8.s\n"
- "uzp2 z17.s, z18.s, z8.s\n"
- "ld1b { z8.h }, p4/Z, [x17]\n"
- "ldp x9, x28, [x11, #0x0]\n"
+ ".inst 0x454d194a // usublb z10.h, z10.b, z13.b\n"
+ "ld1w { z17.s }, p2/Z, [x12]\n"
+ "ld1w { z16.s }, p1/Z, [x12, #1, MUL VL]\n"
+ "uzp1 z8.s, z17.s, z16.s\n"
+ "uzp2 z24.s, z17.s, z16.s\n"
+ "ld1b { z2.h }, p4/Z, [x17]\n"
+ "ldp x27, x26, [x11, #0x0]\n"
"addvl x12, x12, #2\n"
- "mov z9.d, z13.d\n"
+ "mov z18.d, z8.d\n"
"ldp x25, x24, [x11, #0x10]\n"
"ldp x23, x22, [x11, #0x20]\n"
- "mov z10.d, z17.d\n"
- "mov z11.d, z13.d\n"
+ "mov z0.d, z24.d\n"
+ "mov z15.d, z8.d\n"
"ldp x21, x20, [x11, #0x30]\n"
- "ld1b { z31.h }, p3/Z, [x9, x7]\n"
- "mov z22.d, z17.d\n"
- "mov z21.d, z13.d\n"
- "ld1b { z30.h }, p3/Z, [x28, x7]\n"
- "ld1b { z29.h }, p3/Z, [x25, x7]\n"
- "mov z18.d, z17.d\n"
- ".inst 0x454c18a5 // usublb z5.h, z5.b, z12.b\n"
- "ld1b { z28.h }, p3/Z, [x24, x7]\n"
+ "ld1b { z21.h }, p3/Z, [x27, x7]\n"
+ "mov z1.d, z24.d\n"
+ "mov z5.d, z8.d\n"
+ "ld1b { z22.h }, p3/Z, [x26, x7]\n"
+ "ld1b { z11.h }, p3/Z, [x25, x7]\n"
+ "mov z6.d, z24.d\n"
+ ".inst 0x454d1863 // usublb z3.h, z3.b, z13.b\n"
+ "ld1b { z20.h }, p3/Z, [x24, x7]\n"
"ld1b { z27.h }, p3/Z, [x23, x7]\n"
- ".inst 0x454c18c6 // usublb z6.h, z6.b, z12.b\n"
- ".inst 0x454c18e7 // usublb z7.h, z7.b, z12.b\n"
- "ld1b { z26.h }, p3/Z, [x22, x7]\n"
- "ld1b { z25.h }, p3/Z, [x21, x7]\n"
- ".inst 0x454c1908 // usublb z8.h, z8.b, z12.b\n"
- ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
- "ld1b { z24.h }, p3/Z, [x20, x7]\n"
- "ldr x27, [%x[params], %[offsetof_Params_requant_muls]]\n"
- ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- "ldr x26, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ ".inst 0x454d1af7 // usublb z23.h, z23.b, z13.b\n"
+ ".inst 0x454d18e7 // usublb z7.h, z7.b, z13.b\n"
+ "ld1b { z28.h }, p3/Z, [x22, x7]\n"
+ "ld1b { z16.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x454d1842 // usublb z2.h, z2.b, z13.b\n"
+ ".inst 0x455a1ab5 // usublb z21.h, z21.b, z26.b\n"
+ "ld1b { z31.h }, p3/Z, [x20, x7]\n"
+ "ldr x9, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ ".inst 0x455a1ad6 // usublb z22.h, z22.b, z26.b\n"
+ ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
+ "ldr x28, [%x[params], %[offsetof_Params_requant_shifts]]\n"
"str x12, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
- ".inst 0x45571b5a // usublb z26.h, z26.b, z23.b\n"
- ".inst 0x45571b39 // usublb z25.h, z25.b, z23.b\n"
- ".inst 0x45571b18 // usublb z24.h, z24.b, z23.b\n"
+ ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
+ ".inst 0x455a1b7b // usublb z27.h, z27.b, z26.b\n"
+ ".inst 0x455a1b9c // usublb z28.h, z28.b, z26.b\n"
+ ".inst 0x455a1a10 // usublb z16.h, z16.b, z26.b\n"
+ ".inst 0x455a1bff // usublb z31.h, z31.b, z26.b\n"
"1:" // Loop
- ".inst 0x448843ed // smlalb z13.s, p4/M, z31.h, z8.h\n"
- ".inst 0x448847f1 // smlalt z17.s, p4/M, z31.h, z8.h\n"
- "ldr x25, [x11, #0x40]\n"
- "ldr x24, [x11, #0x48]\n"
- ".inst 0x448643e9 // smlalb z9.s, p4/M, z31.h, z6.h\n"
- ".inst 0x448647ea // smlalt z10.s, p4/M, z31.h, z6.h\n"
- "ldr x22, [x11, #0x50]\n"
- "ldr x20, [x11, #0x58]\n"
- ".inst 0x448043cd // smlalb z13.s, p4/M, z30.h, z0.h\n"
- ".inst 0x448047d1 // smlalt z17.s, p4/M, z30.h, z0.h\n"
- "ldr x23, [x11, #0x78]\n"
- "ldr x21, [x11, #0x60]\n"
- ".inst 0x44814389 // smlalb z9.s, p4/M, z28.h, z1.h\n"
- ".inst 0x4481478a // smlalt z10.s, p4/M, z28.h, z1.h\n"
- "ld1b { z28.h }, p3/Z, [x24, x7]\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x448143ad // smlalb z13.s, p4/M, z29.h, z1.h\n"
- ".inst 0x448147b1 // smlalt z17.s, p4/M, z29.h, z1.h\n"
- "ld1b { z29.h }, p3/Z, [x25, x7]\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x44824369 // smlalb z9.s, p4/M, z27.h, z2.h\n"
- ".inst 0x4482476a // smlalt z10.s, p4/M, z27.h, z2.h\n"
- "ld1b { z27.h }, p3/Z, [x22, x7]\n"
- ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
- ".inst 0x4483434d // smlalb z13.s, p4/M, z26.h, z3.h\n"
- ".inst 0x44834751 // smlalt z17.s, p4/M, z26.h, z3.h\n"
- "ld1b { z26.h }, p3/Z, [x20, x7]\n"
- ".inst 0x45571b5a // usublb z26.h, z26.b, z23.b\n"
- ".inst 0x44804309 // smlalb z9.s, p4/M, z24.h, z0.h\n"
- ".inst 0x4480470a // smlalt z10.s, p4/M, z24.h, z0.h\n"
- "ldr x22, [x11, #0x80]\n"
- "ldr x20, [x11, #0x68]\n"
- ".inst 0x4484432d // smlalb z13.s, p4/M, z25.h, z4.h\n"
- ".inst 0x44844731 // smlalt z17.s, p4/M, z25.h, z4.h\n"
- "ld1b { z25.h }, p3/Z, [x21, x7]\n"
- ".inst 0x45571b39 // usublb z25.h, z25.b, z23.b\n"
- ".inst 0x448443a9 // smlalb z9.s, p4/M, z29.h, z4.h\n"
- ".inst 0x448447aa // smlalt z10.s, p4/M, z29.h, z4.h\n"
- "ldr x21, [x11, #0x88]\n"
+ ".inst 0x448242a8 // smlalb z8.s, p4/M, z21.h, z2.h\n"
+ "ldr x21, [x11, #0x58]\n"
+ "ldr x20, [x11, #0x78]\n"
+ ".inst 0x448246b8 // smlalt z24.s, p4/M, z21.h, z2.h\n"
+ ".inst 0x449942c8 // smlalb z8.s, p4/M, z22.h, z25.h\n"
+ "ld1b { z17.h }, p3/Z, [x21, x7]\n"
"ld1b { z29.h }, p3/Z, [x20, x7]\n"
- ".inst 0x4482430d // smlalb z13.s, p4/M, z24.h, z2.h\n"
- ".inst 0x44824711 // smlalt z17.s, p4/M, z24.h, z2.h\n"
- "ldr x20, [x11, #0x70]\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x44854389 // smlalb z9.s, p4/M, z28.h, z5.h\n"
- ".inst 0x4485478a // smlalt z10.s, p4/M, z28.h, z5.h\n"
- "ld1b { z28.h }, p3/Z, [x22, x7]\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x448243eb // smlalb z11.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448247f6 // smlalt z22.s, p4/M, z31.h, z2.h\n"
- "ldr x25, [x11, #0x98]\n"
- "ld1b { z24.h }, p3/Z, [x20, x7]\n"
- ".inst 0x4485436d // smlalb z13.s, p4/M, z27.h, z5.h\n"
- ".inst 0x44854771 // smlalt z17.s, p4/M, z27.h, z5.h\n"
- ".inst 0x45571b18 // usublb z24.h, z24.b, z23.b\n"
- "ldr x24, [x11, #0x90]\n"
- ".inst 0x44834369 // smlalb z9.s, p4/M, z27.h, z3.h\n"
- ".inst 0x4483476a // smlalt z10.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x449742b2 // smlalb z18.s, p4/M, z21.h, z23.h\n"
+ "ldr x21, [x11, #0x60]\n"
+ "ldr x20, [x11, #0x80]\n"
+ ".inst 0x448e42af // smlalb z15.s, p4/M, z21.h, z14.h\n"
+ ".inst 0x449942a5 // smlalb z5.s, p4/M, z21.h, z25.h\n"
+ ".inst 0x449946d8 // smlalt z24.s, p4/M, z22.h, z25.h\n"
+ ".inst 0x455a1a31 // usublb z17.h, z17.b, z26.b\n"
+ ".inst 0x449e4168 // smlalb z8.s, p4/M, z11.h, z30.h\n"
+ "ld1b { z22.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x455a1bbd // usublb z29.h, z29.b, z26.b\n"
+ ".inst 0x449746a0 // smlalt z0.s, p4/M, z21.h, z23.h\n"
+ ".inst 0x448e46a1 // smlalt z1.s, p4/M, z21.h, z14.h\n"
+ "ldr x21, [x11, #0x68]\n"
+ ".inst 0x449946a6 // smlalt z6.s, p4/M, z21.h, z25.h\n"
+ "ld1b { z21.h }, p3/Z, [x20, x7]\n"
+ "ldr x20, [x11, #0x88]\n"
+ ".inst 0x449e4292 // smlalb z18.s, p4/M, z20.h, z30.h\n"
+ ".inst 0x4484422f // smlalb z15.s, p4/M, z17.h, z4.h\n"
+ ".inst 0x448a43a5 // smlalb z5.s, p4/M, z29.h, z10.h\n"
+ ".inst 0x455a1ad6 // usublb z22.h, z22.b, z26.b\n"
+ "ldr x22, [x11, #0x40]\n"
+ ".inst 0x449e4578 // smlalt z24.s, p4/M, z11.h, z30.h\n"
+ ".inst 0x455a1ab5 // usublb z21.h, z21.b, z26.b\n"
+ ".inst 0x44844388 // smlalb z8.s, p4/M, z28.h, z4.h\n"
+ "ld1b { z11.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x449e4680 // smlalt z0.s, p4/M, z20.h, z30.h\n"
+ "ld1b { z20.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x44844621 // smlalt z1.s, p4/M, z17.h, z4.h\n"
+ "ldr x21, [x11, #0x70]\n"
+ ".inst 0x448a47a6 // smlalt z6.s, p4/M, z29.h, z10.h\n"
+ "ldr x20, [x11, #0x98]\n"
+ ".inst 0x448e4372 // smlalb z18.s, p4/M, z27.h, z14.h\n"
+ "ldr x23, [x11, #0x50]\n"
+ ".inst 0x449942cf // smlalb z15.s, p4/M, z22.h, z25.h\n"
+ ".inst 0x449e42a5 // smlalb z5.s, p4/M, z21.h, z30.h\n"
+ ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
+ "ld1b { z17.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x44844798 // smlalt z24.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
+ ".inst 0x448a4208 // smlalb z8.s, p4/M, z16.h, z10.h\n"
+ "ld1b { z29.h }, p3/Z, [x21, x7]\n"
+ "ld1b { z28.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x448e4760 // smlalt z0.s, p4/M, z27.h, z14.h\n"
+ "ldr x22, [x11, #0x48]\n"
+ ".inst 0x449946c1 // smlalt z1.s, p4/M, z22.h, z25.h\n"
+ ".inst 0x449e46a6 // smlalt z6.s, p4/M, z21.h, z30.h\n"
+ "ldr x21, [x11, #0x90]\n"
+ "ldr x20, [x11, #0xa8]\n"
+ ".inst 0x449943f2 // smlalb z18.s, p4/M, z31.h, z25.h\n"
"ld1b { z27.h }, p3/Z, [x23, x7]\n"
- ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
- ".inst 0x448043f5 // smlalb z21.s, p4/M, z31.h, z0.h\n"
- ".inst 0x4483434b // smlalb z11.s, p4/M, z26.h, z3.h\n"
- "ldr x23, [x11, #0xa8]\n"
- "ldr x20, [x11, #0xa0]\n"
- ".inst 0x44834756 // smlalt z22.s, p4/M, z26.h, z3.h\n"
- ".inst 0x448047f2 // smlalt z18.s, p4/M, z31.h, z0.h\n"
- "ld1b { z26.h }, p3/Z, [x21, x7]\n"
- ".inst 0x45571b5a // usublb z26.h, z26.b, z23.b\n"
- ".inst 0x44844375 // smlalb z21.s, p4/M, z27.h, z4.h\n"
- ".inst 0x4480432b // smlalb z11.s, p4/M, z25.h, z0.h\n"
- "ldr x22, [x11, #0xb0]\n"
- "ldr x21, [x11, #0xb8]\n"
- ".inst 0x44804736 // smlalt z22.s, p4/M, z25.h, z0.h\n"
- ".inst 0x44844772 // smlalt z18.s, p4/M, z27.h, z4.h\n"
- "ld1b { z27.h }, p3/Z, [x20, x7]\n"
- ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
- ".inst 0x44814395 // smlalb z21.s, p4/M, z28.h, z1.h\n"
- ".inst 0x4486432d // smlalb z13.s, p4/M, z25.h, z6.h\n"
- "ldr x20, [x11, #0xc0]\n"
- "ld1w { z31.s }, p2/Z, [x27]\n"
- ".inst 0x44864731 // smlalt z17.s, p4/M, z25.h, z6.h\n"
- ".inst 0x448443ab // smlalb z11.s, p4/M, z29.h, z4.h\n"
- "ld1b { z25.h }, p3/Z, [x24, x7]\n"
- ".inst 0x45571b39 // usublb z25.h, z25.b, z23.b\n"
- ".inst 0x448447b6 // smlalt z22.s, p4/M, z29.h, z4.h\n"
- "ld1b { z29.h }, p3/Z, [x25, x7]\n"
- ".inst 0x44814792 // smlalt z18.s, p4/M, z28.h, z1.h\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x44854355 // smlalb z21.s, p4/M, z26.h, z5.h\n"
- ".inst 0x4487430d // smlalb z13.s, p4/M, z24.h, z7.h\n"
- "ld1w { z20.s }, p1/Z, [x27, #1, MUL VL]\n"
- "uzp1 z19.s, z31.s, z20.s\n"
- ".inst 0x44874711 // smlalt z17.s, p4/M, z24.h, z7.h\n"
- ".inst 0x4481430b // smlalb z11.s, p4/M, z24.h, z1.h\n"
- "uzp2 z30.s, z31.s, z20.s\n"
- "ld1w { z31.s }, p2/Z, [x26]\n"
- ".inst 0x44814716 // smlalt z22.s, p4/M, z24.h, z1.h\n"
- "ld1b { z24.h }, p3/Z, [x23, x7]\n"
- ".inst 0x44854752 // smlalt z18.s, p4/M, z26.h, z5.h\n"
- ".inst 0x45571b18 // usublb z24.h, z24.b, z23.b\n"
- ".inst 0x448243b5 // smlalb z21.s, p4/M, z29.h, z2.h\n"
- "ld1b { z26.h }, p3/Z, [x22, x7]\n"
- ".inst 0x448247b2 // smlalt z18.s, p4/M, z29.h, z2.h\n"
- ".inst 0x45571b5a // usublb z26.h, z26.b, z23.b\n"
- ".inst 0x4486432b // smlalb z11.s, p4/M, z25.h, z6.h\n"
- ".inst 0x44834315 // smlalb z21.s, p4/M, z24.h, z3.h\n"
- "ld1w { z20.s }, p1/Z, [x26, #1, MUL VL]\n"
- "uzp1 z1.s, z31.s, z20.s\n"
- ".inst 0x44874389 // smlalb z9.s, p4/M, z28.h, z7.h\n"
- ".inst 0x4487478a // smlalt z10.s, p4/M, z28.h, z7.h\n"
- ".inst 0x04b375ad // sqrdmulh z13.s, z13.s, z19.s\n"
- "whilelt p0.h, x10, x8\n"
- ".inst 0x44864736 // smlalt z22.s, p4/M, z25.h, z6.h\n"
+ ".inst 0x448a416f // smlalb z15.s, p4/M, z11.h, z10.h\n"
+ ".inst 0x44834285 // smlalb z5.s, p4/M, z20.h, z3.h\n"
+ ".inst 0x455a1a31 // usublb z17.h, z17.b, z26.b\n"
+ ".inst 0x448a4618 // smlalt z24.s, p4/M, z16.h, z10.h\n"
+ ".inst 0x455a1bbd // usublb z29.h, z29.b, z26.b\n"
+ ".inst 0x448e43e8 // smlalb z8.s, p4/M, z31.h, z14.h\n"
+ "ld1b { z16.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x455a1b9c // usublb z28.h, z28.b, z26.b\n"
+ ".inst 0x449947e0 // smlalt z0.s, p4/M, z31.h, z25.h\n"
"ld1b { z25.h }, p3/Z, [x21, x7]\n"
- ".inst 0x44834712 // smlalt z18.s, p4/M, z24.h, z3.h\n"
- ".inst 0x45571b39 // usublb z25.h, z25.b, z23.b\n"
- ".inst 0x4487436b // smlalb z11.s, p4/M, z27.h, z7.h\n"
- ".inst 0x44874355 // smlalb z21.s, p4/M, z26.h, z7.h\n"
- "uzp2 z31.s, z31.s, z20.s\n"
- "inch x17\n"
- ".inst 0x448843a9 // smlalb z9.s, p4/M, z29.h, z8.h\n"
- ".inst 0x448847aa // smlalt z10.s, p4/M, z29.h, z8.h\n"
- "ld1b { z29.h }, p3/Z, [x20, x7]\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x44874776 // smlalt z22.s, p4/M, z27.h, z7.h\n"
- ".inst 0x44874752 // smlalt z18.s, p4/M, z26.h, z7.h\n"
- "and z0.d, z13.d, z1.d\n"
+ ".inst 0x448a4561 // smlalt z1.s, p4/M, z11.h, z10.h\n"
+ "ld1b { z11.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x455a1b7b // usublb z27.h, z27.b, z26.b\n"
+ ".inst 0x44834686 // smlalt z6.s, p4/M, z20.h, z3.h\n"
+ "ldr x21, [x11, #0xa0]\n"
+ "ldr x20, [x11, #0xb0]\n"
+ ".inst 0x448a4232 // smlalb z18.s, p4/M, z17.h, z10.h\n"
+ ".inst 0x449e43af // smlalb z15.s, p4/M, z29.h, z30.h\n"
+ ".inst 0x455a1a10 // usublb z16.h, z16.b, z26.b\n"
+ ".inst 0x448e4385 // smlalb z5.s, p4/M, z28.h, z14.h\n"
+ ".inst 0x448e47f8 // smlalt z24.s, p4/M, z31.h, z14.h\n"
+ ".inst 0x455a1b39 // usublb z25.h, z25.b, z26.b\n"
+ "ld1b { z20.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
+ ".inst 0x44834368 // smlalb z8.s, p4/M, z27.h, z3.h\n"
+ "ld1b { z31.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x448a4620 // smlalt z0.s, p4/M, z17.h, z10.h\n"
+ ".inst 0x449e47a1 // smlalt z1.s, p4/M, z29.h, z30.h\n"
+ ".inst 0x448e4786 // smlalt z6.s, p4/M, z28.h, z14.h\n"
+ "ldr x20, [x11, #0xb8]\n"
+ ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
+ ".inst 0x44834212 // smlalb z18.s, p4/M, z16.h, z3.h\n"
+ ".inst 0x4497432f // smlalb z15.s, p4/M, z25.h, z23.h\n"
+ ".inst 0x455a1bff // usublb z31.h, z31.b, z26.b\n"
+ "ld1b { z30.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x44844165 // smlalb z5.s, p4/M, z11.h, z4.h\n"
+ ".inst 0x44834778 // smlalt z24.s, p4/M, z27.h, z3.h\n"
+ "ldr x20, [x11, #0xc0]\n"
+ "ld1w { z17.s }, p2/Z, [x9]\n"
+ ".inst 0x449742c8 // smlalb z8.s, p4/M, z22.h, z23.h\n"
+ ".inst 0x44834600 // smlalt z0.s, p4/M, z16.h, z3.h\n"
+ "ld1w { z14.s }, p1/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x455a1bde // usublb z30.h, z30.b, z26.b\n"
+ ".inst 0x44974721 // smlalt z1.s, p4/M, z25.h, z23.h\n"
+ ".inst 0x44844566 // smlalt z6.s, p4/M, z11.h, z4.h\n"
+ "ld1b { z25.h }, p3/Z, [x20, x7]\n"
+ "uzp1 z10.s, z17.s, z14.s\n"
+ ".inst 0x44844372 // smlalb z18.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x4487428f // smlalb z15.s, p4/M, z20.h, z7.h\n"
+ "uzp2 z14.s, z17.s, z14.s\n"
+ "ld1w { z17.s }, p2/Z, [x28]\n"
+ ".inst 0x448743e5 // smlalb z5.s, p4/M, z31.h, z7.h\n"
+ ".inst 0x449746d8 // smlalt z24.s, p4/M, z22.h, z23.h\n"
+ "ld1w { z16.s }, p1/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x455a1b39 // usublb z25.h, z25.b, z26.b\n"
+ ".inst 0x448743a8 // smlalb z8.s, p4/M, z29.h, z7.h\n"
+ ".inst 0x44844760 // smlalt z0.s, p4/M, z27.h, z4.h\n"
+ "uzp1 z4.s, z17.s, z16.s\n"
"inch x7\n"
- ".inst 0x4485430b // smlalb z11.s, p4/M, z24.h, z5.h\n"
- ".inst 0x44864335 // smlalb z21.s, p4/M, z25.h, z6.h\n"
- ".inst 0x04be7631 // sqrdmulh z17.s, z17.s, z30.s\n"
+ ".inst 0x44874681 // smlalt z1.s, p4/M, z20.h, z7.h\n"
+ ".inst 0x448747e6 // smlalt z6.s, p4/M, z31.h, z7.h\n"
+ ".inst 0x04aa7508 // sqrdmulh z8.s, z8.s, z10.s\n"
+ "whilelt p0.h, x10, x8\n"
+ ".inst 0x448742b2 // smlalb z18.s, p4/M, z21.h, z7.h\n"
+ ".inst 0x4483416f // smlalb z15.s, p4/M, z11.h, z3.h\n"
+ "uzp2 z22.s, z17.s, z16.s\n"
"mov x20, x7\n"
- ".inst 0x44854716 // smlalt z22.s, p4/M, z24.h, z5.h\n"
- ".inst 0x44864732 // smlalt z18.s, p4/M, z25.h, z6.h\n"
- "asr z0.s, z0.s, #0x1f\n"
+ ".inst 0x449743c5 // smlalb z5.s, p4/M, z30.h, z23.h\n"
+ ".inst 0x448747b8 // smlalt z24.s, p4/M, z29.h, z7.h\n"
+ "and z17.d, z8.d, z4.d\n"
+ "inch x17\n"
+ ".inst 0x448746a0 // smlalt z0.s, p4/M, z21.h, z7.h\n"
+ ".inst 0x44834561 // smlalt z1.s, p4/M, z11.h, z3.h\n"
+ ".inst 0x04ae7718 // sqrdmulh z24.s, z24.s, z14.s\n"
"incw x20\n"
- ".inst 0x4488432b // smlalb z11.s, p4/M, z25.h, z8.h\n"
- ".inst 0x448843b5 // smlalb z21.s, p4/M, z29.h, z8.h\n"
- "and z20.d, z17.d, z31.d\n"
+ ".inst 0x449747c6 // smlalt z6.s, p4/M, z30.h, z23.h\n"
+ ".inst 0x44824392 // smlalb z18.s, p4/M, z28.h, z2.h\n"
+ "asr z17.s, z17.s, #0x1f\n"
"whilelt p2.s, x7, x8\n"
- ".inst 0x44884736 // smlalt z22.s, p4/M, z25.h, z8.h\n"
- ".inst 0x448847b2 // smlalt z18.s, p4/M, z29.h, z8.h\n"
- ".inst 0x04b37529 // sqrdmulh z9.s, z9.s, z19.s\n"
+ ".inst 0x448243cf // smlalb z15.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x44824325 // smlalb z5.s, p4/M, z25.h, z2.h\n"
+ "and z16.d, z24.d, z22.d\n"
"whilelt p1.s, x20, x8\n"
- ".inst 0x04b3756b // sqrdmulh z11.s, z11.s, z19.s\n"
- ".inst 0x04b376b5 // sqrdmulh z21.s, z21.s, z19.s\n"
- "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44824780 // smlalt z0.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x448247c1 // smlalt z1.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x04aa7652 // sqrdmulh z18.s, z18.s, z10.s\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44824726 // smlalt z6.s, p4/M, z25.h, z2.h\n"
+ ".inst 0x04aa75ef // sqrdmulh z15.s, z15.s, z10.s\n"
"whilelt p3.h, x7, x8\n"
- "sqadd z13.s, z13.s, z0.s\n"
- "asr z20.s, z20.s, #0x1f\n"
- ".inst 0x4482902d // srshl z13.s, p4/M, z13.s, z1.s\n"
- "addvl x27, x27, #2\n"
- "and z19.d, z9.d, z1.d\n"
- ".inst 0x04be754a // sqrdmulh z10.s, z10.s, z30.s\n"
- "addvl x26, x26, #2\n"
- "and z2.d, z11.d, z1.d\n"
- ".inst 0x04be76d6 // sqrdmulh z22.s, z22.s, z30.s\n"
- "and z0.d, z21.d, z1.d\n"
- ".inst 0x04be7652 // sqrdmulh z18.s, z18.s, z30.s\n"
- "sqadd z17.s, z17.s, z20.s\n"
- "asr z19.s, z19.s, #0x1f\n"
- ".inst 0x448293f1 // srshl z17.s, p4/M, z17.s, z31.s\n"
- "and z3.d, z10.d, z31.d\n"
- "asr z2.s, z2.s, #0x1f\n"
- "and z26.d, z22.d, z31.d\n"
- "asr z0.s, z0.s, #0x1f\n"
- "and z20.d, z18.d, z31.d\n"
- "sqadd z9.s, z9.s, z19.s\n"
- ".inst 0x44829029 // srshl z9.s, p4/M, z9.s, z1.s\n"
- "asr z3.s, z3.s, #0x1f\n"
- "sqadd z11.s, z11.s, z2.s\n"
- ".inst 0x4482902b // srshl z11.s, p4/M, z11.s, z1.s\n"
- "asr z26.s, z26.s, #0x1f\n"
- "sqadd z21.s, z21.s, z0.s\n"
- ".inst 0x44829035 // srshl z21.s, p4/M, z21.s, z1.s\n"
+ "addvl x9, x9, #2\n"
+ ".inst 0x04aa74a5 // sqrdmulh z5.s, z5.s, z10.s\n"
+ "sqadd z8.s, z8.s, z17.s\n"
+ ".inst 0x44829088 // srshl z8.s, p4/M, z8.s, z4.s\n"
+ "addvl x28, x28, #2\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "and z21.d, z18.d, z4.d\n"
+ ".inst 0x04ae7400 // sqrdmulh z0.s, z0.s, z14.s\n"
+ "and z20.d, z15.d, z4.d\n"
+ ".inst 0x04ae7421 // sqrdmulh z1.s, z1.s, z14.s\n"
+ "and z28.d, z5.d, z4.d\n"
+ ".inst 0x04ae74c6 // sqrdmulh z6.s, z6.s, z14.s\n"
+ "sqadd z24.s, z24.s, z16.s\n"
+ ".inst 0x448292d8 // srshl z24.s, p4/M, z24.s, z22.s\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "and z25.d, z0.d, z22.d\n"
"asr z20.s, z20.s, #0x1f\n"
- "sqadd z10.s, z10.s, z3.s\n"
- ".inst 0x448293ea // srshl z10.s, p4/M, z10.s, z31.s\n"
- "sqadd z22.s, z22.s, z26.s\n"
- "sqadd z18.s, z18.s, z20.s\n"
- ".inst 0x448293f6 // srshl z22.s, p4/M, z22.s, z31.s\n"
- ".inst 0x448293f2 // srshl z18.s, p4/M, z18.s, z31.s\n"
- ".inst 0x453041ad // sqxtnb z13.h, z13.s\n"
- ".inst 0x45304129 // sqxtnb z9.h, z9.s\n"
- ".inst 0x4530416b // sqxtnb z11.h, z11.s\n"
- ".inst 0x453042b5 // sqxtnb z21.h, z21.s\n"
- ".inst 0x4530462d // sqxtnt z13.h, z17.s\n"
- ".inst 0x45304549 // sqxtnt z9.h, z10.s\n"
- ".inst 0x453046cb // sqxtnt z11.h, z22.s\n"
- ".inst 0x45304655 // sqxtnt z21.h, z18.s\n"
- "sqadd z13.h, z13.h, z14.h\n"
- "sqadd z9.h, z9.h, z14.h\n"
- "smax z13.h, p4/M, z13.h, z16.h\n"
- "smax z9.h, p4/M, z9.h, z16.h\n"
- "sqadd z11.h, z11.h, z14.h\n"
- "sqadd z21.h, z21.h, z14.h\n"
- "smax z11.h, p4/M, z11.h, z16.h\n"
- "smax z21.h, p4/M, z21.h, z16.h\n"
- "smin z13.h, p4/M, z13.h, z15.h\n"
- "smin z9.h, p4/M, z9.h, z15.h\n"
- "st1b { z13.h }, p0, [x16, x10]\n"
- "smin z11.h, p4/M, z11.h, z15.h\n"
- "smin z21.h, p4/M, z21.h, z15.h\n"
- "st1b { z9.h }, p0, [x15, x10]\n"
- "st1b { z11.h }, p0, [x14, x10]\n"
- "st1b { z21.h }, p0, [x13, x10]\n"
- "ld1b { z0.h }, p4/Z, [x17]\n"
- "ld1b { z1.h }, p4/Z, [x17, #1, MUL VL]\n"
+ "and z17.d, z1.d, z22.d\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "and z16.d, z6.d, z22.d\n"
+ "sqadd z18.s, z18.s, z21.s\n"
+ "asr z25.s, z25.s, #0x1f\n"
+ ".inst 0x44829092 // srshl z18.s, p4/M, z18.s, z4.s\n"
+ "sqadd z15.s, z15.s, z20.s\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ ".inst 0x4482908f // srshl z15.s, p4/M, z15.s, z4.s\n"
+ "sqadd z5.s, z5.s, z28.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x44829085 // srshl z5.s, p4/M, z5.s, z4.s\n"
+ "sqadd z0.s, z0.s, z25.s\n"
+ "sqadd z1.s, z1.s, z17.s\n"
+ ".inst 0x448292c0 // srshl z0.s, p4/M, z0.s, z22.s\n"
+ ".inst 0x448292c1 // srshl z1.s, p4/M, z1.s, z22.s\n"
+ "sqadd z6.s, z6.s, z16.s\n"
+ ".inst 0x45304108 // sqxtnb z8.h, z8.s\n"
+ ".inst 0x448292c6 // srshl z6.s, p4/M, z6.s, z22.s\n"
+ ".inst 0x45304252 // sqxtnb z18.h, z18.s\n"
+ ".inst 0x453041ef // sqxtnb z15.h, z15.s\n"
+ ".inst 0x453040a5 // sqxtnb z5.h, z5.s\n"
+ ".inst 0x45304708 // sqxtnt z8.h, z24.s\n"
+ ".inst 0x45304412 // sqxtnt z18.h, z0.s\n"
+ ".inst 0x4530442f // sqxtnt z15.h, z1.s\n"
+ ".inst 0x453044c5 // sqxtnt z5.h, z6.s\n"
+ "sqadd z8.h, z8.h, z19.h\n"
+ "smax z8.h, p4/M, z8.h, z12.h\n"
+ "smin z8.h, p4/M, z8.h, z9.h\n"
+ "sqadd z18.h, z18.h, z19.h\n"
+ "sqadd z15.h, z15.h, z19.h\n"
+ "smax z18.h, p4/M, z18.h, z12.h\n"
+ "smax z15.h, p4/M, z15.h, z12.h\n"
+ "sqadd z5.h, z5.h, z19.h\n"
+ "smax z5.h, p4/M, z5.h, z12.h\n"
+ "smin z18.h, p4/M, z18.h, z9.h\n"
+ "st1b { z8.h }, p0, [x16, x10]\n"
+ "smin z15.h, p4/M, z15.h, z9.h\n"
+ "smin z5.h, p4/M, z5.h, z9.h\n"
+ "st1b { z18.h }, p0, [x15, x10]\n"
+ "st1b { z15.h }, p0, [x14, x10]\n"
+ "st1b { z5.h }, p0, [x13, x10]\n"
+ "ld1b { z25.h }, p4/Z, [x17]\n"
+ "ld1b { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
"inch x10\n"
- "ld1b { z2.h }, p4/Z, [x17, #2, MUL VL]\n"
- "ld1b { z3.h }, p4/Z, [x17, #3, MUL VL]\n"
- ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
- ".inst 0x454c1821 // usublb z1.h, z1.b, z12.b\n"
- "ld1b { z4.h }, p4/Z, [x17, #4, MUL VL]\n"
- "ld1b { z5.h }, p4/Z, [x17, #5, MUL VL]\n"
- ".inst 0x454c1842 // usublb z2.h, z2.b, z12.b\n"
- ".inst 0x454c1863 // usublb z3.h, z3.b, z12.b\n"
- "ld1b { z6.h }, p4/Z, [x17, #6, MUL VL]\n"
+ "ld1b { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
+ "ld1b { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
+ ".inst 0x454d1b39 // usublb z25.h, z25.b, z13.b\n"
+ ".inst 0x454d1bde // usublb z30.h, z30.b, z13.b\n"
+ "ld1b { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
+ "ld1b { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
+ ".inst 0x454d19ce // usublb z14.h, z14.b, z13.b\n"
+ ".inst 0x454d1884 // usublb z4.h, z4.b, z13.b\n"
+ "ld1b { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
"ld1b { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
"inch x17, ALL, MUL #8\n"
- ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
- "ld1w { z18.s }, p2/Z, [x12]\n"
- "ld1w { z8.s }, p1/Z, [x12, #1, MUL VL]\n"
- "uzp1 z13.s, z18.s, z8.s\n"
- "uzp2 z17.s, z18.s, z8.s\n"
- "ld1b { z8.h }, p4/Z, [x17]\n"
- "ldp x9, x28, [x11, #0x0]\n"
- "addvl x12, x12, #2\n"
- "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x454d194a // usublb z10.h, z10.b, z13.b\n"
+ "ld1w { z17.s }, p2/Z, [x20]\n"
+ "ld1w { z16.s }, p1/Z, [x20, #1, MUL VL]\n"
+ "uzp1 z8.s, z17.s, z16.s\n"
+ "uzp2 z24.s, z17.s, z16.s\n"
+ "ld1b { z2.h }, p4/Z, [x17]\n"
+ "ldp x27, x26, [x11, #0x0]\n"
+ "addvl x20, x20, #2\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
"ldp x25, x24, [x11, #0x10]\n"
"ldp x23, x22, [x11, #0x20]\n"
- "mov z9.d, z13.d\n"
- "mov z10.d, z17.d\n"
+ "mov z18.d, z8.d\n"
+ "mov z0.d, z24.d\n"
"ldp x21, x20, [x11, #0x30]\n"
- "ld1b { z31.h }, p3/Z, [x9, x7]\n"
- "mov z11.d, z13.d\n"
- "mov z22.d, z17.d\n"
- "ld1b { z30.h }, p3/Z, [x28, x7]\n"
- "ld1b { z29.h }, p3/Z, [x25, x7]\n"
- "mov z21.d, z13.d\n"
- "mov z18.d, z17.d\n"
- "ld1b { z28.h }, p3/Z, [x24, x7]\n"
+ "ld1b { z21.h }, p3/Z, [x27, x7]\n"
+ "mov z15.d, z8.d\n"
+ "mov z1.d, z24.d\n"
+ "ld1b { z22.h }, p3/Z, [x26, x7]\n"
+ "ld1b { z11.h }, p3/Z, [x25, x7]\n"
+ "mov z5.d, z8.d\n"
+ "mov z6.d, z24.d\n"
+ "ld1b { z20.h }, p3/Z, [x24, x7]\n"
"ld1b { z27.h }, p3/Z, [x23, x7]\n"
- ".inst 0x454c18a5 // usublb z5.h, z5.b, z12.b\n"
- ".inst 0x454c18c6 // usublb z6.h, z6.b, z12.b\n"
- "ld1b { z26.h }, p3/Z, [x22, x7]\n"
- "ld1b { z25.h }, p3/Z, [x21, x7]\n"
- ".inst 0x454c18e7 // usublb z7.h, z7.b, z12.b\n"
- ".inst 0x454c1908 // usublb z8.h, z8.b, z12.b\n"
- "ld1b { z24.h }, p3/Z, [x20, x7]\n"
- ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
- ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
- ".inst 0x45571b5a // usublb z26.h, z26.b, z23.b\n"
- ".inst 0x45571b39 // usublb z25.h, z25.b, z23.b\n"
- ".inst 0x45571b18 // usublb z24.h, z24.b, z23.b\n"
+ ".inst 0x454d1863 // usublb z3.h, z3.b, z13.b\n"
+ ".inst 0x454d1af7 // usublb z23.h, z23.b, z13.b\n"
+ "ld1b { z28.h }, p3/Z, [x22, x7]\n"
+ "ld1b { z16.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x454d18e7 // usublb z7.h, z7.b, z13.b\n"
+ ".inst 0x454d1842 // usublb z2.h, z2.b, z13.b\n"
+ "ld1b { z31.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x455a1ab5 // usublb z21.h, z21.b, z26.b\n"
+ ".inst 0x455a1ad6 // usublb z22.h, z22.b, z26.b\n"
+ ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
+ ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
+ ".inst 0x455a1b7b // usublb z27.h, z27.b, z26.b\n"
+ ".inst 0x455a1b9c // usublb z28.h, z28.b, z26.b\n"
+ ".inst 0x455a1a10 // usublb z16.h, z16.b, z26.b\n"
+ ".inst 0x455a1bff // usublb z31.h, z31.b, z26.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
@@ -448,4 +448,4 @@ void sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index ac0a00b245..dfaa059e9f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,14 +22,14 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -47,17 +47,16 @@ class sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstS
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
- unsigned int get_accumulator_depth_vl(void) const override { return 2; }
-
sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
- Parent::KernelType kernel = sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+ Parent::KernelType kernel = sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index 40e2f5df25..b8adbb8262 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -27,7 +27,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -46,7 +46,7 @@ void sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
struct Params
{
long unsigned int n_channels;
- const uint8_t *weights;
+ const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
const int32_t *const requant_muls;
@@ -57,7 +57,7 @@ void sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
Params(
long unsigned int n_channels,
const uint8_t *const *inptrs_raw,
- const uint8_t *const weights,
+ const void *const weights,
const int32_t *const bias,
const arm_gemm::Requantize32 &qp,
const int32_t *const requant_muls,
@@ -111,542 +111,542 @@ void sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "mov x0, #0x0\n"
- "mov x24, x0\n"
+ "mov x2, #0x0\n"
+ "mov x24, x2\n"
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
- "ldr x1, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x3, [%x[params], %[offsetof_Params_n_channels]]\n"
"ptrue p4.b\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
"incw x24\n"
- "ldr x2, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x4, [%x[params], %[offsetof_Params_weights]]\n"
"add x21, x23, %[offsetof_Requantize32_a_offset]\n"
"add x20, x23, %[offsetof_Requantize32_b_offset]\n"
- "ld1rb { z15.b }, p4/Z, [x21]\n"
- "ld1rb { z17.b }, p4/Z, [x20]\n"
+ "ld1rb { z30.b }, p4/Z, [x21]\n"
+ "ld1rb { z10.b }, p4/Z, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_c_offset]\n"
"add x20, x23, %[offsetof_Requantize32_minval]\n"
- "ld1rh { z12.h }, p4/Z, [x21]\n"
- "ld1rh { z13.h }, p4/Z, [x20]\n"
+ "ld1rh { z15.h }, p4/Z, [x21]\n"
+ "ld1rh { z12.h }, p4/Z, [x20]\n"
"add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "ld1rh { z11.h }, p4/Z, [x20]\n"
- "ldp x3, x4, [x22, #0x0]\n"
- "whilelt p3.h, x0, x1\n"
- "ldp x5, x6, [x22, #0x10]\n"
- "whilelt p2.s, x0, x1\n"
- "whilelt p1.s, x24, x1\n"
- "ldr x14, [%x[params], %[offsetof_Params_bias]]\n"
- "add x7, %x[params], %[offsetof_Params_inptrs]\n"
- "ld1w { z30.s }, p2/Z, [x14]\n"
- "ld1w { z16.s }, p1/Z, [x14, #1, MUL VL]\n"
- "uzp1 z14.s, z30.s, z16.s\n"
- "ld1b { z0.h }, p4/Z, [x2]\n"
- "ld1b { z1.h }, p4/Z, [x2, #1, MUL VL]\n"
- "uzp2 z10.s, z30.s, z16.s\n"
- "addvl x14, x14, #2\n"
- "ld1b { z2.h }, p4/Z, [x2, #2, MUL VL]\n"
- "ld1b { z3.h }, p4/Z, [x2, #3, MUL VL]\n"
- "mov x8, #0x0\n"
- "mov z20.d, z14.d\n"
- "ld1b { z4.h }, p4/Z, [x2, #4, MUL VL]\n"
- "ldp x9, x28, [x7, #0x0]\n"
- "mov z7.d, z10.d\n"
- "mov z8.d, z14.d\n"
- "ldp x27, x26, [x7, #0x10]\n"
- "ldp x25, x24, [x7, #0x20]\n"
- "mov z16.d, z10.d\n"
+ "ld1rh { z13.h }, p4/Z, [x20]\n"
+ "ldp x5, x6, [x22, #0x0]\n"
+ "whilelt p3.h, x2, x3\n"
+ "ldp x7, x8, [x22, #0x10]\n"
+ "whilelt p2.s, x2, x3\n"
+ "whilelt p1.s, x24, x3\n"
+ "ldr x10, [%x[params], %[offsetof_Params_bias]]\n"
+ "add x17, %x[params], %[offsetof_Params_inptrs]\n"
+ "ld1w { z17.s }, p2/Z, [x10]\n"
+ "ld1w { z16.s }, p1/Z, [x10, #1, MUL VL]\n"
+ "uzp1 z14.s, z17.s, z16.s\n"
+ "ld1b { z26.h }, p4/Z, [x4]\n"
+ "ld1b { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
+ "uzp2 z23.s, z17.s, z16.s\n"
+ "addvl x10, x10, #2\n"
+ "ld1b { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
+ "ld1b { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
+ "mov x16, #0x0\n"
"mov z6.d, z14.d\n"
- "ldp x23, x22, [x7, #0x30]\n"
- "ldp x21, x20, [x7, #0x40]\n"
- "mov z5.d, z10.d\n"
- ".inst 0x45511800 // usublb z0.h, z0.b, z17.b\n"
- "ld1b { z31.h }, p3/Z, [x9, x0]\n"
- "ld1b { z30.h }, p3/Z, [x28, x0]\n"
- ".inst 0x45511821 // usublb z1.h, z1.b, z17.b\n"
- ".inst 0x45511842 // usublb z2.h, z2.b, z17.b\n"
- "ld1b { z29.h }, p3/Z, [x27, x0]\n"
- "ld1b { z28.h }, p3/Z, [x26, x0]\n"
- ".inst 0x45511863 // usublb z3.h, z3.b, z17.b\n"
- ".inst 0x45511884 // usublb z4.h, z4.b, z17.b\n"
- "ld1b { z27.h }, p3/Z, [x25, x0]\n"
- "ld1b { z23.h }, p3/Z, [x24, x0]\n"
- ".inst 0x454f1bff // usublb z31.h, z31.b, z15.b\n"
- ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
- "ld1b { z25.h }, p3/Z, [x23, x0]\n"
- "ld1b { z24.h }, p3/Z, [x22, x0]\n"
- ".inst 0x454f1bbd // usublb z29.h, z29.b, z15.b\n"
- ".inst 0x454f1b9c // usublb z28.h, z28.b, z15.b\n"
- "ld1b { z26.h }, p3/Z, [x21, x0]\n"
- "ld1b { z22.h }, p3/Z, [x20, x0]\n"
- ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
- ".inst 0x454f1af7 // usublb z23.h, z23.b, z15.b\n"
- "ldr x17, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "ldr x16, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "str x14, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
- ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
- ".inst 0x454f1b5a // usublb z26.h, z26.b, z15.b\n"
- ".inst 0x454f1ad6 // usublb z22.h, z22.b, z15.b\n"
+ "ld1b { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
+ "ldp x9, x28, [x17, #0x0]\n"
+ "mov z18.d, z23.d\n"
+ "mov z9.d, z14.d\n"
+ "ldp x27, x26, [x17, #0x10]\n"
+ "ldp x25, x24, [x17, #0x20]\n"
+ "mov z20.d, z23.d\n"
+ "mov z7.d, z14.d\n"
+ "ldp x23, x22, [x17, #0x30]\n"
+ "ldp x21, x20, [x17, #0x40]\n"
+ "mov z1.d, z23.d\n"
+ ".inst 0x454a1b5a // usublb z26.h, z26.b, z10.b\n"
+ "ld1b { z22.h }, p3/Z, [x9, x2]\n"
+ "ld1b { z2.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x454a1908 // usublb z8.h, z8.b, z10.b\n"
+ ".inst 0x454a1a10 // usublb z16.h, z16.b, z10.b\n"
+ "ld1b { z11.h }, p3/Z, [x27, x2]\n"
+ "ld1b { z3.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x454a1ab5 // usublb z21.h, z21.b, z10.b\n"
+ ".inst 0x454a1a31 // usublb z17.h, z17.b, z10.b\n"
+ "ld1b { z29.h }, p3/Z, [x25, x2]\n"
+ "ld1b { z4.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x455e1ad6 // usublb z22.h, z22.b, z30.b\n"
+ ".inst 0x455e1842 // usublb z2.h, z2.b, z30.b\n"
+ "ld1b { z31.h }, p3/Z, [x23, x2]\n"
+ "ld1b { z0.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e196b // usublb z11.h, z11.b, z30.b\n"
+ ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
+ "ld1b { z19.h }, p3/Z, [x21, x2]\n"
+ "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1bbd // usublb z29.h, z29.b, z30.b\n"
+ ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
+ "ldr x15, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "str x10, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
+ ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
+ ".inst 0x455e1a73 // usublb z19.h, z19.b, z30.b\n"
+ ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
"1:" // Loop
- ".inst 0x448043ee // smlalb z14.s, p4/M, z31.h, z0.h\n"
- ".inst 0x448047ea // smlalt z10.s, p4/M, z31.h, z0.h\n"
- "ldr x20, [x7, #0x50]\n"
- "ld1b { z31.h }, p3/Z, [x20, x0]\n"
- ".inst 0x448143ce // smlalb z14.s, p4/M, z30.h, z1.h\n"
- ".inst 0x448043d4 // smlalb z20.s, p4/M, z30.h, z0.h\n"
- "ldr x22, [x7, #0x58]\n"
- ".inst 0x454f1bff // usublb z31.h, z31.b, z15.b\n"
- ".inst 0x448043a8 // smlalb z8.s, p4/M, z29.h, z0.h\n"
- ".inst 0x44804386 // smlalb z6.s, p4/M, z28.h, z0.h\n"
- "ldr x21, [x7, #0x60]\n"
- "ldr x20, [x7, #0x68]\n"
- ".inst 0x448147ca // smlalt z10.s, p4/M, z30.h, z1.h\n"
- ".inst 0x448047c7 // smlalt z7.s, p4/M, z30.h, z0.h\n"
- "ld1b { z30.h }, p3/Z, [x22, x0]\n"
- ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
- ".inst 0x448047b0 // smlalt z16.s, p4/M, z29.h, z0.h\n"
- ".inst 0x4482436e // smlalb z14.s, p4/M, z27.h, z2.h\n"
- "ldr x25, [x7, #0x70]\n"
- "ldr x24, [x7, #0x78]\n"
- ".inst 0x44804785 // smlalt z5.s, p4/M, z28.h, z0.h\n"
- ".inst 0x44814374 // smlalb z20.s, p4/M, z27.h, z1.h\n"
- "ld1b { z0.h }, p4/Z, [x2, #5, MUL VL]\n"
- ".inst 0x45511800 // usublb z0.h, z0.b, z17.b\n"
- ".inst 0x44814388 // smlalb z8.s, p4/M, z28.h, z1.h\n"
- ".inst 0x448142e6 // smlalb z6.s, p4/M, z23.h, z1.h\n"
- "ldr x15, [x7, #0x80]\n"
- "ldr x23, [x7, #0x88]\n"
- ".inst 0x4482476a // smlalt z10.s, p4/M, z27.h, z2.h\n"
- ".inst 0x44814767 // smlalt z7.s, p4/M, z27.h, z1.h\n"
- "ld1b { z27.h }, p3/Z, [x21, x0]\n"
- ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
- ".inst 0x44814790 // smlalt z16.s, p4/M, z28.h, z1.h\n"
- ".inst 0x4483432e // smlalb z14.s, p4/M, z25.h, z3.h\n"
- "ldr x22, [x7, #0x90]\n"
- "ldr x21, [x7, #0x98]\n"
- ".inst 0x448146e5 // smlalt z5.s, p4/M, z23.h, z1.h\n"
- ".inst 0x44824334 // smlalb z20.s, p4/M, z25.h, z2.h\n"
- "ld1b { z1.h }, p4/Z, [x2, #6, MUL VL]\n"
- ".inst 0x45511821 // usublb z1.h, z1.b, z17.b\n"
- ".inst 0x448242e8 // smlalb z8.s, p4/M, z23.h, z2.h\n"
- ".inst 0x448243e6 // smlalb z6.s, p4/M, z31.h, z2.h\n"
- "ldr x14, [x7, #0xa0]\n"
- "ldr x13, [x7, #0xa8]\n"
- ".inst 0x4483472a // smlalt z10.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44824727 // smlalt z7.s, p4/M, z25.h, z2.h\n"
- "ld1b { z25.h }, p3/Z, [x20, x0]\n"
- ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
- ".inst 0x448246f0 // smlalt z16.s, p4/M, z23.h, z2.h\n"
- ".inst 0x4484430e // smlalb z14.s, p4/M, z24.h, z4.h\n"
- "ldr x12, [x7, #0xb0]\n"
- "ldr x20, [x7, #0xb8]\n"
- ".inst 0x448247e5 // smlalt z5.s, p4/M, z31.h, z2.h\n"
- ".inst 0x44834314 // smlalb z20.s, p4/M, z24.h, z3.h\n"
- "ld1b { z2.h }, p4/Z, [x2, #7, MUL VL]\n"
- "inch x2, ALL, MUL #8\n"
- ".inst 0x448343e8 // smlalb z8.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448343c6 // smlalb z6.s, p4/M, z30.h, z3.h\n"
- ".inst 0x45511842 // usublb z2.h, z2.b, z17.b\n"
- "ldr x11, [x7, #0xc0]\n"
- ".inst 0x4484470a // smlalt z10.s, p4/M, z24.h, z4.h\n"
- ".inst 0x44834707 // smlalt z7.s, p4/M, z24.h, z3.h\n"
- "ld1b { z24.h }, p3/Z, [x25, x0]\n"
- ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
- ".inst 0x448347f0 // smlalt z16.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448043ae // smlalb z14.s, p4/M, z29.h, z0.h\n"
- "ldr x10, [x7, #0xc8]\n"
- "ldr x9, [x7, #0xd0]\n"
- ".inst 0x448347c5 // smlalt z5.s, p4/M, z30.h, z3.h\n"
- ".inst 0x44844374 // smlalb z20.s, p4/M, z27.h, z4.h\n"
- "ld1b { z3.h }, p4/Z, [x2]\n"
- ".inst 0x45511863 // usublb z3.h, z3.b, z17.b\n"
- ".inst 0x448443c8 // smlalb z8.s, p4/M, z30.h, z4.h\n"
- ".inst 0x44844346 // smlalb z6.s, p4/M, z26.h, z4.h\n"
- "ldr x28, [x7, #0xd8]\n"
- "ldr x27, [x7, #0xe0]\n"
- ".inst 0x448047aa // smlalt z10.s, p4/M, z29.h, z0.h\n"
- ".inst 0x44844767 // smlalt z7.s, p4/M, z27.h, z4.h\n"
- "ld1b { z27.h }, p3/Z, [x24, x0]\n"
- ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
- ".inst 0x448447d0 // smlalt z16.s, p4/M, z30.h, z4.h\n"
- ".inst 0x4481438e // smlalb z14.s, p4/M, z28.h, z1.h\n"
- "ldr x26, [x7, #0xe8]\n"
- "ldr x25, [x7, #0xf0]\n"
- ".inst 0x44844745 // smlalt z5.s, p4/M, z26.h, z4.h\n"
- ".inst 0x44804394 // smlalb z20.s, p4/M, z28.h, z0.h\n"
- "ld1b { z4.h }, p4/Z, [x2, #1, MUL VL]\n"
- ".inst 0x45511884 // usublb z4.h, z4.b, z17.b\n"
- ".inst 0x448042c8 // smlalb z8.s, p4/M, z22.h, z0.h\n"
- ".inst 0x44804326 // smlalb z6.s, p4/M, z25.h, z0.h\n"
- "ld1w { z19.s }, p2/Z, [x17]\n"
- "ld1w { z18.s }, p1/Z, [x17, #1, MUL VL]\n"
- ".inst 0x4481478a // smlalt z10.s, p4/M, z28.h, z1.h\n"
- ".inst 0x44804787 // smlalt z7.s, p4/M, z28.h, z0.h\n"
- "ld1b { z28.h }, p3/Z, [x23, x0]\n"
- ".inst 0x454f1b9c // usublb z28.h, z28.b, z15.b\n"
- ".inst 0x448046d0 // smlalt z16.s, p4/M, z22.h, z0.h\n"
- ".inst 0x448242ee // smlalb z14.s, p4/M, z23.h, z2.h\n"
- "ldr x24, [x7, #0xf8]\n"
- "uzp1 z9.s, z19.s, z18.s\n"
- ".inst 0x44804725 // smlalt z5.s, p4/M, z25.h, z0.h\n"
- ".inst 0x448142f4 // smlalb z20.s, p4/M, z23.h, z1.h\n"
- "ld1b { z0.h }, p4/Z, [x2, #2, MUL VL]\n"
- ".inst 0x45511800 // usublb z0.h, z0.b, z17.b\n"
- ".inst 0x44814328 // smlalb z8.s, p4/M, z25.h, z1.h\n"
- ".inst 0x44814306 // smlalb z6.s, p4/M, z24.h, z1.h\n"
- "uzp2 z29.s, z19.s, z18.s\n"
- "ld1w { z19.s }, p2/Z, [x16]\n"
- ".inst 0x448246ea // smlalt z10.s, p4/M, z23.h, z2.h\n"
- ".inst 0x448146e7 // smlalt z7.s, p4/M, z23.h, z1.h\n"
- "ld1b { z23.h }, p3/Z, [x15, x0]\n"
- ".inst 0x454f1af7 // usublb z23.h, z23.b, z15.b\n"
- ".inst 0x44814730 // smlalt z16.s, p4/M, z25.h, z1.h\n"
- ".inst 0x448343ee // smlalb z14.s, p4/M, z31.h, z3.h\n"
- "ldr x23, [x7, #0x100]\n"
- "whilelt p0.h, x8, x1\n"
- ".inst 0x44814705 // smlalt z5.s, p4/M, z24.h, z1.h\n"
- ".inst 0x448243f4 // smlalb z20.s, p4/M, z31.h, z2.h\n"
- "ld1b { z1.h }, p4/Z, [x2, #3, MUL VL]\n"
- ".inst 0x45511821 // usublb z1.h, z1.b, z17.b\n"
- ".inst 0x44824308 // smlalb z8.s, p4/M, z24.h, z2.h\n"
- ".inst 0x44824366 // smlalb z6.s, p4/M, z27.h, z2.h\n"
- "addvl x17, x17, #2\n"
- ".inst 0x448347ea // smlalt z10.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448247e7 // smlalt z7.s, p4/M, z31.h, z2.h\n"
- "ld1b { z31.h }, p3/Z, [x22, x0]\n"
- ".inst 0x454f1bff // usublb z31.h, z31.b, z15.b\n"
- ".inst 0x44824710 // smlalt z16.s, p4/M, z24.h, z2.h\n"
- ".inst 0x448443ce // smlalb z14.s, p4/M, z30.h, z4.h\n"
- "ldr x22, [x7, #0x108]\n"
- ".inst 0x44824765 // smlalt z5.s, p4/M, z27.h, z2.h\n"
- ".inst 0x448343d4 // smlalb z20.s, p4/M, z30.h, z3.h\n"
- "ld1b { z2.h }, p4/Z, [x2, #4, MUL VL]\n"
- ".inst 0x45511842 // usublb z2.h, z2.b, z17.b\n"
- ".inst 0x44834368 // smlalb z8.s, p4/M, z27.h, z3.h\n"
- ".inst 0x448342e6 // smlalb z6.s, p4/M, z23.h, z3.h\n"
- ".inst 0x448447ca // smlalt z10.s, p4/M, z30.h, z4.h\n"
- ".inst 0x448347c7 // smlalt z7.s, p4/M, z30.h, z3.h\n"
- "ld1b { z30.h }, p3/Z, [x21, x0]\n"
- ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
- ".inst 0x44834770 // smlalt z16.s, p4/M, z27.h, z3.h\n"
- ".inst 0x448042ce // smlalb z14.s, p4/M, z22.h, z0.h\n"
- "ldr x21, [x7, #0x110]\n"
- ".inst 0x448346e5 // smlalt z5.s, p4/M, z23.h, z3.h\n"
- ".inst 0x44844354 // smlalb z20.s, p4/M, z26.h, z4.h\n"
- "ld1b { z3.h }, p4/Z, [x2, #5, MUL VL]\n"
- ".inst 0x45511863 // usublb z3.h, z3.b, z17.b\n"
- ".inst 0x448442e8 // smlalb z8.s, p4/M, z23.h, z4.h\n"
- ".inst 0x44844386 // smlalb z6.s, p4/M, z28.h, z4.h\n"
- ".inst 0x448046ca // smlalt z10.s, p4/M, z22.h, z0.h\n"
- ".inst 0x44844747 // smlalt z7.s, p4/M, z26.h, z4.h\n"
- "ld1b { z26.h }, p3/Z, [x14, x0]\n"
- ".inst 0x454f1b5a // usublb z26.h, z26.b, z15.b\n"
- ".inst 0x448446f0 // smlalt z16.s, p4/M, z23.h, z4.h\n"
- ".inst 0x4481432e // smlalb z14.s, p4/M, z25.h, z1.h\n"
- "ld1b { z22.h }, p3/Z, [x20, x0]\n"
- ".inst 0x454f1ad6 // usublb z22.h, z22.b, z15.b\n"
- ".inst 0x44844785 // smlalt z5.s, p4/M, z28.h, z4.h\n"
- ".inst 0x44804334 // smlalb z20.s, p4/M, z25.h, z0.h\n"
- "ld1b { z4.h }, p4/Z, [x2, #6, MUL VL]\n"
- ".inst 0x45511884 // usublb z4.h, z4.b, z17.b\n"
- ".inst 0x448043e8 // smlalb z8.s, p4/M, z31.h, z0.h\n"
- ".inst 0x448043c6 // smlalb z6.s, p4/M, z30.h, z0.h\n"
- "ldr x20, [x7, #0x118]\n"
- "ldr x14, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x4481472a // smlalt z10.s, p4/M, z25.h, z1.h\n"
- ".inst 0x44804727 // smlalt z7.s, p4/M, z25.h, z0.h\n"
- "ld1b { z25.h }, p3/Z, [x13, x0]\n"
- ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
- ".inst 0x448047f0 // smlalt z16.s, p4/M, z31.h, z0.h\n"
- ".inst 0x4482430e // smlalb z14.s, p4/M, z24.h, z2.h\n"
- ".inst 0x448047c5 // smlalt z5.s, p4/M, z30.h, z0.h\n"
- ".inst 0x44814314 // smlalb z20.s, p4/M, z24.h, z1.h\n"
- "ld1b { z0.h }, p4/Z, [x2, #7, MUL VL]\n"
- "inch x2, ALL, MUL #8\n"
- ".inst 0x448143c8 // smlalb z8.s, p4/M, z30.h, z1.h\n"
- ".inst 0x44814346 // smlalb z6.s, p4/M, z26.h, z1.h\n"
- ".inst 0x45511800 // usublb z0.h, z0.b, z17.b\n"
- ".inst 0x4482470a // smlalt z10.s, p4/M, z24.h, z2.h\n"
- ".inst 0x44814707 // smlalt z7.s, p4/M, z24.h, z1.h\n"
- "ld1b { z24.h }, p3/Z, [x12, x0]\n"
- ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
- ".inst 0x448147d0 // smlalt z16.s, p4/M, z30.h, z1.h\n"
- ".inst 0x4483436e // smlalb z14.s, p4/M, z27.h, z3.h\n"
- ".inst 0x44814745 // smlalt z5.s, p4/M, z26.h, z1.h\n"
- ".inst 0x44824374 // smlalb z20.s, p4/M, z27.h, z2.h\n"
- "ld1b { z1.h }, p4/Z, [x2]\n"
- ".inst 0x45511821 // usublb z1.h, z1.b, z17.b\n"
- ".inst 0x44824348 // smlalb z8.s, p4/M, z26.h, z2.h\n"
- ".inst 0x44824326 // smlalb z6.s, p4/M, z25.h, z2.h\n"
- ".inst 0x4483476a // smlalt z10.s, p4/M, z27.h, z3.h\n"
- ".inst 0x44824767 // smlalt z7.s, p4/M, z27.h, z2.h\n"
- "ld1b { z27.h }, p3/Z, [x11, x0]\n"
- ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
- ".inst 0x44824750 // smlalt z16.s, p4/M, z26.h, z2.h\n"
- ".inst 0x448442ee // smlalb z14.s, p4/M, z23.h, z4.h\n"
- ".inst 0x44824725 // smlalt z5.s, p4/M, z25.h, z2.h\n"
- ".inst 0x448342f4 // smlalb z20.s, p4/M, z23.h, z3.h\n"
- "ld1b { z2.h }, p4/Z, [x2, #1, MUL VL]\n"
- ".inst 0x45511842 // usublb z2.h, z2.b, z17.b\n"
- ".inst 0x44834328 // smlalb z8.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44834306 // smlalb z6.s, p4/M, z24.h, z3.h\n"
- ".inst 0x448446ea // smlalt z10.s, p4/M, z23.h, z4.h\n"
- ".inst 0x448346e7 // smlalt z7.s, p4/M, z23.h, z3.h\n"
- "ld1b { z23.h }, p3/Z, [x10, x0]\n"
- ".inst 0x454f1af7 // usublb z23.h, z23.b, z15.b\n"
- ".inst 0x44834730 // smlalt z16.s, p4/M, z25.h, z3.h\n"
- ".inst 0x448043ee // smlalb z14.s, p4/M, z31.h, z0.h\n"
- ".inst 0x44834705 // smlalt z5.s, p4/M, z24.h, z3.h\n"
- ".inst 0x44844394 // smlalb z20.s, p4/M, z28.h, z4.h\n"
- "ld1b { z3.h }, p4/Z, [x2, #2, MUL VL]\n"
- ".inst 0x45511863 // usublb z3.h, z3.b, z17.b\n"
- ".inst 0x44844308 // smlalb z8.s, p4/M, z24.h, z4.h\n"
- ".inst 0x448442c6 // smlalb z6.s, p4/M, z22.h, z4.h\n"
- ".inst 0x448047ea // smlalt z10.s, p4/M, z31.h, z0.h\n"
- ".inst 0x44844787 // smlalt z7.s, p4/M, z28.h, z4.h\n"
- "ld1b { z31.h }, p3/Z, [x9, x0]\n"
- ".inst 0x454f1bff // usublb z31.h, z31.b, z15.b\n"
- ".inst 0x44844710 // smlalt z16.s, p4/M, z24.h, z4.h\n"
- ".inst 0x448143ce // smlalb z14.s, p4/M, z30.h, z1.h\n"
- "ld1b { z28.h }, p3/Z, [x27, x0]\n"
- ".inst 0x454f1b9c // usublb z28.h, z28.b, z15.b\n"
- ".inst 0x448446c5 // smlalt z5.s, p4/M, z22.h, z4.h\n"
- ".inst 0x448043d4 // smlalb z20.s, p4/M, z30.h, z0.h\n"
- "ld1b { z4.h }, p4/Z, [x2, #3, MUL VL]\n"
- ".inst 0x45511884 // usublb z4.h, z4.b, z17.b\n"
- ".inst 0x44804368 // smlalb z8.s, p4/M, z27.h, z0.h\n"
- ".inst 0x448042e6 // smlalb z6.s, p4/M, z23.h, z0.h\n"
- ".inst 0x448147ca // smlalt z10.s, p4/M, z30.h, z1.h\n"
- ".inst 0x448047c7 // smlalt z7.s, p4/M, z30.h, z0.h\n"
- "ld1b { z30.h }, p3/Z, [x28, x0]\n"
- ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
- ".inst 0x44804770 // smlalt z16.s, p4/M, z27.h, z0.h\n"
- ".inst 0x4482434e // smlalb z14.s, p4/M, z26.h, z2.h\n"
- ".inst 0x448046e5 // smlalt z5.s, p4/M, z23.h, z0.h\n"
- ".inst 0x44814354 // smlalb z20.s, p4/M, z26.h, z1.h\n"
- "ld1b { z0.h }, p4/Z, [x2, #4, MUL VL]\n"
- ".inst 0x45511800 // usublb z0.h, z0.b, z17.b\n"
- ".inst 0x448142e8 // smlalb z8.s, p4/M, z23.h, z1.h\n"
- ".inst 0x448143e6 // smlalb z6.s, p4/M, z31.h, z1.h\n"
- ".inst 0x4482474a // smlalt z10.s, p4/M, z26.h, z2.h\n"
- ".inst 0x44814747 // smlalt z7.s, p4/M, z26.h, z1.h\n"
- "ld1b { z26.h }, p3/Z, [x26, x0]\n"
- ".inst 0x454f1b5a // usublb z26.h, z26.b, z15.b\n"
- ".inst 0x448146f0 // smlalt z16.s, p4/M, z23.h, z1.h\n"
- ".inst 0x4483432e // smlalb z14.s, p4/M, z25.h, z3.h\n"
- ".inst 0x448147e5 // smlalt z5.s, p4/M, z31.h, z1.h\n"
- ".inst 0x44824334 // smlalb z20.s, p4/M, z25.h, z2.h\n"
- "ld1b { z1.h }, p4/Z, [x2, #5, MUL VL]\n"
- ".inst 0x45511821 // usublb z1.h, z1.b, z17.b\n"
- ".inst 0x448243e8 // smlalb z8.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448243c6 // smlalb z6.s, p4/M, z30.h, z2.h\n"
- ".inst 0x4483472a // smlalt z10.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44824727 // smlalt z7.s, p4/M, z25.h, z2.h\n"
- "ld1b { z25.h }, p3/Z, [x25, x0]\n"
- ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
- ".inst 0x448247f0 // smlalt z16.s, p4/M, z31.h, z2.h\n"
- ".inst 0x4484430e // smlalb z14.s, p4/M, z24.h, z4.h\n"
- ".inst 0x448247c5 // smlalt z5.s, p4/M, z30.h, z2.h\n"
- ".inst 0x44834314 // smlalb z20.s, p4/M, z24.h, z3.h\n"
- "ld1b { z2.h }, p4/Z, [x2, #6, MUL VL]\n"
- ".inst 0x45511842 // usublb z2.h, z2.b, z17.b\n"
- ".inst 0x448343c8 // smlalb z8.s, p4/M, z30.h, z3.h\n"
- ".inst 0x44834386 // smlalb z6.s, p4/M, z28.h, z3.h\n"
- ".inst 0x4484470a // smlalt z10.s, p4/M, z24.h, z4.h\n"
- ".inst 0x44834707 // smlalt z7.s, p4/M, z24.h, z3.h\n"
- "ld1b { z24.h }, p3/Z, [x24, x0]\n"
- ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
- ".inst 0x448347d0 // smlalt z16.s, p4/M, z30.h, z3.h\n"
- ".inst 0x4480436e // smlalb z14.s, p4/M, z27.h, z0.h\n"
- ".inst 0x44834785 // smlalt z5.s, p4/M, z28.h, z3.h\n"
- ".inst 0x448442d4 // smlalb z20.s, p4/M, z22.h, z4.h\n"
- "ld1b { z3.h }, p4/Z, [x2, #7, MUL VL]\n"
- "inch x2, ALL, MUL #8\n"
- ".inst 0x44844388 // smlalb z8.s, p4/M, z28.h, z4.h\n"
- ".inst 0x44844346 // smlalb z6.s, p4/M, z26.h, z4.h\n"
- ".inst 0x45511863 // usublb z3.h, z3.b, z17.b\n"
- ".inst 0x4480476a // smlalt z10.s, p4/M, z27.h, z0.h\n"
- ".inst 0x44844790 // smlalt z16.s, p4/M, z28.h, z4.h\n"
- "ld1b { z27.h }, p3/Z, [x23, x0]\n"
- ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
- ".inst 0x448142ee // smlalb z14.s, p4/M, z23.h, z1.h\n"
- ".inst 0x448446c7 // smlalt z7.s, p4/M, z22.h, z4.h\n"
- "ld1w { z18.s }, p1/Z, [x16, #1, MUL VL]\n"
- "addvl x16, x16, #2\n"
- ".inst 0x44844745 // smlalt z5.s, p4/M, z26.h, z4.h\n"
- ".inst 0x448042f4 // smlalb z20.s, p4/M, z23.h, z0.h\n"
- "ld1b { z4.h }, p4/Z, [x2]\n"
- ".inst 0x45511884 // usublb z4.h, z4.b, z17.b\n"
- ".inst 0x44804328 // smlalb z8.s, p4/M, z25.h, z0.h\n"
- ".inst 0x44804306 // smlalb z6.s, p4/M, z24.h, z0.h\n"
- "inch x2\n"
- ".inst 0x448146ea // smlalt z10.s, p4/M, z23.h, z1.h\n"
- ".inst 0x44804730 // smlalt z16.s, p4/M, z25.h, z0.h\n"
- "ld1b { z25.h }, p3/Z, [x22, x0]\n"
- ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
+ ".inst 0x449a42ce // smlalb z14.s, p4/M, z22.h, z26.h\n"
+ ".inst 0x449a46d7 // smlalt z23.s, p4/M, z22.h, z26.h\n"
+ "ldr x20, [x17, #0x50]\n"
+ "ld1b { z27.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x4488404e // smlalb z14.s, p4/M, z2.h, z8.h\n"
+ ".inst 0x449a4046 // smlalb z6.s, p4/M, z2.h, z26.h\n"
+ "ldr x20, [x17, #0x58]\n"
+ ".inst 0x455e1b7b // usublb z27.h, z27.b, z30.b\n"
+ ".inst 0x449a4169 // smlalb z9.s, p4/M, z11.h, z26.h\n"
+ ".inst 0x449a4067 // smlalb z7.s, p4/M, z3.h, z26.h\n"
+ "ld1b { z5.h }, p3/Z, [x20, x2]\n"
+ "ldr x20, [x17, #0x60]\n"
+ ".inst 0x44884457 // smlalt z23.s, p4/M, z2.h, z8.h\n"
+ ".inst 0x449043ae // smlalb z14.s, p4/M, z29.h, z16.h\n"
+ "ld1b { z25.h }, p4/Z, [x4, #5, MUL VL]\n"
+ ".inst 0x455e18a5 // usublb z5.h, z5.b, z30.b\n"
+ ".inst 0x449a4452 // smlalt z18.s, p4/M, z2.h, z26.h\n"
+ ".inst 0x449a4574 // smlalt z20.s, p4/M, z11.h, z26.h\n"
+ "ld1b { z22.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x454a1b39 // usublb z25.h, z25.b, z10.b\n"
+ ".inst 0x449a4461 // smlalt z1.s, p4/M, z3.h, z26.h\n"
+ ".inst 0x448843a6 // smlalb z6.s, p4/M, z29.h, z8.h\n"
+ "ldr x20, [x17, #0x68]\n"
+ "ld1b { z2.h }, p4/Z, [x4, #6, MUL VL]\n"
+ ".inst 0x44884069 // smlalb z9.s, p4/M, z3.h, z8.h\n"
+ ".inst 0x44884087 // smlalb z7.s, p4/M, z4.h, z8.h\n"
+ ".inst 0x455e1ad6 // usublb z22.h, z22.b, z30.b\n"
+ "ld1b { z26.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x449047b7 // smlalt z23.s, p4/M, z29.h, z16.h\n"
+ ".inst 0x449543ee // smlalb z14.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x454a1842 // usublb z2.h, z2.b, z10.b\n"
+ "ldr x20, [x17, #0x70]\n"
+ ".inst 0x448847b2 // smlalt z18.s, p4/M, z29.h, z8.h\n"
+ ".inst 0x44884474 // smlalt z20.s, p4/M, z3.h, z8.h\n"
+ "ld1b { z29.h }, p4/Z, [x4, #7, MUL VL]\n"
+ ".inst 0x455e1b5a // usublb z26.h, z26.b, z30.b\n"
+ ".inst 0x44884481 // smlalt z1.s, p4/M, z4.h, z8.h\n"
+ ".inst 0x449043e6 // smlalb z6.s, p4/M, z31.h, z16.h\n"
+ "inch x4, ALL, MUL #8\n"
+ "ld1b { z8.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x44904089 // smlalb z9.s, p4/M, z4.h, z16.h\n"
+ ".inst 0x44904367 // smlalb z7.s, p4/M, z27.h, z16.h\n"
+ ".inst 0x454a1bbd // usublb z29.h, z29.b, z10.b\n"
+ "ldr x20, [x17, #0x78]\n"
+ ".inst 0x449547f7 // smlalt z23.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x4491400e // smlalb z14.s, p4/M, z0.h, z17.h\n"
+ "ld1b { z24.h }, p4/Z, [x4]\n"
+ ".inst 0x455e1908 // usublb z8.h, z8.b, z30.b\n"
+ ".inst 0x449047f2 // smlalt z18.s, p4/M, z31.h, z16.h\n"
+ ".inst 0x44904494 // smlalt z20.s, p4/M, z4.h, z16.h\n"
+ "ld1b { z31.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x454a1b18 // usublb z24.h, z24.b, z10.b\n"
+ ".inst 0x44904761 // smlalt z1.s, p4/M, z27.h, z16.h\n"
+ ".inst 0x44954006 // smlalb z6.s, p4/M, z0.h, z21.h\n"
+ "ldr x22, [x17, #0x80]\n"
+ "ld1b { z16.h }, p4/Z, [x4, #1, MUL VL]\n"
+ ".inst 0x44954369 // smlalb z9.s, p4/M, z27.h, z21.h\n"
+ ".inst 0x449540a7 // smlalb z7.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
+ "ldr x21, [x17, #0x88]\n"
+ ".inst 0x44914417 // smlalt z23.s, p4/M, z0.h, z17.h\n"
+ ".inst 0x4499416e // smlalb z14.s, p4/M, z11.h, z25.h\n"
+ ".inst 0x454a1a10 // usublb z16.h, z16.b, z10.b\n"
+ "ldr x20, [x17, #0x90]\n"
+ ".inst 0x44954412 // smlalt z18.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x44954774 // smlalt z20.s, p4/M, z27.h, z21.h\n"
+ "ld1b { z0.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
+ ".inst 0x449544a1 // smlalt z1.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x449142c6 // smlalb z6.s, p4/M, z22.h, z17.h\n"
+ "ld1b { z21.h }, p4/Z, [x4, #2, MUL VL]\n"
+ ".inst 0x454a1ab5 // usublb z21.h, z21.b, z10.b\n"
+ ".inst 0x449140a9 // smlalb z9.s, p4/M, z5.h, z17.h\n"
+ ".inst 0x44914267 // smlalb z7.s, p4/M, z19.h, z17.h\n"
+ "ldr x23, [x17, #0x98]\n"
+ "ldr x22, [x17, #0xa0]\n"
+ ".inst 0x44994577 // smlalt z23.s, p4/M, z11.h, z25.h\n"
+ ".inst 0x4482406e // smlalb z14.s, p4/M, z3.h, z2.h\n"
+ "ld1b { z11.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x455e196b // usublb z11.h, z11.b, z30.b\n"
+ ".inst 0x449146d2 // smlalt z18.s, p4/M, z22.h, z17.h\n"
+ ".inst 0x449144b4 // smlalt z20.s, p4/M, z5.h, z17.h\n"
+ "ld1b { z22.h }, p4/Z, [x4, #3, MUL VL]\n"
+ ".inst 0x454a1ad6 // usublb z22.h, z22.b, z10.b\n"
+ ".inst 0x44914661 // smlalt z1.s, p4/M, z19.h, z17.h\n"
+ ".inst 0x44994066 // smlalb z6.s, p4/M, z3.h, z25.h\n"
+ "ld1b { z17.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1a31 // usublb z17.h, z17.b, z30.b\n"
+ ".inst 0x44994389 // smlalb z9.s, p4/M, z28.h, z25.h\n"
+ ".inst 0x44994347 // smlalb z7.s, p4/M, z26.h, z25.h\n"
+ "ldr x20, [x17, #0xa8]\n"
+ "ldr x21, [x17, #0xb0]\n"
+ ".inst 0x44824477 // smlalt z23.s, p4/M, z3.h, z2.h\n"
+ ".inst 0x449d408e // smlalb z14.s, p4/M, z4.h, z29.h\n"
+ "ldr x13, [x17, #0xb8]\n"
+ "ldr x12, [x17, #0xc0]\n"
+ ".inst 0x44994472 // smlalt z18.s, p4/M, z3.h, z25.h\n"
+ ".inst 0x44994794 // smlalt z20.s, p4/M, z28.h, z25.h\n"
+ "ld1b { z3.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
+ ".inst 0x44994741 // smlalt z1.s, p4/M, z26.h, z25.h\n"
+ ".inst 0x44824086 // smlalb z6.s, p4/M, z4.h, z2.h\n"
+ "ld1b { z25.h }, p4/Z, [x4, #4, MUL VL]\n"
+ ".inst 0x454a1b39 // usublb z25.h, z25.b, z10.b\n"
+ ".inst 0x44824349 // smlalb z9.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x44824107 // smlalb z7.s, p4/M, z8.h, z2.h\n"
+ "ldr x11, [x17, #0xc8]\n"
+ "ldr x10, [x17, #0xd0]\n"
+ ".inst 0x449d4497 // smlalt z23.s, p4/M, z4.h, z29.h\n"
+ ".inst 0x4498436e // smlalb z14.s, p4/M, z27.h, z24.h\n"
+ "ldr x9, [x17, #0xd8]\n"
+ "ldr x28, [x17, #0xe0]\n"
+ ".inst 0x44824492 // smlalt z18.s, p4/M, z4.h, z2.h\n"
+ ".inst 0x44824754 // smlalt z20.s, p4/M, z26.h, z2.h\n"
+ "ld1b { z4.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
+ ".inst 0x44824501 // smlalt z1.s, p4/M, z8.h, z2.h\n"
+ ".inst 0x449d4366 // smlalb z6.s, p4/M, z27.h, z29.h\n"
+ "ld1b { z2.h }, p4/Z, [x4, #5, MUL VL]\n"
+ ".inst 0x454a1842 // usublb z2.h, z2.b, z10.b\n"
+ ".inst 0x449d4109 // smlalb z9.s, p4/M, z8.h, z29.h\n"
+ ".inst 0x449d43e7 // smlalb z7.s, p4/M, z31.h, z29.h\n"
+ "ldr x27, [x17, #0xe8]\n"
+ "ldr x26, [x17, #0xf0]\n"
+ ".inst 0x44984777 // smlalt z23.s, p4/M, z27.h, z24.h\n"
+ ".inst 0x449040ae // smlalb z14.s, p4/M, z5.h, z16.h\n"
+ "ldr x25, [x17, #0xf8]\n"
+ "ldr x24, [x17, #0x100]\n"
+ ".inst 0x449d4772 // smlalt z18.s, p4/M, z27.h, z29.h\n"
+ ".inst 0x449d4514 // smlalt z20.s, p4/M, z8.h, z29.h\n"
+ "ld1b { z27.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1b7b // usublb z27.h, z27.b, z30.b\n"
+ ".inst 0x449d47e1 // smlalt z1.s, p4/M, z31.h, z29.h\n"
+ ".inst 0x449840a6 // smlalb z6.s, p4/M, z5.h, z24.h\n"
+ "ld1b { z29.h }, p4/Z, [x4, #6, MUL VL]\n"
+ ".inst 0x454a1bbd // usublb z29.h, z29.b, z10.b\n"
+ ".inst 0x449843e9 // smlalb z9.s, p4/M, z31.h, z24.h\n"
+ ".inst 0x44984007 // smlalb z7.s, p4/M, z0.h, z24.h\n"
+ "ldr x23, [x17, #0x108]\n"
+ "ldr x22, [x17, #0x110]\n"
+ ".inst 0x449044b7 // smlalt z23.s, p4/M, z5.h, z16.h\n"
+ ".inst 0x4495438e // smlalb z14.s, p4/M, z28.h, z21.h\n"
+ "ldr x20, [x17, #0x118]\n"
+ "whilelt p0.h, x16, x3\n"
+ ".inst 0x449844b2 // smlalt z18.s, p4/M, z5.h, z24.h\n"
+ ".inst 0x449847f4 // smlalt z20.s, p4/M, z31.h, z24.h\n"
+ "ld1b { z5.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x455e18a5 // usublb z5.h, z5.b, z30.b\n"
+ ".inst 0x44984401 // smlalt z1.s, p4/M, z0.h, z24.h\n"
+ ".inst 0x44904266 // smlalb z6.s, p4/M, z19.h, z16.h\n"
+ "ld1b { z24.h }, p4/Z, [x4, #7, MUL VL]\n"
+ "inch x4, ALL, MUL #8\n"
+ ".inst 0x44904009 // smlalb z9.s, p4/M, z0.h, z16.h\n"
+ ".inst 0x44904167 // smlalb z7.s, p4/M, z11.h, z16.h\n"
+ ".inst 0x454a1b18 // usublb z24.h, z24.b, z10.b\n"
+ "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44954797 // smlalt z23.s, p4/M, z28.h, z21.h\n"
+ ".inst 0x4496434e // smlalb z14.s, p4/M, z26.h, z22.h\n"
+ "ld1b { z28.h }, p3/Z, [x13, x2]\n"
+ ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
+ ".inst 0x44904672 // smlalt z18.s, p4/M, z19.h, z16.h\n"
+ ".inst 0x44904414 // smlalt z20.s, p4/M, z0.h, z16.h\n"
+ "ld1b { z19.h }, p4/Z, [x4]\n"
+ ".inst 0x454a1a73 // usublb z19.h, z19.b, z10.b\n"
+ ".inst 0x44904561 // smlalt z1.s, p4/M, z11.h, z16.h\n"
+ ".inst 0x44954346 // smlalb z6.s, p4/M, z26.h, z21.h\n"
+ "ld1b { z16.h }, p3/Z, [x12, x2]\n"
+ ".inst 0x455e1a10 // usublb z16.h, z16.b, z30.b\n"
+ ".inst 0x44954229 // smlalb z9.s, p4/M, z17.h, z21.h\n"
+ ".inst 0x44954067 // smlalb z7.s, p4/M, z3.h, z21.h\n"
+ ".inst 0x44964757 // smlalt z23.s, p4/M, z26.h, z22.h\n"
+ ".inst 0x4499410e // smlalb z14.s, p4/M, z8.h, z25.h\n"
+ ".inst 0x44954752 // smlalt z18.s, p4/M, z26.h, z21.h\n"
+ ".inst 0x44954634 // smlalt z20.s, p4/M, z17.h, z21.h\n"
+ "ld1b { z26.h }, p3/Z, [x11, x2]\n"
+ ".inst 0x455e1b5a // usublb z26.h, z26.b, z30.b\n"
+ ".inst 0x44954461 // smlalt z1.s, p4/M, z3.h, z21.h\n"
+ ".inst 0x44964106 // smlalb z6.s, p4/M, z8.h, z22.h\n"
+ "ld1b { z21.h }, p4/Z, [x4, #1, MUL VL]\n"
+ ".inst 0x454a1ab5 // usublb z21.h, z21.b, z10.b\n"
+ ".inst 0x44964069 // smlalb z9.s, p4/M, z3.h, z22.h\n"
+ ".inst 0x44964087 // smlalb z7.s, p4/M, z4.h, z22.h\n"
+ ".inst 0x44994517 // smlalt z23.s, p4/M, z8.h, z25.h\n"
".inst 0x448243ee // smlalb z14.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448046e7 // smlalt z7.s, p4/M, z23.h, z0.h\n"
- "uzp1 z23.s, z19.s, z18.s\n"
- ".inst 0x44804705 // smlalt z5.s, p4/M, z24.h, z0.h\n"
- ".inst 0x448143f4 // smlalb z20.s, p4/M, z31.h, z1.h\n"
- "uzp2 z22.s, z19.s, z18.s\n"
- ".inst 0x44814308 // smlalb z8.s, p4/M, z24.h, z1.h\n"
- ".inst 0x44814366 // smlalb z6.s, p4/M, z27.h, z1.h\n"
- ".inst 0x448247ea // smlalt z10.s, p4/M, z31.h, z2.h\n"
- ".inst 0x44814710 // smlalt z16.s, p4/M, z24.h, z1.h\n"
- "ld1b { z24.h }, p3/Z, [x21, x0]\n"
- ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
- ".inst 0x448343ce // smlalb z14.s, p4/M, z30.h, z3.h\n"
- ".inst 0x448147e7 // smlalt z7.s, p4/M, z31.h, z1.h\n"
- ".inst 0x44814765 // smlalt z5.s, p4/M, z27.h, z1.h\n"
- ".inst 0x448243d4 // smlalb z20.s, p4/M, z30.h, z2.h\n"
- ".inst 0x44824368 // smlalb z8.s, p4/M, z27.h, z2.h\n"
- ".inst 0x44824326 // smlalb z6.s, p4/M, z25.h, z2.h\n"
- ".inst 0x448347ca // smlalt z10.s, p4/M, z30.h, z3.h\n"
- ".inst 0x44824770 // smlalt z16.s, p4/M, z27.h, z2.h\n"
- "ld1b { z27.h }, p3/Z, [x20, x0]\n"
- ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
- ".inst 0x4484438e // smlalb z14.s, p4/M, z28.h, z4.h\n"
- ".inst 0x448247c7 // smlalt z7.s, p4/M, z30.h, z2.h\n"
- ".inst 0x04a975ce // sqrdmulh z14.s, z14.s, z9.s\n"
- "inch x0\n"
- ".inst 0x44824725 // smlalt z5.s, p4/M, z25.h, z2.h\n"
- ".inst 0x44834394 // smlalb z20.s, p4/M, z28.h, z3.h\n"
- "and z21.d, z14.d, z23.d\n"
- "mov x20, x0\n"
- ".inst 0x44834328 // smlalb z8.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44834306 // smlalb z6.s, p4/M, z24.h, z3.h\n"
- "asr z21.s, z21.s, #0x1f\n"
+ ".inst 0x44964512 // smlalt z18.s, p4/M, z8.h, z22.h\n"
+ ".inst 0x44964474 // smlalt z20.s, p4/M, z3.h, z22.h\n"
+ "ld1b { z8.h }, p3/Z, [x10, x2]\n"
+ ".inst 0x455e1908 // usublb z8.h, z8.b, z30.b\n"
+ ".inst 0x44964481 // smlalt z1.s, p4/M, z4.h, z22.h\n"
+ ".inst 0x449943e6 // smlalb z6.s, p4/M, z31.h, z25.h\n"
+ "ld1b { z22.h }, p4/Z, [x4, #2, MUL VL]\n"
+ ".inst 0x454a1ad6 // usublb z22.h, z22.b, z10.b\n"
+ ".inst 0x44994089 // smlalb z9.s, p4/M, z4.h, z25.h\n"
+ ".inst 0x44994367 // smlalb z7.s, p4/M, z27.h, z25.h\n"
+ ".inst 0x448247f7 // smlalt z23.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x449d400e // smlalb z14.s, p4/M, z0.h, z29.h\n"
+ ".inst 0x449947f2 // smlalt z18.s, p4/M, z31.h, z25.h\n"
+ ".inst 0x44994494 // smlalt z20.s, p4/M, z4.h, z25.h\n"
+ "ld1b { z31.h }, p3/Z, [x9, x2]\n"
+ ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
+ ".inst 0x44994761 // smlalt z1.s, p4/M, z27.h, z25.h\n"
+ ".inst 0x44824006 // smlalb z6.s, p4/M, z0.h, z2.h\n"
+ "ld1b { z25.h }, p4/Z, [x4, #3, MUL VL]\n"
+ ".inst 0x454a1b39 // usublb z25.h, z25.b, z10.b\n"
+ ".inst 0x44824369 // smlalb z9.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x448240a7 // smlalb z7.s, p4/M, z5.h, z2.h\n"
+ ".inst 0x449d4417 // smlalt z23.s, p4/M, z0.h, z29.h\n"
+ ".inst 0x4498422e // smlalb z14.s, p4/M, z17.h, z24.h\n"
+ ".inst 0x44824412 // smlalt z18.s, p4/M, z0.h, z2.h\n"
+ ".inst 0x44824774 // smlalt z20.s, p4/M, z27.h, z2.h\n"
+ "ld1b { z0.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
+ ".inst 0x448244a1 // smlalt z1.s, p4/M, z5.h, z2.h\n"
+ ".inst 0x449d4166 // smlalb z6.s, p4/M, z11.h, z29.h\n"
+ "ld1b { z2.h }, p4/Z, [x4, #4, MUL VL]\n"
+ ".inst 0x454a1842 // usublb z2.h, z2.b, z10.b\n"
+ ".inst 0x449d40a9 // smlalb z9.s, p4/M, z5.h, z29.h\n"
+ ".inst 0x449d4387 // smlalb z7.s, p4/M, z28.h, z29.h\n"
+ ".inst 0x44984637 // smlalt z23.s, p4/M, z17.h, z24.h\n"
+ ".inst 0x4493406e // smlalb z14.s, p4/M, z3.h, z19.h\n"
+ "ld1b { z17.h }, p3/Z, [x27, x2]\n"
+ ".inst 0x455e1a31 // usublb z17.h, z17.b, z30.b\n"
+ ".inst 0x449d4572 // smlalt z18.s, p4/M, z11.h, z29.h\n"
+ ".inst 0x449d44b4 // smlalt z20.s, p4/M, z5.h, z29.h\n"
+ "ld1b { z11.h }, p4/Z, [x4, #5, MUL VL]\n"
+ ".inst 0x454a196b // usublb z11.h, z11.b, z10.b\n"
+ ".inst 0x449d4781 // smlalt z1.s, p4/M, z28.h, z29.h\n"
+ ".inst 0x44984066 // smlalb z6.s, p4/M, z3.h, z24.h\n"
+ "ld1b { z29.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x455e1bbd // usublb z29.h, z29.b, z30.b\n"
+ ".inst 0x44984209 // smlalb z9.s, p4/M, z16.h, z24.h\n"
+ ".inst 0x44984347 // smlalb z7.s, p4/M, z26.h, z24.h\n"
+ ".inst 0x44934477 // smlalt z23.s, p4/M, z3.h, z19.h\n"
+ ".inst 0x4495408e // smlalb z14.s, p4/M, z4.h, z21.h\n"
+ ".inst 0x44984472 // smlalt z18.s, p4/M, z3.h, z24.h\n"
+ ".inst 0x44984614 // smlalt z20.s, p4/M, z16.h, z24.h\n"
+ "ld1b { z3.h }, p3/Z, [x25, x2]\n"
+ ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
+ ".inst 0x44984741 // smlalt z1.s, p4/M, z26.h, z24.h\n"
+ ".inst 0x44934086 // smlalb z6.s, p4/M, z4.h, z19.h\n"
+ "ld1b { z24.h }, p4/Z, [x4, #6, MUL VL]\n"
+ ".inst 0x454a1b18 // usublb z24.h, z24.b, z10.b\n"
+ ".inst 0x44934349 // smlalb z9.s, p4/M, z26.h, z19.h\n"
+ ".inst 0x44934107 // smlalb z7.s, p4/M, z8.h, z19.h\n"
+ ".inst 0x44954497 // smlalt z23.s, p4/M, z4.h, z21.h\n"
+ ".inst 0x4496436e // smlalb z14.s, p4/M, z27.h, z22.h\n"
+ ".inst 0x44934492 // smlalt z18.s, p4/M, z4.h, z19.h\n"
+ ".inst 0x44934754 // smlalt z20.s, p4/M, z26.h, z19.h\n"
+ "ld1b { z4.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
+ ".inst 0x44934501 // smlalt z1.s, p4/M, z8.h, z19.h\n"
+ ".inst 0x44954366 // smlalb z6.s, p4/M, z27.h, z21.h\n"
+ "ld1b { z19.h }, p4/Z, [x4, #7, MUL VL]\n"
+ "inch x4, ALL, MUL #8\n"
+ ".inst 0x44954109 // smlalb z9.s, p4/M, z8.h, z21.h\n"
+ ".inst 0x449543e7 // smlalb z7.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x454a1a73 // usublb z19.h, z19.b, z10.b\n"
+ ".inst 0x44964777 // smlalt z23.s, p4/M, z27.h, z22.h\n"
+ ".inst 0x449940ae // smlalb z14.s, p4/M, z5.h, z25.h\n"
+ ".inst 0x44954772 // smlalt z18.s, p4/M, z27.h, z21.h\n"
+ ".inst 0x44954514 // smlalt z20.s, p4/M, z8.h, z21.h\n"
+ "ld1b { z27.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x455e1b7b // usublb z27.h, z27.b, z30.b\n"
+ ".inst 0x449547e1 // smlalt z1.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x449640a6 // smlalb z6.s, p4/M, z5.h, z22.h\n"
+ "ld1b { z21.h }, p4/Z, [x4]\n"
+ ".inst 0x454a1ab5 // usublb z21.h, z21.b, z10.b\n"
+ ".inst 0x449643e9 // smlalb z9.s, p4/M, z31.h, z22.h\n"
+ ".inst 0x44964007 // smlalb z7.s, p4/M, z0.h, z22.h\n"
+ "inch x4\n"
+ ".inst 0x449944b7 // smlalt z23.s, p4/M, z5.h, z25.h\n"
+ ".inst 0x4482420e // smlalb z14.s, p4/M, z16.h, z2.h\n"
+ ".inst 0x449644b2 // smlalt z18.s, p4/M, z5.h, z22.h\n"
+ ".inst 0x449647f4 // smlalt z20.s, p4/M, z31.h, z22.h\n"
+ "ld1b { z5.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e18a5 // usublb z5.h, z5.b, z30.b\n"
+ ".inst 0x44964401 // smlalt z1.s, p4/M, z0.h, z22.h\n"
+ ".inst 0x44994386 // smlalb z6.s, p4/M, z28.h, z25.h\n"
+ "ld1w { z22.s }, p2/Z, [x15]\n"
+ ".inst 0x44994009 // smlalb z9.s, p4/M, z0.h, z25.h\n"
+ ".inst 0x44994227 // smlalb z7.s, p4/M, z17.h, z25.h\n"
+ ".inst 0x44824617 // smlalt z23.s, p4/M, z16.h, z2.h\n"
+ ".inst 0x448b434e // smlalb z14.s, p4/M, z26.h, z11.h\n"
+ "ld1w { z16.s }, p1/Z, [x15, #1, MUL VL]\n"
+ "addvl x15, x15, #2\n"
+ ".inst 0x44994792 // smlalt z18.s, p4/M, z28.h, z25.h\n"
+ ".inst 0x44994414 // smlalt z20.s, p4/M, z0.h, z25.h\n"
+ "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
+ ".inst 0x44994621 // smlalt z1.s, p4/M, z17.h, z25.h\n"
+ ".inst 0x44824346 // smlalb z6.s, p4/M, z26.h, z2.h\n"
+ "uzp1 z25.s, z22.s, z16.s\n"
+ "inch x2\n"
+ ".inst 0x448243a9 // smlalb z9.s, p4/M, z29.h, z2.h\n"
+ ".inst 0x44824067 // smlalb z7.s, p4/M, z3.h, z2.h\n"
+ "uzp2 z16.s, z22.s, z16.s\n"
+ "ld1w { z22.s }, p2/Z, [x14]\n"
+ ".inst 0x448b4757 // smlalt z23.s, p4/M, z26.h, z11.h\n"
+ ".inst 0x4498410e // smlalb z14.s, p4/M, z8.h, z24.h\n"
+ "mov x20, x2\n"
"incw x20\n"
- ".inst 0x4484478a // smlalt z10.s, p4/M, z28.h, z4.h\n"
- ".inst 0x44834787 // smlalt z7.s, p4/M, z28.h, z3.h\n"
- ".inst 0x04bd754a // sqrdmulh z10.s, z10.s, z29.s\n"
- "whilelt p2.s, x0, x1\n"
- ".inst 0x44834730 // smlalt z16.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44834705 // smlalt z5.s, p4/M, z24.h, z3.h\n"
- "and z3.d, z10.d, z22.d\n"
- "whilelt p1.s, x20, x1\n"
- ".inst 0x44844354 // smlalb z20.s, p4/M, z26.h, z4.h\n"
- ".inst 0x44844308 // smlalb z8.s, p4/M, z24.h, z4.h\n"
- ".inst 0x04a97694 // sqrdmulh z20.s, z20.s, z9.s\n"
- "whilelt p3.h, x0, x1\n"
- ".inst 0x44844366 // smlalb z6.s, p4/M, z27.h, z4.h\n"
- ".inst 0x44844747 // smlalt z7.s, p4/M, z26.h, z4.h\n"
- ".inst 0x04a97508 // sqrdmulh z8.s, z8.s, z9.s\n"
- ".inst 0x44844710 // smlalt z16.s, p4/M, z24.h, z4.h\n"
- ".inst 0x44844765 // smlalt z5.s, p4/M, z27.h, z4.h\n"
- ".inst 0x04a974c6 // sqrdmulh z6.s, z6.s, z9.s\n"
- "sqadd z14.s, z14.s, z21.s\n"
+ ".inst 0x44824752 // smlalt z18.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x448247b4 // smlalt z20.s, p4/M, z29.h, z2.h\n"
+ "ld1w { z26.s }, p1/Z, [x14, #1, MUL VL]\n"
+ "uzp1 z29.s, z22.s, z26.s\n"
+ ".inst 0x44824461 // smlalt z1.s, p4/M, z3.h, z2.h\n"
+ ".inst 0x448b4106 // smlalb z6.s, p4/M, z8.h, z11.h\n"
+ "uzp2 z22.s, z22.s, z26.s\n"
+ "whilelt p2.s, x2, x3\n"
+ ".inst 0x448b4069 // smlalb z9.s, p4/M, z3.h, z11.h\n"
+ ".inst 0x448b4087 // smlalb z7.s, p4/M, z4.h, z11.h\n"
+ "whilelt p1.s, x20, x3\n"
+ "whilelt p3.h, x2, x3\n"
+ ".inst 0x44984517 // smlalt z23.s, p4/M, z8.h, z24.h\n"
+ ".inst 0x449343ee // smlalb z14.s, p4/M, z31.h, z19.h\n"
+ "addvl x14, x14, #2\n"
+ ".inst 0x448b4512 // smlalt z18.s, p4/M, z8.h, z11.h\n"
+ ".inst 0x448b4474 // smlalt z20.s, p4/M, z3.h, z11.h\n"
+ ".inst 0x448b4481 // smlalt z1.s, p4/M, z4.h, z11.h\n"
+ ".inst 0x449843e6 // smlalb z6.s, p4/M, z31.h, z24.h\n"
+ ".inst 0x44984089 // smlalb z9.s, p4/M, z4.h, z24.h\n"
+ ".inst 0x44984367 // smlalb z7.s, p4/M, z27.h, z24.h\n"
+ ".inst 0x449347f7 // smlalt z23.s, p4/M, z31.h, z19.h\n"
+ ".inst 0x4495400e // smlalb z14.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x04b975ce // sqrdmulh z14.s, z14.s, z25.s\n"
+ ".inst 0x449847f2 // smlalt z18.s, p4/M, z31.h, z24.h\n"
+ ".inst 0x44984494 // smlalt z20.s, p4/M, z4.h, z24.h\n"
+ "and z3.d, z14.d, z29.d\n"
+ ".inst 0x44984761 // smlalt z1.s, p4/M, z27.h, z24.h\n"
+ ".inst 0x44934006 // smlalb z6.s, p4/M, z0.h, z19.h\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ ".inst 0x44934369 // smlalb z9.s, p4/M, z27.h, z19.h\n"
+ ".inst 0x449340a7 // smlalb z7.s, p4/M, z5.h, z19.h\n"
+ "sqadd z14.s, z14.s, z3.s\n"
+ ".inst 0x448293ae // srshl z14.s, p4/M, z14.s, z29.s\n"
+ ".inst 0x44954417 // smlalt z23.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x44934412 // smlalt z18.s, p4/M, z0.h, z19.h\n"
+ ".inst 0x04b076f7 // sqrdmulh z23.s, z23.s, z16.s\n"
+ ".inst 0x44934774 // smlalt z20.s, p4/M, z27.h, z19.h\n"
+ ".inst 0x449344a1 // smlalt z1.s, p4/M, z5.h, z19.h\n"
+ "and z31.d, z23.d, z22.d\n"
+ ".inst 0x44954226 // smlalb z6.s, p4/M, z17.h, z21.h\n"
+ ".inst 0x449540a9 // smlalb z9.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x04b974c6 // sqrdmulh z6.s, z6.s, z25.s\n"
+ ".inst 0x44954387 // smlalb z7.s, p4/M, z28.h, z21.h\n"
+ ".inst 0x44954632 // smlalt z18.s, p4/M, z17.h, z21.h\n"
+ ".inst 0x04b97529 // sqrdmulh z9.s, z9.s, z25.s\n"
+ ".inst 0x449544b4 // smlalt z20.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x44954781 // smlalt z1.s, p4/M, z28.h, z21.h\n"
+ ".inst 0x04b974e7 // sqrdmulh z7.s, z7.s, z25.s\n"
+ "asr z31.s, z31.s, #0x1f\n"
+ "and z3.d, z6.d, z29.d\n"
+ ".inst 0x04b07652 // sqrdmulh z18.s, z18.s, z16.s\n"
+ "and z0.d, z9.d, z29.d\n"
+ ".inst 0x04b07694 // sqrdmulh z20.s, z20.s, z16.s\n"
+ "and z19.d, z7.d, z29.d\n"
+ ".inst 0x04b07421 // sqrdmulh z1.s, z1.s, z16.s\n"
+ "sqadd z23.s, z23.s, z31.s\n"
+ ".inst 0x448292d7 // srshl z23.s, p4/M, z23.s, z22.s\n"
"asr z3.s, z3.s, #0x1f\n"
- ".inst 0x448292ee // srshl z14.s, p4/M, z14.s, z23.s\n"
- "and z19.d, z20.d, z23.d\n"
- ".inst 0x04bd74e7 // sqrdmulh z7.s, z7.s, z29.s\n"
- "and z18.d, z8.d, z23.d\n"
- ".inst 0x04bd7610 // sqrdmulh z16.s, z16.s, z29.s\n"
- "and z21.d, z6.d, z23.d\n"
- ".inst 0x04bd74a5 // sqrdmulh z5.s, z5.s, z29.s\n"
- "sqadd z10.s, z10.s, z3.s\n"
+ "and z21.d, z18.d, z22.d\n"
+ "asr z0.s, z0.s, #0x1f\n"
+ "and z17.d, z20.d, z22.d\n"
"asr z19.s, z19.s, #0x1f\n"
- ".inst 0x448292ca // srshl z10.s, p4/M, z10.s, z22.s\n"
- "and z1.d, z7.d, z22.d\n"
- "asr z18.s, z18.s, #0x1f\n"
- "and z2.d, z16.d, z22.d\n"
+ "and z16.d, z1.d, z22.d\n"
+ "sqadd z6.s, z6.s, z3.s\n"
"asr z21.s, z21.s, #0x1f\n"
- "and z3.d, z5.d, z22.d\n"
- "sqadd z20.s, z20.s, z19.s\n"
- ".inst 0x448292f4 // srshl z20.s, p4/M, z20.s, z23.s\n"
- "asr z1.s, z1.s, #0x1f\n"
- "sqadd z8.s, z8.s, z18.s\n"
- ".inst 0x448292e8 // srshl z8.s, p4/M, z8.s, z23.s\n"
- "asr z2.s, z2.s, #0x1f\n"
- "sqadd z6.s, z6.s, z21.s\n"
- ".inst 0x448292e6 // srshl z6.s, p4/M, z6.s, z23.s\n"
- "asr z3.s, z3.s, #0x1f\n"
- "sqadd z7.s, z7.s, z1.s\n"
- ".inst 0x448292c7 // srshl z7.s, p4/M, z7.s, z22.s\n"
- "sqadd z16.s, z16.s, z2.s\n"
- "sqadd z5.s, z5.s, z3.s\n"
- ".inst 0x448292d0 // srshl z16.s, p4/M, z16.s, z22.s\n"
- ".inst 0x448292c5 // srshl z5.s, p4/M, z5.s, z22.s\n"
+ ".inst 0x448293a6 // srshl z6.s, p4/M, z6.s, z29.s\n"
+ "sqadd z9.s, z9.s, z0.s\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ ".inst 0x448293a9 // srshl z9.s, p4/M, z9.s, z29.s\n"
+ "sqadd z7.s, z7.s, z19.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x448293a7 // srshl z7.s, p4/M, z7.s, z29.s\n"
+ "sqadd z18.s, z18.s, z21.s\n"
+ "sqadd z20.s, z20.s, z17.s\n"
+ ".inst 0x448292d2 // srshl z18.s, p4/M, z18.s, z22.s\n"
+ ".inst 0x448292d4 // srshl z20.s, p4/M, z20.s, z22.s\n"
+ "sqadd z1.s, z1.s, z16.s\n"
".inst 0x453041ce // sqxtnb z14.h, z14.s\n"
- ".inst 0x45304294 // sqxtnb z20.h, z20.s\n"
- ".inst 0x45304108 // sqxtnb z8.h, z8.s\n"
+ ".inst 0x448292c1 // srshl z1.s, p4/M, z1.s, z22.s\n"
".inst 0x453040c6 // sqxtnb z6.h, z6.s\n"
- ".inst 0x4530454e // sqxtnt z14.h, z10.s\n"
- ".inst 0x453044f4 // sqxtnt z20.h, z7.s\n"
- ".inst 0x45304608 // sqxtnt z8.h, z16.s\n"
- ".inst 0x453044a6 // sqxtnt z6.h, z5.s\n"
- "sqadd z14.h, z14.h, z12.h\n"
- "sqadd z20.h, z20.h, z12.h\n"
- "smax z14.h, p4/M, z14.h, z13.h\n"
- "smax z20.h, p4/M, z20.h, z13.h\n"
- "sqadd z8.h, z8.h, z12.h\n"
- "sqadd z6.h, z6.h, z12.h\n"
- "smax z8.h, p4/M, z8.h, z13.h\n"
- "smax z6.h, p4/M, z6.h, z13.h\n"
- "smin z14.h, p4/M, z14.h, z11.h\n"
- "smin z20.h, p4/M, z20.h, z11.h\n"
- "st1b { z14.h }, p0, [x3, x8]\n"
- "smin z8.h, p4/M, z8.h, z11.h\n"
- "smin z6.h, p4/M, z6.h, z11.h\n"
- "st1b { z20.h }, p0, [x4, x8]\n"
- "st1b { z8.h }, p0, [x5, x8]\n"
- "st1b { z6.h }, p0, [x6, x8]\n"
- "ld1w { z30.s }, p2/Z, [x14]\n"
- "ld1w { z16.s }, p1/Z, [x14, #1, MUL VL]\n"
- "uzp1 z14.s, z30.s, z16.s\n"
- "ld1b { z0.h }, p4/Z, [x2]\n"
- "ld1b { z1.h }, p4/Z, [x2, #1, MUL VL]\n"
- "uzp2 z10.s, z30.s, z16.s\n"
- "addvl x14, x14, #2\n"
- "ld1b { z2.h }, p4/Z, [x2, #2, MUL VL]\n"
- "ld1b { z3.h }, p4/Z, [x2, #3, MUL VL]\n"
- "inch x8\n"
- "str x14, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1b { z4.h }, p4/Z, [x2, #4, MUL VL]\n"
- "ldp x9, x28, [x7, #0x0]\n"
- "mov z20.d, z14.d\n"
- "mov z7.d, z10.d\n"
- "ldp x27, x26, [x7, #0x10]\n"
- "ldp x25, x24, [x7, #0x20]\n"
- "mov z8.d, z14.d\n"
- "mov z16.d, z10.d\n"
- "ldp x23, x22, [x7, #0x30]\n"
- "ldp x21, x20, [x7, #0x40]\n"
+ ".inst 0x45304129 // sqxtnb z9.h, z9.s\n"
+ ".inst 0x453040e7 // sqxtnb z7.h, z7.s\n"
+ ".inst 0x453046ee // sqxtnt z14.h, z23.s\n"
+ ".inst 0x45304646 // sqxtnt z6.h, z18.s\n"
+ ".inst 0x45304689 // sqxtnt z9.h, z20.s\n"
+ ".inst 0x45304427 // sqxtnt z7.h, z1.s\n"
+ "sqadd z14.h, z14.h, z15.h\n"
+ "smax z14.h, p4/M, z14.h, z12.h\n"
+ "smin z14.h, p4/M, z14.h, z13.h\n"
+ "sqadd z6.h, z6.h, z15.h\n"
+ "sqadd z9.h, z9.h, z15.h\n"
+ "smax z6.h, p4/M, z6.h, z12.h\n"
+ "smax z9.h, p4/M, z9.h, z12.h\n"
+ "sqadd z7.h, z7.h, z15.h\n"
+ "smax z7.h, p4/M, z7.h, z12.h\n"
+ "smin z6.h, p4/M, z6.h, z13.h\n"
+ "st1b { z14.h }, p0, [x5, x16]\n"
+ "smin z9.h, p4/M, z9.h, z13.h\n"
+ "smin z7.h, p4/M, z7.h, z13.h\n"
+ "st1b { z6.h }, p0, [x6, x16]\n"
+ "st1b { z9.h }, p0, [x7, x16]\n"
+ "st1b { z7.h }, p0, [x8, x16]\n"
+ "ld1w { z17.s }, p2/Z, [x21]\n"
+ "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+ "uzp1 z14.s, z17.s, z16.s\n"
+ "ld1b { z26.h }, p4/Z, [x4]\n"
+ "ld1b { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
+ "uzp2 z23.s, z17.s, z16.s\n"
+ "addvl x21, x21, #2\n"
+ "ld1b { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
+ "ld1b { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
+ "inch x16\n"
+ "str x21, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1b { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
+ "ldp x9, x28, [x17, #0x0]\n"
"mov z6.d, z14.d\n"
- "mov z5.d, z10.d\n"
- "ld1b { z31.h }, p3/Z, [x9, x0]\n"
- "ld1b { z30.h }, p3/Z, [x28, x0]\n"
- ".inst 0x45511800 // usublb z0.h, z0.b, z17.b\n"
- ".inst 0x45511821 // usublb z1.h, z1.b, z17.b\n"
- "ld1b { z29.h }, p3/Z, [x27, x0]\n"
- "ld1b { z28.h }, p3/Z, [x26, x0]\n"
- ".inst 0x45511842 // usublb z2.h, z2.b, z17.b\n"
- ".inst 0x45511863 // usublb z3.h, z3.b, z17.b\n"
- "ld1b { z27.h }, p3/Z, [x25, x0]\n"
- "ld1b { z23.h }, p3/Z, [x24, x0]\n"
- ".inst 0x45511884 // usublb z4.h, z4.b, z17.b\n"
- ".inst 0x454f1bff // usublb z31.h, z31.b, z15.b\n"
- "ld1b { z25.h }, p3/Z, [x23, x0]\n"
- "ld1b { z24.h }, p3/Z, [x22, x0]\n"
- ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
- ".inst 0x454f1bbd // usublb z29.h, z29.b, z15.b\n"
- "ld1b { z26.h }, p3/Z, [x21, x0]\n"
- "ld1b { z22.h }, p3/Z, [x20, x0]\n"
- ".inst 0x454f1b9c // usublb z28.h, z28.b, z15.b\n"
- ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
- ".inst 0x454f1af7 // usublb z23.h, z23.b, z15.b\n"
- ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
- ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
- ".inst 0x454f1b5a // usublb z26.h, z26.b, z15.b\n"
- ".inst 0x454f1ad6 // usublb z22.h, z22.b, z15.b\n"
+ "mov z18.d, z23.d\n"
+ "ldp x27, x26, [x17, #0x10]\n"
+ "ldp x25, x24, [x17, #0x20]\n"
+ "mov z9.d, z14.d\n"
+ "mov z20.d, z23.d\n"
+ "ldp x23, x22, [x17, #0x30]\n"
+ "ldp x21, x20, [x17, #0x40]\n"
+ "mov z7.d, z14.d\n"
+ "mov z1.d, z23.d\n"
+ "ld1b { z22.h }, p3/Z, [x9, x2]\n"
+ "ld1b { z2.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x454a1b5a // usublb z26.h, z26.b, z10.b\n"
+ ".inst 0x454a1908 // usublb z8.h, z8.b, z10.b\n"
+ "ld1b { z11.h }, p3/Z, [x27, x2]\n"
+ "ld1b { z3.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x454a1a10 // usublb z16.h, z16.b, z10.b\n"
+ ".inst 0x454a1ab5 // usublb z21.h, z21.b, z10.b\n"
+ "ld1b { z29.h }, p3/Z, [x25, x2]\n"
+ "ld1b { z4.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x454a1a31 // usublb z17.h, z17.b, z10.b\n"
+ ".inst 0x455e1ad6 // usublb z22.h, z22.b, z30.b\n"
+ "ld1b { z31.h }, p3/Z, [x23, x2]\n"
+ "ld1b { z0.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e1842 // usublb z2.h, z2.b, z30.b\n"
+ ".inst 0x455e196b // usublb z11.h, z11.b, z30.b\n"
+ "ld1b { z19.h }, p3/Z, [x21, x2]\n"
+ "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
+ ".inst 0x455e1bbd // usublb z29.h, z29.b, z30.b\n"
+ ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
+ ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
+ ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
+ ".inst 0x455e1a73 // usublb z19.h, z19.b, z30.b\n"
+ ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
index 81c954a11b..d5382533a8 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
index 66c24c34b5..a9cd8a7fa9 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -47,285 +47,285 @@ void sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
"ldr x20, [%x[inptrs], #0x10]\n"
"ldr x22, [%x[inptrs], #0x20]\n"
"ldr x21, [%x[inptrs], #0x0]\n"
- "mov z15.b, #0x1\n"
- "lsr z15.s, z15.s, #0x8\n"
+ "mov z13.b, #0x1\n"
+ "lsr z13.s, z13.s, #0x8\n"
"ld1b { z1.b }, p0/Z, [x23]\n"
"ld1b { z2.b }, p0/Z, [x20]\n"
- "mov z30.d, z1.d\n"
- "mov z29.d, z1.d\n"
+ "mov z8.d, z1.d\n"
+ "mov z27.d, z1.d\n"
"ldr x20, [%x[inptrs], #0x18]\n"
"ld1b { z4.b }, p0/Z, [x22]\n"
- "mov z28.d, z1.d\n"
- "mov z27.d, z2.d\n"
+ "mov z31.d, z1.d\n"
+ "mov z28.d, z2.d\n"
"ld1b { z0.b }, p0/Z, [x21]\n"
+ "mov z30.d, z2.d\n"
"mov z26.d, z2.d\n"
- "mov z25.d, z2.d\n"
"ld1b { z3.b }, p0/Z, [x20]\n"
- "mov z24.d, z4.d\n"
- "mov z23.d, z4.d\n"
- "ptrue p2.b\n"
- "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
"mov z22.d, z4.d\n"
- "ext z30.b, z30.b, z30.b, #0x2\n"
+ "mov z10.d, z4.d\n"
+ "ptrue p2.b\n"
+ "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z18.d, z4.d\n"
+ "ext z8.b, z8.b, z8.b, #0x2\n"
"lsl x10, %x[n_channels], #0x2\n"
- "neg z14.s, p2/M, z14.s\n"
- "ext z29.b, z29.b, z29.b, #0x4\n"
- "ext z28.b, z28.b, z28.b, #0x6\n"
+ "neg z11.s, p2/M, z11.s\n"
+ "ext z27.b, z27.b, z27.b, #0x4\n"
+ "ext z31.b, z31.b, z31.b, #0x6\n"
"mov x9, #0x0\n"
- "whilelt p1.b, x9, x10\n"
- "ext z27.b, z27.b, z27.b, #0x2\n"
- "ext z26.b, z26.b, z26.b, #0x4\n"
- "ld1w { z13.s }, p1/Z, [%x[params]]\n"
+ "whilelt p0.b, x9, x10\n"
+ "ext z28.b, z28.b, z28.b, #0x2\n"
+ "ext z30.b, z30.b, z30.b, #0x4\n"
+ "ld1w { z14.s }, p0/Z, [%x[params]]\n"
"mov x28, #0x0\n"
- "ext z25.b, z25.b, z25.b, #0x6\n"
- "ext z24.b, z24.b, z24.b, #0x2\n"
+ "ext z26.b, z26.b, z26.b, #0x6\n"
+ "ext z22.b, z22.b, z22.b, #0x2\n"
"ldp x27, x26, [%x[outptrs], #0x0]\n"
"ldp x25, x24, [%x[outptrs], #0x10]\n"
- "ext z23.b, z23.b, z23.b, #0x4\n"
- "ext z22.b, z22.b, z22.b, #0x6\n"
+ "ext z10.b, z10.b, z10.b, #0x4\n"
+ "ext z18.b, z18.b, z18.b, #0x6\n"
"ldp x23, x22, [%x[outptrs], #0x20]\n"
"ldp x21, x20, [%x[outptrs], #0x30]\n"
"mov z21.d, z0.d\n"
"mov z20.d, z0.d\n"
- "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"mov z19.d, z0.d\n"
- "mov z18.d, z3.d\n"
- "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
- "ld1b { z5.b }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "mov z24.d, z3.d\n"
+ "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1b { z5.b }, p0/Z, [%x[params], #1, MUL VL]\n"
"mov z17.d, z3.d\n"
"mov z16.d, z3.d\n"
- "ld1b { z6.b }, p1/Z, [%x[params], #2, MUL VL]\n"
- "ld1b { z7.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "ld1b { z6.b }, p0/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z7.b }, p0/Z, [%x[params], #3, MUL VL]\n"
"ext z21.b, z21.b, z21.b, #0x2\n"
"ext z20.b, z20.b, z20.b, #0x4\n"
"addvl %x[params], %x[params], #4\n"
"ext z19.b, z19.b, z19.b, #0x6\n"
- "zip1 z1.s, z1.s, z29.s\n"
- "zip1 z30.s, z30.s, z28.s\n"
- "zip1 z2.s, z2.s, z26.s\n"
- "zip1 z27.s, z27.s, z25.s\n"
- "ext z18.b, z18.b, z18.b, #0x2\n"
+ "zip1 z1.s, z1.s, z27.s\n"
+ "zip1 z8.s, z8.s, z31.s\n"
+ "zip1 z2.s, z2.s, z30.s\n"
+ "zip1 z28.s, z28.s, z26.s\n"
+ "ext z24.b, z24.b, z24.b, #0x2\n"
"ext z17.b, z17.b, z17.b, #0x4\n"
"ext z16.b, z16.b, z16.b, #0x6\n"
- "zip1 z4.s, z4.s, z23.s\n"
- "zip1 z24.s, z24.s, z22.s\n"
+ "zip1 z4.s, z4.s, z10.s\n"
+ "zip1 z22.s, z22.s, z18.s\n"
"zip1 z0.s, z0.s, z20.s\n"
"zip1 z21.s, z21.s, z19.s\n"
- "zip1 z1.s, z1.s, z30.s\n"
- "zip1 z2.s, z2.s, z27.s\n"
+ "zip1 z1.s, z1.s, z8.s\n"
+ "zip1 z2.s, z2.s, z28.s\n"
"zip1 z3.s, z3.s, z17.s\n"
- "zip1 z18.s, z18.s, z16.s\n"
- "zip1 z4.s, z4.s, z24.s\n"
+ "zip1 z24.s, z24.s, z16.s\n"
+ "zip1 z4.s, z4.s, z22.s\n"
"zip1 z0.s, z0.s, z21.s\n"
"mov z1.q, z1.q[0]\n"
"mov z2.q, z2.q[0]\n"
- "zip1 z3.s, z3.s, z18.s\n"
+ "zip1 z3.s, z3.s, z24.s\n"
"mov z4.q, z4.q[0]\n"
"mov z24.s, #0x0\n"
"mov z25.s, #0x0\n"
- "udot z24.s, z15.b, z1.b[0]\n"
+ "udot z24.s, z13.b, z1.b[0]\n"
"mov z23.s, #0x0\n"
"mov z22.s, #0x0\n"
- "udot z25.s, z15.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z1.b[1]\n"
"mov z21.s, #0x0\n"
- "mov z20.s, #0x0\n"
- "udot z23.s, z15.b, z1.b[2]\n"
- "mov z9.s, #0x0\n"
- "mov z8.s, #0x0\n"
- "udot z22.s, z15.b, z1.b[3]\n"
"mov z19.s, #0x0\n"
+ "udot z23.s, z13.b, z1.b[2]\n"
+ "mov z10.s, #0x0\n"
+ "mov z8.s, #0x0\n"
+ "udot z22.s, z13.b, z1.b[3]\n"
+ "mov z20.s, #0x0\n"
"mov z18.s, #0x0\n"
- "udot z21.s, z15.b, z2.b[0]\n"
+ "udot z21.s, z13.b, z2.b[0]\n"
"mov z17.s, #0x0\n"
"mov z16.s, #0x0\n"
- "udot z20.s, z15.b, z2.b[1]\n"
- "udot z9.s, z15.b, z2.b[2]\n"
- "udot z8.s, z15.b, z2.b[3]\n"
+ "udot z19.s, z13.b, z2.b[1]\n"
+ "udot z10.s, z13.b, z2.b[2]\n"
+ "udot z8.s, z13.b, z2.b[3]\n"
"mov z0.q, z0.q[0]\n"
- "udot z19.s, z15.b, z4.b[0]\n"
- "udot z18.s, z15.b, z4.b[1]\n"
+ "udot z20.s, z13.b, z4.b[0]\n"
+ "udot z18.s, z13.b, z4.b[1]\n"
"mov z3.q, z3.q[0]\n"
- "udot z17.s, z15.b, z4.b[2]\n"
- "udot z16.s, z15.b, z4.b[3]\n"
+ "udot z17.s, z13.b, z4.b[2]\n"
+ "udot z16.s, z13.b, z4.b[3]\n"
"mov z31.s, #0x0\n"
"mov z30.s, #0x0\n"
- "mov z29.s, #0x0\n"
- "udot z31.s, z15.b, z0.b[0]\n"
+ "mov z26.s, #0x0\n"
+ "udot z31.s, z13.b, z0.b[0]\n"
+ "mov z27.s, #0x0\n"
"mov z28.s, #0x0\n"
- "udot z30.s, z15.b, z0.b[1]\n"
- "udot z29.s, z15.b, z0.b[2]\n"
- "udot z28.s, z15.b, z0.b[3]\n"
+ "udot z30.s, z13.b, z0.b[1]\n"
+ "mov z29.s, #0x0\n"
+ "udot z26.s, z13.b, z0.b[2]\n"
+ "udot z27.s, z13.b, z0.b[3]\n"
+ "udot z28.s, z13.b, z3.b[0]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
"add z24.s, z24.s, z21.s\n"
- "add z25.s, z25.s, z20.s\n"
- "add z26.s, z23.s, z9.s\n"
- "add z27.s, z22.s, z8.s\n"
- "add z23.s, z19.s, z21.s\n"
- "mov z22.s, #0x0\n"
- "udot z22.s, z15.b, z3.b[0]\n"
- "add z21.s, z18.s, z20.s\n"
+ "add z25.s, z25.s, z19.s\n"
+ "add z23.s, z23.s, z10.s\n"
+ "add z22.s, z22.s, z8.s\n"
+ "add z21.s, z20.s, z21.s\n"
"mov z20.s, #0x0\n"
- "udot z20.s, z15.b, z3.b[1]\n"
- "add z19.s, z17.s, z9.s\n"
+ "udot z20.s, z13.b, z3.b[2]\n"
+ "add z19.s, z18.s, z19.s\n"
"mov z18.s, #0x0\n"
- "udot z18.s, z15.b, z3.b[2]\n"
- "add z17.s, z16.s, z8.s\n"
- "mov z16.s, #0x0\n"
- "udot z16.s, z15.b, z3.b[3]\n"
+ "udot z18.s, z13.b, z3.b[3]\n"
+ "add z17.s, z17.s, z10.s\n"
+ "add z16.s, z16.s, z8.s\n"
"add z24.s, z24.s, z31.s\n"
"add z25.s, z25.s, z30.s\n"
- "mul z24.s, p2/M, z24.s, z14.s\n"
- "mul z25.s, p2/M, z25.s, z14.s\n"
- "add z26.s, z26.s, z29.s\n"
- "add z27.s, z27.s, z28.s\n"
- "mul z26.s, p2/M, z26.s, z14.s\n"
- "mul z27.s, p2/M, z27.s, z14.s\n"
- "add z28.s, z23.s, z22.s\n"
- "add z29.s, z21.s, z20.s\n"
- "mul z28.s, p2/M, z28.s, z14.s\n"
- "mul z29.s, p2/M, z29.s, z14.s\n"
- "add z30.s, z19.s, z18.s\n"
- "add z31.s, z17.s, z16.s\n"
- "mul z30.s, p2/M, z30.s, z14.s\n"
- "mul z31.s, p2/M, z31.s, z14.s\n"
+ "mul z24.s, p2/M, z24.s, z11.s\n"
+ "mul z25.s, p2/M, z25.s, z11.s\n"
+ "add z26.s, z23.s, z26.s\n"
+ "add z27.s, z22.s, z27.s\n"
+ "mul z26.s, p2/M, z26.s, z11.s\n"
+ "mul z27.s, p2/M, z27.s, z11.s\n"
+ "add z28.s, z21.s, z28.s\n"
+ "add z29.s, z19.s, z29.s\n"
+ "mul z28.s, p2/M, z28.s, z11.s\n"
+ "mul z29.s, p2/M, z29.s, z11.s\n"
+ "add z30.s, z17.s, z20.s\n"
+ "add z31.s, z16.s, z18.s\n"
+ "mul z30.s, p2/M, z30.s, z11.s\n"
+ "mul z31.s, p2/M, z31.s, z11.s\n"
"zip1 z19.s, z24.s, z26.s\n"
"zip1 z18.s, z25.s, z27.s\n"
"zip1 z17.s, z28.s, z30.s\n"
"zip1 z16.s, z29.s, z31.s\n"
"zip1 z22.s, z19.s, z18.s\n"
"zip1 z23.s, z17.s, z16.s\n"
- "add z24.s, z24.s, z13.s\n"
- "add z25.s, z25.s, z13.s\n"
- "add z26.s, z26.s, z13.s\n"
- "add z27.s, z27.s, z13.s\n"
- "add z28.s, z28.s, z13.s\n"
- "add z29.s, z29.s, z13.s\n"
- "add z30.s, z30.s, z13.s\n"
- "add z31.s, z31.s, z13.s\n"
+ "add z24.s, z24.s, z14.s\n"
+ "add z25.s, z25.s, z14.s\n"
+ "add z26.s, z26.s, z14.s\n"
+ "add z27.s, z27.s, z14.s\n"
+ "add z28.s, z28.s, z14.s\n"
+ "add z29.s, z29.s, z14.s\n"
+ "add z30.s, z30.s, z14.s\n"
+ "add z31.s, z31.s, z14.s\n"
"1:" // Loop
"udot z24.s, z5.b, z0.b[0]\n"
"udot z25.s, z5.b, z0.b[1]\n"
- "ld1w { z21.s }, p2/Z, [%x[params]]\n"
- "ld1w { z20.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "ld1w { z8.s }, p2/Z, [%x[params]]\n"
+ "ld1w { z21.s }, p2/Z, [%x[params], #1, MUL VL]\n"
"udot z26.s, z5.b, z0.b[2]\n"
"udot z27.s, z5.b, z0.b[3]\n"
"incb x9\n"
- "whilelt p0.s, x28, %x[n_channels]\n"
+ "whilelt p1.s, x28, %x[n_channels]\n"
"udot z24.s, z6.b, z1.b[0]\n"
"udot z25.s, z6.b, z1.b[1]\n"
- "whilelt p1.b, x9, x10\n"
- "ld1w { z13.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "whilelt p0.b, x9, x10\n"
+ "ld1w { z20.s }, p0/Z, [%x[params], #2, MUL VL]\n"
"udot z26.s, z6.b, z1.b[2]\n"
"udot z27.s, z6.b, z1.b[3]\n"
"udot z28.s, z5.b, z2.b[0]\n"
"udot z29.s, z5.b, z2.b[1]\n"
"udot z30.s, z5.b, z2.b[2]\n"
"udot z31.s, z5.b, z2.b[3]\n"
- "ld1b { z5.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "ld1b { z5.b }, p0/Z, [%x[params], #3, MUL VL]\n"
"udot z24.s, z7.b, z2.b[0]\n"
"udot z25.s, z7.b, z2.b[1]\n"
- ".inst 0x04b57718 // sqrdmulh z24.s, z24.s, z21.s\n"
+ ".inst 0x04a87718 // sqrdmulh z24.s, z24.s, z8.s\n"
"udot z26.s, z7.b, z2.b[2]\n"
"udot z27.s, z7.b, z2.b[3]\n"
- ".inst 0x04b57739 // sqrdmulh z25.s, z25.s, z21.s\n"
+ ".inst 0x04a87739 // sqrdmulh z25.s, z25.s, z8.s\n"
"udot z28.s, z6.b, z3.b[0]\n"
"udot z29.s, z6.b, z3.b[1]\n"
- ".inst 0x04b5775a // sqrdmulh z26.s, z26.s, z21.s\n"
+ ".inst 0x04a8775a // sqrdmulh z26.s, z26.s, z8.s\n"
"udot z30.s, z6.b, z3.b[2]\n"
"udot z31.s, z6.b, z3.b[3]\n"
- ".inst 0x04b5777b // sqrdmulh z27.s, z27.s, z21.s\n"
- "ld1b { z6.b }, p1/Z, [%x[params], #4, MUL VL]\n"
+ ".inst 0x04a8777b // sqrdmulh z27.s, z27.s, z8.s\n"
+ "ld1b { z6.b }, p0/Z, [%x[params], #4, MUL VL]\n"
"udot z28.s, z7.b, z4.b[0]\n"
"udot z29.s, z7.b, z4.b[1]\n"
- "and z19.d, z24.d, z20.d\n"
+ "and z19.d, z24.d, z21.d\n"
"udot z30.s, z7.b, z4.b[2]\n"
"udot z31.s, z7.b, z4.b[3]\n"
- "and z18.d, z25.d, z20.d\n"
- "ld1b { z7.b }, p1/Z, [%x[params], #5, MUL VL]\n"
- "and z17.d, z26.d, z20.d\n"
- "and z16.d, z27.d, z20.d\n"
+ "and z18.d, z25.d, z21.d\n"
+ "ld1b { z7.b }, p0/Z, [%x[params], #5, MUL VL]\n"
+ "and z17.d, z26.d, z21.d\n"
+ "and z16.d, z27.d, z21.d\n"
"addvl %x[params], %x[params], #6\n"
"asr z19.s, z19.s, #0x1f\n"
"asr z18.s, z18.s, #0x1f\n"
"asr z17.s, z17.s, #0x1f\n"
"asr z16.s, z16.s, #0x1f\n"
- ".inst 0x04b5779c // sqrdmulh z28.s, z28.s, z21.s\n"
- ".inst 0x04b577bd // sqrdmulh z29.s, z29.s, z21.s\n"
- ".inst 0x04b577de // sqrdmulh z30.s, z30.s, z21.s\n"
- ".inst 0x04b577ff // sqrdmulh z31.s, z31.s, z21.s\n"
+ ".inst 0x04a8779c // sqrdmulh z28.s, z28.s, z8.s\n"
+ ".inst 0x04a877bd // sqrdmulh z29.s, z29.s, z8.s\n"
+ ".inst 0x04a877de // sqrdmulh z30.s, z30.s, z8.s\n"
+ ".inst 0x04a877ff // sqrdmulh z31.s, z31.s, z8.s\n"
"sqadd z24.s, z24.s, z19.s\n"
"sqadd z25.s, z25.s, z18.s\n"
- ".inst 0x44828a98 // srshl z24.s, p2/M, z24.s, z20.s\n"
- ".inst 0x44828a99 // srshl z25.s, p2/M, z25.s, z20.s\n"
+ ".inst 0x44828ab8 // srshl z24.s, p2/M, z24.s, z21.s\n"
+ ".inst 0x44828ab9 // srshl z25.s, p2/M, z25.s, z21.s\n"
"sqadd z26.s, z26.s, z17.s\n"
"sqadd z27.s, z27.s, z16.s\n"
- ".inst 0x44828a9a // srshl z26.s, p2/M, z26.s, z20.s\n"
- ".inst 0x44828a9b // srshl z27.s, p2/M, z27.s, z20.s\n"
- "and z19.d, z28.d, z20.d\n"
- "and z18.d, z29.d, z20.d\n"
- "and z17.d, z30.d, z20.d\n"
- "and z16.d, z31.d, z20.d\n"
+ ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
+ ".inst 0x44828abb // srshl z27.s, p2/M, z27.s, z21.s\n"
+ "and z19.d, z28.d, z21.d\n"
+ "and z18.d, z29.d, z21.d\n"
+ "and z17.d, z30.d, z21.d\n"
+ "and z16.d, z31.d, z21.d\n"
"asr z19.s, z19.s, #0x1f\n"
"asr z18.s, z18.s, #0x1f\n"
"asr z17.s, z17.s, #0x1f\n"
"asr z16.s, z16.s, #0x1f\n"
"sqadd z28.s, z28.s, z19.s\n"
"sqadd z29.s, z29.s, z18.s\n"
- ".inst 0x44828a9c // srshl z28.s, p2/M, z28.s, z20.s\n"
- ".inst 0x44828a9d // srshl z29.s, p2/M, z29.s, z20.s\n"
+ ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
+ ".inst 0x44828abd // srshl z29.s, p2/M, z29.s, z21.s\n"
"sqadd z30.s, z30.s, z17.s\n"
"sqadd z31.s, z31.s, z16.s\n"
- ".inst 0x44828a9e // srshl z30.s, p2/M, z30.s, z20.s\n"
- ".inst 0x44828a9f // srshl z31.s, p2/M, z31.s, z20.s\n"
- "add z24.s, z24.s, z12.s\n"
- "add z25.s, z25.s, z12.s\n"
- "smin z24.s, p2/M, z24.s, z10.s\n"
- "smin z25.s, p2/M, z25.s, z10.s\n"
- "add z26.s, z26.s, z12.s\n"
- "add z27.s, z27.s, z12.s\n"
- "smin z26.s, p2/M, z26.s, z10.s\n"
- "smin z27.s, p2/M, z27.s, z10.s\n"
- "add z28.s, z28.s, z12.s\n"
- "add z29.s, z29.s, z12.s\n"
- "smin z28.s, p2/M, z28.s, z10.s\n"
- "smin z29.s, p2/M, z29.s, z10.s\n"
- "add z30.s, z30.s, z12.s\n"
- "add z31.s, z31.s, z12.s\n"
- "smin z30.s, p2/M, z30.s, z10.s\n"
- "smin z31.s, p2/M, z31.s, z10.s\n"
- "smax z24.s, p2/M, z24.s, z11.s\n"
- "smax z25.s, p2/M, z25.s, z11.s\n"
- "st1b { z24.s }, p0, [x27, x28]\n"
+ ".inst 0x44828abe // srshl z30.s, p2/M, z30.s, z21.s\n"
+ ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
+ "add z24.s, z24.s, z9.s\n"
+ "add z25.s, z25.s, z9.s\n"
+ "smin z24.s, p2/M, z24.s, z12.s\n"
+ "smin z25.s, p2/M, z25.s, z12.s\n"
+ "add z26.s, z26.s, z9.s\n"
+ "add z27.s, z27.s, z9.s\n"
+ "smin z26.s, p2/M, z26.s, z12.s\n"
+ "smin z27.s, p2/M, z27.s, z12.s\n"
+ "add z28.s, z28.s, z9.s\n"
+ "add z29.s, z29.s, z9.s\n"
+ "smin z28.s, p2/M, z28.s, z12.s\n"
+ "smin z29.s, p2/M, z29.s, z12.s\n"
+ "add z30.s, z30.s, z9.s\n"
+ "add z31.s, z31.s, z9.s\n"
+ "smin z30.s, p2/M, z30.s, z12.s\n"
+ "smin z31.s, p2/M, z31.s, z12.s\n"
+ "smax z24.s, p2/M, z24.s, z15.s\n"
+ "smax z25.s, p2/M, z25.s, z15.s\n"
+ "st1b { z24.s }, p1, [x27, x28]\n"
"mov z24.s, z22.s[0]\n"
- "smax z26.s, p2/M, z26.s, z11.s\n"
- "smax z27.s, p2/M, z27.s, z11.s\n"
- "st1b { z25.s }, p0, [x26, x28]\n"
+ "smax z26.s, p2/M, z26.s, z15.s\n"
+ "smax z27.s, p2/M, z27.s, z15.s\n"
+ "st1b { z25.s }, p1, [x26, x28]\n"
"mov z25.s, z22.s[1]\n"
- "smax z28.s, p2/M, z28.s, z11.s\n"
- "smax z29.s, p2/M, z29.s, z11.s\n"
- "st1b { z26.s }, p0, [x25, x28]\n"
+ "smax z28.s, p2/M, z28.s, z15.s\n"
+ "smax z29.s, p2/M, z29.s, z15.s\n"
+ "st1b { z26.s }, p1, [x25, x28]\n"
"mov z26.s, z22.s[2]\n"
- "smax z30.s, p2/M, z30.s, z11.s\n"
- "smax z31.s, p2/M, z31.s, z11.s\n"
- "st1b { z27.s }, p0, [x24, x28]\n"
+ "smax z30.s, p2/M, z30.s, z15.s\n"
+ "smax z31.s, p2/M, z31.s, z15.s\n"
+ "st1b { z27.s }, p1, [x24, x28]\n"
"mov z27.s, z22.s[3]\n"
- "st1b { z28.s }, p0, [x23, x28]\n"
+ "st1b { z28.s }, p1, [x23, x28]\n"
"mov z28.s, z23.s[0]\n"
- "add z24.s, z24.s, z13.s\n"
- "st1b { z29.s }, p0, [x22, x28]\n"
+ "add z24.s, z24.s, z20.s\n"
+ "st1b { z29.s }, p1, [x22, x28]\n"
"mov z29.s, z23.s[1]\n"
- "add z25.s, z25.s, z13.s\n"
- "st1b { z30.s }, p0, [x21, x28]\n"
+ "add z25.s, z25.s, z20.s\n"
+ "st1b { z30.s }, p1, [x21, x28]\n"
"mov z30.s, z23.s[2]\n"
- "add z26.s, z26.s, z13.s\n"
- "st1b { z31.s }, p0, [x20, x28]\n"
+ "add z26.s, z26.s, z20.s\n"
+ "st1b { z31.s }, p1, [x20, x28]\n"
"mov z31.s, z23.s[3]\n"
"incw x28\n"
- "add z27.s, z27.s, z13.s\n"
- "add z28.s, z28.s, z13.s\n"
- "add z29.s, z29.s, z13.s\n"
- "add z30.s, z30.s, z13.s\n"
- "add z31.s, z31.s, z13.s\n"
+ "add z27.s, z27.s, z20.s\n"
+ "add z28.s, z28.s, z20.s\n"
+ "add z29.s, z29.s, z20.s\n"
+ "add z30.s, z30.s, z20.s\n"
+ "add z31.s, z31.s, z20.s\n"
"b.any 1b\n"
: [params] "+&r" (params)
: [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
index e7173de65a..55b6edea2c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
index debaa8c296..4b65a67309 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -47,8 +47,8 @@ void sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"ldr x21, [%x[inptrs], #0x20]\n"
"ldr x20, [%x[inptrs], #0x10]\n"
"ld1b { z3.b }, p0/Z, [x22]\n"
- "mov z20.d, z3.d\n"
- "ext z20.b, z20.b, z20.b, #0x1\n"
+ "mov z23.d, z3.d\n"
+ "ext z23.b, z23.b, z23.b, #0x1\n"
"ld1b { z4.b }, p0/Z, [x21]\n"
"ldr x24, [%x[inptrs], #0x8]\n"
"mov z18.d, z4.d\n"
@@ -59,132 +59,132 @@ void sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"ext z15.b, z15.b, z15.b, #0x1\n"
"ldr x22, [%x[inptrs], #0x30]\n"
"ldr x21, [%x[inptrs], #0x38]\n"
- "zip1 z3.d, z3.d, z20.d\n"
+ "zip1 z3.d, z3.d, z23.d\n"
"zip1 z4.d, z4.d, z18.d\n"
"ldr x20, [%x[inptrs], #0x0]\n"
"ld1b { z1.b }, p0/Z, [x24]\n"
- "mov z20.d, z1.d\n"
- "ext z20.b, z20.b, z20.b, #0x1\n"
+ "mov z19.d, z1.d\n"
+ "ext z19.b, z19.b, z19.b, #0x1\n"
"ld1b { z5.b }, p0/Z, [x23]\n"
"ld1b { z6.b }, p0/Z, [x22]\n"
- "mov z13.d, z5.d\n"
- "mov z19.d, z6.d\n"
+ "mov z18.d, z5.d\n"
+ "mov z22.d, z6.d\n"
"ld1b { z7.b }, p0/Z, [x21]\n"
"ld1b { z0.b }, p0/Z, [x20]\n"
- "mov z25.d, z7.d\n"
+ "mov z8.d, z7.d\n"
"zip1 z2.d, z2.d, z15.d\n"
"mov z3.q, z3.q[0]\n"
"mov z4.q, z4.q[0]\n"
"ptrue p2.b\n"
"ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "ext z13.b, z13.b, z13.b, #0x1\n"
- "ext z19.b, z19.b, z19.b, #0x1\n"
+ "ext z18.b, z18.b, z18.b, #0x1\n"
+ "ext z22.b, z22.b, z22.b, #0x1\n"
"lsl x10, %x[n_channels], #0x2\n"
"neg z23.s, p2/M, z23.s\n"
- "ext z25.b, z25.b, z25.b, #0x1\n"
- "mov z30.b, #0x1\n"
+ "ext z8.b, z8.b, z8.b, #0x1\n"
+ "mov z28.b, #0x1\n"
"mov x9, #0x0\n"
- "whilelt p1.b, x9, x10\n"
+ "whilelt p0.b, x9, x10\n"
+ "mov z25.s, #0x0\n"
"mov z24.s, #0x0\n"
- "mov z28.s, #0x0\n"
- "udot z24.s, z30.b, z3.b[0]\n"
- "ld1w { z12.s }, p1/Z, [%x[params]]\n"
- "mov z18.s, #0x0\n"
+ "udot z25.s, z28.b, z3.b[0]\n"
+ "ld1w { z12.s }, p0/Z, [%x[params]]\n"
"mov z17.s, #0x0\n"
- "udot z28.s, z30.b, z3.b[2]\n"
+ "mov z16.s, #0x0\n"
+ "udot z24.s, z28.b, z3.b[2]\n"
"mov x28, #0x0\n"
- "mov z16.d, z0.d\n"
- "udot z18.s, z30.b, z4.b[0]\n"
- "udot z17.s, z30.b, z4.b[2]\n"
+ "mov z27.d, z0.d\n"
+ "udot z17.s, z28.b, z4.b[0]\n"
+ "udot z16.s, z28.b, z4.b[2]\n"
"ldp x27, x26, [%x[outptrs], #0x0]\n"
- "ext z16.b, z16.b, z16.b, #0x1\n"
- "zip1 z1.d, z1.d, z20.d\n"
+ "ext z27.b, z27.b, z27.b, #0x1\n"
+ "zip1 z1.d, z1.d, z19.d\n"
"ldp x25, x24, [%x[outptrs], #0x10]\n"
"ldp x23, x22, [%x[outptrs], #0x20]\n"
"mov z2.q, z2.q[0]\n"
- "zip1 z5.d, z5.d, z13.d\n"
+ "zip1 z5.d, z5.d, z18.d\n"
"ldp x21, x20, [%x[outptrs], #0x30]\n"
- "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "zip1 z6.d, z6.d, z19.d\n"
- "zip1 z7.d, z7.d, z25.d\n"
- "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "zip1 z6.d, z6.d, z22.d\n"
+ "zip1 z7.d, z7.d, z8.d\n"
+ "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
- "mov z26.s, #0x0\n"
- "mov z22.s, #0x0\n"
- "udot z26.s, z30.b, z2.b[0]\n"
- "ld1b { z8.b }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "mov z30.s, #0x0\n"
+ "mov z31.s, #0x0\n"
+ "udot z30.s, z28.b, z2.b[0]\n"
+ "ld1b { z8.b }, p0/Z, [%x[params], #1, MUL VL]\n"
"mov z29.s, #0x1\n"
- "udot z22.s, z30.b, z2.b[2]\n"
- "udot z24.s, z29.b, z3.b[1]\n"
- "ld1b { z9.b }, p1/Z, [%x[params], #2, MUL VL]\n"
- "zip1 z0.d, z0.d, z16.d\n"
+ "udot z31.s, z28.b, z2.b[2]\n"
+ "udot z25.s, z29.b, z3.b[1]\n"
+ "ld1b { z9.b }, p0/Z, [%x[params], #2, MUL VL]\n"
+ "zip1 z0.d, z0.d, z27.d\n"
"mov z1.q, z1.q[0]\n"
- "udot z28.s, z29.b, z3.b[3]\n"
- "ld1b { z10.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "udot z24.s, z29.b, z3.b[3]\n"
+ "ld1b { z10.b }, p0/Z, [%x[params], #3, MUL VL]\n"
"mov z5.q, z5.q[0]\n"
"mov z6.q, z6.q[0]\n"
- "udot z18.s, z29.b, z4.b[1]\n"
- "ld1b { z11.b }, p1/Z, [%x[params], #4, MUL VL]\n"
+ "udot z17.s, z29.b, z4.b[1]\n"
+ "ld1b { z11.b }, p0/Z, [%x[params], #4, MUL VL]\n"
"mov z7.q, z7.q[0]\n"
- "mov z21.s, #0x0\n"
- "udot z17.s, z29.b, z4.b[3]\n"
+ "mov z22.s, #0x0\n"
+ "udot z16.s, z29.b, z4.b[3]\n"
"addvl %x[params], %x[params], #5\n"
- "mov z20.s, #0x0\n"
- "mov z25.s, #0x0\n"
- "udot z21.s, z30.b, z1.b[0]\n"
+ "mov z21.s, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "udot z22.s, z28.b, z1.b[0]\n"
"mov z27.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "udot z21.s, z28.b, z1.b[2]\n"
"mov z19.s, #0x0\n"
- "udot z20.s, z30.b, z1.b[2]\n"
- "udot z25.s, z30.b, z5.b[0]\n"
- "udot z27.s, z30.b, z5.b[2]\n"
- "mov z0.q, z0.q[0]\n"
- "udot z19.s, z30.b, z6.b[0]\n"
- "udot z26.s, z29.b, z2.b[1]\n"
- "add z24.s, z24.s, z18.s\n"
"mov z18.s, #0x0\n"
- "udot z18.s, z30.b, z6.b[2]\n"
- "udot z22.s, z29.b, z2.b[3]\n"
- "add z17.s, z28.s, z17.s\n"
- "mov z16.s, #0x0\n"
- "udot z16.s, z30.b, z7.b[0]\n"
- "udot z21.s, z29.b, z1.b[1]\n"
- "udot z20.s, z29.b, z1.b[3]\n"
- "add z28.s, z26.s, z24.s\n"
- "udot z25.s, z29.b, z5.b[1]\n"
+ "udot z26.s, z28.b, z5.b[0]\n"
+ "udot z27.s, z28.b, z5.b[2]\n"
+ "udot z20.s, z28.b, z6.b[0]\n"
+ "mov z0.q, z0.q[0]\n"
+ "udot z19.s, z28.b, z6.b[2]\n"
+ "udot z18.s, z28.b, z7.b[0]\n"
+ "add z17.s, z25.s, z17.s\n"
+ "mov z25.s, #0x0\n"
+ "udot z25.s, z28.b, z7.b[2]\n"
+ "udot z30.s, z29.b, z2.b[1]\n"
+ "udot z31.s, z29.b, z2.b[3]\n"
+ "add z16.s, z24.s, z16.s\n"
+ "udot z22.s, z29.b, z1.b[1]\n"
+ "mov z24.s, #0x0\n"
+ "udot z24.s, z28.b, z0.b[0]\n"
+ "udot z21.s, z29.b, z1.b[3]\n"
+ "udot z26.s, z29.b, z5.b[1]\n"
"udot z27.s, z29.b, z5.b[3]\n"
- "add z31.s, z22.s, z17.s\n"
- "udot z19.s, z29.b, z6.b[1]\n"
- "udot z18.s, z29.b, z6.b[3]\n"
- "add z22.s, z21.s, z28.s\n"
- "udot z16.s, z29.b, z7.b[1]\n"
- "add z21.s, z20.s, z31.s\n"
- "add z20.s, z25.s, z19.s\n"
- "add z19.s, z27.s, z18.s\n"
- "add z18.s, z16.s, z24.s\n"
- "mov z16.s, #0x0\n"
- "udot z16.s, z30.b, z7.b[2]\n"
- "udot z16.s, z29.b, z7.b[3]\n"
- "add z17.s, z16.s, z17.s\n"
- "mov z16.s, #0x0\n"
- "udot z16.s, z30.b, z0.b[0]\n"
- "udot z16.s, z29.b, z0.b[1]\n"
- "add z24.s, z22.s, z16.s\n"
- "add z26.s, z22.s, z25.s\n"
+ "add z30.s, z30.s, z17.s\n"
+ "udot z20.s, z29.b, z6.b[1]\n"
+ "udot z19.s, z29.b, z6.b[3]\n"
+ "add z31.s, z31.s, z16.s\n"
+ "udot z18.s, z29.b, z7.b[1]\n"
+ "udot z25.s, z29.b, z7.b[3]\n"
+ "add z22.s, z22.s, z30.s\n"
+ "udot z24.s, z29.b, z0.b[1]\n"
+ "add z21.s, z21.s, z31.s\n"
+ "add z20.s, z26.s, z20.s\n"
+ "add z19.s, z27.s, z19.s\n"
+ "add z18.s, z18.s, z17.s\n"
+ "mov z17.s, #0x0\n"
+ "udot z17.s, z28.b, z0.b[2]\n"
+ "udot z17.s, z29.b, z0.b[3]\n"
+ "add z16.s, z25.s, z16.s\n"
+ "add z24.s, z22.s, z24.s\n"
+ "add z25.s, z21.s, z17.s\n"
"mul z24.s, p2/M, z24.s, z23.s\n"
- "mul z26.s, p2/M, z26.s, z23.s\n"
- "mov z16.s, #0x0\n"
- "udot z16.s, z30.b, z0.b[2]\n"
- "udot z16.s, z29.b, z0.b[3]\n"
- "add z25.s, z21.s, z16.s\n"
- "add z27.s, z21.s, z27.s\n"
"mul z25.s, p2/M, z25.s, z23.s\n"
+ "add z26.s, z26.s, z22.s\n"
+ "add z27.s, z27.s, z21.s\n"
+ "mul z26.s, p2/M, z26.s, z23.s\n"
"mul z27.s, p2/M, z27.s, z23.s\n"
- "add z28.s, z20.s, z28.s\n"
+ "add z28.s, z20.s, z30.s\n"
"add z29.s, z19.s, z31.s\n"
"mul z28.s, p2/M, z28.s, z23.s\n"
"mul z29.s, p2/M, z29.s, z23.s\n"
- "add z30.s, z18.s, z20.s\n"
- "add z31.s, z17.s, z19.s\n"
+ "add z30.s, z20.s, z18.s\n"
+ "add z31.s, z19.s, z16.s\n"
"mul z30.s, p2/M, z30.s, z23.s\n"
"mul z31.s, p2/M, z31.s, z23.s\n"
"zip1 z19.s, z24.s, z26.s\n"
@@ -204,22 +204,22 @@ void sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"1:" // Loop
"udot z24.s, z8.b, z0.b[0]\n"
"udot z25.s, z8.b, z0.b[2]\n"
- "ld1w { z17.s }, p2/Z, [%x[params], #6, MUL VL]\n"
- "ld1w { z19.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "ld1w { z12.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
"udot z26.s, z8.b, z1.b[0]\n"
"udot z27.s, z8.b, z1.b[2]\n"
"incb x9\n"
- "whilelt p0.s, x28, %x[n_channels]\n"
+ "whilelt p1.s, x28, %x[n_channels]\n"
"udot z24.s, z9.b, z0.b[1]\n"
"udot z25.s, z9.b, z0.b[3]\n"
- "whilelt p1.b, x9, x10\n"
+ "whilelt p0.b, x9, x10\n"
"udot z26.s, z9.b, z1.b[1]\n"
"udot z27.s, z9.b, z1.b[3]\n"
"udot z28.s, z8.b, z2.b[0]\n"
"udot z29.s, z8.b, z2.b[2]\n"
"udot z30.s, z8.b, z3.b[0]\n"
"udot z31.s, z8.b, z3.b[2]\n"
- "ld1b { z8.b }, p2/Z, [%x[params]]\n"
+ "ld1b { z17.b }, p2/Z, [%x[params]]\n"
"udot z24.s, z10.b, z1.b[0]\n"
"udot z25.s, z10.b, z1.b[2]\n"
"udot z26.s, z10.b, z2.b[0]\n"
@@ -228,7 +228,7 @@ void sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"udot z29.s, z9.b, z2.b[3]\n"
"udot z30.s, z9.b, z3.b[1]\n"
"udot z31.s, z9.b, z3.b[3]\n"
- "ld1b { z9.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [%x[params], #1, MUL VL]\n"
"udot z24.s, z11.b, z1.b[1]\n"
"udot z25.s, z11.b, z1.b[3]\n"
"udot z26.s, z11.b, z2.b[1]\n"
@@ -237,158 +237,158 @@ void sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"udot z29.s, z10.b, z3.b[2]\n"
"udot z30.s, z10.b, z4.b[0]\n"
"udot z31.s, z10.b, z4.b[2]\n"
- "ld1b { z10.b }, p2/Z, [%x[params], #2, MUL VL]\n"
- "udot z24.s, z8.b, z2.b[0]\n"
- "udot z25.s, z8.b, z2.b[2]\n"
- "udot z26.s, z8.b, z3.b[0]\n"
- "udot z27.s, z8.b, z3.b[2]\n"
+ "ld1b { z19.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "udot z24.s, z17.b, z2.b[0]\n"
+ "udot z25.s, z17.b, z2.b[2]\n"
+ "udot z26.s, z17.b, z3.b[0]\n"
+ "udot z27.s, z17.b, z3.b[2]\n"
"udot z28.s, z11.b, z3.b[1]\n"
"udot z29.s, z11.b, z3.b[3]\n"
"udot z30.s, z11.b, z4.b[1]\n"
"udot z31.s, z11.b, z4.b[3]\n"
- "ld1b { z11.b }, p2/Z, [%x[params], #3, MUL VL]\n"
- "udot z24.s, z9.b, z2.b[1]\n"
- "udot z25.s, z9.b, z2.b[3]\n"
- "udot z26.s, z9.b, z3.b[1]\n"
- "udot z27.s, z9.b, z3.b[3]\n"
- "udot z28.s, z8.b, z4.b[0]\n"
- "udot z29.s, z8.b, z4.b[2]\n"
- "udot z30.s, z8.b, z5.b[0]\n"
- "udot z31.s, z8.b, z5.b[2]\n"
- "ld1b { z8.b }, p2/Z, [%x[params], #4, MUL VL]\n"
- "udot z24.s, z10.b, z3.b[0]\n"
- "udot z25.s, z10.b, z3.b[2]\n"
- "udot z26.s, z10.b, z4.b[0]\n"
- "udot z27.s, z10.b, z4.b[2]\n"
- "udot z28.s, z9.b, z4.b[1]\n"
- "udot z29.s, z9.b, z4.b[3]\n"
- "udot z30.s, z9.b, z5.b[1]\n"
- "udot z31.s, z9.b, z5.b[3]\n"
- "ld1b { z9.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "ld1b { z18.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "udot z24.s, z16.b, z2.b[1]\n"
+ "udot z25.s, z16.b, z2.b[3]\n"
+ "udot z26.s, z16.b, z3.b[1]\n"
+ "udot z27.s, z16.b, z3.b[3]\n"
+ "udot z28.s, z17.b, z4.b[0]\n"
+ "udot z29.s, z17.b, z4.b[2]\n"
+ "udot z30.s, z17.b, z5.b[0]\n"
+ "udot z31.s, z17.b, z5.b[2]\n"
+ "ld1b { z17.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "udot z24.s, z19.b, z3.b[0]\n"
+ "udot z25.s, z19.b, z3.b[2]\n"
+ "udot z26.s, z19.b, z4.b[0]\n"
+ "udot z27.s, z19.b, z4.b[2]\n"
+ "udot z28.s, z16.b, z4.b[1]\n"
+ "udot z29.s, z16.b, z4.b[3]\n"
+ "udot z30.s, z16.b, z5.b[1]\n"
+ "udot z31.s, z16.b, z5.b[3]\n"
+ "ld1b { z16.b }, p2/Z, [%x[params], #5, MUL VL]\n"
"addvl %x[params], %x[params], #16\n"
- "udot z24.s, z11.b, z3.b[1]\n"
- "udot z25.s, z11.b, z3.b[3]\n"
- "ld1w { z12.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
- "udot z26.s, z11.b, z4.b[1]\n"
- "udot z27.s, z11.b, z4.b[3]\n"
- "udot z28.s, z10.b, z5.b[0]\n"
- "udot z29.s, z10.b, z5.b[2]\n"
- "udot z30.s, z10.b, z6.b[0]\n"
- "udot z31.s, z10.b, z6.b[2]\n"
- "ld1b { z10.b }, p1/Z, [%x[params], #-5, MUL VL]\n"
- "udot z24.s, z8.b, z4.b[0]\n"
- "udot z25.s, z8.b, z4.b[2]\n"
- "udot z26.s, z8.b, z5.b[0]\n"
- "udot z27.s, z8.b, z5.b[2]\n"
- "udot z28.s, z11.b, z5.b[1]\n"
- "udot z29.s, z11.b, z5.b[3]\n"
- "udot z30.s, z11.b, z6.b[1]\n"
- "udot z31.s, z11.b, z6.b[3]\n"
- "ld1b { z11.b }, p1/Z, [%x[params], #-4, MUL VL]\n"
- "udot z24.s, z9.b, z4.b[1]\n"
- "udot z25.s, z9.b, z4.b[3]\n"
- ".inst 0x04b17718 // sqrdmulh z24.s, z24.s, z17.s\n"
- "udot z26.s, z9.b, z5.b[1]\n"
- "udot z27.s, z9.b, z5.b[3]\n"
- ".inst 0x04b17739 // sqrdmulh z25.s, z25.s, z17.s\n"
- "udot z28.s, z8.b, z6.b[0]\n"
- "udot z29.s, z8.b, z6.b[2]\n"
- ".inst 0x04b1775a // sqrdmulh z26.s, z26.s, z17.s\n"
- "udot z30.s, z8.b, z7.b[0]\n"
- "udot z31.s, z8.b, z7.b[2]\n"
- ".inst 0x04b1777b // sqrdmulh z27.s, z27.s, z17.s\n"
- "ld1b { z8.b }, p1/Z, [%x[params], #-7, MUL VL]\n"
- "udot z28.s, z9.b, z6.b[1]\n"
- "udot z29.s, z9.b, z6.b[3]\n"
- "and z16.d, z24.d, z19.d\n"
- "udot z30.s, z9.b, z7.b[1]\n"
- "udot z31.s, z9.b, z7.b[3]\n"
- "and z18.d, z25.d, z19.d\n"
- "ld1b { z9.b }, p1/Z, [%x[params], #-6, MUL VL]\n"
- "asr z16.s, z16.s, #0x1f\n"
- "asr z18.s, z18.s, #0x1f\n"
+ "udot z24.s, z18.b, z3.b[1]\n"
+ "udot z25.s, z18.b, z3.b[3]\n"
+ "ld1w { z20.s }, p0/Z, [%x[params], #-8, MUL VL]\n"
+ "udot z26.s, z18.b, z4.b[1]\n"
+ "udot z27.s, z18.b, z4.b[3]\n"
+ "udot z28.s, z19.b, z5.b[0]\n"
+ "udot z29.s, z19.b, z5.b[2]\n"
+ "udot z30.s, z19.b, z6.b[0]\n"
+ "udot z31.s, z19.b, z6.b[2]\n"
+ "ld1b { z10.b }, p0/Z, [%x[params], #-5, MUL VL]\n"
+ "udot z24.s, z17.b, z4.b[0]\n"
+ "udot z25.s, z17.b, z4.b[2]\n"
+ "udot z26.s, z17.b, z5.b[0]\n"
+ "udot z27.s, z17.b, z5.b[2]\n"
+ "udot z28.s, z18.b, z5.b[1]\n"
+ "udot z29.s, z18.b, z5.b[3]\n"
+ "udot z30.s, z18.b, z6.b[1]\n"
+ "udot z31.s, z18.b, z6.b[3]\n"
+ "ld1b { z11.b }, p0/Z, [%x[params], #-4, MUL VL]\n"
+ "udot z24.s, z16.b, z4.b[1]\n"
+ "udot z25.s, z16.b, z4.b[3]\n"
+ ".inst 0x04ac7718 // sqrdmulh z24.s, z24.s, z12.s\n"
+ "udot z26.s, z16.b, z5.b[1]\n"
+ "udot z27.s, z16.b, z5.b[3]\n"
+ ".inst 0x04ac7739 // sqrdmulh z25.s, z25.s, z12.s\n"
+ "udot z28.s, z17.b, z6.b[0]\n"
+ "udot z29.s, z17.b, z6.b[2]\n"
+ ".inst 0x04ac775a // sqrdmulh z26.s, z26.s, z12.s\n"
+ "udot z30.s, z17.b, z7.b[0]\n"
+ "udot z31.s, z17.b, z7.b[2]\n"
+ ".inst 0x04ac777b // sqrdmulh z27.s, z27.s, z12.s\n"
+ "ld1b { z8.b }, p0/Z, [%x[params], #-7, MUL VL]\n"
+ "udot z28.s, z16.b, z6.b[1]\n"
+ "udot z29.s, z16.b, z6.b[3]\n"
+ "and z19.d, z24.d, z21.d\n"
+ "udot z30.s, z16.b, z7.b[1]\n"
+ "udot z31.s, z16.b, z7.b[3]\n"
+ "and z18.d, z25.d, z21.d\n"
+ "ld1b { z9.b }, p0/Z, [%x[params], #-6, MUL VL]\n"
+ "and z17.d, z26.d, z21.d\n"
+ "and z16.d, z27.d, z21.d\n"
"addvl %x[params], %x[params], #-3\n"
- ".inst 0x04b1779c // sqrdmulh z28.s, z28.s, z17.s\n"
- ".inst 0x04b177bd // sqrdmulh z29.s, z29.s, z17.s\n"
- ".inst 0x04b177de // sqrdmulh z30.s, z30.s, z17.s\n"
- ".inst 0x04b177ff // sqrdmulh z31.s, z31.s, z17.s\n"
- "and z17.d, z26.d, z19.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
"asr z17.s, z17.s, #0x1f\n"
- "sqadd z24.s, z24.s, z16.s\n"
- "and z16.d, z27.d, z19.d\n"
- ".inst 0x44828a78 // srshl z24.s, p2/M, z24.s, z19.s\n"
"asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x04ac779c // sqrdmulh z28.s, z28.s, z12.s\n"
+ ".inst 0x04ac77bd // sqrdmulh z29.s, z29.s, z12.s\n"
+ ".inst 0x04ac77de // sqrdmulh z30.s, z30.s, z12.s\n"
+ ".inst 0x04ac77ff // sqrdmulh z31.s, z31.s, z12.s\n"
+ "sqadd z24.s, z24.s, z19.s\n"
"sqadd z25.s, z25.s, z18.s\n"
- ".inst 0x44828a79 // srshl z25.s, p2/M, z25.s, z19.s\n"
+ ".inst 0x44828ab8 // srshl z24.s, p2/M, z24.s, z21.s\n"
+ ".inst 0x44828ab9 // srshl z25.s, p2/M, z25.s, z21.s\n"
"sqadd z26.s, z26.s, z17.s\n"
"sqadd z27.s, z27.s, z16.s\n"
- ".inst 0x44828a7a // srshl z26.s, p2/M, z26.s, z19.s\n"
- ".inst 0x44828a7b // srshl z27.s, p2/M, z27.s, z19.s\n"
- "and z16.d, z28.d, z19.d\n"
- "and z18.d, z29.d, z19.d\n"
- "and z17.d, z30.d, z19.d\n"
- "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
+ ".inst 0x44828abb // srshl z27.s, p2/M, z27.s, z21.s\n"
+ "and z19.d, z28.d, z21.d\n"
+ "and z18.d, z29.d, z21.d\n"
+ "and z17.d, z30.d, z21.d\n"
+ "and z16.d, z31.d, z21.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
"asr z18.s, z18.s, #0x1f\n"
"asr z17.s, z17.s, #0x1f\n"
- "sqadd z28.s, z28.s, z16.s\n"
- "and z16.d, z31.d, z19.d\n"
- ".inst 0x44828a7c // srshl z28.s, p2/M, z28.s, z19.s\n"
"asr z16.s, z16.s, #0x1f\n"
+ "sqadd z28.s, z28.s, z19.s\n"
"sqadd z29.s, z29.s, z18.s\n"
- ".inst 0x44828a7d // srshl z29.s, p2/M, z29.s, z19.s\n"
+ ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
+ ".inst 0x44828abd // srshl z29.s, p2/M, z29.s, z21.s\n"
"sqadd z30.s, z30.s, z17.s\n"
"sqadd z31.s, z31.s, z16.s\n"
- ".inst 0x44828a7e // srshl z30.s, p2/M, z30.s, z19.s\n"
- ".inst 0x44828a7f // srshl z31.s, p2/M, z31.s, z19.s\n"
- "add z24.s, z24.s, z14.s\n"
- "add z25.s, z25.s, z14.s\n"
+ ".inst 0x44828abe // srshl z30.s, p2/M, z30.s, z21.s\n"
+ ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
+ "add z24.s, z24.s, z13.s\n"
+ "add z25.s, z25.s, z13.s\n"
"smin z24.s, p2/M, z24.s, z15.s\n"
"smin z25.s, p2/M, z25.s, z15.s\n"
- "add z26.s, z26.s, z14.s\n"
- "add z27.s, z27.s, z14.s\n"
+ "add z26.s, z26.s, z13.s\n"
+ "add z27.s, z27.s, z13.s\n"
"smin z26.s, p2/M, z26.s, z15.s\n"
"smin z27.s, p2/M, z27.s, z15.s\n"
- "add z28.s, z28.s, z14.s\n"
- "add z29.s, z29.s, z14.s\n"
+ "add z28.s, z28.s, z13.s\n"
+ "add z29.s, z29.s, z13.s\n"
"smin z28.s, p2/M, z28.s, z15.s\n"
"smin z29.s, p2/M, z29.s, z15.s\n"
- "add z30.s, z30.s, z14.s\n"
- "add z31.s, z31.s, z14.s\n"
+ "add z30.s, z30.s, z13.s\n"
+ "add z31.s, z31.s, z13.s\n"
"smin z30.s, p2/M, z30.s, z15.s\n"
"smin z31.s, p2/M, z31.s, z15.s\n"
- "smax z24.s, p2/M, z24.s, z13.s\n"
- "smax z25.s, p2/M, z25.s, z13.s\n"
- "st1b { z24.s }, p0, [x27, x28]\n"
+ "smax z24.s, p2/M, z24.s, z14.s\n"
+ "smax z25.s, p2/M, z25.s, z14.s\n"
+ "st1b { z24.s }, p1, [x27, x28]\n"
"mov z24.s, z22.s[0]\n"
- "smax z26.s, p2/M, z26.s, z13.s\n"
- "smax z27.s, p2/M, z27.s, z13.s\n"
- "st1b { z25.s }, p0, [x26, x28]\n"
+ "smax z26.s, p2/M, z26.s, z14.s\n"
+ "smax z27.s, p2/M, z27.s, z14.s\n"
+ "st1b { z25.s }, p1, [x26, x28]\n"
"mov z25.s, z22.s[1]\n"
- "smax z28.s, p2/M, z28.s, z13.s\n"
- "smax z29.s, p2/M, z29.s, z13.s\n"
- "st1b { z26.s }, p0, [x25, x28]\n"
+ "smax z28.s, p2/M, z28.s, z14.s\n"
+ "smax z29.s, p2/M, z29.s, z14.s\n"
+ "st1b { z26.s }, p1, [x25, x28]\n"
"mov z26.s, z22.s[2]\n"
- "smax z30.s, p2/M, z30.s, z13.s\n"
- "smax z31.s, p2/M, z31.s, z13.s\n"
- "st1b { z27.s }, p0, [x24, x28]\n"
+ "smax z30.s, p2/M, z30.s, z14.s\n"
+ "smax z31.s, p2/M, z31.s, z14.s\n"
+ "st1b { z27.s }, p1, [x24, x28]\n"
"mov z27.s, z22.s[3]\n"
- "st1b { z28.s }, p0, [x23, x28]\n"
+ "st1b { z28.s }, p1, [x23, x28]\n"
"mov z28.s, z23.s[0]\n"
- "add z24.s, z24.s, z12.s\n"
- "st1b { z29.s }, p0, [x22, x28]\n"
+ "add z24.s, z24.s, z20.s\n"
+ "st1b { z29.s }, p1, [x22, x28]\n"
"mov z29.s, z23.s[1]\n"
- "add z25.s, z25.s, z12.s\n"
- "st1b { z30.s }, p0, [x21, x28]\n"
+ "add z25.s, z25.s, z20.s\n"
+ "st1b { z30.s }, p1, [x21, x28]\n"
"mov z30.s, z23.s[2]\n"
- "add z26.s, z26.s, z12.s\n"
- "st1b { z31.s }, p0, [x20, x28]\n"
+ "add z26.s, z26.s, z20.s\n"
+ "st1b { z31.s }, p1, [x20, x28]\n"
"mov z31.s, z23.s[3]\n"
"incw x28\n"
- "add z27.s, z27.s, z12.s\n"
- "add z28.s, z28.s, z12.s\n"
- "add z29.s, z29.s, z12.s\n"
- "add z30.s, z30.s, z12.s\n"
- "add z31.s, z31.s, z12.s\n"
+ "add z27.s, z27.s, z20.s\n"
+ "add z28.s, z28.s, z20.s\n"
+ "add z29.s, z29.s, z20.s\n"
+ "add z30.s, z30.s, z20.s\n"
+ "add z31.s, z31.s, z20.s\n"
"b.any 1b\n"
: [params] "+&r" (params)
: [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index 3d475daf72..0f1030c0d7 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,14 +22,14 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -47,17 +47,16 @@ class sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfi
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
- unsigned int get_accumulator_depth_vl(void) const override { return 2; }
-
sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
- Parent::KernelType kernel = sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+ Parent::KernelType kernel = sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index ff3ec0ba48..887eccf1e9 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -27,7 +27,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -46,7 +46,7 @@ void sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
struct Params
{
long unsigned int n_channels;
- const int8_t *weights;
+ const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
const int32_t *const requant_muls;
@@ -57,7 +57,7 @@ void sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
Params(
long unsigned int n_channels,
const uint8_t *const *inptrs_raw,
- const int8_t *const weights,
+ const void *const weights,
const int32_t *const bias,
const arm_gemm::Requantize32 &qp,
const int32_t *const requant_muls,
@@ -91,320 +91,320 @@ void sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "mov x8, #0x0\n"
+ "mov x16, #0x0\n"
"ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
"ptrue p4.b\n"
"ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
- "mov x23, x8\n"
+ "mov x23, x16\n"
"add x21, x25, %[offsetof_Requantize32_a_offset]\n"
- "ldr x17, [%x[params], %[offsetof_Params_n_channels]]\n"
- "ldr x16, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x15, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
"add x20, x25, %[offsetof_Requantize32_b_offset]\n"
"add x22, x25, %[offsetof_Requantize32_c_offset]\n"
- "ld1rb { z23.b }, p4/Z, [x21]\n"
- "ld1rb { z15.b }, p4/Z, [x20]\n"
+ "ld1rb { z12.b }, p4/Z, [x21]\n"
+ "ld1rb { z30.b }, p4/Z, [x20]\n"
"add x21, x25, %[offsetof_Requantize32_minval]\n"
"add x20, x25, %[offsetof_Requantize32_maxval]\n"
- "ld1rh { z14.h }, p4/Z, [x22]\n"
- "ld1rh { z12.h }, p4/Z, [x21]\n"
- "ld1rh { z11.h }, p4/Z, [x20]\n"
- "ldp x15, x14, [x24, #0x0]\n"
+ "ld1rh { z24.h }, p4/Z, [x22]\n"
+ "ld1rh { z11.h }, p4/Z, [x21]\n"
+ "ld1rh { z26.h }, p4/Z, [x20]\n"
+ "ldp x13, x12, [x24, #0x0]\n"
"incw x23\n"
- "whilelt p3.h, x8, x17\n"
- "ldp x13, x12, [x24, #0x10]\n"
- "whilelt p2.s, x8, x17\n"
- "whilelt p1.s, x23, x17\n"
- "ldr x26, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1sb { z0.h }, p4/Z, [x16]\n"
- "ld1sb { z1.h }, p4/Z, [x16, #1, MUL VL]\n"
- "add x11, %x[params], %[offsetof_Params_inptrs]\n"
- "mov x10, #0x0\n"
- "ld1sb { z2.h }, p4/Z, [x16, #2, MUL VL]\n"
- "ld1sb { z3.h }, p4/Z, [x16, #3, MUL VL]\n"
- ".inst 0x454f1000 // ssublb z0.h, z0.b, z15.b\n"
- ".inst 0x454f1021 // ssublb z1.h, z1.b, z15.b\n"
- "ld1sb { z4.h }, p4/Z, [x16, #4, MUL VL]\n"
- "ld1sb { z5.h }, p4/Z, [x16, #5, MUL VL]\n"
- ".inst 0x454f1042 // ssublb z2.h, z2.b, z15.b\n"
- ".inst 0x454f1063 // ssublb z3.h, z3.b, z15.b\n"
- "ld1sb { z6.h }, p4/Z, [x16, #6, MUL VL]\n"
- "ld1sb { z7.h }, p4/Z, [x16, #7, MUL VL]\n"
- "inch x16, ALL, MUL #8\n"
- ".inst 0x454f1084 // ssublb z4.h, z4.b, z15.b\n"
- "ld1w { z17.s }, p2/Z, [x26]\n"
- "ld1w { z16.s }, p1/Z, [x26, #1, MUL VL]\n"
- "uzp1 z13.s, z17.s, z16.s\n"
- "uzp2 z17.s, z17.s, z16.s\n"
- "ld1sb { z8.h }, p4/Z, [x16]\n"
- "ldp x24, x23, [x11, #0x0]\n"
- "addvl x26, x26, #2\n"
- "mov z26.d, z13.d\n"
- "ldp x22, x21, [x11, #0x10]\n"
- "ldr x20, [x11, #0x20]\n"
- "mov z10.d, z17.d\n"
- "mov z24.d, z13.d\n"
- "ld1b { z31.h }, p3/Z, [x24, x8]\n"
- "ld1b { z30.h }, p3/Z, [x23, x8]\n"
- "mov z16.d, z17.d\n"
- "mov z25.d, z13.d\n"
- "ld1b { z29.h }, p3/Z, [x22, x8]\n"
- "ld1b { z28.h }, p3/Z, [x21, x8]\n"
- "mov z9.d, z17.d\n"
- ".inst 0x454f10a5 // ssublb z5.h, z5.b, z15.b\n"
- "ld1b { z27.h }, p3/Z, [x20, x8]\n"
- "ldr x9, [%x[params], %[offsetof_Params_requant_muls]]\n"
- ".inst 0x454f10c6 // ssublb z6.h, z6.b, z15.b\n"
- ".inst 0x454f10e7 // ssublb z7.h, z7.b, z15.b\n"
- "ldr x28, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "str x26, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x454f1108 // ssublb z8.h, z8.b, z15.b\n"
- ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
- ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
+ "whilelt p3.h, x16, x15\n"
+ "ldp x11, x10, [x24, #0x10]\n"
+ "whilelt p2.s, x16, x15\n"
+ "whilelt p1.s, x23, x15\n"
+ "ldr x9, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1sb { z14.h }, p4/Z, [x14]\n"
+ "ld1sb { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
+ "add x28, %x[params], %[offsetof_Params_inptrs]\n"
+ "mov x27, #0x0\n"
+ "ld1sb { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+ "ld1sb { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
+ ".inst 0x455e11ce // ssublb z14.h, z14.b, z30.b\n"
+ ".inst 0x455e12b5 // ssublb z21.h, z21.b, z30.b\n"
+ "ld1sb { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
+ "ld1sb { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
+ ".inst 0x455e1021 // ssublb z1.h, z1.b, z30.b\n"
+ ".inst 0x455e10c6 // ssublb z6.h, z6.b, z30.b\n"
+ "ld1sb { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
+ "ld1sb { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+ "inch x14, ALL, MUL #8\n"
+ ".inst 0x455e1042 // ssublb z2.h, z2.b, z30.b\n"
+ "ld1w { z17.s }, p2/Z, [x9]\n"
+ "ld1w { z16.s }, p1/Z, [x9, #1, MUL VL]\n"
+ "uzp1 z5.s, z17.s, z16.s\n"
+ "uzp2 z9.s, z17.s, z16.s\n"
+ "ld1sb { z8.h }, p4/Z, [x14]\n"
+ "ldp x24, x23, [x28, #0x0]\n"
+ "addvl x9, x9, #2\n"
+ "mov z17.d, z5.d\n"
+ "ldp x22, x21, [x28, #0x10]\n"
+ "ldr x20, [x28, #0x20]\n"
+ "mov z25.d, z9.d\n"
+ "mov z16.d, z5.d\n"
+ "ld1b { z0.h }, p3/Z, [x24, x16]\n"
+ "ld1b { z29.h }, p3/Z, [x23, x16]\n"
+ "mov z23.d, z9.d\n"
+ "mov z22.d, z5.d\n"
+ "ld1b { z4.h }, p3/Z, [x22, x16]\n"
+ "ld1b { z13.h }, p3/Z, [x21, x16]\n"
+ "mov z27.d, z9.d\n"
+ ".inst 0x455e1252 // ssublb z18.h, z18.b, z30.b\n"
+ "ld1b { z20.h }, p3/Z, [x20, x16]\n"
+ "ldr x26, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ ".inst 0x455e10e7 // ssublb z7.h, z7.b, z30.b\n"
+ ".inst 0x455e114a // ssublb z10.h, z10.b, z30.b\n"
+ "ldr x25, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "str x9, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x455e1108 // ssublb z8.h, z8.b, z30.b\n"
+ ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
+ ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
+ ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
+ ".inst 0x454c19ad // usublb z13.h, z13.b, z12.b\n"
+ ".inst 0x454c1a94 // usublb z20.h, z20.b, z12.b\n"
"1:" // Loop
- ".inst 0x448443ed // smlalb z13.s, p4/M, z31.h, z4.h\n"
- ".inst 0x448447f1 // smlalt z17.s, p4/M, z31.h, z4.h\n"
- "ldr x22, [x11, #0x28]\n"
- "ldr x27, [x11, #0x38]\n"
- ".inst 0x448343fa // smlalb z26.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448347ea // smlalt z10.s, p4/M, z31.h, z3.h\n"
- "ldr x21, [x11, #0x30]\n"
- "ldr x26, [x11, #0x40]\n"
- ".inst 0x448043cd // smlalb z13.s, p4/M, z30.h, z0.h\n"
- ".inst 0x448047d1 // smlalt z17.s, p4/M, z30.h, z0.h\n"
- "ldr x20, [x11, #0x48]\n"
- "ld1b { z30.h }, p3/Z, [x20, x8]\n"
- ".inst 0x448243ba // smlalb z26.s, p4/M, z29.h, z2.h\n"
- ".inst 0x448247aa // smlalt z10.s, p4/M, z29.h, z2.h\n"
- "ld1b { z29.h }, p3/Z, [x21, x8]\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x448143f8 // smlalb z24.s, p4/M, z31.h, z1.h\n"
- ".inst 0x448147f0 // smlalt z16.s, p4/M, z31.h, z1.h\n"
- "ldr x25, [x11, #0x50]\n"
- "ldr x24, [x11, #0x58]\n"
- ".inst 0x448043f9 // smlalb z25.s, p4/M, z31.h, z0.h\n"
- ".inst 0x448047e9 // smlalt z9.s, p4/M, z31.h, z0.h\n"
- "ld1b { z31.h }, p3/Z, [x22, x8]\n"
- ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
- ".inst 0x4485438d // smlalb z13.s, p4/M, z28.h, z5.h\n"
- ".inst 0x44854791 // smlalt z17.s, p4/M, z28.h, z5.h\n"
- ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
- "ldr x23, [x11, #0x60]\n"
- ".inst 0x4484439a // smlalb z26.s, p4/M, z28.h, z4.h\n"
- ".inst 0x4484478a // smlalt z10.s, p4/M, z28.h, z4.h\n"
- "ldr x22, [x11, #0x68]\n"
- "ldr x21, [x11, #0x70]\n"
- ".inst 0x44824398 // smlalb z24.s, p4/M, z28.h, z2.h\n"
- ".inst 0x44824790 // smlalt z16.s, p4/M, z28.h, z2.h\n"
- "ldr x20, [x11, #0x78]\n"
- "ld1w { z20.s }, p2/Z, [x9]\n"
- ".inst 0x44814399 // smlalb z25.s, p4/M, z28.h, z1.h\n"
- ".inst 0x44814789 // smlalt z9.s, p4/M, z28.h, z1.h\n"
- "ld1b { z28.h }, p3/Z, [x27, x8]\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x4487436d // smlalb z13.s, p4/M, z27.h, z7.h\n"
- ".inst 0x44874771 // smlalt z17.s, p4/M, z27.h, z7.h\n"
- "ld1w { z18.s }, p1/Z, [x9, #1, MUL VL]\n"
- "uzp1 z19.s, z20.s, z18.s\n"
- ".inst 0x4486437a // smlalb z26.s, p4/M, z27.h, z6.h\n"
- ".inst 0x4486476a // smlalt z10.s, p4/M, z27.h, z6.h\n"
- "uzp2 z22.s, z20.s, z18.s\n"
- "ld1w { z20.s }, p2/Z, [x28]\n"
- ".inst 0x448643f8 // smlalb z24.s, p4/M, z31.h, z6.h\n"
- ".inst 0x448647f0 // smlalt z16.s, p4/M, z31.h, z6.h\n"
- "ld1b { z31.h }, p3/Z, [x26, x8]\n"
- ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
- ".inst 0x44834379 // smlalb z25.s, p4/M, z27.h, z3.h\n"
- ".inst 0x44834769 // smlalt z9.s, p4/M, z27.h, z3.h\n"
- "whilelt p0.h, x10, x17\n"
+ ".inst 0x44824005 // smlalb z5.s, p4/M, z0.h, z2.h\n"
+ ".inst 0x44824409 // smlalt z9.s, p4/M, z0.h, z2.h\n"
+ "ldr x20, [x28, #0x28]\n"
+ "ldr x21, [x28, #0x38]\n"
+ ".inst 0x448e43a5 // smlalb z5.s, p4/M, z29.h, z14.h\n"
+ ".inst 0x44864011 // smlalb z17.s, p4/M, z0.h, z6.h\n"
+ "ld1b { z3.h }, p3/Z, [x20, x16]\n"
+ "ldr x20, [x28, #0x30]\n"
+ ".inst 0x44954010 // smlalb z16.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x448e4016 // smlalb z22.s, p4/M, z0.h, z14.h\n"
+ "ld1b { z31.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x454c1863 // usublb z3.h, z3.b, z12.b\n"
+ ".inst 0x448e47a9 // smlalt z9.s, p4/M, z29.h, z14.h\n"
+ ".inst 0x449241a5 // smlalb z5.s, p4/M, z13.h, z18.h\n"
+ "ldr x21, [x28, #0x40]\n"
+ "ld1b { z15.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x44864419 // smlalt z25.s, p4/M, z0.h, z6.h\n"
+ ".inst 0x44954417 // smlalt z23.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n"
+ "ldr x20, [x28, #0x48]\n"
+ ".inst 0x448e441b // smlalt z27.s, p4/M, z0.h, z14.h\n"
+ ".inst 0x44814091 // smlalb z17.s, p4/M, z4.h, z1.h\n"
+ "ld1b { z19.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x454c19ef // usublb z15.h, z15.b, z12.b\n"
+ ".inst 0x448141b0 // smlalb z16.s, p4/M, z13.h, z1.h\n"
+ ".inst 0x449541b6 // smlalb z22.s, p4/M, z13.h, z21.h\n"
+ "ld1b { z28.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x454c1a73 // usublb z19.h, z19.b, z12.b\n"
+ ".inst 0x449245a9 // smlalt z9.s, p4/M, z13.h, z18.h\n"
+ ".inst 0x448a4285 // smlalb z5.s, p4/M, z20.h, z10.h\n"
+ "ldr x21, [x28, #0x50]\n"
+ "ldr x20, [x28, #0x58]\n"
+ ".inst 0x44814499 // smlalt z25.s, p4/M, z4.h, z1.h\n"
+ ".inst 0x448145b7 // smlalt z23.s, p4/M, z13.h, z1.h\n"
+ ".inst 0x454c1b9c // usublb z28.h, z28.b, z12.b\n"
+ "ld1b { z4.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x449545bb // smlalt z27.s, p4/M, z13.h, z21.h\n"
+ ".inst 0x448241b1 // smlalb z17.s, p4/M, z13.h, z2.h\n"
+ "ld1b { z29.h }, p3/Z, [x20, x16]\n"
+ "ldr x21, [x28, #0x60]\n"
+ ".inst 0x44874070 // smlalb z16.s, p4/M, z3.h, z7.h\n"
+ ".inst 0x44864296 // smlalb z22.s, p4/M, z20.h, z6.h\n"
+ "ldr x20, [x28, #0x68]\n"
+ ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
+ ".inst 0x448a4689 // smlalt z9.s, p4/M, z20.h, z10.h\n"
+ ".inst 0x449543e5 // smlalb z5.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
+ "ld1b { z0.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x448245b9 // smlalt z25.s, p4/M, z13.h, z2.h\n"
+ ".inst 0x44874477 // smlalt z23.s, p4/M, z3.h, z7.h\n"
+ "ld1b { z3.h }, p3/Z, [x20, x16]\n"
+ "ldr x20, [x28, #0x70]\n"
+ ".inst 0x4486469b // smlalt z27.s, p4/M, z20.h, z6.h\n"
+ ".inst 0x44874291 // smlalb z17.s, p4/M, z20.h, z7.h\n"
+ ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
+ "ld1b { z13.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x44824290 // smlalb z16.s, p4/M, z20.h, z2.h\n"
+ ".inst 0x448841f6 // smlalb z22.s, p4/M, z15.h, z8.h\n"
+ ".inst 0x454c1863 // usublb z3.h, z3.b, z12.b\n"
+ "ldr x20, [x28, #0x78]\n"
+ ".inst 0x449547e9 // smlalt z9.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x44814265 // smlalb z5.s, p4/M, z19.h, z1.h\n"
+ ".inst 0x454c19ad // usublb z13.h, z13.b, z12.b\n"
+ "whilelt p0.h, x27, x15\n"
+ ".inst 0x44874699 // smlalt z25.s, p4/M, z20.h, z7.h\n"
+ ".inst 0x44824697 // smlalt z23.s, p4/M, z20.h, z2.h\n"
+ "ld1w { z20.s }, p2/Z, [x26]\n"
+ "inch x14\n"
+ ".inst 0x448845fb // smlalt z27.s, p4/M, z15.h, z8.h\n"
+ ".inst 0x448e43f1 // smlalb z17.s, p4/M, z31.h, z14.h\n"
+ "ld1w { z15.s }, p1/Z, [x26, #1, MUL VL]\n"
+ "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44924390 // smlalb z16.s, p4/M, z28.h, z18.h\n"
+ ".inst 0x44824396 // smlalb z22.s, p4/M, z28.h, z2.h\n"
+ "addvl x26, x26, #2\n"
+ ".inst 0x44814669 // smlalt z9.s, p4/M, z19.h, z1.h\n"
+ ".inst 0x44884385 // smlalb z5.s, p4/M, z28.h, z8.h\n"
+ ".inst 0x448e47f9 // smlalt z25.s, p4/M, z31.h, z14.h\n"
+ ".inst 0x44924797 // smlalt z23.s, p4/M, z28.h, z18.h\n"
+ "ld1b { z31.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n"
+ ".inst 0x4482479b // smlalt z27.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x44954271 // smlalb z17.s, p4/M, z19.h, z21.h\n"
+ "uzp1 z2.s, z20.s, z15.s\n"
"inch x16\n"
- ".inst 0x4481438d // smlalb z13.s, p4/M, z28.h, z1.h\n"
- ".inst 0x44814791 // smlalt z17.s, p4/M, z28.h, z1.h\n"
- "ldr x26, [%x[params], %[offsetof_Params_bias]]\n"
- "addvl x9, x9, #2\n"
- ".inst 0x4480439a // smlalb z26.s, p4/M, z28.h, z0.h\n"
- ".inst 0x4480478a // smlalt z10.s, p4/M, z28.h, z0.h\n"
- "ld1b { z28.h }, p3/Z, [x24, x8]\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x44844378 // smlalb z24.s, p4/M, z27.h, z4.h\n"
- ".inst 0x448843b9 // smlalb z25.s, p4/M, z29.h, z8.h\n"
- ".inst 0x44844770 // smlalt z16.s, p4/M, z27.h, z4.h\n"
- ".inst 0x448847a9 // smlalt z9.s, p4/M, z29.h, z8.h\n"
- "ld1b { z29.h }, p3/Z, [x25, x8]\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x448243ed // smlalb z13.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448247f1 // smlalt z17.s, p4/M, z31.h, z2.h\n"
- "ld1w { z18.s }, p1/Z, [x28, #1, MUL VL]\n"
- "addvl x28, x28, #2\n"
- ".inst 0x448143fa // smlalb z26.s, p4/M, z31.h, z1.h\n"
- ".inst 0x448147ea // smlalt z10.s, p4/M, z31.h, z1.h\n"
- "ld1b { z31.h }, p3/Z, [x23, x8]\n"
- ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
- ".inst 0x448543d8 // smlalb z24.s, p4/M, z30.h, z5.h\n"
- ".inst 0x448443d9 // smlalb z25.s, p4/M, z30.h, z4.h\n"
- "uzp1 z1.s, z20.s, z18.s\n"
- ".inst 0x448843cd // smlalb z13.s, p4/M, z30.h, z8.h\n"
- ".inst 0x448847d1 // smlalt z17.s, p4/M, z30.h, z8.h\n"
- "uzp2 z27.s, z20.s, z18.s\n"
- ".inst 0x448743da // smlalb z26.s, p4/M, z30.h, z7.h\n"
- ".inst 0x448747ca // smlalt z10.s, p4/M, z30.h, z7.h\n"
- ".inst 0x448547d0 // smlalt z16.s, p4/M, z30.h, z5.h\n"
- ".inst 0x448447c9 // smlalt z9.s, p4/M, z30.h, z4.h\n"
- "ld1b { z30.h }, p3/Z, [x22, x8]\n"
- ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
- ".inst 0x448043b8 // smlalb z24.s, p4/M, z29.h, z0.h\n"
- ".inst 0x44824399 // smlalb z25.s, p4/M, z28.h, z2.h\n"
- ".inst 0x448343ad // smlalb z13.s, p4/M, z29.h, z3.h\n"
- ".inst 0x448347b1 // smlalt z17.s, p4/M, z29.h, z3.h\n"
- ".inst 0x448047b0 // smlalt z16.s, p4/M, z29.h, z0.h\n"
- "ld1b { z29.h }, p3/Z, [x21, x8]\n"
- ".inst 0x44824789 // smlalt z9.s, p4/M, z28.h, z2.h\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x448343f8 // smlalb z24.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448543d9 // smlalb z25.s, p4/M, z30.h, z5.h\n"
- ".inst 0x4485439a // smlalb z26.s, p4/M, z28.h, z5.h\n"
- ".inst 0x4485478a // smlalt z10.s, p4/M, z28.h, z5.h\n"
- "ld1b { z28.h }, p3/Z, [x20, x8]\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x448643ed // smlalb z13.s, p4/M, z31.h, z6.h\n"
- ".inst 0x448347f0 // smlalt z16.s, p4/M, z31.h, z3.h\n"
- ".inst 0x04b375ad // sqrdmulh z13.s, z13.s, z19.s\n"
- "inch x8\n"
- ".inst 0x448547c9 // smlalt z9.s, p4/M, z30.h, z5.h\n"
- ".inst 0x448743b8 // smlalb z24.s, p4/M, z29.h, z7.h\n"
- "and z21.d, z13.d, z1.d\n"
- "mov x20, x8\n"
- ".inst 0x448643b9 // smlalb z25.s, p4/M, z29.h, z6.h\n"
- ".inst 0x448647f1 // smlalt z17.s, p4/M, z31.h, z6.h\n"
- ".inst 0x04b67631 // sqrdmulh z17.s, z17.s, z22.s\n"
+ ".inst 0x448e4090 // smlalb z16.s, p4/M, z4.h, z14.h\n"
+ ".inst 0x448143b6 // smlalb z22.s, p4/M, z29.h, z1.h\n"
+ "uzp2 z15.s, z20.s, z15.s\n"
+ "ld1w { z20.s }, p2/Z, [x25]\n"
+ ".inst 0x44884789 // smlalt z9.s, p4/M, z28.h, z8.h\n"
+ ".inst 0x44864085 // smlalb z5.s, p4/M, z4.h, z6.h\n"
+ "mov x20, x16\n"
"incw x20\n"
- ".inst 0x448747b0 // smlalt z16.s, p4/M, z29.h, z7.h\n"
- ".inst 0x448647a9 // smlalt z9.s, p4/M, z29.h, z6.h\n"
- "asr z21.s, z21.s, #0x1f\n"
- "whilelt p2.s, x8, x17\n"
- ".inst 0x448843da // smlalb z26.s, p4/M, z30.h, z8.h\n"
- ".inst 0x44884398 // smlalb z24.s, p4/M, z28.h, z8.h\n"
- "and z20.d, z17.d, z27.d\n"
- "whilelt p1.s, x20, x17\n"
- ".inst 0x44874399 // smlalb z25.s, p4/M, z28.h, z7.h\n"
- ".inst 0x448847ca // smlalt z10.s, p4/M, z30.h, z8.h\n"
- ".inst 0x04b3775a // sqrdmulh z26.s, z26.s, z19.s\n"
- "whilelt p3.h, x8, x17\n"
- ".inst 0x44884790 // smlalt z16.s, p4/M, z28.h, z8.h\n"
- ".inst 0x44874789 // smlalt z9.s, p4/M, z28.h, z7.h\n"
- ".inst 0x04b37718 // sqrdmulh z24.s, z24.s, z19.s\n"
- ".inst 0x04b37739 // sqrdmulh z25.s, z25.s, z19.s\n"
- "sqadd z13.s, z13.s, z21.s\n"
- ".inst 0x4482902d // srshl z13.s, p4/M, z13.s, z1.s\n"
- "asr z20.s, z20.s, #0x1f\n"
- "and z19.d, z26.d, z1.d\n"
- ".inst 0x04b6754a // sqrdmulh z10.s, z10.s, z22.s\n"
- "and z18.d, z24.d, z1.d\n"
- ".inst 0x04b67610 // sqrdmulh z16.s, z16.s, z22.s\n"
- "and z21.d, z25.d, z1.d\n"
- ".inst 0x04b67529 // sqrdmulh z9.s, z9.s, z22.s\n"
- "sqadd z17.s, z17.s, z20.s\n"
- ".inst 0x44829371 // srshl z17.s, p4/M, z17.s, z27.s\n"
+ ".inst 0x44954679 // smlalt z25.s, p4/M, z19.h, z21.h\n"
+ ".inst 0x448e4497 // smlalt z23.s, p4/M, z4.h, z14.h\n"
+ "ld1w { z19.s }, p1/Z, [x25, #1, MUL VL]\n"
+ "uzp1 z21.s, z20.s, z19.s\n"
+ ".inst 0x448147bb // smlalt z27.s, p4/M, z29.h, z1.h\n"
+ ".inst 0x448a4391 // smlalb z17.s, p4/M, z28.h, z10.h\n"
+ "uzp2 z1.s, z20.s, z19.s\n"
+ "whilelt p2.s, x16, x15\n"
+ ".inst 0x44864010 // smlalb z16.s, p4/M, z0.h, z6.h\n"
+ ".inst 0x44924076 // smlalb z22.s, p4/M, z3.h, z18.h\n"
+ "whilelt p1.s, x20, x15\n"
+ "whilelt p3.h, x16, x15\n"
+ ".inst 0x44864489 // smlalt z9.s, p4/M, z4.h, z6.h\n"
+ ".inst 0x44874005 // smlalb z5.s, p4/M, z0.h, z7.h\n"
+ ".inst 0x04a274a5 // sqrdmulh z5.s, z5.s, z2.s\n"
+ "addvl x25, x25, #2\n"
+ ".inst 0x448a4799 // smlalt z25.s, p4/M, z28.h, z10.h\n"
+ ".inst 0x44864417 // smlalt z23.s, p4/M, z0.h, z6.h\n"
+ "and z19.d, z5.d, z21.d\n"
+ ".inst 0x4492447b // smlalt z27.s, p4/M, z3.h, z18.h\n"
+ ".inst 0x449243b1 // smlalb z17.s, p4/M, z29.h, z18.h\n"
"asr z19.s, z19.s, #0x1f\n"
- "and z2.d, z10.d, z27.d\n"
+ ".inst 0x448a41b0 // smlalb z16.s, p4/M, z13.h, z10.h\n"
+ ".inst 0x448741b6 // smlalb z22.s, p4/M, z13.h, z7.h\n"
+ "sqadd z5.s, z5.s, z19.s\n"
+ ".inst 0x448292a5 // srshl z5.s, p4/M, z5.s, z21.s\n"
+ ".inst 0x44874409 // smlalt z9.s, p4/M, z0.h, z7.h\n"
+ ".inst 0x449247b9 // smlalt z25.s, p4/M, z29.h, z18.h\n"
+ ".inst 0x04af7529 // sqrdmulh z9.s, z9.s, z15.s\n"
+ ".inst 0x448a45b7 // smlalt z23.s, p4/M, z13.h, z10.h\n"
+ ".inst 0x448745bb // smlalt z27.s, p4/M, z13.h, z7.h\n"
+ "and z29.d, z9.d, z1.d\n"
+ ".inst 0x44884071 // smlalb z17.s, p4/M, z3.h, z8.h\n"
+ ".inst 0x448843f0 // smlalb z16.s, p4/M, z31.h, z8.h\n"
+ ".inst 0x04a27631 // sqrdmulh z17.s, z17.s, z2.s\n"
+ ".inst 0x448a43f6 // smlalb z22.s, p4/M, z31.h, z10.h\n"
+ ".inst 0x44884479 // smlalt z25.s, p4/M, z3.h, z8.h\n"
+ ".inst 0x04a27610 // sqrdmulh z16.s, z16.s, z2.s\n"
+ ".inst 0x448847f7 // smlalt z23.s, p4/M, z31.h, z8.h\n"
+ ".inst 0x448a47fb // smlalt z27.s, p4/M, z31.h, z10.h\n"
+ ".inst 0x04a276d6 // sqrdmulh z22.s, z22.s, z2.s\n"
+ "asr z29.s, z29.s, #0x1f\n"
+ "and z18.d, z17.d, z21.d\n"
+ ".inst 0x04af7739 // sqrdmulh z25.s, z25.s, z15.s\n"
+ "and z20.d, z16.d, z21.d\n"
+ ".inst 0x04af76f7 // sqrdmulh z23.s, z23.s, z15.s\n"
+ "and z19.d, z22.d, z21.d\n"
+ ".inst 0x04af777b // sqrdmulh z27.s, z27.s, z15.s\n"
+ "sqadd z9.s, z9.s, z29.s\n"
+ ".inst 0x44829029 // srshl z9.s, p4/M, z9.s, z1.s\n"
"asr z18.s, z18.s, #0x1f\n"
- "and z22.d, z16.d, z27.d\n"
- "asr z21.s, z21.s, #0x1f\n"
- "and z20.d, z9.d, z27.d\n"
- "sqadd z26.s, z26.s, z19.s\n"
- "asr z2.s, z2.s, #0x1f\n"
- ".inst 0x4482903a // srshl z26.s, p4/M, z26.s, z1.s\n"
- "sqadd z24.s, z24.s, z18.s\n"
- "asr z22.s, z22.s, #0x1f\n"
- ".inst 0x44829038 // srshl z24.s, p4/M, z24.s, z1.s\n"
- "sqadd z25.s, z25.s, z21.s\n"
+ "and z7.d, z25.d, z1.d\n"
"asr z20.s, z20.s, #0x1f\n"
+ "and z6.d, z23.d, z1.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "and z2.d, z27.d, z1.d\n"
+ "sqadd z17.s, z17.s, z18.s\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ ".inst 0x448292b1 // srshl z17.s, p4/M, z17.s, z21.s\n"
+ "sqadd z16.s, z16.s, z20.s\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ ".inst 0x448292b0 // srshl z16.s, p4/M, z16.s, z21.s\n"
+ "sqadd z22.s, z22.s, z19.s\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ ".inst 0x448292b6 // srshl z22.s, p4/M, z22.s, z21.s\n"
+ "sqadd z25.s, z25.s, z7.s\n"
+ "sqadd z23.s, z23.s, z6.s\n"
".inst 0x44829039 // srshl z25.s, p4/M, z25.s, z1.s\n"
- "sqadd z10.s, z10.s, z2.s\n"
- "sqadd z16.s, z16.s, z22.s\n"
- ".inst 0x4482936a // srshl z10.s, p4/M, z10.s, z27.s\n"
- ".inst 0x44829370 // srshl z16.s, p4/M, z16.s, z27.s\n"
- "sqadd z9.s, z9.s, z20.s\n"
- ".inst 0x453041ad // sqxtnb z13.h, z13.s\n"
- ".inst 0x44829369 // srshl z9.s, p4/M, z9.s, z27.s\n"
- ".inst 0x4530435a // sqxtnb z26.h, z26.s\n"
- ".inst 0x45304318 // sqxtnb z24.h, z24.s\n"
- ".inst 0x45304339 // sqxtnb z25.h, z25.s\n"
- ".inst 0x4530462d // sqxtnt z13.h, z17.s\n"
- ".inst 0x4530455a // sqxtnt z26.h, z10.s\n"
- ".inst 0x45304618 // sqxtnt z24.h, z16.s\n"
- ".inst 0x45304539 // sqxtnt z25.h, z9.s\n"
- "sqadd z13.h, z13.h, z14.h\n"
- "smax z13.h, p4/M, z13.h, z12.h\n"
- "smin z13.h, p4/M, z13.h, z11.h\n"
- "sqadd z26.h, z26.h, z14.h\n"
- "sqadd z24.h, z24.h, z14.h\n"
- "smax z26.h, p4/M, z26.h, z12.h\n"
- "smax z24.h, p4/M, z24.h, z12.h\n"
- "sqadd z25.h, z25.h, z14.h\n"
- "smax z25.h, p4/M, z25.h, z12.h\n"
- "smin z26.h, p4/M, z26.h, z11.h\n"
- "st1b { z13.h }, p0, [x15, x10]\n"
- "smin z24.h, p4/M, z24.h, z11.h\n"
- "smin z25.h, p4/M, z25.h, z11.h\n"
- "st1b { z26.h }, p0, [x14, x10]\n"
- "st1b { z24.h }, p0, [x13, x10]\n"
- "st1b { z25.h }, p0, [x12, x10]\n"
- "ld1sb { z0.h }, p4/Z, [x16]\n"
- "ld1sb { z1.h }, p4/Z, [x16, #1, MUL VL]\n"
- "inch x10\n"
- "ld1sb { z2.h }, p4/Z, [x16, #2, MUL VL]\n"
- "ld1sb { z3.h }, p4/Z, [x16, #3, MUL VL]\n"
- ".inst 0x454f1000 // ssublb z0.h, z0.b, z15.b\n"
- ".inst 0x454f1021 // ssublb z1.h, z1.b, z15.b\n"
- "ld1sb { z4.h }, p4/Z, [x16, #4, MUL VL]\n"
- "ld1sb { z5.h }, p4/Z, [x16, #5, MUL VL]\n"
- ".inst 0x454f1042 // ssublb z2.h, z2.b, z15.b\n"
- ".inst 0x454f1063 // ssublb z3.h, z3.b, z15.b\n"
- "ld1sb { z6.h }, p4/Z, [x16, #6, MUL VL]\n"
- "ld1sb { z7.h }, p4/Z, [x16, #7, MUL VL]\n"
- "inch x16, ALL, MUL #8\n"
- ".inst 0x454f1084 // ssublb z4.h, z4.b, z15.b\n"
- "ld1w { z17.s }, p2/Z, [x26]\n"
- "ld1w { z16.s }, p1/Z, [x26, #1, MUL VL]\n"
- "uzp1 z13.s, z17.s, z16.s\n"
- "uzp2 z17.s, z17.s, z16.s\n"
- "ld1sb { z8.h }, p4/Z, [x16]\n"
- "ldp x24, x23, [x11, #0x0]\n"
- "addvl x26, x26, #2\n"
- "str x26, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x22, x21, [x11, #0x10]\n"
- "ldr x20, [x11, #0x20]\n"
- "mov z26.d, z13.d\n"
- "mov z10.d, z17.d\n"
- "ld1b { z31.h }, p3/Z, [x24, x8]\n"
- "ld1b { z30.h }, p3/Z, [x23, x8]\n"
- "mov z24.d, z13.d\n"
- "mov z16.d, z17.d\n"
- "ld1b { z29.h }, p3/Z, [x22, x8]\n"
- "ld1b { z28.h }, p3/Z, [x21, x8]\n"
- "mov z25.d, z13.d\n"
- "mov z9.d, z17.d\n"
- "ld1b { z27.h }, p3/Z, [x20, x8]\n"
- ".inst 0x454f10a5 // ssublb z5.h, z5.b, z15.b\n"
- ".inst 0x454f10c6 // ssublb z6.h, z6.b, z15.b\n"
- ".inst 0x454f10e7 // ssublb z7.h, z7.b, z15.b\n"
- ".inst 0x454f1108 // ssublb z8.h, z8.b, z15.b\n"
- ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
- ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
+ ".inst 0x44829037 // srshl z23.s, p4/M, z23.s, z1.s\n"
+ "sqadd z27.s, z27.s, z2.s\n"
+ ".inst 0x453040a5 // sqxtnb z5.h, z5.s\n"
+ ".inst 0x4482903b // srshl z27.s, p4/M, z27.s, z1.s\n"
+ ".inst 0x45304231 // sqxtnb z17.h, z17.s\n"
+ ".inst 0x45304210 // sqxtnb z16.h, z16.s\n"
+ ".inst 0x453042d6 // sqxtnb z22.h, z22.s\n"
+ ".inst 0x45304525 // sqxtnt z5.h, z9.s\n"
+ ".inst 0x45304731 // sqxtnt z17.h, z25.s\n"
+ ".inst 0x453046f0 // sqxtnt z16.h, z23.s\n"
+ ".inst 0x45304776 // sqxtnt z22.h, z27.s\n"
+ "sqadd z5.h, z5.h, z24.h\n"
+ "smax z5.h, p4/M, z5.h, z11.h\n"
+ "smin z5.h, p4/M, z5.h, z26.h\n"
+ "sqadd z17.h, z17.h, z24.h\n"
+ "sqadd z16.h, z16.h, z24.h\n"
+ "smax z17.h, p4/M, z17.h, z11.h\n"
+ "smax z16.h, p4/M, z16.h, z11.h\n"
+ "sqadd z22.h, z22.h, z24.h\n"
+ "smax z22.h, p4/M, z22.h, z11.h\n"
+ "smin z17.h, p4/M, z17.h, z26.h\n"
+ "st1b { z5.h }, p0, [x13, x27]\n"
+ "smin z16.h, p4/M, z16.h, z26.h\n"
+ "smin z22.h, p4/M, z22.h, z26.h\n"
+ "st1b { z17.h }, p0, [x12, x27]\n"
+ "st1b { z16.h }, p0, [x11, x27]\n"
+ "st1b { z22.h }, p0, [x10, x27]\n"
+ "ld1sb { z14.h }, p4/Z, [x14]\n"
+ "ld1sb { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
+ "inch x27\n"
+ "ld1sb { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+ "ld1sb { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
+ ".inst 0x455e11ce // ssublb z14.h, z14.b, z30.b\n"
+ ".inst 0x455e12b5 // ssublb z21.h, z21.b, z30.b\n"
+ "ld1sb { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
+ "ld1sb { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
+ ".inst 0x455e1021 // ssublb z1.h, z1.b, z30.b\n"
+ ".inst 0x455e10c6 // ssublb z6.h, z6.b, z30.b\n"
+ "ld1sb { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
+ "ld1sb { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+ "inch x14, ALL, MUL #8\n"
+ ".inst 0x455e1042 // ssublb z2.h, z2.b, z30.b\n"
+ "ld1w { z17.s }, p2/Z, [x21]\n"
+ "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+ "uzp1 z5.s, z17.s, z16.s\n"
+ "uzp2 z9.s, z17.s, z16.s\n"
+ "ld1sb { z8.h }, p4/Z, [x14]\n"
+ "ldp x24, x23, [x28, #0x0]\n"
+ "addvl x21, x21, #2\n"
+ "str x21, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x22, x21, [x28, #0x10]\n"
+ "ldr x20, [x28, #0x20]\n"
+ "mov z17.d, z5.d\n"
+ "mov z25.d, z9.d\n"
+ "ld1b { z0.h }, p3/Z, [x24, x16]\n"
+ "ld1b { z29.h }, p3/Z, [x23, x16]\n"
+ "mov z16.d, z5.d\n"
+ "mov z23.d, z9.d\n"
+ "ld1b { z4.h }, p3/Z, [x22, x16]\n"
+ "ld1b { z13.h }, p3/Z, [x21, x16]\n"
+ "mov z22.d, z5.d\n"
+ "mov z27.d, z9.d\n"
+ "ld1b { z20.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x455e1252 // ssublb z18.h, z18.b, z30.b\n"
+ ".inst 0x455e10e7 // ssublb z7.h, z7.b, z30.b\n"
+ ".inst 0x455e114a // ssublb z10.h, z10.b, z30.b\n"
+ ".inst 0x455e1108 // ssublb z8.h, z8.b, z30.b\n"
+ ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
+ ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
+ ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
+ ".inst 0x454c19ad // usublb z13.h, z13.b, z12.b\n"
+ ".inst 0x454c1a94 // usublb z20.h, z20.b, z12.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index 9a3db20f73..79e3fd5f54 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,14 +22,14 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -47,17 +47,16 @@ class sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfi
constexpr static unsigned int stride_rows = 2;
constexpr static unsigned int stride_cols = 2;
- arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
- unsigned int get_accumulator_depth_vl(void) const override { return 2; }
-
sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
- Parent::KernelType kernel = sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+ Parent::KernelType kernel = sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index 24c4bf713d..754d06d443 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -27,7 +27,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -46,7 +46,7 @@ void sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
struct Params
{
long unsigned int n_channels;
- const int8_t *weights;
+ const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
const int32_t *const requant_muls;
@@ -57,7 +57,7 @@ void sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
Params(
long unsigned int n_channels,
const uint8_t *const *inptrs_raw,
- const int8_t *const weights,
+ const void *const weights,
const int32_t *const bias,
const arm_gemm::Requantize32 &qp,
const int32_t *const requant_muls,
@@ -110,13 +110,13 @@ void sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
"add x20, x25, %[offsetof_Requantize32_b_offset]\n"
"add x22, x25, %[offsetof_Requantize32_c_offset]\n"
- "ld1rb { z23.b }, p4/Z, [x21]\n"
- "ld1rb { z12.b }, p4/Z, [x20]\n"
+ "ld1rb { z26.b }, p4/Z, [x21]\n"
+ "ld1rb { z13.b }, p4/Z, [x20]\n"
"add x21, x25, %[offsetof_Requantize32_minval]\n"
"add x20, x25, %[offsetof_Requantize32_maxval]\n"
- "ld1rh { z14.h }, p4/Z, [x22]\n"
- "ld1rh { z16.h }, p4/Z, [x21]\n"
- "ld1rh { z15.h }, p4/Z, [x20]\n"
+ "ld1rh { z19.h }, p4/Z, [x22]\n"
+ "ld1rh { z12.h }, p4/Z, [x21]\n"
+ "ld1rh { z9.h }, p4/Z, [x20]\n"
"ldp x16, x15, [x24, #0x0]\n"
"incw x23\n"
"whilelt p3.h, x7, x8\n"
@@ -124,320 +124,320 @@ void sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"whilelt p2.s, x7, x8\n"
"whilelt p1.s, x23, x8\n"
"ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1sb { z0.h }, p4/Z, [x17]\n"
- "ld1sb { z1.h }, p4/Z, [x17, #1, MUL VL]\n"
+ "ld1sb { z25.h }, p4/Z, [x17]\n"
+ "ld1sb { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
"add x11, %x[params], %[offsetof_Params_inptrs]\n"
"mov x10, #0x0\n"
- "ld1sb { z2.h }, p4/Z, [x17, #2, MUL VL]\n"
- "ld1sb { z3.h }, p4/Z, [x17, #3, MUL VL]\n"
- ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
- ".inst 0x454c1021 // ssublb z1.h, z1.b, z12.b\n"
- "ld1sb { z4.h }, p4/Z, [x17, #4, MUL VL]\n"
- "ld1sb { z5.h }, p4/Z, [x17, #5, MUL VL]\n"
- ".inst 0x454c1042 // ssublb z2.h, z2.b, z12.b\n"
- ".inst 0x454c1063 // ssublb z3.h, z3.b, z12.b\n"
- "ld1sb { z6.h }, p4/Z, [x17, #6, MUL VL]\n"
+ "ld1sb { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
+ "ld1sb { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
+ ".inst 0x454d1339 // ssublb z25.h, z25.b, z13.b\n"
+ ".inst 0x454d13de // ssublb z30.h, z30.b, z13.b\n"
+ "ld1sb { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
+ "ld1sb { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
+ ".inst 0x454d11ce // ssublb z14.h, z14.b, z13.b\n"
+ ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n"
+ "ld1sb { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
"ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
"inch x17, ALL, MUL #8\n"
- ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
- "ld1w { z18.s }, p2/Z, [x12]\n"
- "ld1w { z8.s }, p1/Z, [x12, #1, MUL VL]\n"
- "uzp1 z13.s, z18.s, z8.s\n"
- "uzp2 z17.s, z18.s, z8.s\n"
- "ld1sb { z8.h }, p4/Z, [x17]\n"
- "ldp x9, x28, [x11, #0x0]\n"
+ ".inst 0x454d114a // ssublb z10.h, z10.b, z13.b\n"
+ "ld1w { z17.s }, p2/Z, [x12]\n"
+ "ld1w { z16.s }, p1/Z, [x12, #1, MUL VL]\n"
+ "uzp1 z8.s, z17.s, z16.s\n"
+ "uzp2 z24.s, z17.s, z16.s\n"
+ "ld1sb { z2.h }, p4/Z, [x17]\n"
+ "ldp x27, x26, [x11, #0x0]\n"
"addvl x12, x12, #2\n"
- "mov z9.d, z13.d\n"
+ "mov z18.d, z8.d\n"
"ldp x25, x24, [x11, #0x10]\n"
"ldp x23, x22, [x11, #0x20]\n"
- "mov z10.d, z17.d\n"
- "mov z11.d, z13.d\n"
+ "mov z0.d, z24.d\n"
+ "mov z15.d, z8.d\n"
"ldp x21, x20, [x11, #0x30]\n"
- "ld1b { z31.h }, p3/Z, [x9, x7]\n"
- "mov z22.d, z17.d\n"
- "mov z21.d, z13.d\n"
- "ld1b { z30.h }, p3/Z, [x28, x7]\n"
- "ld1b { z29.h }, p3/Z, [x25, x7]\n"
- "mov z18.d, z17.d\n"
- ".inst 0x454c10a5 // ssublb z5.h, z5.b, z12.b\n"
- "ld1b { z28.h }, p3/Z, [x24, x7]\n"
+ "ld1b { z21.h }, p3/Z, [x27, x7]\n"
+ "mov z1.d, z24.d\n"
+ "mov z5.d, z8.d\n"
+ "ld1b { z22.h }, p3/Z, [x26, x7]\n"
+ "ld1b { z11.h }, p3/Z, [x25, x7]\n"
+ "mov z6.d, z24.d\n"
+ ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n"
+ "ld1b { z20.h }, p3/Z, [x24, x7]\n"
"ld1b { z27.h }, p3/Z, [x23, x7]\n"
- ".inst 0x454c10c6 // ssublb z6.h, z6.b, z12.b\n"
- ".inst 0x454c10e7 // ssublb z7.h, z7.b, z12.b\n"
- "ld1b { z26.h }, p3/Z, [x22, x7]\n"
- "ld1b { z25.h }, p3/Z, [x21, x7]\n"
- ".inst 0x454c1108 // ssublb z8.h, z8.b, z12.b\n"
- ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
- "ld1b { z24.h }, p3/Z, [x20, x7]\n"
- "ldr x27, [%x[params], %[offsetof_Params_requant_muls]]\n"
- ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- "ldr x26, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ ".inst 0x454d12f7 // ssublb z23.h, z23.b, z13.b\n"
+ ".inst 0x454d10e7 // ssublb z7.h, z7.b, z13.b\n"
+ "ld1b { z28.h }, p3/Z, [x22, x7]\n"
+ "ld1b { z16.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n"
+ ".inst 0x455a1ab5 // usublb z21.h, z21.b, z26.b\n"
+ "ld1b { z31.h }, p3/Z, [x20, x7]\n"
+ "ldr x9, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ ".inst 0x455a1ad6 // usublb z22.h, z22.b, z26.b\n"
+ ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
+ "ldr x28, [%x[params], %[offsetof_Params_requant_shifts]]\n"
"str x12, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
- ".inst 0x45571b5a // usublb z26.h, z26.b, z23.b\n"
- ".inst 0x45571b39 // usublb z25.h, z25.b, z23.b\n"
- ".inst 0x45571b18 // usublb z24.h, z24.b, z23.b\n"
+ ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
+ ".inst 0x455a1b7b // usublb z27.h, z27.b, z26.b\n"
+ ".inst 0x455a1b9c // usublb z28.h, z28.b, z26.b\n"
+ ".inst 0x455a1a10 // usublb z16.h, z16.b, z26.b\n"
+ ".inst 0x455a1bff // usublb z31.h, z31.b, z26.b\n"
"1:" // Loop
- ".inst 0x448843ed // smlalb z13.s, p4/M, z31.h, z8.h\n"
- ".inst 0x448847f1 // smlalt z17.s, p4/M, z31.h, z8.h\n"
- "ldr x25, [x11, #0x40]\n"
- "ldr x24, [x11, #0x48]\n"
- ".inst 0x448643e9 // smlalb z9.s, p4/M, z31.h, z6.h\n"
- ".inst 0x448647ea // smlalt z10.s, p4/M, z31.h, z6.h\n"
- "ldr x22, [x11, #0x50]\n"
- "ldr x20, [x11, #0x58]\n"
- ".inst 0x448043cd // smlalb z13.s, p4/M, z30.h, z0.h\n"
- ".inst 0x448047d1 // smlalt z17.s, p4/M, z30.h, z0.h\n"
- "ldr x23, [x11, #0x78]\n"
- "ldr x21, [x11, #0x60]\n"
- ".inst 0x44814389 // smlalb z9.s, p4/M, z28.h, z1.h\n"
- ".inst 0x4481478a // smlalt z10.s, p4/M, z28.h, z1.h\n"
- "ld1b { z28.h }, p3/Z, [x24, x7]\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x448143ad // smlalb z13.s, p4/M, z29.h, z1.h\n"
- ".inst 0x448147b1 // smlalt z17.s, p4/M, z29.h, z1.h\n"
- "ld1b { z29.h }, p3/Z, [x25, x7]\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x44824369 // smlalb z9.s, p4/M, z27.h, z2.h\n"
- ".inst 0x4482476a // smlalt z10.s, p4/M, z27.h, z2.h\n"
- "ld1b { z27.h }, p3/Z, [x22, x7]\n"
- ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
- ".inst 0x4483434d // smlalb z13.s, p4/M, z26.h, z3.h\n"
- ".inst 0x44834751 // smlalt z17.s, p4/M, z26.h, z3.h\n"
- "ld1b { z26.h }, p3/Z, [x20, x7]\n"
- ".inst 0x45571b5a // usublb z26.h, z26.b, z23.b\n"
- ".inst 0x44804309 // smlalb z9.s, p4/M, z24.h, z0.h\n"
- ".inst 0x4480470a // smlalt z10.s, p4/M, z24.h, z0.h\n"
- "ldr x22, [x11, #0x80]\n"
- "ldr x20, [x11, #0x68]\n"
- ".inst 0x4484432d // smlalb z13.s, p4/M, z25.h, z4.h\n"
- ".inst 0x44844731 // smlalt z17.s, p4/M, z25.h, z4.h\n"
- "ld1b { z25.h }, p3/Z, [x21, x7]\n"
- ".inst 0x45571b39 // usublb z25.h, z25.b, z23.b\n"
- ".inst 0x448443a9 // smlalb z9.s, p4/M, z29.h, z4.h\n"
- ".inst 0x448447aa // smlalt z10.s, p4/M, z29.h, z4.h\n"
- "ldr x21, [x11, #0x88]\n"
+ ".inst 0x448242a8 // smlalb z8.s, p4/M, z21.h, z2.h\n"
+ "ldr x21, [x11, #0x58]\n"
+ "ldr x20, [x11, #0x78]\n"
+ ".inst 0x448246b8 // smlalt z24.s, p4/M, z21.h, z2.h\n"
+ ".inst 0x449942c8 // smlalb z8.s, p4/M, z22.h, z25.h\n"
+ "ld1b { z17.h }, p3/Z, [x21, x7]\n"
"ld1b { z29.h }, p3/Z, [x20, x7]\n"
- ".inst 0x4482430d // smlalb z13.s, p4/M, z24.h, z2.h\n"
- ".inst 0x44824711 // smlalt z17.s, p4/M, z24.h, z2.h\n"
- "ldr x20, [x11, #0x70]\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x44854389 // smlalb z9.s, p4/M, z28.h, z5.h\n"
- ".inst 0x4485478a // smlalt z10.s, p4/M, z28.h, z5.h\n"
- "ld1b { z28.h }, p3/Z, [x22, x7]\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x448243eb // smlalb z11.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448247f6 // smlalt z22.s, p4/M, z31.h, z2.h\n"
- "ldr x25, [x11, #0x98]\n"
- "ld1b { z24.h }, p3/Z, [x20, x7]\n"
- ".inst 0x4485436d // smlalb z13.s, p4/M, z27.h, z5.h\n"
- ".inst 0x44854771 // smlalt z17.s, p4/M, z27.h, z5.h\n"
- ".inst 0x45571b18 // usublb z24.h, z24.b, z23.b\n"
- "ldr x24, [x11, #0x90]\n"
- ".inst 0x44834369 // smlalb z9.s, p4/M, z27.h, z3.h\n"
- ".inst 0x4483476a // smlalt z10.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x449742b2 // smlalb z18.s, p4/M, z21.h, z23.h\n"
+ "ldr x21, [x11, #0x60]\n"
+ "ldr x20, [x11, #0x80]\n"
+ ".inst 0x448e42af // smlalb z15.s, p4/M, z21.h, z14.h\n"
+ ".inst 0x449942a5 // smlalb z5.s, p4/M, z21.h, z25.h\n"
+ ".inst 0x449946d8 // smlalt z24.s, p4/M, z22.h, z25.h\n"
+ ".inst 0x455a1a31 // usublb z17.h, z17.b, z26.b\n"
+ ".inst 0x449e4168 // smlalb z8.s, p4/M, z11.h, z30.h\n"
+ "ld1b { z22.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x455a1bbd // usublb z29.h, z29.b, z26.b\n"
+ ".inst 0x449746a0 // smlalt z0.s, p4/M, z21.h, z23.h\n"
+ ".inst 0x448e46a1 // smlalt z1.s, p4/M, z21.h, z14.h\n"
+ "ldr x21, [x11, #0x68]\n"
+ ".inst 0x449946a6 // smlalt z6.s, p4/M, z21.h, z25.h\n"
+ "ld1b { z21.h }, p3/Z, [x20, x7]\n"
+ "ldr x20, [x11, #0x88]\n"
+ ".inst 0x449e4292 // smlalb z18.s, p4/M, z20.h, z30.h\n"
+ ".inst 0x4484422f // smlalb z15.s, p4/M, z17.h, z4.h\n"
+ ".inst 0x448a43a5 // smlalb z5.s, p4/M, z29.h, z10.h\n"
+ ".inst 0x455a1ad6 // usublb z22.h, z22.b, z26.b\n"
+ "ldr x22, [x11, #0x40]\n"
+ ".inst 0x449e4578 // smlalt z24.s, p4/M, z11.h, z30.h\n"
+ ".inst 0x455a1ab5 // usublb z21.h, z21.b, z26.b\n"
+ ".inst 0x44844388 // smlalb z8.s, p4/M, z28.h, z4.h\n"
+ "ld1b { z11.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x449e4680 // smlalt z0.s, p4/M, z20.h, z30.h\n"
+ "ld1b { z20.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x44844621 // smlalt z1.s, p4/M, z17.h, z4.h\n"
+ "ldr x21, [x11, #0x70]\n"
+ ".inst 0x448a47a6 // smlalt z6.s, p4/M, z29.h, z10.h\n"
+ "ldr x20, [x11, #0x98]\n"
+ ".inst 0x448e4372 // smlalb z18.s, p4/M, z27.h, z14.h\n"
+ "ldr x23, [x11, #0x50]\n"
+ ".inst 0x449942cf // smlalb z15.s, p4/M, z22.h, z25.h\n"
+ ".inst 0x449e42a5 // smlalb z5.s, p4/M, z21.h, z30.h\n"
+ ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
+ "ld1b { z17.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x44844798 // smlalt z24.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
+ ".inst 0x448a4208 // smlalb z8.s, p4/M, z16.h, z10.h\n"
+ "ld1b { z29.h }, p3/Z, [x21, x7]\n"
+ "ld1b { z28.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x448e4760 // smlalt z0.s, p4/M, z27.h, z14.h\n"
+ "ldr x22, [x11, #0x48]\n"
+ ".inst 0x449946c1 // smlalt z1.s, p4/M, z22.h, z25.h\n"
+ ".inst 0x449e46a6 // smlalt z6.s, p4/M, z21.h, z30.h\n"
+ "ldr x21, [x11, #0x90]\n"
+ "ldr x20, [x11, #0xa8]\n"
+ ".inst 0x449943f2 // smlalb z18.s, p4/M, z31.h, z25.h\n"
"ld1b { z27.h }, p3/Z, [x23, x7]\n"
- ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
- ".inst 0x448043f5 // smlalb z21.s, p4/M, z31.h, z0.h\n"
- ".inst 0x4483434b // smlalb z11.s, p4/M, z26.h, z3.h\n"
- "ldr x23, [x11, #0xa8]\n"
- "ldr x20, [x11, #0xa0]\n"
- ".inst 0x44834756 // smlalt z22.s, p4/M, z26.h, z3.h\n"
- ".inst 0x448047f2 // smlalt z18.s, p4/M, z31.h, z0.h\n"
- "ld1b { z26.h }, p3/Z, [x21, x7]\n"
- ".inst 0x45571b5a // usublb z26.h, z26.b, z23.b\n"
- ".inst 0x44844375 // smlalb z21.s, p4/M, z27.h, z4.h\n"
- ".inst 0x4480432b // smlalb z11.s, p4/M, z25.h, z0.h\n"
- "ldr x22, [x11, #0xb0]\n"
- "ldr x21, [x11, #0xb8]\n"
- ".inst 0x44804736 // smlalt z22.s, p4/M, z25.h, z0.h\n"
- ".inst 0x44844772 // smlalt z18.s, p4/M, z27.h, z4.h\n"
- "ld1b { z27.h }, p3/Z, [x20, x7]\n"
- ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
- ".inst 0x44814395 // smlalb z21.s, p4/M, z28.h, z1.h\n"
- ".inst 0x4486432d // smlalb z13.s, p4/M, z25.h, z6.h\n"
- "ldr x20, [x11, #0xc0]\n"
- "ld1w { z31.s }, p2/Z, [x27]\n"
- ".inst 0x44864731 // smlalt z17.s, p4/M, z25.h, z6.h\n"
- ".inst 0x448443ab // smlalb z11.s, p4/M, z29.h, z4.h\n"
- "ld1b { z25.h }, p3/Z, [x24, x7]\n"
- ".inst 0x45571b39 // usublb z25.h, z25.b, z23.b\n"
- ".inst 0x448447b6 // smlalt z22.s, p4/M, z29.h, z4.h\n"
- "ld1b { z29.h }, p3/Z, [x25, x7]\n"
- ".inst 0x44814792 // smlalt z18.s, p4/M, z28.h, z1.h\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x44854355 // smlalb z21.s, p4/M, z26.h, z5.h\n"
- ".inst 0x4487430d // smlalb z13.s, p4/M, z24.h, z7.h\n"
- "ld1w { z20.s }, p1/Z, [x27, #1, MUL VL]\n"
- "uzp1 z19.s, z31.s, z20.s\n"
- ".inst 0x44874711 // smlalt z17.s, p4/M, z24.h, z7.h\n"
- ".inst 0x4481430b // smlalb z11.s, p4/M, z24.h, z1.h\n"
- "uzp2 z30.s, z31.s, z20.s\n"
- "ld1w { z31.s }, p2/Z, [x26]\n"
- ".inst 0x44814716 // smlalt z22.s, p4/M, z24.h, z1.h\n"
- "ld1b { z24.h }, p3/Z, [x23, x7]\n"
- ".inst 0x44854752 // smlalt z18.s, p4/M, z26.h, z5.h\n"
- ".inst 0x45571b18 // usublb z24.h, z24.b, z23.b\n"
- ".inst 0x448243b5 // smlalb z21.s, p4/M, z29.h, z2.h\n"
- "ld1b { z26.h }, p3/Z, [x22, x7]\n"
- ".inst 0x448247b2 // smlalt z18.s, p4/M, z29.h, z2.h\n"
- ".inst 0x45571b5a // usublb z26.h, z26.b, z23.b\n"
- ".inst 0x4486432b // smlalb z11.s, p4/M, z25.h, z6.h\n"
- ".inst 0x44834315 // smlalb z21.s, p4/M, z24.h, z3.h\n"
- "ld1w { z20.s }, p1/Z, [x26, #1, MUL VL]\n"
- "uzp1 z1.s, z31.s, z20.s\n"
- ".inst 0x44874389 // smlalb z9.s, p4/M, z28.h, z7.h\n"
- ".inst 0x4487478a // smlalt z10.s, p4/M, z28.h, z7.h\n"
- ".inst 0x04b375ad // sqrdmulh z13.s, z13.s, z19.s\n"
- "whilelt p0.h, x10, x8\n"
- ".inst 0x44864736 // smlalt z22.s, p4/M, z25.h, z6.h\n"
+ ".inst 0x448a416f // smlalb z15.s, p4/M, z11.h, z10.h\n"
+ ".inst 0x44834285 // smlalb z5.s, p4/M, z20.h, z3.h\n"
+ ".inst 0x455a1a31 // usublb z17.h, z17.b, z26.b\n"
+ ".inst 0x448a4618 // smlalt z24.s, p4/M, z16.h, z10.h\n"
+ ".inst 0x455a1bbd // usublb z29.h, z29.b, z26.b\n"
+ ".inst 0x448e43e8 // smlalb z8.s, p4/M, z31.h, z14.h\n"
+ "ld1b { z16.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x455a1b9c // usublb z28.h, z28.b, z26.b\n"
+ ".inst 0x449947e0 // smlalt z0.s, p4/M, z31.h, z25.h\n"
"ld1b { z25.h }, p3/Z, [x21, x7]\n"
- ".inst 0x44834712 // smlalt z18.s, p4/M, z24.h, z3.h\n"
- ".inst 0x45571b39 // usublb z25.h, z25.b, z23.b\n"
- ".inst 0x4487436b // smlalb z11.s, p4/M, z27.h, z7.h\n"
- ".inst 0x44874355 // smlalb z21.s, p4/M, z26.h, z7.h\n"
- "uzp2 z31.s, z31.s, z20.s\n"
- "inch x17\n"
- ".inst 0x448843a9 // smlalb z9.s, p4/M, z29.h, z8.h\n"
- ".inst 0x448847aa // smlalt z10.s, p4/M, z29.h, z8.h\n"
- "ld1b { z29.h }, p3/Z, [x20, x7]\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x44874776 // smlalt z22.s, p4/M, z27.h, z7.h\n"
- ".inst 0x44874752 // smlalt z18.s, p4/M, z26.h, z7.h\n"
- "and z0.d, z13.d, z1.d\n"
+ ".inst 0x448a4561 // smlalt z1.s, p4/M, z11.h, z10.h\n"
+ "ld1b { z11.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x455a1b7b // usublb z27.h, z27.b, z26.b\n"
+ ".inst 0x44834686 // smlalt z6.s, p4/M, z20.h, z3.h\n"
+ "ldr x21, [x11, #0xa0]\n"
+ "ldr x20, [x11, #0xb0]\n"
+ ".inst 0x448a4232 // smlalb z18.s, p4/M, z17.h, z10.h\n"
+ ".inst 0x449e43af // smlalb z15.s, p4/M, z29.h, z30.h\n"
+ ".inst 0x455a1a10 // usublb z16.h, z16.b, z26.b\n"
+ ".inst 0x448e4385 // smlalb z5.s, p4/M, z28.h, z14.h\n"
+ ".inst 0x448e47f8 // smlalt z24.s, p4/M, z31.h, z14.h\n"
+ ".inst 0x455a1b39 // usublb z25.h, z25.b, z26.b\n"
+ "ld1b { z20.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
+ ".inst 0x44834368 // smlalb z8.s, p4/M, z27.h, z3.h\n"
+ "ld1b { z31.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x448a4620 // smlalt z0.s, p4/M, z17.h, z10.h\n"
+ ".inst 0x449e47a1 // smlalt z1.s, p4/M, z29.h, z30.h\n"
+ ".inst 0x448e4786 // smlalt z6.s, p4/M, z28.h, z14.h\n"
+ "ldr x20, [x11, #0xb8]\n"
+ ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
+ ".inst 0x44834212 // smlalb z18.s, p4/M, z16.h, z3.h\n"
+ ".inst 0x4497432f // smlalb z15.s, p4/M, z25.h, z23.h\n"
+ ".inst 0x455a1bff // usublb z31.h, z31.b, z26.b\n"
+ "ld1b { z30.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x44844165 // smlalb z5.s, p4/M, z11.h, z4.h\n"
+ ".inst 0x44834778 // smlalt z24.s, p4/M, z27.h, z3.h\n"
+ "ldr x20, [x11, #0xc0]\n"
+ "ld1w { z17.s }, p2/Z, [x9]\n"
+ ".inst 0x449742c8 // smlalb z8.s, p4/M, z22.h, z23.h\n"
+ ".inst 0x44834600 // smlalt z0.s, p4/M, z16.h, z3.h\n"
+ "ld1w { z14.s }, p1/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x455a1bde // usublb z30.h, z30.b, z26.b\n"
+ ".inst 0x44974721 // smlalt z1.s, p4/M, z25.h, z23.h\n"
+ ".inst 0x44844566 // smlalt z6.s, p4/M, z11.h, z4.h\n"
+ "ld1b { z25.h }, p3/Z, [x20, x7]\n"
+ "uzp1 z10.s, z17.s, z14.s\n"
+ ".inst 0x44844372 // smlalb z18.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x4487428f // smlalb z15.s, p4/M, z20.h, z7.h\n"
+ "uzp2 z14.s, z17.s, z14.s\n"
+ "ld1w { z17.s }, p2/Z, [x28]\n"
+ ".inst 0x448743e5 // smlalb z5.s, p4/M, z31.h, z7.h\n"
+ ".inst 0x449746d8 // smlalt z24.s, p4/M, z22.h, z23.h\n"
+ "ld1w { z16.s }, p1/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x455a1b39 // usublb z25.h, z25.b, z26.b\n"
+ ".inst 0x448743a8 // smlalb z8.s, p4/M, z29.h, z7.h\n"
+ ".inst 0x44844760 // smlalt z0.s, p4/M, z27.h, z4.h\n"
+ "uzp1 z4.s, z17.s, z16.s\n"
"inch x7\n"
- ".inst 0x4485430b // smlalb z11.s, p4/M, z24.h, z5.h\n"
- ".inst 0x44864335 // smlalb z21.s, p4/M, z25.h, z6.h\n"
- ".inst 0x04be7631 // sqrdmulh z17.s, z17.s, z30.s\n"
+ ".inst 0x44874681 // smlalt z1.s, p4/M, z20.h, z7.h\n"
+ ".inst 0x448747e6 // smlalt z6.s, p4/M, z31.h, z7.h\n"
+ ".inst 0x04aa7508 // sqrdmulh z8.s, z8.s, z10.s\n"
+ "whilelt p0.h, x10, x8\n"
+ ".inst 0x448742b2 // smlalb z18.s, p4/M, z21.h, z7.h\n"
+ ".inst 0x4483416f // smlalb z15.s, p4/M, z11.h, z3.h\n"
+ "uzp2 z22.s, z17.s, z16.s\n"
"mov x20, x7\n"
- ".inst 0x44854716 // smlalt z22.s, p4/M, z24.h, z5.h\n"
- ".inst 0x44864732 // smlalt z18.s, p4/M, z25.h, z6.h\n"
- "asr z0.s, z0.s, #0x1f\n"
+ ".inst 0x449743c5 // smlalb z5.s, p4/M, z30.h, z23.h\n"
+ ".inst 0x448747b8 // smlalt z24.s, p4/M, z29.h, z7.h\n"
+ "and z17.d, z8.d, z4.d\n"
+ "inch x17\n"
+ ".inst 0x448746a0 // smlalt z0.s, p4/M, z21.h, z7.h\n"
+ ".inst 0x44834561 // smlalt z1.s, p4/M, z11.h, z3.h\n"
+ ".inst 0x04ae7718 // sqrdmulh z24.s, z24.s, z14.s\n"
"incw x20\n"
- ".inst 0x4488432b // smlalb z11.s, p4/M, z25.h, z8.h\n"
- ".inst 0x448843b5 // smlalb z21.s, p4/M, z29.h, z8.h\n"
- "and z20.d, z17.d, z31.d\n"
+ ".inst 0x449747c6 // smlalt z6.s, p4/M, z30.h, z23.h\n"
+ ".inst 0x44824392 // smlalb z18.s, p4/M, z28.h, z2.h\n"
+ "asr z17.s, z17.s, #0x1f\n"
"whilelt p2.s, x7, x8\n"
- ".inst 0x44884736 // smlalt z22.s, p4/M, z25.h, z8.h\n"
- ".inst 0x448847b2 // smlalt z18.s, p4/M, z29.h, z8.h\n"
- ".inst 0x04b37529 // sqrdmulh z9.s, z9.s, z19.s\n"
+ ".inst 0x448243cf // smlalb z15.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x44824325 // smlalb z5.s, p4/M, z25.h, z2.h\n"
+ "and z16.d, z24.d, z22.d\n"
"whilelt p1.s, x20, x8\n"
- ".inst 0x04b3756b // sqrdmulh z11.s, z11.s, z19.s\n"
- ".inst 0x04b376b5 // sqrdmulh z21.s, z21.s, z19.s\n"
- "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44824780 // smlalt z0.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x448247c1 // smlalt z1.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x04aa7652 // sqrdmulh z18.s, z18.s, z10.s\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44824726 // smlalt z6.s, p4/M, z25.h, z2.h\n"
+ ".inst 0x04aa75ef // sqrdmulh z15.s, z15.s, z10.s\n"
"whilelt p3.h, x7, x8\n"
- "sqadd z13.s, z13.s, z0.s\n"
- "asr z20.s, z20.s, #0x1f\n"
- ".inst 0x4482902d // srshl z13.s, p4/M, z13.s, z1.s\n"
- "addvl x27, x27, #2\n"
- "and z19.d, z9.d, z1.d\n"
- ".inst 0x04be754a // sqrdmulh z10.s, z10.s, z30.s\n"
- "addvl x26, x26, #2\n"
- "and z2.d, z11.d, z1.d\n"
- ".inst 0x04be76d6 // sqrdmulh z22.s, z22.s, z30.s\n"
- "and z0.d, z21.d, z1.d\n"
- ".inst 0x04be7652 // sqrdmulh z18.s, z18.s, z30.s\n"
- "sqadd z17.s, z17.s, z20.s\n"
- "asr z19.s, z19.s, #0x1f\n"
- ".inst 0x448293f1 // srshl z17.s, p4/M, z17.s, z31.s\n"
- "and z3.d, z10.d, z31.d\n"
- "asr z2.s, z2.s, #0x1f\n"
- "and z26.d, z22.d, z31.d\n"
- "asr z0.s, z0.s, #0x1f\n"
- "and z20.d, z18.d, z31.d\n"
- "sqadd z9.s, z9.s, z19.s\n"
- ".inst 0x44829029 // srshl z9.s, p4/M, z9.s, z1.s\n"
- "asr z3.s, z3.s, #0x1f\n"
- "sqadd z11.s, z11.s, z2.s\n"
- ".inst 0x4482902b // srshl z11.s, p4/M, z11.s, z1.s\n"
- "asr z26.s, z26.s, #0x1f\n"
- "sqadd z21.s, z21.s, z0.s\n"
- ".inst 0x44829035 // srshl z21.s, p4/M, z21.s, z1.s\n"
+ "addvl x9, x9, #2\n"
+ ".inst 0x04aa74a5 // sqrdmulh z5.s, z5.s, z10.s\n"
+ "sqadd z8.s, z8.s, z17.s\n"
+ ".inst 0x44829088 // srshl z8.s, p4/M, z8.s, z4.s\n"
+ "addvl x28, x28, #2\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "and z21.d, z18.d, z4.d\n"
+ ".inst 0x04ae7400 // sqrdmulh z0.s, z0.s, z14.s\n"
+ "and z20.d, z15.d, z4.d\n"
+ ".inst 0x04ae7421 // sqrdmulh z1.s, z1.s, z14.s\n"
+ "and z28.d, z5.d, z4.d\n"
+ ".inst 0x04ae74c6 // sqrdmulh z6.s, z6.s, z14.s\n"
+ "sqadd z24.s, z24.s, z16.s\n"
+ ".inst 0x448292d8 // srshl z24.s, p4/M, z24.s, z22.s\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "and z25.d, z0.d, z22.d\n"
"asr z20.s, z20.s, #0x1f\n"
- "sqadd z10.s, z10.s, z3.s\n"
- ".inst 0x448293ea // srshl z10.s, p4/M, z10.s, z31.s\n"
- "sqadd z22.s, z22.s, z26.s\n"
- "sqadd z18.s, z18.s, z20.s\n"
- ".inst 0x448293f6 // srshl z22.s, p4/M, z22.s, z31.s\n"
- ".inst 0x448293f2 // srshl z18.s, p4/M, z18.s, z31.s\n"
- ".inst 0x453041ad // sqxtnb z13.h, z13.s\n"
- ".inst 0x45304129 // sqxtnb z9.h, z9.s\n"
- ".inst 0x4530416b // sqxtnb z11.h, z11.s\n"
- ".inst 0x453042b5 // sqxtnb z21.h, z21.s\n"
- ".inst 0x4530462d // sqxtnt z13.h, z17.s\n"
- ".inst 0x45304549 // sqxtnt z9.h, z10.s\n"
- ".inst 0x453046cb // sqxtnt z11.h, z22.s\n"
- ".inst 0x45304655 // sqxtnt z21.h, z18.s\n"
- "sqadd z13.h, z13.h, z14.h\n"
- "sqadd z9.h, z9.h, z14.h\n"
- "smax z13.h, p4/M, z13.h, z16.h\n"
- "smax z9.h, p4/M, z9.h, z16.h\n"
- "sqadd z11.h, z11.h, z14.h\n"
- "sqadd z21.h, z21.h, z14.h\n"
- "smax z11.h, p4/M, z11.h, z16.h\n"
- "smax z21.h, p4/M, z21.h, z16.h\n"
- "smin z13.h, p4/M, z13.h, z15.h\n"
- "smin z9.h, p4/M, z9.h, z15.h\n"
- "st1b { z13.h }, p0, [x16, x10]\n"
- "smin z11.h, p4/M, z11.h, z15.h\n"
- "smin z21.h, p4/M, z21.h, z15.h\n"
- "st1b { z9.h }, p0, [x15, x10]\n"
- "st1b { z11.h }, p0, [x14, x10]\n"
- "st1b { z21.h }, p0, [x13, x10]\n"
- "ld1sb { z0.h }, p4/Z, [x17]\n"
- "ld1sb { z1.h }, p4/Z, [x17, #1, MUL VL]\n"
+ "and z17.d, z1.d, z22.d\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "and z16.d, z6.d, z22.d\n"
+ "sqadd z18.s, z18.s, z21.s\n"
+ "asr z25.s, z25.s, #0x1f\n"
+ ".inst 0x44829092 // srshl z18.s, p4/M, z18.s, z4.s\n"
+ "sqadd z15.s, z15.s, z20.s\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ ".inst 0x4482908f // srshl z15.s, p4/M, z15.s, z4.s\n"
+ "sqadd z5.s, z5.s, z28.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x44829085 // srshl z5.s, p4/M, z5.s, z4.s\n"
+ "sqadd z0.s, z0.s, z25.s\n"
+ "sqadd z1.s, z1.s, z17.s\n"
+ ".inst 0x448292c0 // srshl z0.s, p4/M, z0.s, z22.s\n"
+ ".inst 0x448292c1 // srshl z1.s, p4/M, z1.s, z22.s\n"
+ "sqadd z6.s, z6.s, z16.s\n"
+ ".inst 0x45304108 // sqxtnb z8.h, z8.s\n"
+ ".inst 0x448292c6 // srshl z6.s, p4/M, z6.s, z22.s\n"
+ ".inst 0x45304252 // sqxtnb z18.h, z18.s\n"
+ ".inst 0x453041ef // sqxtnb z15.h, z15.s\n"
+ ".inst 0x453040a5 // sqxtnb z5.h, z5.s\n"
+ ".inst 0x45304708 // sqxtnt z8.h, z24.s\n"
+ ".inst 0x45304412 // sqxtnt z18.h, z0.s\n"
+ ".inst 0x4530442f // sqxtnt z15.h, z1.s\n"
+ ".inst 0x453044c5 // sqxtnt z5.h, z6.s\n"
+ "sqadd z8.h, z8.h, z19.h\n"
+ "smax z8.h, p4/M, z8.h, z12.h\n"
+ "smin z8.h, p4/M, z8.h, z9.h\n"
+ "sqadd z18.h, z18.h, z19.h\n"
+ "sqadd z15.h, z15.h, z19.h\n"
+ "smax z18.h, p4/M, z18.h, z12.h\n"
+ "smax z15.h, p4/M, z15.h, z12.h\n"
+ "sqadd z5.h, z5.h, z19.h\n"
+ "smax z5.h, p4/M, z5.h, z12.h\n"
+ "smin z18.h, p4/M, z18.h, z9.h\n"
+ "st1b { z8.h }, p0, [x16, x10]\n"
+ "smin z15.h, p4/M, z15.h, z9.h\n"
+ "smin z5.h, p4/M, z5.h, z9.h\n"
+ "st1b { z18.h }, p0, [x15, x10]\n"
+ "st1b { z15.h }, p0, [x14, x10]\n"
+ "st1b { z5.h }, p0, [x13, x10]\n"
+ "ld1sb { z25.h }, p4/Z, [x17]\n"
+ "ld1sb { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
"inch x10\n"
- "ld1sb { z2.h }, p4/Z, [x17, #2, MUL VL]\n"
- "ld1sb { z3.h }, p4/Z, [x17, #3, MUL VL]\n"
- ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
- ".inst 0x454c1021 // ssublb z1.h, z1.b, z12.b\n"
- "ld1sb { z4.h }, p4/Z, [x17, #4, MUL VL]\n"
- "ld1sb { z5.h }, p4/Z, [x17, #5, MUL VL]\n"
- ".inst 0x454c1042 // ssublb z2.h, z2.b, z12.b\n"
- ".inst 0x454c1063 // ssublb z3.h, z3.b, z12.b\n"
- "ld1sb { z6.h }, p4/Z, [x17, #6, MUL VL]\n"
+ "ld1sb { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
+ "ld1sb { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
+ ".inst 0x454d1339 // ssublb z25.h, z25.b, z13.b\n"
+ ".inst 0x454d13de // ssublb z30.h, z30.b, z13.b\n"
+ "ld1sb { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
+ "ld1sb { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
+ ".inst 0x454d11ce // ssublb z14.h, z14.b, z13.b\n"
+ ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n"
+ "ld1sb { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
"ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
"inch x17, ALL, MUL #8\n"
- ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
- "ld1w { z18.s }, p2/Z, [x12]\n"
- "ld1w { z8.s }, p1/Z, [x12, #1, MUL VL]\n"
- "uzp1 z13.s, z18.s, z8.s\n"
- "uzp2 z17.s, z18.s, z8.s\n"
- "ld1sb { z8.h }, p4/Z, [x17]\n"
- "ldp x9, x28, [x11, #0x0]\n"
- "addvl x12, x12, #2\n"
- "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x454d114a // ssublb z10.h, z10.b, z13.b\n"
+ "ld1w { z17.s }, p2/Z, [x20]\n"
+ "ld1w { z16.s }, p1/Z, [x20, #1, MUL VL]\n"
+ "uzp1 z8.s, z17.s, z16.s\n"
+ "uzp2 z24.s, z17.s, z16.s\n"
+ "ld1sb { z2.h }, p4/Z, [x17]\n"
+ "ldp x27, x26, [x11, #0x0]\n"
+ "addvl x20, x20, #2\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
"ldp x25, x24, [x11, #0x10]\n"
"ldp x23, x22, [x11, #0x20]\n"
- "mov z9.d, z13.d\n"
- "mov z10.d, z17.d\n"
+ "mov z18.d, z8.d\n"
+ "mov z0.d, z24.d\n"
"ldp x21, x20, [x11, #0x30]\n"
- "ld1b { z31.h }, p3/Z, [x9, x7]\n"
- "mov z11.d, z13.d\n"
- "mov z22.d, z17.d\n"
- "ld1b { z30.h }, p3/Z, [x28, x7]\n"
- "ld1b { z29.h }, p3/Z, [x25, x7]\n"
- "mov z21.d, z13.d\n"
- "mov z18.d, z17.d\n"
- "ld1b { z28.h }, p3/Z, [x24, x7]\n"
+ "ld1b { z21.h }, p3/Z, [x27, x7]\n"
+ "mov z15.d, z8.d\n"
+ "mov z1.d, z24.d\n"
+ "ld1b { z22.h }, p3/Z, [x26, x7]\n"
+ "ld1b { z11.h }, p3/Z, [x25, x7]\n"
+ "mov z5.d, z8.d\n"
+ "mov z6.d, z24.d\n"
+ "ld1b { z20.h }, p3/Z, [x24, x7]\n"
"ld1b { z27.h }, p3/Z, [x23, x7]\n"
- ".inst 0x454c10a5 // ssublb z5.h, z5.b, z12.b\n"
- ".inst 0x454c10c6 // ssublb z6.h, z6.b, z12.b\n"
- "ld1b { z26.h }, p3/Z, [x22, x7]\n"
- "ld1b { z25.h }, p3/Z, [x21, x7]\n"
- ".inst 0x454c10e7 // ssublb z7.h, z7.b, z12.b\n"
- ".inst 0x454c1108 // ssublb z8.h, z8.b, z12.b\n"
- "ld1b { z24.h }, p3/Z, [x20, x7]\n"
- ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
- ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
- ".inst 0x45571b5a // usublb z26.h, z26.b, z23.b\n"
- ".inst 0x45571b39 // usublb z25.h, z25.b, z23.b\n"
- ".inst 0x45571b18 // usublb z24.h, z24.b, z23.b\n"
+ ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n"
+ ".inst 0x454d12f7 // ssublb z23.h, z23.b, z13.b\n"
+ "ld1b { z28.h }, p3/Z, [x22, x7]\n"
+ "ld1b { z16.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x454d10e7 // ssublb z7.h, z7.b, z13.b\n"
+ ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n"
+ "ld1b { z31.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x455a1ab5 // usublb z21.h, z21.b, z26.b\n"
+ ".inst 0x455a1ad6 // usublb z22.h, z22.b, z26.b\n"
+ ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
+ ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
+ ".inst 0x455a1b7b // usublb z27.h, z27.b, z26.b\n"
+ ".inst 0x455a1b9c // usublb z28.h, z28.b, z26.b\n"
+ ".inst 0x455a1a10 // usublb z16.h, z16.b, z26.b\n"
+ ".inst 0x455a1bff // usublb z31.h, z31.b, z26.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
@@ -448,4 +448,4 @@ void sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index 06ca42eed9..0ff853ec2d 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,14 +22,14 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -47,17 +47,16 @@ class sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfi
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
- unsigned int get_accumulator_depth_vl(void) const override { return 2; }
-
sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
- Parent::KernelType kernel = sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+ Parent::KernelType kernel = sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index 9c291ae186..f24a258484 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -27,7 +27,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -46,7 +46,7 @@ void sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
struct Params
{
long unsigned int n_channels;
- const int8_t *weights;
+ const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
const int32_t *const requant_muls;
@@ -57,7 +57,7 @@ void sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
Params(
long unsigned int n_channels,
const uint8_t *const *inptrs_raw,
- const int8_t *const weights,
+ const void *const weights,
const int32_t *const bias,
const arm_gemm::Requantize32 &qp,
const int32_t *const requant_muls,
@@ -111,542 +111,542 @@ void sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "mov x0, #0x0\n"
- "mov x24, x0\n"
+ "mov x2, #0x0\n"
+ "mov x24, x2\n"
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
- "ldr x1, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x3, [%x[params], %[offsetof_Params_n_channels]]\n"
"ptrue p4.b\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
"incw x24\n"
- "ldr x2, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x4, [%x[params], %[offsetof_Params_weights]]\n"
"add x21, x23, %[offsetof_Requantize32_a_offset]\n"
"add x20, x23, %[offsetof_Requantize32_b_offset]\n"
- "ld1rb { z15.b }, p4/Z, [x21]\n"
- "ld1rb { z17.b }, p4/Z, [x20]\n"
+ "ld1rb { z30.b }, p4/Z, [x21]\n"
+ "ld1rb { z10.b }, p4/Z, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_c_offset]\n"
"add x20, x23, %[offsetof_Requantize32_minval]\n"
- "ld1rh { z12.h }, p4/Z, [x21]\n"
- "ld1rh { z13.h }, p4/Z, [x20]\n"
+ "ld1rh { z15.h }, p4/Z, [x21]\n"
+ "ld1rh { z12.h }, p4/Z, [x20]\n"
"add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "ld1rh { z11.h }, p4/Z, [x20]\n"
- "ldp x3, x4, [x22, #0x0]\n"
- "whilelt p3.h, x0, x1\n"
- "ldp x5, x6, [x22, #0x10]\n"
- "whilelt p2.s, x0, x1\n"
- "whilelt p1.s, x24, x1\n"
- "ldr x14, [%x[params], %[offsetof_Params_bias]]\n"
- "add x7, %x[params], %[offsetof_Params_inptrs]\n"
- "ld1w { z30.s }, p2/Z, [x14]\n"
- "ld1w { z16.s }, p1/Z, [x14, #1, MUL VL]\n"
- "uzp1 z14.s, z30.s, z16.s\n"
- "ld1sb { z0.h }, p4/Z, [x2]\n"
- "ld1sb { z1.h }, p4/Z, [x2, #1, MUL VL]\n"
- "uzp2 z10.s, z30.s, z16.s\n"
- "addvl x14, x14, #2\n"
- "ld1sb { z2.h }, p4/Z, [x2, #2, MUL VL]\n"
- "ld1sb { z3.h }, p4/Z, [x2, #3, MUL VL]\n"
- "mov x8, #0x0\n"
- "mov z20.d, z14.d\n"
- "ld1sb { z4.h }, p4/Z, [x2, #4, MUL VL]\n"
- "ldp x9, x28, [x7, #0x0]\n"
- "mov z7.d, z10.d\n"
- "mov z8.d, z14.d\n"
- "ldp x27, x26, [x7, #0x10]\n"
- "ldp x25, x24, [x7, #0x20]\n"
- "mov z16.d, z10.d\n"
+ "ld1rh { z13.h }, p4/Z, [x20]\n"
+ "ldp x5, x6, [x22, #0x0]\n"
+ "whilelt p3.h, x2, x3\n"
+ "ldp x7, x8, [x22, #0x10]\n"
+ "whilelt p2.s, x2, x3\n"
+ "whilelt p1.s, x24, x3\n"
+ "ldr x10, [%x[params], %[offsetof_Params_bias]]\n"
+ "add x17, %x[params], %[offsetof_Params_inptrs]\n"
+ "ld1w { z17.s }, p2/Z, [x10]\n"
+ "ld1w { z16.s }, p1/Z, [x10, #1, MUL VL]\n"
+ "uzp1 z14.s, z17.s, z16.s\n"
+ "ld1sb { z26.h }, p4/Z, [x4]\n"
+ "ld1sb { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
+ "uzp2 z23.s, z17.s, z16.s\n"
+ "addvl x10, x10, #2\n"
+ "ld1sb { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
+ "ld1sb { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
+ "mov x16, #0x0\n"
"mov z6.d, z14.d\n"
- "ldp x23, x22, [x7, #0x30]\n"
- "ldp x21, x20, [x7, #0x40]\n"
- "mov z5.d, z10.d\n"
- ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
- "ld1b { z31.h }, p3/Z, [x9, x0]\n"
- "ld1b { z30.h }, p3/Z, [x28, x0]\n"
- ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
- ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
- "ld1b { z29.h }, p3/Z, [x27, x0]\n"
- "ld1b { z28.h }, p3/Z, [x26, x0]\n"
- ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
- ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
- "ld1b { z27.h }, p3/Z, [x25, x0]\n"
- "ld1b { z23.h }, p3/Z, [x24, x0]\n"
- ".inst 0x454f1bff // usublb z31.h, z31.b, z15.b\n"
- ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
- "ld1b { z25.h }, p3/Z, [x23, x0]\n"
- "ld1b { z24.h }, p3/Z, [x22, x0]\n"
- ".inst 0x454f1bbd // usublb z29.h, z29.b, z15.b\n"
- ".inst 0x454f1b9c // usublb z28.h, z28.b, z15.b\n"
- "ld1b { z26.h }, p3/Z, [x21, x0]\n"
- "ld1b { z22.h }, p3/Z, [x20, x0]\n"
- ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
- ".inst 0x454f1af7 // usublb z23.h, z23.b, z15.b\n"
- "ldr x17, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "ldr x16, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "str x14, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
- ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
- ".inst 0x454f1b5a // usublb z26.h, z26.b, z15.b\n"
- ".inst 0x454f1ad6 // usublb z22.h, z22.b, z15.b\n"
+ "ld1sb { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
+ "ldp x9, x28, [x17, #0x0]\n"
+ "mov z18.d, z23.d\n"
+ "mov z9.d, z14.d\n"
+ "ldp x27, x26, [x17, #0x10]\n"
+ "ldp x25, x24, [x17, #0x20]\n"
+ "mov z20.d, z23.d\n"
+ "mov z7.d, z14.d\n"
+ "ldp x23, x22, [x17, #0x30]\n"
+ "ldp x21, x20, [x17, #0x40]\n"
+ "mov z1.d, z23.d\n"
+ ".inst 0x454a135a // ssublb z26.h, z26.b, z10.b\n"
+ "ld1b { z22.h }, p3/Z, [x9, x2]\n"
+ "ld1b { z2.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x454a1108 // ssublb z8.h, z8.b, z10.b\n"
+ ".inst 0x454a1210 // ssublb z16.h, z16.b, z10.b\n"
+ "ld1b { z11.h }, p3/Z, [x27, x2]\n"
+ "ld1b { z3.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
+ ".inst 0x454a1231 // ssublb z17.h, z17.b, z10.b\n"
+ "ld1b { z29.h }, p3/Z, [x25, x2]\n"
+ "ld1b { z4.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x455e1ad6 // usublb z22.h, z22.b, z30.b\n"
+ ".inst 0x455e1842 // usublb z2.h, z2.b, z30.b\n"
+ "ld1b { z31.h }, p3/Z, [x23, x2]\n"
+ "ld1b { z0.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e196b // usublb z11.h, z11.b, z30.b\n"
+ ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
+ "ld1b { z19.h }, p3/Z, [x21, x2]\n"
+ "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1bbd // usublb z29.h, z29.b, z30.b\n"
+ ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
+ "ldr x15, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "str x10, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
+ ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
+ ".inst 0x455e1a73 // usublb z19.h, z19.b, z30.b\n"
+ ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
"1:" // Loop
- ".inst 0x448043ee // smlalb z14.s, p4/M, z31.h, z0.h\n"
- ".inst 0x448047ea // smlalt z10.s, p4/M, z31.h, z0.h\n"
- "ldr x20, [x7, #0x50]\n"
- "ld1b { z31.h }, p3/Z, [x20, x0]\n"
- ".inst 0x448143ce // smlalb z14.s, p4/M, z30.h, z1.h\n"
- ".inst 0x448043d4 // smlalb z20.s, p4/M, z30.h, z0.h\n"
- "ldr x22, [x7, #0x58]\n"
- ".inst 0x454f1bff // usublb z31.h, z31.b, z15.b\n"
- ".inst 0x448043a8 // smlalb z8.s, p4/M, z29.h, z0.h\n"
- ".inst 0x44804386 // smlalb z6.s, p4/M, z28.h, z0.h\n"
- "ldr x21, [x7, #0x60]\n"
- "ldr x20, [x7, #0x68]\n"
- ".inst 0x448147ca // smlalt z10.s, p4/M, z30.h, z1.h\n"
- ".inst 0x448047c7 // smlalt z7.s, p4/M, z30.h, z0.h\n"
- "ld1b { z30.h }, p3/Z, [x22, x0]\n"
- ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
- ".inst 0x448047b0 // smlalt z16.s, p4/M, z29.h, z0.h\n"
- ".inst 0x4482436e // smlalb z14.s, p4/M, z27.h, z2.h\n"
- "ldr x25, [x7, #0x70]\n"
- "ldr x24, [x7, #0x78]\n"
- ".inst 0x44804785 // smlalt z5.s, p4/M, z28.h, z0.h\n"
- ".inst 0x44814374 // smlalb z20.s, p4/M, z27.h, z1.h\n"
- "ld1sb { z0.h }, p4/Z, [x2, #5, MUL VL]\n"
- ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
- ".inst 0x44814388 // smlalb z8.s, p4/M, z28.h, z1.h\n"
- ".inst 0x448142e6 // smlalb z6.s, p4/M, z23.h, z1.h\n"
- "ldr x15, [x7, #0x80]\n"
- "ldr x23, [x7, #0x88]\n"
- ".inst 0x4482476a // smlalt z10.s, p4/M, z27.h, z2.h\n"
- ".inst 0x44814767 // smlalt z7.s, p4/M, z27.h, z1.h\n"
- "ld1b { z27.h }, p3/Z, [x21, x0]\n"
- ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
- ".inst 0x44814790 // smlalt z16.s, p4/M, z28.h, z1.h\n"
- ".inst 0x4483432e // smlalb z14.s, p4/M, z25.h, z3.h\n"
- "ldr x22, [x7, #0x90]\n"
- "ldr x21, [x7, #0x98]\n"
- ".inst 0x448146e5 // smlalt z5.s, p4/M, z23.h, z1.h\n"
- ".inst 0x44824334 // smlalb z20.s, p4/M, z25.h, z2.h\n"
- "ld1sb { z1.h }, p4/Z, [x2, #6, MUL VL]\n"
- ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
- ".inst 0x448242e8 // smlalb z8.s, p4/M, z23.h, z2.h\n"
- ".inst 0x448243e6 // smlalb z6.s, p4/M, z31.h, z2.h\n"
- "ldr x14, [x7, #0xa0]\n"
- "ldr x13, [x7, #0xa8]\n"
- ".inst 0x4483472a // smlalt z10.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44824727 // smlalt z7.s, p4/M, z25.h, z2.h\n"
- "ld1b { z25.h }, p3/Z, [x20, x0]\n"
- ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
- ".inst 0x448246f0 // smlalt z16.s, p4/M, z23.h, z2.h\n"
- ".inst 0x4484430e // smlalb z14.s, p4/M, z24.h, z4.h\n"
- "ldr x12, [x7, #0xb0]\n"
- "ldr x20, [x7, #0xb8]\n"
- ".inst 0x448247e5 // smlalt z5.s, p4/M, z31.h, z2.h\n"
- ".inst 0x44834314 // smlalb z20.s, p4/M, z24.h, z3.h\n"
- "ld1sb { z2.h }, p4/Z, [x2, #7, MUL VL]\n"
- "inch x2, ALL, MUL #8\n"
- ".inst 0x448343e8 // smlalb z8.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448343c6 // smlalb z6.s, p4/M, z30.h, z3.h\n"
- ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
- "ldr x11, [x7, #0xc0]\n"
- ".inst 0x4484470a // smlalt z10.s, p4/M, z24.h, z4.h\n"
- ".inst 0x44834707 // smlalt z7.s, p4/M, z24.h, z3.h\n"
- "ld1b { z24.h }, p3/Z, [x25, x0]\n"
- ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
- ".inst 0x448347f0 // smlalt z16.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448043ae // smlalb z14.s, p4/M, z29.h, z0.h\n"
- "ldr x10, [x7, #0xc8]\n"
- "ldr x9, [x7, #0xd0]\n"
- ".inst 0x448347c5 // smlalt z5.s, p4/M, z30.h, z3.h\n"
- ".inst 0x44844374 // smlalb z20.s, p4/M, z27.h, z4.h\n"
- "ld1sb { z3.h }, p4/Z, [x2]\n"
- ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
- ".inst 0x448443c8 // smlalb z8.s, p4/M, z30.h, z4.h\n"
- ".inst 0x44844346 // smlalb z6.s, p4/M, z26.h, z4.h\n"
- "ldr x28, [x7, #0xd8]\n"
- "ldr x27, [x7, #0xe0]\n"
- ".inst 0x448047aa // smlalt z10.s, p4/M, z29.h, z0.h\n"
- ".inst 0x44844767 // smlalt z7.s, p4/M, z27.h, z4.h\n"
- "ld1b { z27.h }, p3/Z, [x24, x0]\n"
- ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
- ".inst 0x448447d0 // smlalt z16.s, p4/M, z30.h, z4.h\n"
- ".inst 0x4481438e // smlalb z14.s, p4/M, z28.h, z1.h\n"
- "ldr x26, [x7, #0xe8]\n"
- "ldr x25, [x7, #0xf0]\n"
- ".inst 0x44844745 // smlalt z5.s, p4/M, z26.h, z4.h\n"
- ".inst 0x44804394 // smlalb z20.s, p4/M, z28.h, z0.h\n"
- "ld1sb { z4.h }, p4/Z, [x2, #1, MUL VL]\n"
- ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
- ".inst 0x448042c8 // smlalb z8.s, p4/M, z22.h, z0.h\n"
- ".inst 0x44804326 // smlalb z6.s, p4/M, z25.h, z0.h\n"
- "ld1w { z19.s }, p2/Z, [x17]\n"
- "ld1w { z18.s }, p1/Z, [x17, #1, MUL VL]\n"
- ".inst 0x4481478a // smlalt z10.s, p4/M, z28.h, z1.h\n"
- ".inst 0x44804787 // smlalt z7.s, p4/M, z28.h, z0.h\n"
- "ld1b { z28.h }, p3/Z, [x23, x0]\n"
- ".inst 0x454f1b9c // usublb z28.h, z28.b, z15.b\n"
- ".inst 0x448046d0 // smlalt z16.s, p4/M, z22.h, z0.h\n"
- ".inst 0x448242ee // smlalb z14.s, p4/M, z23.h, z2.h\n"
- "ldr x24, [x7, #0xf8]\n"
- "uzp1 z9.s, z19.s, z18.s\n"
- ".inst 0x44804725 // smlalt z5.s, p4/M, z25.h, z0.h\n"
- ".inst 0x448142f4 // smlalb z20.s, p4/M, z23.h, z1.h\n"
- "ld1sb { z0.h }, p4/Z, [x2, #2, MUL VL]\n"
- ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
- ".inst 0x44814328 // smlalb z8.s, p4/M, z25.h, z1.h\n"
- ".inst 0x44814306 // smlalb z6.s, p4/M, z24.h, z1.h\n"
- "uzp2 z29.s, z19.s, z18.s\n"
- "ld1w { z19.s }, p2/Z, [x16]\n"
- ".inst 0x448246ea // smlalt z10.s, p4/M, z23.h, z2.h\n"
- ".inst 0x448146e7 // smlalt z7.s, p4/M, z23.h, z1.h\n"
- "ld1b { z23.h }, p3/Z, [x15, x0]\n"
- ".inst 0x454f1af7 // usublb z23.h, z23.b, z15.b\n"
- ".inst 0x44814730 // smlalt z16.s, p4/M, z25.h, z1.h\n"
- ".inst 0x448343ee // smlalb z14.s, p4/M, z31.h, z3.h\n"
- "ldr x23, [x7, #0x100]\n"
- "whilelt p0.h, x8, x1\n"
- ".inst 0x44814705 // smlalt z5.s, p4/M, z24.h, z1.h\n"
- ".inst 0x448243f4 // smlalb z20.s, p4/M, z31.h, z2.h\n"
- "ld1sb { z1.h }, p4/Z, [x2, #3, MUL VL]\n"
- ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
- ".inst 0x44824308 // smlalb z8.s, p4/M, z24.h, z2.h\n"
- ".inst 0x44824366 // smlalb z6.s, p4/M, z27.h, z2.h\n"
- "addvl x17, x17, #2\n"
- ".inst 0x448347ea // smlalt z10.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448247e7 // smlalt z7.s, p4/M, z31.h, z2.h\n"
- "ld1b { z31.h }, p3/Z, [x22, x0]\n"
- ".inst 0x454f1bff // usublb z31.h, z31.b, z15.b\n"
- ".inst 0x44824710 // smlalt z16.s, p4/M, z24.h, z2.h\n"
- ".inst 0x448443ce // smlalb z14.s, p4/M, z30.h, z4.h\n"
- "ldr x22, [x7, #0x108]\n"
- ".inst 0x44824765 // smlalt z5.s, p4/M, z27.h, z2.h\n"
- ".inst 0x448343d4 // smlalb z20.s, p4/M, z30.h, z3.h\n"
- "ld1sb { z2.h }, p4/Z, [x2, #4, MUL VL]\n"
- ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
- ".inst 0x44834368 // smlalb z8.s, p4/M, z27.h, z3.h\n"
- ".inst 0x448342e6 // smlalb z6.s, p4/M, z23.h, z3.h\n"
- ".inst 0x448447ca // smlalt z10.s, p4/M, z30.h, z4.h\n"
- ".inst 0x448347c7 // smlalt z7.s, p4/M, z30.h, z3.h\n"
- "ld1b { z30.h }, p3/Z, [x21, x0]\n"
- ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
- ".inst 0x44834770 // smlalt z16.s, p4/M, z27.h, z3.h\n"
- ".inst 0x448042ce // smlalb z14.s, p4/M, z22.h, z0.h\n"
- "ldr x21, [x7, #0x110]\n"
- ".inst 0x448346e5 // smlalt z5.s, p4/M, z23.h, z3.h\n"
- ".inst 0x44844354 // smlalb z20.s, p4/M, z26.h, z4.h\n"
- "ld1sb { z3.h }, p4/Z, [x2, #5, MUL VL]\n"
- ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
- ".inst 0x448442e8 // smlalb z8.s, p4/M, z23.h, z4.h\n"
- ".inst 0x44844386 // smlalb z6.s, p4/M, z28.h, z4.h\n"
- ".inst 0x448046ca // smlalt z10.s, p4/M, z22.h, z0.h\n"
- ".inst 0x44844747 // smlalt z7.s, p4/M, z26.h, z4.h\n"
- "ld1b { z26.h }, p3/Z, [x14, x0]\n"
- ".inst 0x454f1b5a // usublb z26.h, z26.b, z15.b\n"
- ".inst 0x448446f0 // smlalt z16.s, p4/M, z23.h, z4.h\n"
- ".inst 0x4481432e // smlalb z14.s, p4/M, z25.h, z1.h\n"
- "ld1b { z22.h }, p3/Z, [x20, x0]\n"
- ".inst 0x454f1ad6 // usublb z22.h, z22.b, z15.b\n"
- ".inst 0x44844785 // smlalt z5.s, p4/M, z28.h, z4.h\n"
- ".inst 0x44804334 // smlalb z20.s, p4/M, z25.h, z0.h\n"
- "ld1sb { z4.h }, p4/Z, [x2, #6, MUL VL]\n"
- ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
- ".inst 0x448043e8 // smlalb z8.s, p4/M, z31.h, z0.h\n"
- ".inst 0x448043c6 // smlalb z6.s, p4/M, z30.h, z0.h\n"
- "ldr x20, [x7, #0x118]\n"
- "ldr x14, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x4481472a // smlalt z10.s, p4/M, z25.h, z1.h\n"
- ".inst 0x44804727 // smlalt z7.s, p4/M, z25.h, z0.h\n"
- "ld1b { z25.h }, p3/Z, [x13, x0]\n"
- ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
- ".inst 0x448047f0 // smlalt z16.s, p4/M, z31.h, z0.h\n"
- ".inst 0x4482430e // smlalb z14.s, p4/M, z24.h, z2.h\n"
- ".inst 0x448047c5 // smlalt z5.s, p4/M, z30.h, z0.h\n"
- ".inst 0x44814314 // smlalb z20.s, p4/M, z24.h, z1.h\n"
- "ld1sb { z0.h }, p4/Z, [x2, #7, MUL VL]\n"
- "inch x2, ALL, MUL #8\n"
- ".inst 0x448143c8 // smlalb z8.s, p4/M, z30.h, z1.h\n"
- ".inst 0x44814346 // smlalb z6.s, p4/M, z26.h, z1.h\n"
- ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
- ".inst 0x4482470a // smlalt z10.s, p4/M, z24.h, z2.h\n"
- ".inst 0x44814707 // smlalt z7.s, p4/M, z24.h, z1.h\n"
- "ld1b { z24.h }, p3/Z, [x12, x0]\n"
- ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
- ".inst 0x448147d0 // smlalt z16.s, p4/M, z30.h, z1.h\n"
- ".inst 0x4483436e // smlalb z14.s, p4/M, z27.h, z3.h\n"
- ".inst 0x44814745 // smlalt z5.s, p4/M, z26.h, z1.h\n"
- ".inst 0x44824374 // smlalb z20.s, p4/M, z27.h, z2.h\n"
- "ld1sb { z1.h }, p4/Z, [x2]\n"
- ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
- ".inst 0x44824348 // smlalb z8.s, p4/M, z26.h, z2.h\n"
- ".inst 0x44824326 // smlalb z6.s, p4/M, z25.h, z2.h\n"
- ".inst 0x4483476a // smlalt z10.s, p4/M, z27.h, z3.h\n"
- ".inst 0x44824767 // smlalt z7.s, p4/M, z27.h, z2.h\n"
- "ld1b { z27.h }, p3/Z, [x11, x0]\n"
- ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
- ".inst 0x44824750 // smlalt z16.s, p4/M, z26.h, z2.h\n"
- ".inst 0x448442ee // smlalb z14.s, p4/M, z23.h, z4.h\n"
- ".inst 0x44824725 // smlalt z5.s, p4/M, z25.h, z2.h\n"
- ".inst 0x448342f4 // smlalb z20.s, p4/M, z23.h, z3.h\n"
- "ld1sb { z2.h }, p4/Z, [x2, #1, MUL VL]\n"
- ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
- ".inst 0x44834328 // smlalb z8.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44834306 // smlalb z6.s, p4/M, z24.h, z3.h\n"
- ".inst 0x448446ea // smlalt z10.s, p4/M, z23.h, z4.h\n"
- ".inst 0x448346e7 // smlalt z7.s, p4/M, z23.h, z3.h\n"
- "ld1b { z23.h }, p3/Z, [x10, x0]\n"
- ".inst 0x454f1af7 // usublb z23.h, z23.b, z15.b\n"
- ".inst 0x44834730 // smlalt z16.s, p4/M, z25.h, z3.h\n"
- ".inst 0x448043ee // smlalb z14.s, p4/M, z31.h, z0.h\n"
- ".inst 0x44834705 // smlalt z5.s, p4/M, z24.h, z3.h\n"
- ".inst 0x44844394 // smlalb z20.s, p4/M, z28.h, z4.h\n"
- "ld1sb { z3.h }, p4/Z, [x2, #2, MUL VL]\n"
- ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
- ".inst 0x44844308 // smlalb z8.s, p4/M, z24.h, z4.h\n"
- ".inst 0x448442c6 // smlalb z6.s, p4/M, z22.h, z4.h\n"
- ".inst 0x448047ea // smlalt z10.s, p4/M, z31.h, z0.h\n"
- ".inst 0x44844787 // smlalt z7.s, p4/M, z28.h, z4.h\n"
- "ld1b { z31.h }, p3/Z, [x9, x0]\n"
- ".inst 0x454f1bff // usublb z31.h, z31.b, z15.b\n"
- ".inst 0x44844710 // smlalt z16.s, p4/M, z24.h, z4.h\n"
- ".inst 0x448143ce // smlalb z14.s, p4/M, z30.h, z1.h\n"
- "ld1b { z28.h }, p3/Z, [x27, x0]\n"
- ".inst 0x454f1b9c // usublb z28.h, z28.b, z15.b\n"
- ".inst 0x448446c5 // smlalt z5.s, p4/M, z22.h, z4.h\n"
- ".inst 0x448043d4 // smlalb z20.s, p4/M, z30.h, z0.h\n"
- "ld1sb { z4.h }, p4/Z, [x2, #3, MUL VL]\n"
- ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
- ".inst 0x44804368 // smlalb z8.s, p4/M, z27.h, z0.h\n"
- ".inst 0x448042e6 // smlalb z6.s, p4/M, z23.h, z0.h\n"
- ".inst 0x448147ca // smlalt z10.s, p4/M, z30.h, z1.h\n"
- ".inst 0x448047c7 // smlalt z7.s, p4/M, z30.h, z0.h\n"
- "ld1b { z30.h }, p3/Z, [x28, x0]\n"
- ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
- ".inst 0x44804770 // smlalt z16.s, p4/M, z27.h, z0.h\n"
- ".inst 0x4482434e // smlalb z14.s, p4/M, z26.h, z2.h\n"
- ".inst 0x448046e5 // smlalt z5.s, p4/M, z23.h, z0.h\n"
- ".inst 0x44814354 // smlalb z20.s, p4/M, z26.h, z1.h\n"
- "ld1sb { z0.h }, p4/Z, [x2, #4, MUL VL]\n"
- ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
- ".inst 0x448142e8 // smlalb z8.s, p4/M, z23.h, z1.h\n"
- ".inst 0x448143e6 // smlalb z6.s, p4/M, z31.h, z1.h\n"
- ".inst 0x4482474a // smlalt z10.s, p4/M, z26.h, z2.h\n"
- ".inst 0x44814747 // smlalt z7.s, p4/M, z26.h, z1.h\n"
- "ld1b { z26.h }, p3/Z, [x26, x0]\n"
- ".inst 0x454f1b5a // usublb z26.h, z26.b, z15.b\n"
- ".inst 0x448146f0 // smlalt z16.s, p4/M, z23.h, z1.h\n"
- ".inst 0x4483432e // smlalb z14.s, p4/M, z25.h, z3.h\n"
- ".inst 0x448147e5 // smlalt z5.s, p4/M, z31.h, z1.h\n"
- ".inst 0x44824334 // smlalb z20.s, p4/M, z25.h, z2.h\n"
- "ld1sb { z1.h }, p4/Z, [x2, #5, MUL VL]\n"
- ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
- ".inst 0x448243e8 // smlalb z8.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448243c6 // smlalb z6.s, p4/M, z30.h, z2.h\n"
- ".inst 0x4483472a // smlalt z10.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44824727 // smlalt z7.s, p4/M, z25.h, z2.h\n"
- "ld1b { z25.h }, p3/Z, [x25, x0]\n"
- ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
- ".inst 0x448247f0 // smlalt z16.s, p4/M, z31.h, z2.h\n"
- ".inst 0x4484430e // smlalb z14.s, p4/M, z24.h, z4.h\n"
- ".inst 0x448247c5 // smlalt z5.s, p4/M, z30.h, z2.h\n"
- ".inst 0x44834314 // smlalb z20.s, p4/M, z24.h, z3.h\n"
- "ld1sb { z2.h }, p4/Z, [x2, #6, MUL VL]\n"
- ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
- ".inst 0x448343c8 // smlalb z8.s, p4/M, z30.h, z3.h\n"
- ".inst 0x44834386 // smlalb z6.s, p4/M, z28.h, z3.h\n"
- ".inst 0x4484470a // smlalt z10.s, p4/M, z24.h, z4.h\n"
- ".inst 0x44834707 // smlalt z7.s, p4/M, z24.h, z3.h\n"
- "ld1b { z24.h }, p3/Z, [x24, x0]\n"
- ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
- ".inst 0x448347d0 // smlalt z16.s, p4/M, z30.h, z3.h\n"
- ".inst 0x4480436e // smlalb z14.s, p4/M, z27.h, z0.h\n"
- ".inst 0x44834785 // smlalt z5.s, p4/M, z28.h, z3.h\n"
- ".inst 0x448442d4 // smlalb z20.s, p4/M, z22.h, z4.h\n"
- "ld1sb { z3.h }, p4/Z, [x2, #7, MUL VL]\n"
- "inch x2, ALL, MUL #8\n"
- ".inst 0x44844388 // smlalb z8.s, p4/M, z28.h, z4.h\n"
- ".inst 0x44844346 // smlalb z6.s, p4/M, z26.h, z4.h\n"
- ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
- ".inst 0x4480476a // smlalt z10.s, p4/M, z27.h, z0.h\n"
- ".inst 0x44844790 // smlalt z16.s, p4/M, z28.h, z4.h\n"
- "ld1b { z27.h }, p3/Z, [x23, x0]\n"
- ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
- ".inst 0x448142ee // smlalb z14.s, p4/M, z23.h, z1.h\n"
- ".inst 0x448446c7 // smlalt z7.s, p4/M, z22.h, z4.h\n"
- "ld1w { z18.s }, p1/Z, [x16, #1, MUL VL]\n"
- "addvl x16, x16, #2\n"
- ".inst 0x44844745 // smlalt z5.s, p4/M, z26.h, z4.h\n"
- ".inst 0x448042f4 // smlalb z20.s, p4/M, z23.h, z0.h\n"
- "ld1sb { z4.h }, p4/Z, [x2]\n"
- ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
- ".inst 0x44804328 // smlalb z8.s, p4/M, z25.h, z0.h\n"
- ".inst 0x44804306 // smlalb z6.s, p4/M, z24.h, z0.h\n"
- "inch x2\n"
- ".inst 0x448146ea // smlalt z10.s, p4/M, z23.h, z1.h\n"
- ".inst 0x44804730 // smlalt z16.s, p4/M, z25.h, z0.h\n"
- "ld1b { z25.h }, p3/Z, [x22, x0]\n"
- ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
+ ".inst 0x449a42ce // smlalb z14.s, p4/M, z22.h, z26.h\n"
+ ".inst 0x449a46d7 // smlalt z23.s, p4/M, z22.h, z26.h\n"
+ "ldr x20, [x17, #0x50]\n"
+ "ld1b { z27.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x4488404e // smlalb z14.s, p4/M, z2.h, z8.h\n"
+ ".inst 0x449a4046 // smlalb z6.s, p4/M, z2.h, z26.h\n"
+ "ldr x20, [x17, #0x58]\n"
+ ".inst 0x455e1b7b // usublb z27.h, z27.b, z30.b\n"
+ ".inst 0x449a4169 // smlalb z9.s, p4/M, z11.h, z26.h\n"
+ ".inst 0x449a4067 // smlalb z7.s, p4/M, z3.h, z26.h\n"
+ "ld1b { z5.h }, p3/Z, [x20, x2]\n"
+ "ldr x20, [x17, #0x60]\n"
+ ".inst 0x44884457 // smlalt z23.s, p4/M, z2.h, z8.h\n"
+ ".inst 0x449043ae // smlalb z14.s, p4/M, z29.h, z16.h\n"
+ "ld1sb { z25.h }, p4/Z, [x4, #5, MUL VL]\n"
+ ".inst 0x455e18a5 // usublb z5.h, z5.b, z30.b\n"
+ ".inst 0x449a4452 // smlalt z18.s, p4/M, z2.h, z26.h\n"
+ ".inst 0x449a4574 // smlalt z20.s, p4/M, z11.h, z26.h\n"
+ "ld1b { z22.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x454a1339 // ssublb z25.h, z25.b, z10.b\n"
+ ".inst 0x449a4461 // smlalt z1.s, p4/M, z3.h, z26.h\n"
+ ".inst 0x448843a6 // smlalb z6.s, p4/M, z29.h, z8.h\n"
+ "ldr x20, [x17, #0x68]\n"
+ "ld1sb { z2.h }, p4/Z, [x4, #6, MUL VL]\n"
+ ".inst 0x44884069 // smlalb z9.s, p4/M, z3.h, z8.h\n"
+ ".inst 0x44884087 // smlalb z7.s, p4/M, z4.h, z8.h\n"
+ ".inst 0x455e1ad6 // usublb z22.h, z22.b, z30.b\n"
+ "ld1b { z26.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x449047b7 // smlalt z23.s, p4/M, z29.h, z16.h\n"
+ ".inst 0x449543ee // smlalb z14.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x454a1042 // ssublb z2.h, z2.b, z10.b\n"
+ "ldr x20, [x17, #0x70]\n"
+ ".inst 0x448847b2 // smlalt z18.s, p4/M, z29.h, z8.h\n"
+ ".inst 0x44884474 // smlalt z20.s, p4/M, z3.h, z8.h\n"
+ "ld1sb { z29.h }, p4/Z, [x4, #7, MUL VL]\n"
+ ".inst 0x455e1b5a // usublb z26.h, z26.b, z30.b\n"
+ ".inst 0x44884481 // smlalt z1.s, p4/M, z4.h, z8.h\n"
+ ".inst 0x449043e6 // smlalb z6.s, p4/M, z31.h, z16.h\n"
+ "inch x4, ALL, MUL #8\n"
+ "ld1b { z8.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x44904089 // smlalb z9.s, p4/M, z4.h, z16.h\n"
+ ".inst 0x44904367 // smlalb z7.s, p4/M, z27.h, z16.h\n"
+ ".inst 0x454a13bd // ssublb z29.h, z29.b, z10.b\n"
+ "ldr x20, [x17, #0x78]\n"
+ ".inst 0x449547f7 // smlalt z23.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x4491400e // smlalb z14.s, p4/M, z0.h, z17.h\n"
+ "ld1sb { z24.h }, p4/Z, [x4]\n"
+ ".inst 0x455e1908 // usublb z8.h, z8.b, z30.b\n"
+ ".inst 0x449047f2 // smlalt z18.s, p4/M, z31.h, z16.h\n"
+ ".inst 0x44904494 // smlalt z20.s, p4/M, z4.h, z16.h\n"
+ "ld1b { z31.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x454a1318 // ssublb z24.h, z24.b, z10.b\n"
+ ".inst 0x44904761 // smlalt z1.s, p4/M, z27.h, z16.h\n"
+ ".inst 0x44954006 // smlalb z6.s, p4/M, z0.h, z21.h\n"
+ "ldr x22, [x17, #0x80]\n"
+ "ld1sb { z16.h }, p4/Z, [x4, #1, MUL VL]\n"
+ ".inst 0x44954369 // smlalb z9.s, p4/M, z27.h, z21.h\n"
+ ".inst 0x449540a7 // smlalb z7.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
+ "ldr x21, [x17, #0x88]\n"
+ ".inst 0x44914417 // smlalt z23.s, p4/M, z0.h, z17.h\n"
+ ".inst 0x4499416e // smlalb z14.s, p4/M, z11.h, z25.h\n"
+ ".inst 0x454a1210 // ssublb z16.h, z16.b, z10.b\n"
+ "ldr x20, [x17, #0x90]\n"
+ ".inst 0x44954412 // smlalt z18.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x44954774 // smlalt z20.s, p4/M, z27.h, z21.h\n"
+ "ld1b { z0.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
+ ".inst 0x449544a1 // smlalt z1.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x449142c6 // smlalb z6.s, p4/M, z22.h, z17.h\n"
+ "ld1sb { z21.h }, p4/Z, [x4, #2, MUL VL]\n"
+ ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
+ ".inst 0x449140a9 // smlalb z9.s, p4/M, z5.h, z17.h\n"
+ ".inst 0x44914267 // smlalb z7.s, p4/M, z19.h, z17.h\n"
+ "ldr x23, [x17, #0x98]\n"
+ "ldr x22, [x17, #0xa0]\n"
+ ".inst 0x44994577 // smlalt z23.s, p4/M, z11.h, z25.h\n"
+ ".inst 0x4482406e // smlalb z14.s, p4/M, z3.h, z2.h\n"
+ "ld1b { z11.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x455e196b // usublb z11.h, z11.b, z30.b\n"
+ ".inst 0x449146d2 // smlalt z18.s, p4/M, z22.h, z17.h\n"
+ ".inst 0x449144b4 // smlalt z20.s, p4/M, z5.h, z17.h\n"
+ "ld1sb { z22.h }, p4/Z, [x4, #3, MUL VL]\n"
+ ".inst 0x454a12d6 // ssublb z22.h, z22.b, z10.b\n"
+ ".inst 0x44914661 // smlalt z1.s, p4/M, z19.h, z17.h\n"
+ ".inst 0x44994066 // smlalb z6.s, p4/M, z3.h, z25.h\n"
+ "ld1b { z17.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1a31 // usublb z17.h, z17.b, z30.b\n"
+ ".inst 0x44994389 // smlalb z9.s, p4/M, z28.h, z25.h\n"
+ ".inst 0x44994347 // smlalb z7.s, p4/M, z26.h, z25.h\n"
+ "ldr x20, [x17, #0xa8]\n"
+ "ldr x21, [x17, #0xb0]\n"
+ ".inst 0x44824477 // smlalt z23.s, p4/M, z3.h, z2.h\n"
+ ".inst 0x449d408e // smlalb z14.s, p4/M, z4.h, z29.h\n"
+ "ldr x13, [x17, #0xb8]\n"
+ "ldr x12, [x17, #0xc0]\n"
+ ".inst 0x44994472 // smlalt z18.s, p4/M, z3.h, z25.h\n"
+ ".inst 0x44994794 // smlalt z20.s, p4/M, z28.h, z25.h\n"
+ "ld1b { z3.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
+ ".inst 0x44994741 // smlalt z1.s, p4/M, z26.h, z25.h\n"
+ ".inst 0x44824086 // smlalb z6.s, p4/M, z4.h, z2.h\n"
+ "ld1sb { z25.h }, p4/Z, [x4, #4, MUL VL]\n"
+ ".inst 0x454a1339 // ssublb z25.h, z25.b, z10.b\n"
+ ".inst 0x44824349 // smlalb z9.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x44824107 // smlalb z7.s, p4/M, z8.h, z2.h\n"
+ "ldr x11, [x17, #0xc8]\n"
+ "ldr x10, [x17, #0xd0]\n"
+ ".inst 0x449d4497 // smlalt z23.s, p4/M, z4.h, z29.h\n"
+ ".inst 0x4498436e // smlalb z14.s, p4/M, z27.h, z24.h\n"
+ "ldr x9, [x17, #0xd8]\n"
+ "ldr x28, [x17, #0xe0]\n"
+ ".inst 0x44824492 // smlalt z18.s, p4/M, z4.h, z2.h\n"
+ ".inst 0x44824754 // smlalt z20.s, p4/M, z26.h, z2.h\n"
+ "ld1b { z4.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
+ ".inst 0x44824501 // smlalt z1.s, p4/M, z8.h, z2.h\n"
+ ".inst 0x449d4366 // smlalb z6.s, p4/M, z27.h, z29.h\n"
+ "ld1sb { z2.h }, p4/Z, [x4, #5, MUL VL]\n"
+ ".inst 0x454a1042 // ssublb z2.h, z2.b, z10.b\n"
+ ".inst 0x449d4109 // smlalb z9.s, p4/M, z8.h, z29.h\n"
+ ".inst 0x449d43e7 // smlalb z7.s, p4/M, z31.h, z29.h\n"
+ "ldr x27, [x17, #0xe8]\n"
+ "ldr x26, [x17, #0xf0]\n"
+ ".inst 0x44984777 // smlalt z23.s, p4/M, z27.h, z24.h\n"
+ ".inst 0x449040ae // smlalb z14.s, p4/M, z5.h, z16.h\n"
+ "ldr x25, [x17, #0xf8]\n"
+ "ldr x24, [x17, #0x100]\n"
+ ".inst 0x449d4772 // smlalt z18.s, p4/M, z27.h, z29.h\n"
+ ".inst 0x449d4514 // smlalt z20.s, p4/M, z8.h, z29.h\n"
+ "ld1b { z27.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1b7b // usublb z27.h, z27.b, z30.b\n"
+ ".inst 0x449d47e1 // smlalt z1.s, p4/M, z31.h, z29.h\n"
+ ".inst 0x449840a6 // smlalb z6.s, p4/M, z5.h, z24.h\n"
+ "ld1sb { z29.h }, p4/Z, [x4, #6, MUL VL]\n"
+ ".inst 0x454a13bd // ssublb z29.h, z29.b, z10.b\n"
+ ".inst 0x449843e9 // smlalb z9.s, p4/M, z31.h, z24.h\n"
+ ".inst 0x44984007 // smlalb z7.s, p4/M, z0.h, z24.h\n"
+ "ldr x23, [x17, #0x108]\n"
+ "ldr x22, [x17, #0x110]\n"
+ ".inst 0x449044b7 // smlalt z23.s, p4/M, z5.h, z16.h\n"
+ ".inst 0x4495438e // smlalb z14.s, p4/M, z28.h, z21.h\n"
+ "ldr x20, [x17, #0x118]\n"
+ "whilelt p0.h, x16, x3\n"
+ ".inst 0x449844b2 // smlalt z18.s, p4/M, z5.h, z24.h\n"
+ ".inst 0x449847f4 // smlalt z20.s, p4/M, z31.h, z24.h\n"
+ "ld1b { z5.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x455e18a5 // usublb z5.h, z5.b, z30.b\n"
+ ".inst 0x44984401 // smlalt z1.s, p4/M, z0.h, z24.h\n"
+ ".inst 0x44904266 // smlalb z6.s, p4/M, z19.h, z16.h\n"
+ "ld1sb { z24.h }, p4/Z, [x4, #7, MUL VL]\n"
+ "inch x4, ALL, MUL #8\n"
+ ".inst 0x44904009 // smlalb z9.s, p4/M, z0.h, z16.h\n"
+ ".inst 0x44904167 // smlalb z7.s, p4/M, z11.h, z16.h\n"
+ ".inst 0x454a1318 // ssublb z24.h, z24.b, z10.b\n"
+ "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44954797 // smlalt z23.s, p4/M, z28.h, z21.h\n"
+ ".inst 0x4496434e // smlalb z14.s, p4/M, z26.h, z22.h\n"
+ "ld1b { z28.h }, p3/Z, [x13, x2]\n"
+ ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
+ ".inst 0x44904672 // smlalt z18.s, p4/M, z19.h, z16.h\n"
+ ".inst 0x44904414 // smlalt z20.s, p4/M, z0.h, z16.h\n"
+ "ld1sb { z19.h }, p4/Z, [x4]\n"
+ ".inst 0x454a1273 // ssublb z19.h, z19.b, z10.b\n"
+ ".inst 0x44904561 // smlalt z1.s, p4/M, z11.h, z16.h\n"
+ ".inst 0x44954346 // smlalb z6.s, p4/M, z26.h, z21.h\n"
+ "ld1b { z16.h }, p3/Z, [x12, x2]\n"
+ ".inst 0x455e1a10 // usublb z16.h, z16.b, z30.b\n"
+ ".inst 0x44954229 // smlalb z9.s, p4/M, z17.h, z21.h\n"
+ ".inst 0x44954067 // smlalb z7.s, p4/M, z3.h, z21.h\n"
+ ".inst 0x44964757 // smlalt z23.s, p4/M, z26.h, z22.h\n"
+ ".inst 0x4499410e // smlalb z14.s, p4/M, z8.h, z25.h\n"
+ ".inst 0x44954752 // smlalt z18.s, p4/M, z26.h, z21.h\n"
+ ".inst 0x44954634 // smlalt z20.s, p4/M, z17.h, z21.h\n"
+ "ld1b { z26.h }, p3/Z, [x11, x2]\n"
+ ".inst 0x455e1b5a // usublb z26.h, z26.b, z30.b\n"
+ ".inst 0x44954461 // smlalt z1.s, p4/M, z3.h, z21.h\n"
+ ".inst 0x44964106 // smlalb z6.s, p4/M, z8.h, z22.h\n"
+ "ld1sb { z21.h }, p4/Z, [x4, #1, MUL VL]\n"
+ ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
+ ".inst 0x44964069 // smlalb z9.s, p4/M, z3.h, z22.h\n"
+ ".inst 0x44964087 // smlalb z7.s, p4/M, z4.h, z22.h\n"
+ ".inst 0x44994517 // smlalt z23.s, p4/M, z8.h, z25.h\n"
".inst 0x448243ee // smlalb z14.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448046e7 // smlalt z7.s, p4/M, z23.h, z0.h\n"
- "uzp1 z23.s, z19.s, z18.s\n"
- ".inst 0x44804705 // smlalt z5.s, p4/M, z24.h, z0.h\n"
- ".inst 0x448143f4 // smlalb z20.s, p4/M, z31.h, z1.h\n"
- "uzp2 z22.s, z19.s, z18.s\n"
- ".inst 0x44814308 // smlalb z8.s, p4/M, z24.h, z1.h\n"
- ".inst 0x44814366 // smlalb z6.s, p4/M, z27.h, z1.h\n"
- ".inst 0x448247ea // smlalt z10.s, p4/M, z31.h, z2.h\n"
- ".inst 0x44814710 // smlalt z16.s, p4/M, z24.h, z1.h\n"
- "ld1b { z24.h }, p3/Z, [x21, x0]\n"
- ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
- ".inst 0x448343ce // smlalb z14.s, p4/M, z30.h, z3.h\n"
- ".inst 0x448147e7 // smlalt z7.s, p4/M, z31.h, z1.h\n"
- ".inst 0x44814765 // smlalt z5.s, p4/M, z27.h, z1.h\n"
- ".inst 0x448243d4 // smlalb z20.s, p4/M, z30.h, z2.h\n"
- ".inst 0x44824368 // smlalb z8.s, p4/M, z27.h, z2.h\n"
- ".inst 0x44824326 // smlalb z6.s, p4/M, z25.h, z2.h\n"
- ".inst 0x448347ca // smlalt z10.s, p4/M, z30.h, z3.h\n"
- ".inst 0x44824770 // smlalt z16.s, p4/M, z27.h, z2.h\n"
- "ld1b { z27.h }, p3/Z, [x20, x0]\n"
- ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
- ".inst 0x4484438e // smlalb z14.s, p4/M, z28.h, z4.h\n"
- ".inst 0x448247c7 // smlalt z7.s, p4/M, z30.h, z2.h\n"
- ".inst 0x04a975ce // sqrdmulh z14.s, z14.s, z9.s\n"
- "inch x0\n"
- ".inst 0x44824725 // smlalt z5.s, p4/M, z25.h, z2.h\n"
- ".inst 0x44834394 // smlalb z20.s, p4/M, z28.h, z3.h\n"
- "and z21.d, z14.d, z23.d\n"
- "mov x20, x0\n"
- ".inst 0x44834328 // smlalb z8.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44834306 // smlalb z6.s, p4/M, z24.h, z3.h\n"
- "asr z21.s, z21.s, #0x1f\n"
+ ".inst 0x44964512 // smlalt z18.s, p4/M, z8.h, z22.h\n"
+ ".inst 0x44964474 // smlalt z20.s, p4/M, z3.h, z22.h\n"
+ "ld1b { z8.h }, p3/Z, [x10, x2]\n"
+ ".inst 0x455e1908 // usublb z8.h, z8.b, z30.b\n"
+ ".inst 0x44964481 // smlalt z1.s, p4/M, z4.h, z22.h\n"
+ ".inst 0x449943e6 // smlalb z6.s, p4/M, z31.h, z25.h\n"
+ "ld1sb { z22.h }, p4/Z, [x4, #2, MUL VL]\n"
+ ".inst 0x454a12d6 // ssublb z22.h, z22.b, z10.b\n"
+ ".inst 0x44994089 // smlalb z9.s, p4/M, z4.h, z25.h\n"
+ ".inst 0x44994367 // smlalb z7.s, p4/M, z27.h, z25.h\n"
+ ".inst 0x448247f7 // smlalt z23.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x449d400e // smlalb z14.s, p4/M, z0.h, z29.h\n"
+ ".inst 0x449947f2 // smlalt z18.s, p4/M, z31.h, z25.h\n"
+ ".inst 0x44994494 // smlalt z20.s, p4/M, z4.h, z25.h\n"
+ "ld1b { z31.h }, p3/Z, [x9, x2]\n"
+ ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
+ ".inst 0x44994761 // smlalt z1.s, p4/M, z27.h, z25.h\n"
+ ".inst 0x44824006 // smlalb z6.s, p4/M, z0.h, z2.h\n"
+ "ld1sb { z25.h }, p4/Z, [x4, #3, MUL VL]\n"
+ ".inst 0x454a1339 // ssublb z25.h, z25.b, z10.b\n"
+ ".inst 0x44824369 // smlalb z9.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x448240a7 // smlalb z7.s, p4/M, z5.h, z2.h\n"
+ ".inst 0x449d4417 // smlalt z23.s, p4/M, z0.h, z29.h\n"
+ ".inst 0x4498422e // smlalb z14.s, p4/M, z17.h, z24.h\n"
+ ".inst 0x44824412 // smlalt z18.s, p4/M, z0.h, z2.h\n"
+ ".inst 0x44824774 // smlalt z20.s, p4/M, z27.h, z2.h\n"
+ "ld1b { z0.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
+ ".inst 0x448244a1 // smlalt z1.s, p4/M, z5.h, z2.h\n"
+ ".inst 0x449d4166 // smlalb z6.s, p4/M, z11.h, z29.h\n"
+ "ld1sb { z2.h }, p4/Z, [x4, #4, MUL VL]\n"
+ ".inst 0x454a1042 // ssublb z2.h, z2.b, z10.b\n"
+ ".inst 0x449d40a9 // smlalb z9.s, p4/M, z5.h, z29.h\n"
+ ".inst 0x449d4387 // smlalb z7.s, p4/M, z28.h, z29.h\n"
+ ".inst 0x44984637 // smlalt z23.s, p4/M, z17.h, z24.h\n"
+ ".inst 0x4493406e // smlalb z14.s, p4/M, z3.h, z19.h\n"
+ "ld1b { z17.h }, p3/Z, [x27, x2]\n"
+ ".inst 0x455e1a31 // usublb z17.h, z17.b, z30.b\n"
+ ".inst 0x449d4572 // smlalt z18.s, p4/M, z11.h, z29.h\n"
+ ".inst 0x449d44b4 // smlalt z20.s, p4/M, z5.h, z29.h\n"
+ "ld1sb { z11.h }, p4/Z, [x4, #5, MUL VL]\n"
+ ".inst 0x454a116b // ssublb z11.h, z11.b, z10.b\n"
+ ".inst 0x449d4781 // smlalt z1.s, p4/M, z28.h, z29.h\n"
+ ".inst 0x44984066 // smlalb z6.s, p4/M, z3.h, z24.h\n"
+ "ld1b { z29.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x455e1bbd // usublb z29.h, z29.b, z30.b\n"
+ ".inst 0x44984209 // smlalb z9.s, p4/M, z16.h, z24.h\n"
+ ".inst 0x44984347 // smlalb z7.s, p4/M, z26.h, z24.h\n"
+ ".inst 0x44934477 // smlalt z23.s, p4/M, z3.h, z19.h\n"
+ ".inst 0x4495408e // smlalb z14.s, p4/M, z4.h, z21.h\n"
+ ".inst 0x44984472 // smlalt z18.s, p4/M, z3.h, z24.h\n"
+ ".inst 0x44984614 // smlalt z20.s, p4/M, z16.h, z24.h\n"
+ "ld1b { z3.h }, p3/Z, [x25, x2]\n"
+ ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
+ ".inst 0x44984741 // smlalt z1.s, p4/M, z26.h, z24.h\n"
+ ".inst 0x44934086 // smlalb z6.s, p4/M, z4.h, z19.h\n"
+ "ld1sb { z24.h }, p4/Z, [x4, #6, MUL VL]\n"
+ ".inst 0x454a1318 // ssublb z24.h, z24.b, z10.b\n"
+ ".inst 0x44934349 // smlalb z9.s, p4/M, z26.h, z19.h\n"
+ ".inst 0x44934107 // smlalb z7.s, p4/M, z8.h, z19.h\n"
+ ".inst 0x44954497 // smlalt z23.s, p4/M, z4.h, z21.h\n"
+ ".inst 0x4496436e // smlalb z14.s, p4/M, z27.h, z22.h\n"
+ ".inst 0x44934492 // smlalt z18.s, p4/M, z4.h, z19.h\n"
+ ".inst 0x44934754 // smlalt z20.s, p4/M, z26.h, z19.h\n"
+ "ld1b { z4.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
+ ".inst 0x44934501 // smlalt z1.s, p4/M, z8.h, z19.h\n"
+ ".inst 0x44954366 // smlalb z6.s, p4/M, z27.h, z21.h\n"
+ "ld1sb { z19.h }, p4/Z, [x4, #7, MUL VL]\n"
+ "inch x4, ALL, MUL #8\n"
+ ".inst 0x44954109 // smlalb z9.s, p4/M, z8.h, z21.h\n"
+ ".inst 0x449543e7 // smlalb z7.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x454a1273 // ssublb z19.h, z19.b, z10.b\n"
+ ".inst 0x44964777 // smlalt z23.s, p4/M, z27.h, z22.h\n"
+ ".inst 0x449940ae // smlalb z14.s, p4/M, z5.h, z25.h\n"
+ ".inst 0x44954772 // smlalt z18.s, p4/M, z27.h, z21.h\n"
+ ".inst 0x44954514 // smlalt z20.s, p4/M, z8.h, z21.h\n"
+ "ld1b { z27.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x455e1b7b // usublb z27.h, z27.b, z30.b\n"
+ ".inst 0x449547e1 // smlalt z1.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x449640a6 // smlalb z6.s, p4/M, z5.h, z22.h\n"
+ "ld1sb { z21.h }, p4/Z, [x4]\n"
+ ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
+ ".inst 0x449643e9 // smlalb z9.s, p4/M, z31.h, z22.h\n"
+ ".inst 0x44964007 // smlalb z7.s, p4/M, z0.h, z22.h\n"
+ "inch x4\n"
+ ".inst 0x449944b7 // smlalt z23.s, p4/M, z5.h, z25.h\n"
+ ".inst 0x4482420e // smlalb z14.s, p4/M, z16.h, z2.h\n"
+ ".inst 0x449644b2 // smlalt z18.s, p4/M, z5.h, z22.h\n"
+ ".inst 0x449647f4 // smlalt z20.s, p4/M, z31.h, z22.h\n"
+ "ld1b { z5.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e18a5 // usublb z5.h, z5.b, z30.b\n"
+ ".inst 0x44964401 // smlalt z1.s, p4/M, z0.h, z22.h\n"
+ ".inst 0x44994386 // smlalb z6.s, p4/M, z28.h, z25.h\n"
+ "ld1w { z22.s }, p2/Z, [x15]\n"
+ ".inst 0x44994009 // smlalb z9.s, p4/M, z0.h, z25.h\n"
+ ".inst 0x44994227 // smlalb z7.s, p4/M, z17.h, z25.h\n"
+ ".inst 0x44824617 // smlalt z23.s, p4/M, z16.h, z2.h\n"
+ ".inst 0x448b434e // smlalb z14.s, p4/M, z26.h, z11.h\n"
+ "ld1w { z16.s }, p1/Z, [x15, #1, MUL VL]\n"
+ "addvl x15, x15, #2\n"
+ ".inst 0x44994792 // smlalt z18.s, p4/M, z28.h, z25.h\n"
+ ".inst 0x44994414 // smlalt z20.s, p4/M, z0.h, z25.h\n"
+ "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
+ ".inst 0x44994621 // smlalt z1.s, p4/M, z17.h, z25.h\n"
+ ".inst 0x44824346 // smlalb z6.s, p4/M, z26.h, z2.h\n"
+ "uzp1 z25.s, z22.s, z16.s\n"
+ "inch x2\n"
+ ".inst 0x448243a9 // smlalb z9.s, p4/M, z29.h, z2.h\n"
+ ".inst 0x44824067 // smlalb z7.s, p4/M, z3.h, z2.h\n"
+ "uzp2 z16.s, z22.s, z16.s\n"
+ "ld1w { z22.s }, p2/Z, [x14]\n"
+ ".inst 0x448b4757 // smlalt z23.s, p4/M, z26.h, z11.h\n"
+ ".inst 0x4498410e // smlalb z14.s, p4/M, z8.h, z24.h\n"
+ "mov x20, x2\n"
"incw x20\n"
- ".inst 0x4484478a // smlalt z10.s, p4/M, z28.h, z4.h\n"
- ".inst 0x44834787 // smlalt z7.s, p4/M, z28.h, z3.h\n"
- ".inst 0x04bd754a // sqrdmulh z10.s, z10.s, z29.s\n"
- "whilelt p2.s, x0, x1\n"
- ".inst 0x44834730 // smlalt z16.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44834705 // smlalt z5.s, p4/M, z24.h, z3.h\n"
- "and z3.d, z10.d, z22.d\n"
- "whilelt p1.s, x20, x1\n"
- ".inst 0x44844354 // smlalb z20.s, p4/M, z26.h, z4.h\n"
- ".inst 0x44844308 // smlalb z8.s, p4/M, z24.h, z4.h\n"
- ".inst 0x04a97694 // sqrdmulh z20.s, z20.s, z9.s\n"
- "whilelt p3.h, x0, x1\n"
- ".inst 0x44844366 // smlalb z6.s, p4/M, z27.h, z4.h\n"
- ".inst 0x44844747 // smlalt z7.s, p4/M, z26.h, z4.h\n"
- ".inst 0x04a97508 // sqrdmulh z8.s, z8.s, z9.s\n"
- ".inst 0x44844710 // smlalt z16.s, p4/M, z24.h, z4.h\n"
- ".inst 0x44844765 // smlalt z5.s, p4/M, z27.h, z4.h\n"
- ".inst 0x04a974c6 // sqrdmulh z6.s, z6.s, z9.s\n"
- "sqadd z14.s, z14.s, z21.s\n"
+ ".inst 0x44824752 // smlalt z18.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x448247b4 // smlalt z20.s, p4/M, z29.h, z2.h\n"
+ "ld1w { z26.s }, p1/Z, [x14, #1, MUL VL]\n"
+ "uzp1 z29.s, z22.s, z26.s\n"
+ ".inst 0x44824461 // smlalt z1.s, p4/M, z3.h, z2.h\n"
+ ".inst 0x448b4106 // smlalb z6.s, p4/M, z8.h, z11.h\n"
+ "uzp2 z22.s, z22.s, z26.s\n"
+ "whilelt p2.s, x2, x3\n"
+ ".inst 0x448b4069 // smlalb z9.s, p4/M, z3.h, z11.h\n"
+ ".inst 0x448b4087 // smlalb z7.s, p4/M, z4.h, z11.h\n"
+ "whilelt p1.s, x20, x3\n"
+ "whilelt p3.h, x2, x3\n"
+ ".inst 0x44984517 // smlalt z23.s, p4/M, z8.h, z24.h\n"
+ ".inst 0x449343ee // smlalb z14.s, p4/M, z31.h, z19.h\n"
+ "addvl x14, x14, #2\n"
+ ".inst 0x448b4512 // smlalt z18.s, p4/M, z8.h, z11.h\n"
+ ".inst 0x448b4474 // smlalt z20.s, p4/M, z3.h, z11.h\n"
+ ".inst 0x448b4481 // smlalt z1.s, p4/M, z4.h, z11.h\n"
+ ".inst 0x449843e6 // smlalb z6.s, p4/M, z31.h, z24.h\n"
+ ".inst 0x44984089 // smlalb z9.s, p4/M, z4.h, z24.h\n"
+ ".inst 0x44984367 // smlalb z7.s, p4/M, z27.h, z24.h\n"
+ ".inst 0x449347f7 // smlalt z23.s, p4/M, z31.h, z19.h\n"
+ ".inst 0x4495400e // smlalb z14.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x04b975ce // sqrdmulh z14.s, z14.s, z25.s\n"
+ ".inst 0x449847f2 // smlalt z18.s, p4/M, z31.h, z24.h\n"
+ ".inst 0x44984494 // smlalt z20.s, p4/M, z4.h, z24.h\n"
+ "and z3.d, z14.d, z29.d\n"
+ ".inst 0x44984761 // smlalt z1.s, p4/M, z27.h, z24.h\n"
+ ".inst 0x44934006 // smlalb z6.s, p4/M, z0.h, z19.h\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ ".inst 0x44934369 // smlalb z9.s, p4/M, z27.h, z19.h\n"
+ ".inst 0x449340a7 // smlalb z7.s, p4/M, z5.h, z19.h\n"
+ "sqadd z14.s, z14.s, z3.s\n"
+ ".inst 0x448293ae // srshl z14.s, p4/M, z14.s, z29.s\n"
+ ".inst 0x44954417 // smlalt z23.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x44934412 // smlalt z18.s, p4/M, z0.h, z19.h\n"
+ ".inst 0x04b076f7 // sqrdmulh z23.s, z23.s, z16.s\n"
+ ".inst 0x44934774 // smlalt z20.s, p4/M, z27.h, z19.h\n"
+ ".inst 0x449344a1 // smlalt z1.s, p4/M, z5.h, z19.h\n"
+ "and z31.d, z23.d, z22.d\n"
+ ".inst 0x44954226 // smlalb z6.s, p4/M, z17.h, z21.h\n"
+ ".inst 0x449540a9 // smlalb z9.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x04b974c6 // sqrdmulh z6.s, z6.s, z25.s\n"
+ ".inst 0x44954387 // smlalb z7.s, p4/M, z28.h, z21.h\n"
+ ".inst 0x44954632 // smlalt z18.s, p4/M, z17.h, z21.h\n"
+ ".inst 0x04b97529 // sqrdmulh z9.s, z9.s, z25.s\n"
+ ".inst 0x449544b4 // smlalt z20.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x44954781 // smlalt z1.s, p4/M, z28.h, z21.h\n"
+ ".inst 0x04b974e7 // sqrdmulh z7.s, z7.s, z25.s\n"
+ "asr z31.s, z31.s, #0x1f\n"
+ "and z3.d, z6.d, z29.d\n"
+ ".inst 0x04b07652 // sqrdmulh z18.s, z18.s, z16.s\n"
+ "and z0.d, z9.d, z29.d\n"
+ ".inst 0x04b07694 // sqrdmulh z20.s, z20.s, z16.s\n"
+ "and z19.d, z7.d, z29.d\n"
+ ".inst 0x04b07421 // sqrdmulh z1.s, z1.s, z16.s\n"
+ "sqadd z23.s, z23.s, z31.s\n"
+ ".inst 0x448292d7 // srshl z23.s, p4/M, z23.s, z22.s\n"
"asr z3.s, z3.s, #0x1f\n"
- ".inst 0x448292ee // srshl z14.s, p4/M, z14.s, z23.s\n"
- "and z19.d, z20.d, z23.d\n"
- ".inst 0x04bd74e7 // sqrdmulh z7.s, z7.s, z29.s\n"
- "and z18.d, z8.d, z23.d\n"
- ".inst 0x04bd7610 // sqrdmulh z16.s, z16.s, z29.s\n"
- "and z21.d, z6.d, z23.d\n"
- ".inst 0x04bd74a5 // sqrdmulh z5.s, z5.s, z29.s\n"
- "sqadd z10.s, z10.s, z3.s\n"
+ "and z21.d, z18.d, z22.d\n"
+ "asr z0.s, z0.s, #0x1f\n"
+ "and z17.d, z20.d, z22.d\n"
"asr z19.s, z19.s, #0x1f\n"
- ".inst 0x448292ca // srshl z10.s, p4/M, z10.s, z22.s\n"
- "and z1.d, z7.d, z22.d\n"
- "asr z18.s, z18.s, #0x1f\n"
- "and z2.d, z16.d, z22.d\n"
+ "and z16.d, z1.d, z22.d\n"
+ "sqadd z6.s, z6.s, z3.s\n"
"asr z21.s, z21.s, #0x1f\n"
- "and z3.d, z5.d, z22.d\n"
- "sqadd z20.s, z20.s, z19.s\n"
- ".inst 0x448292f4 // srshl z20.s, p4/M, z20.s, z23.s\n"
- "asr z1.s, z1.s, #0x1f\n"
- "sqadd z8.s, z8.s, z18.s\n"
- ".inst 0x448292e8 // srshl z8.s, p4/M, z8.s, z23.s\n"
- "asr z2.s, z2.s, #0x1f\n"
- "sqadd z6.s, z6.s, z21.s\n"
- ".inst 0x448292e6 // srshl z6.s, p4/M, z6.s, z23.s\n"
- "asr z3.s, z3.s, #0x1f\n"
- "sqadd z7.s, z7.s, z1.s\n"
- ".inst 0x448292c7 // srshl z7.s, p4/M, z7.s, z22.s\n"
- "sqadd z16.s, z16.s, z2.s\n"
- "sqadd z5.s, z5.s, z3.s\n"
- ".inst 0x448292d0 // srshl z16.s, p4/M, z16.s, z22.s\n"
- ".inst 0x448292c5 // srshl z5.s, p4/M, z5.s, z22.s\n"
+ ".inst 0x448293a6 // srshl z6.s, p4/M, z6.s, z29.s\n"
+ "sqadd z9.s, z9.s, z0.s\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ ".inst 0x448293a9 // srshl z9.s, p4/M, z9.s, z29.s\n"
+ "sqadd z7.s, z7.s, z19.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x448293a7 // srshl z7.s, p4/M, z7.s, z29.s\n"
+ "sqadd z18.s, z18.s, z21.s\n"
+ "sqadd z20.s, z20.s, z17.s\n"
+ ".inst 0x448292d2 // srshl z18.s, p4/M, z18.s, z22.s\n"
+ ".inst 0x448292d4 // srshl z20.s, p4/M, z20.s, z22.s\n"
+ "sqadd z1.s, z1.s, z16.s\n"
".inst 0x453041ce // sqxtnb z14.h, z14.s\n"
- ".inst 0x45304294 // sqxtnb z20.h, z20.s\n"
- ".inst 0x45304108 // sqxtnb z8.h, z8.s\n"
+ ".inst 0x448292c1 // srshl z1.s, p4/M, z1.s, z22.s\n"
".inst 0x453040c6 // sqxtnb z6.h, z6.s\n"
- ".inst 0x4530454e // sqxtnt z14.h, z10.s\n"
- ".inst 0x453044f4 // sqxtnt z20.h, z7.s\n"
- ".inst 0x45304608 // sqxtnt z8.h, z16.s\n"
- ".inst 0x453044a6 // sqxtnt z6.h, z5.s\n"
- "sqadd z14.h, z14.h, z12.h\n"
- "sqadd z20.h, z20.h, z12.h\n"
- "smax z14.h, p4/M, z14.h, z13.h\n"
- "smax z20.h, p4/M, z20.h, z13.h\n"
- "sqadd z8.h, z8.h, z12.h\n"
- "sqadd z6.h, z6.h, z12.h\n"
- "smax z8.h, p4/M, z8.h, z13.h\n"
- "smax z6.h, p4/M, z6.h, z13.h\n"
- "smin z14.h, p4/M, z14.h, z11.h\n"
- "smin z20.h, p4/M, z20.h, z11.h\n"
- "st1b { z14.h }, p0, [x3, x8]\n"
- "smin z8.h, p4/M, z8.h, z11.h\n"
- "smin z6.h, p4/M, z6.h, z11.h\n"
- "st1b { z20.h }, p0, [x4, x8]\n"
- "st1b { z8.h }, p0, [x5, x8]\n"
- "st1b { z6.h }, p0, [x6, x8]\n"
- "ld1w { z30.s }, p2/Z, [x14]\n"
- "ld1w { z16.s }, p1/Z, [x14, #1, MUL VL]\n"
- "uzp1 z14.s, z30.s, z16.s\n"
- "ld1sb { z0.h }, p4/Z, [x2]\n"
- "ld1sb { z1.h }, p4/Z, [x2, #1, MUL VL]\n"
- "uzp2 z10.s, z30.s, z16.s\n"
- "addvl x14, x14, #2\n"
- "ld1sb { z2.h }, p4/Z, [x2, #2, MUL VL]\n"
- "ld1sb { z3.h }, p4/Z, [x2, #3, MUL VL]\n"
- "inch x8\n"
- "str x14, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1sb { z4.h }, p4/Z, [x2, #4, MUL VL]\n"
- "ldp x9, x28, [x7, #0x0]\n"
- "mov z20.d, z14.d\n"
- "mov z7.d, z10.d\n"
- "ldp x27, x26, [x7, #0x10]\n"
- "ldp x25, x24, [x7, #0x20]\n"
- "mov z8.d, z14.d\n"
- "mov z16.d, z10.d\n"
- "ldp x23, x22, [x7, #0x30]\n"
- "ldp x21, x20, [x7, #0x40]\n"
+ ".inst 0x45304129 // sqxtnb z9.h, z9.s\n"
+ ".inst 0x453040e7 // sqxtnb z7.h, z7.s\n"
+ ".inst 0x453046ee // sqxtnt z14.h, z23.s\n"
+ ".inst 0x45304646 // sqxtnt z6.h, z18.s\n"
+ ".inst 0x45304689 // sqxtnt z9.h, z20.s\n"
+ ".inst 0x45304427 // sqxtnt z7.h, z1.s\n"
+ "sqadd z14.h, z14.h, z15.h\n"
+ "smax z14.h, p4/M, z14.h, z12.h\n"
+ "smin z14.h, p4/M, z14.h, z13.h\n"
+ "sqadd z6.h, z6.h, z15.h\n"
+ "sqadd z9.h, z9.h, z15.h\n"
+ "smax z6.h, p4/M, z6.h, z12.h\n"
+ "smax z9.h, p4/M, z9.h, z12.h\n"
+ "sqadd z7.h, z7.h, z15.h\n"
+ "smax z7.h, p4/M, z7.h, z12.h\n"
+ "smin z6.h, p4/M, z6.h, z13.h\n"
+ "st1b { z14.h }, p0, [x5, x16]\n"
+ "smin z9.h, p4/M, z9.h, z13.h\n"
+ "smin z7.h, p4/M, z7.h, z13.h\n"
+ "st1b { z6.h }, p0, [x6, x16]\n"
+ "st1b { z9.h }, p0, [x7, x16]\n"
+ "st1b { z7.h }, p0, [x8, x16]\n"
+ "ld1w { z17.s }, p2/Z, [x21]\n"
+ "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+ "uzp1 z14.s, z17.s, z16.s\n"
+ "ld1sb { z26.h }, p4/Z, [x4]\n"
+ "ld1sb { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
+ "uzp2 z23.s, z17.s, z16.s\n"
+ "addvl x21, x21, #2\n"
+ "ld1sb { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
+ "ld1sb { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
+ "inch x16\n"
+ "str x21, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1sb { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
+ "ldp x9, x28, [x17, #0x0]\n"
"mov z6.d, z14.d\n"
- "mov z5.d, z10.d\n"
- "ld1b { z31.h }, p3/Z, [x9, x0]\n"
- "ld1b { z30.h }, p3/Z, [x28, x0]\n"
- ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
- ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
- "ld1b { z29.h }, p3/Z, [x27, x0]\n"
- "ld1b { z28.h }, p3/Z, [x26, x0]\n"
- ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
- ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
- "ld1b { z27.h }, p3/Z, [x25, x0]\n"
- "ld1b { z23.h }, p3/Z, [x24, x0]\n"
- ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
- ".inst 0x454f1bff // usublb z31.h, z31.b, z15.b\n"
- "ld1b { z25.h }, p3/Z, [x23, x0]\n"
- "ld1b { z24.h }, p3/Z, [x22, x0]\n"
- ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
- ".inst 0x454f1bbd // usublb z29.h, z29.b, z15.b\n"
- "ld1b { z26.h }, p3/Z, [x21, x0]\n"
- "ld1b { z22.h }, p3/Z, [x20, x0]\n"
- ".inst 0x454f1b9c // usublb z28.h, z28.b, z15.b\n"
- ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
- ".inst 0x454f1af7 // usublb z23.h, z23.b, z15.b\n"
- ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
- ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
- ".inst 0x454f1b5a // usublb z26.h, z26.b, z15.b\n"
- ".inst 0x454f1ad6 // usublb z22.h, z22.b, z15.b\n"
+ "mov z18.d, z23.d\n"
+ "ldp x27, x26, [x17, #0x10]\n"
+ "ldp x25, x24, [x17, #0x20]\n"
+ "mov z9.d, z14.d\n"
+ "mov z20.d, z23.d\n"
+ "ldp x23, x22, [x17, #0x30]\n"
+ "ldp x21, x20, [x17, #0x40]\n"
+ "mov z7.d, z14.d\n"
+ "mov z1.d, z23.d\n"
+ "ld1b { z22.h }, p3/Z, [x9, x2]\n"
+ "ld1b { z2.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x454a135a // ssublb z26.h, z26.b, z10.b\n"
+ ".inst 0x454a1108 // ssublb z8.h, z8.b, z10.b\n"
+ "ld1b { z11.h }, p3/Z, [x27, x2]\n"
+ "ld1b { z3.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x454a1210 // ssublb z16.h, z16.b, z10.b\n"
+ ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
+ "ld1b { z29.h }, p3/Z, [x25, x2]\n"
+ "ld1b { z4.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x454a1231 // ssublb z17.h, z17.b, z10.b\n"
+ ".inst 0x455e1ad6 // usublb z22.h, z22.b, z30.b\n"
+ "ld1b { z31.h }, p3/Z, [x23, x2]\n"
+ "ld1b { z0.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e1842 // usublb z2.h, z2.b, z30.b\n"
+ ".inst 0x455e196b // usublb z11.h, z11.b, z30.b\n"
+ "ld1b { z19.h }, p3/Z, [x21, x2]\n"
+ "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
+ ".inst 0x455e1bbd // usublb z29.h, z29.b, z30.b\n"
+ ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
+ ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
+ ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
+ ".inst 0x455e1a73 // usublb z19.h, z19.b, z30.b\n"
+ ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp b/src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp
index e9b29ca877..b1fe66cea2 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -91,7 +91,7 @@
#include "depthwise.hpp"
#include "depthfirst_driver.hpp"
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/pooling/depthfirst_driver.hpp b/src/core/NEON/kernels/arm_conv/pooling/depthfirst_driver.hpp
index 8473fc0838..b0aa62bbcb 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/depthfirst_driver.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/depthfirst_driver.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,7 @@
#pragma once
#include "pooling.hpp"
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
namespace arm_conv {
namespace pooling {
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
index a670bb81bb..6b3ebe6664 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#pragma once
-#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace pooling {
@@ -48,4 +48,4 @@ struct a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst : public DepthfirstStrategy
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
index 647103d3a4..5df848d1dd 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -196,38 +196,38 @@ void a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
"add x5, x5, #0x10\n"
"cbz x3, 4f\n"
"3:" // Oddments
- "ldr h6, [x11, x4]\n"
- "ldr h5, [x10, x4]\n"
- "fadd v17.8h, v6.8h, v5.8h\n"
+ "ldr h17, [x11, x4]\n"
+ "ldr h16, [x10, x4]\n"
+ "fadd v18.8h, v17.8h, v16.8h\n"
"subs x3, x3, #0x1\n"
- "ldr h4, [x27, x4]\n"
- "ldr h3, [x26, x4]\n"
- "fadd v16.8h, v4.8h, v3.8h\n"
- "fadd v19.8h, v17.8h, v16.8h\n"
- "ldr h2, [x15, x4]\n"
- "ldr h1, [x14, x4]\n"
- "fadd v18.8h, v2.8h, v1.8h\n"
- "fadd v21.8h, v18.8h, v19.8h\n"
- "ldr h0, [x12, x4]\n"
- "ldr h31, [x28, x4]\n"
- "fadd v17.8h, v0.8h, v31.8h\n"
- "ldr h30, [x9, x4]\n"
- "ldr h29, [x25, x4]\n"
- "fadd v22.8h, v30.8h, v29.8h\n"
- "ldr h28, [x23, x4]\n"
- "ldr h27, [x22, x4]\n"
- "fadd v16.8h, v28.8h, v27.8h\n"
- "fadd v20.8h, v16.8h, v19.8h\n"
- "ldr h26, [x16, x4]\n"
- "ldr h25, [x13, x4]\n"
- "fadd v19.8h, v26.8h, v17.8h\n"
- "fadd v18.8h, v25.8h, v22.8h\n"
- "ldr h24, [x24, x4]\n"
- "ldr h23, [x21, x4]\n"
- "fadd v17.8h, v24.8h, v17.8h\n"
- "fadd v16.8h, v23.8h, v22.8h\n"
- "fadd v19.8h, v21.8h, v19.8h\n"
- "fadd v18.8h, v21.8h, v18.8h\n"
+ "ldr h17, [x27, x4]\n"
+ "ldr h16, [x26, x4]\n"
+ "fadd v16.8h, v17.8h, v16.8h\n"
+ "fadd v18.8h, v18.8h, v16.8h\n"
+ "ldr h17, [x15, x4]\n"
+ "ldr h16, [x14, x4]\n"
+ "fadd v16.8h, v17.8h, v16.8h\n"
+ "fadd v23.8h, v16.8h, v18.8h\n"
+ "ldr h17, [x12, x4]\n"
+ "ldr h16, [x28, x4]\n"
+ "fadd v22.8h, v17.8h, v16.8h\n"
+ "ldr h17, [x9, x4]\n"
+ "ldr h16, [x25, x4]\n"
+ "fadd v21.8h, v17.8h, v16.8h\n"
+ "ldr h17, [x23, x4]\n"
+ "ldr h16, [x22, x4]\n"
+ "fadd v16.8h, v17.8h, v16.8h\n"
+ "fadd v20.8h, v16.8h, v18.8h\n"
+ "ldr h17, [x16, x4]\n"
+ "ldr h16, [x13, x4]\n"
+ "fadd v19.8h, v17.8h, v22.8h\n"
+ "fadd v18.8h, v16.8h, v21.8h\n"
+ "ldr h17, [x24, x4]\n"
+ "ldr h16, [x21, x4]\n"
+ "fadd v17.8h, v17.8h, v22.8h\n"
+ "fadd v16.8h, v16.8h, v21.8h\n"
+ "fadd v19.8h, v23.8h, v19.8h\n"
+ "fadd v18.8h, v23.8h, v18.8h\n"
"add x4, x4, #0x2\n"
"fadd v17.8h, v17.8h, v20.8h\n"
"fadd v16.8h, v16.8h, v20.8h\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp
index 44adb4ffcf..f7be92e53f 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp
@@ -22,7 +22,6 @@
* SOFTWARE.
*/
-
#include <cstdint>
#include <cstddef>
@@ -45,77 +44,77 @@ void a64_fp16_nhwc_avg_generic_depthfirst_impl(
__asm__ __volatile__(
"ld1r { v9.8h }, [%x[rescale_ptr]]\n"
"cmp %x[n_channels], #0x20\n"
- "mov x9, #0x0\n"
- "mov x28, #0x10\n" // cntb _, ALL, #1
- "mov x27, #0x20\n" // cntb _, ALL, #2
- "mov x26, #0x30\n" // cntb _, ALL, #3
+ "mov x27, #0x0\n"
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "mov x24, #0x20\n" // cntb _, ALL, #2
+ "mov x23, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
"movi v8.16b, #0x0\n"
"movi v7.16b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"movi v6.16b, #0x0\n"
"movi v5.16b, #0x0\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldr q2, [x24, x28]\n"
- "ldr q1, [x23, x28]\n"
- "ldr q0, [x24, x27]\n"
- "ldr q31, [x23, x27]\n"
- "ldr q30, [x24, x26]\n"
- "ldr q29, [x23, x26]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldr q2, [x21, x26]\n"
+ "ldr q1, [x20, x26]\n"
+ "ldr q0, [x21, x24]\n"
+ "ldr q31, [x20, x24]\n"
+ "ldr q30, [x21, x23]\n"
+ "ldr q29, [x20, x23]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"fadd v23.8h, v4.8h, v3.8h\n"
"fadd v19.8h, v28.8h, v22.8h\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
"fadd v22.8h, v2.8h, v1.8h\n"
- "ldr q2, [x24, x28]\n"
+ "ldr q2, [x21, x26]\n"
"fadd v18.8h, v27.8h, v21.8h\n"
- "ldr q1, [x23, x28]\n"
+ "ldr q1, [x20, x26]\n"
"fadd v21.8h, v0.8h, v31.8h\n"
- "ldr q0, [x24, x27]\n"
+ "ldr q0, [x21, x24]\n"
"fadd v17.8h, v26.8h, v20.8h\n"
- "ldr q31, [x23, x27]\n"
+ "ldr q31, [x20, x24]\n"
"fadd v20.8h, v30.8h, v29.8h\n"
- "ldr q30, [x24, x26]\n"
+ "ldr q30, [x21, x23]\n"
"fadd v16.8h, v25.8h, v24.8h\n"
- "ldr q29, [x23, x26]\n"
+ "ldr q29, [x20, x23]\n"
"fadd v19.8h, v23.8h, v19.8h\n"
"fadd v18.8h, v22.8h, v18.8h\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"fadd v17.8h, v21.8h, v17.8h\n"
"fadd v16.8h, v20.8h, v16.8h\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
"subs x25, x25, #0x1\n"
"fadd v8.8h, v8.8h, v19.8h\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
"fadd v7.8h, v7.8h, v18.8h\n"
"fadd v6.8h, v6.8h, v17.8h\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"fadd v5.8h, v5.8h, v16.8h\n"
- "add x20, x20, #0x20\n"
+ "add x22, x22, #0x20\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"fadd v23.8h, v4.8h, v3.8h\n"
@@ -138,16 +137,16 @@ void a64_fp16_nhwc_avg_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "fadd v8.8h, v8.8h, v4.8h\n"
- "ldr q2, [x24, x28]\n"
- "ldr q0, [x24, x27]\n"
- "fadd v7.8h, v7.8h, v2.8h\n"
- "fadd v6.8h, v6.8h, v0.8h\n"
- "ldr q30, [x24, x26]\n"
- "fadd v5.8h, v5.8h, v30.8h\n"
+ "fadd v8.8h, v8.8h, v16.8h\n"
+ "ldr q17, [x20, x26]\n"
+ "ldr q16, [x20, x24]\n"
+ "fadd v7.8h, v7.8h, v17.8h\n"
+ "fadd v6.8h, v6.8h, v16.8h\n"
+ "ldr q16, [x20, x23]\n"
+ "fadd v5.8h, v5.8h, v16.8h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x20\n"
@@ -156,14 +155,14 @@ void a64_fp16_nhwc_avg_generic_depthfirst_impl(
"fmul v7.8h, v7.8h, v9.8h\n"
"fmul v6.8h, v6.8h, v9.8h\n"
"fmul v5.8h, v5.8h, v9.8h\n"
- "str q8, [%x[outptr], x9]\n"
- "add x9, x9, #0x40\n"
- "str q7, [%x[outptr], x28]\n"
- "add x28, x28, #0x40\n"
- "str q6, [%x[outptr], x27]\n"
+ "str q8, [%x[outptr], x27]\n"
"add x27, x27, #0x40\n"
- "str q5, [%x[outptr], x26]\n"
+ "str q7, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
+ "str q6, [%x[outptr], x24]\n"
+ "add x24, x24, #0x40\n"
+ "str q5, [%x[outptr], x23]\n"
+ "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 31f\n"
"7:" // Single vector of channels
@@ -172,146 +171,146 @@ void a64_fp16_nhwc_avg_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"movi v8.16b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "fadd v23.8h, v4.8h, v3.8h\n"
- "fadd v19.8h, v28.8h, v22.8h\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
- "fadd v19.8h, v23.8h, v19.8h\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "fadd v17.8h, v4.8h, v3.8h\n"
+ "fadd v16.8h, v28.8h, v22.8h\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "fadd v16.8h, v17.8h, v16.8h\n"
+ "ldp x21, x20, [x22, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "fadd v8.8h, v8.8h, v19.8h\n"
- "add x20, x20, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "fadd v8.8h, v8.8h, v16.8h\n"
+ "add x22, x22, #0x20\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "fadd v23.8h, v4.8h, v3.8h\n"
- "fadd v19.8h, v28.8h, v22.8h\n"
- "fadd v19.8h, v23.8h, v19.8h\n"
- "fadd v8.8h, v8.8h, v19.8h\n"
+ "fadd v17.8h, v4.8h, v3.8h\n"
+ "fadd v16.8h, v28.8h, v22.8h\n"
+ "fadd v16.8h, v17.8h, v16.8h\n"
+ "fadd v8.8h, v8.8h, v16.8h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "fadd v8.8h, v8.8h, v4.8h\n"
+ "fadd v8.8h, v8.8h, v16.8h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x8\n"
"cmp %x[n_channels], #0x8\n"
"fmul v8.8h, v8.8h, v9.8h\n"
- "str q8, [%x[outptr], x9]\n"
- "add x9, x9, #0x10\n"
+ "str q8, [%x[outptr], x27]\n"
+ "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 31f\n"
"14:" // Oddments
"lsr x25, %x[n_valid_cells], #0x2\n"
- "add %x[outptr], %x[outptr], x9\n"
+ "add %x[outptr], %x[outptr], x27\n"
"movi v8.16b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 20f\n"
"15:" // Oddments: 4 inputs loop
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "add x24, x24, x9\n"
- "add x23, x23, x9\n"
- "add x22, x22, x9\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
"movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "add x21, x21, x9\n"
+ "add x20, x20, x27\n"
"movi v28.16b, #0x0\n"
"movi v22.16b, #0x0\n"
"tbz %x[n_channels], #2, 17f\n"
- "ldr d4, [x24], #0x8\n"
- "ldr d3, [x23], #0x8\n"
- "ldr d28, [x22], #0x8\n"
- "ldr d22, [x21], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
+ "ldr d3, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "ldr d22, [x20], #0x8\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
- "ld1 { v28.s }[2], [x22], #0x4\n"
- "ld1 { v22.s }[2], [x21], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v3.s }[2], [x22], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v4.h }[6], [x24], #0x2\n"
- "ld1 { v3.h }[6], [x23], #0x2\n"
- "ld1 { v28.h }[6], [x22], #0x2\n"
- "ld1 { v22.h }[6], [x21], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
+ "ld1 { v3.h }[6], [x22], #0x2\n"
+ "ld1 { v28.h }[6], [x21], #0x2\n"
+ "ld1 { v22.h }[6], [x20], #0x2\n"
"b 19f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v4.h }[4], [x24], #0x2\n"
- "ld1 { v3.h }[4], [x23], #0x2\n"
- "ld1 { v28.h }[4], [x22], #0x2\n"
- "ld1 { v22.h }[4], [x21], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
+ "ld1 { v3.h }[4], [x22], #0x2\n"
+ "ld1 { v28.h }[4], [x21], #0x2\n"
+ "ld1 { v22.h }[4], [x20], #0x2\n"
"b 19f\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ldr s4, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s28, [x22], #0x4\n"
- "ldr s22, [x21], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s28, [x21], #0x4\n"
+ "ldr s22, [x20], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v4.h }[2], [x24], #0x2\n"
- "ld1 { v3.h }[2], [x23], #0x2\n"
- "ld1 { v28.h }[2], [x22], #0x2\n"
- "ld1 { v22.h }[2], [x21], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v3.h }[2], [x22], #0x2\n"
+ "ld1 { v28.h }[2], [x21], #0x2\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
"b 19f\n"
"18:" // Oddments: 4 inputs loop: Load: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ldr h4, [x24], #0x2\n"
- "ldr h3, [x23], #0x2\n"
- "ldr h28, [x22], #0x2\n"
- "ldr h22, [x21], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "ldr h28, [x21], #0x2\n"
+ "ldr h22, [x20], #0x2\n"
"19:" // Oddments: 4 inputs loop: Load: Bit 2: End
- "fadd v23.8h, v4.8h, v3.8h\n"
- "fadd v19.8h, v28.8h, v22.8h\n"
+ "fadd v17.8h, v4.8h, v3.8h\n"
+ "fadd v16.8h, v28.8h, v22.8h\n"
"subs x25, x25, #0x1\n"
- "fadd v19.8h, v23.8h, v19.8h\n"
- "fadd v8.8h, v8.8h, v19.8h\n"
+ "fadd v16.8h, v17.8h, v16.8h\n"
+ "fadd v8.8h, v8.8h, v16.8h\n"
"bgt 15b\n"
"20:" // Oddments: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 26f\n"
"21:" // Oddments: Single input loop
- "ldr x24, [x20], #0x8\n"
- "add x24, x24, x9\n"
+ "ldr x23, [x24], #0x8\n"
+ "add x23, x23, x27\n"
"movi v4.16b, #0x0\n"
"tbz %x[n_channels], #2, 23f\n"
- "ldr d4, [x24], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
"tbz %x[n_channels], #1, 22f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
"tbz %x[n_channels], #0, 25f\n"
- "ld1 { v4.h }[6], [x24], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
"b 25f\n"
"22:" // Oddments: Single input loop: Load: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 25f\n"
- "ld1 { v4.h }[4], [x24], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
"b 25f\n"
"23:" // Oddments: Single input loop: Load: Bit 2: Unset
"tbz %x[n_channels], #1, 24f\n"
- "ldr s4, [x24], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
"tbz %x[n_channels], #0, 25f\n"
- "ld1 { v4.h }[2], [x24], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
"b 25f\n"
"24:" // Oddments: Single input loop: Load: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 25f\n"
- "ldr h4, [x24], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
"25:" // Oddments: Single input loop: Load: Bit 2: End
"subs x21, x21, #0x1\n"
"fadd v8.8h, v8.8h, v4.8h\n"
@@ -342,7 +341,7 @@ void a64_fp16_nhwc_avg_generic_depthfirst_impl(
"31:" // End
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells), [rescale_ptr] "r" (&rescale_value)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 23a9164b76..b65ac7e9fa 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#pragma once
-#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace pooling {
@@ -48,4 +48,4 @@ struct a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 8041453cb1..4b073b9076 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -22,6 +22,7 @@
* SOFTWARE.
*/
+
#include <cstddef>
#include <cstdint>
@@ -111,7 +112,7 @@ void a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"fmax v18.8h, v18.8h, v21.8h\n"
"fmax v17.8h, v17.8h, v20.8h\n"
"add x15, x15, #0x10\n"
- "fmax v16.8h, v16.8h, v20.8h\n"
+ "fmax v16.8h, v20.8h, v16.8h\n"
"str q19, [x14, x12]\n"
"str q18, [x13, x12]\n"
"str q17, [x11, x12]\n"
@@ -121,43 +122,43 @@ void a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"2:" // Vector: Tail
"fmax v21.8h, v30.8h, v29.8h\n"
"fmax v20.8h, v29.8h, v28.8h\n"
- "fmax v19.8h, v27.8h, v26.8h\n"
+ "fmax v16.8h, v27.8h, v26.8h\n"
"fmax v18.8h, v25.8h, v24.8h\n"
"fmax v17.8h, v27.8h, v23.8h\n"
- "fmax v16.8h, v24.8h, v22.8h\n"
- "fmax v19.8h, v21.8h, v19.8h\n"
+ "fmax v19.8h, v24.8h, v22.8h\n"
+ "fmax v16.8h, v21.8h, v16.8h\n"
"fmax v18.8h, v18.8h, v21.8h\n"
- "str q19, [x14, x12]\n"
+ "str q16, [x14, x12]\n"
"fmax v17.8h, v17.8h, v20.8h\n"
- "fmax v16.8h, v16.8h, v20.8h\n"
+ "fmax v16.8h, v20.8h, v19.8h\n"
"str q18, [x13, x12]\n"
"str q17, [x11, x12]\n"
"str q16, [x10, x12]\n"
"add x12, x12, #0x10\n"
"cbz x16, 4f\n"
"3:" // Oddments
- "ldr h30, [x28, x15]\n"
- "ldr h29, [x25, x15]\n"
- "fmax v21.8h, v30.8h, v29.8h\n"
+ "ldr h16, [x28, x15]\n"
+ "ldr h17, [x25, x15]\n"
+ "fmax v23.8h, v16.8h, v17.8h\n"
"subs x16, x16, #0x1\n"
- "ldr h28, [x22, x15]\n"
- "ldr h27, [x26, x15]\n"
- "fmax v20.8h, v29.8h, v28.8h\n"
- "ldr h26, [x9, x15]\n"
- "ldr h25, [x27, x15]\n"
- "fmax v19.8h, v27.8h, v26.8h\n"
- "fmax v19.8h, v21.8h, v19.8h\n"
- "ldr h24, [x24, x15]\n"
- "ldr h23, [x23, x15]\n"
- "fmax v18.8h, v25.8h, v24.8h\n"
- "fmax v17.8h, v27.8h, v23.8h\n"
- "ldr h22, [x21, x15]\n"
- "fmax v16.8h, v24.8h, v22.8h\n"
+ "ldr h16, [x22, x15]\n"
+ "ldr h22, [x26, x15]\n"
+ "fmax v21.8h, v17.8h, v16.8h\n"
+ "ldr h16, [x9, x15]\n"
+ "ldr h17, [x27, x15]\n"
+ "fmax v16.8h, v22.8h, v16.8h\n"
+ "fmax v20.8h, v23.8h, v16.8h\n"
+ "ldr h19, [x24, x15]\n"
+ "ldr h16, [x23, x15]\n"
+ "fmax v18.8h, v17.8h, v19.8h\n"
+ "fmax v17.8h, v22.8h, v16.8h\n"
+ "ldr h16, [x21, x15]\n"
+ "fmax v16.8h, v19.8h, v16.8h\n"
"add x15, x15, #0x2\n"
- "fmax v18.8h, v18.8h, v21.8h\n"
- "fmax v17.8h, v17.8h, v20.8h\n"
- "fmax v16.8h, v16.8h, v20.8h\n"
- "str h19, [x14, x12]\n"
+ "fmax v18.8h, v18.8h, v23.8h\n"
+ "fmax v17.8h, v17.8h, v21.8h\n"
+ "fmax v16.8h, v21.8h, v16.8h\n"
+ "str h20, [x14, x12]\n"
"str h18, [x13, x12]\n"
"str h17, [x11, x12]\n"
"str h16, [x10, x12]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp
index e4de9fb79c..c92e2cdebd 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp
@@ -41,10 +41,10 @@ void a64_fp16_nhwc_max_generic_depthfirst_impl(
{
__asm__ __volatile__(
"cmp %x[n_channels], #0x20\n"
- "mov x9, #0x0\n"
- "mov x28, #0x10\n" // cntb _, ALL, #1
- "mov x27, #0x20\n" // cntb _, ALL, #2
- "mov x26, #0x30\n" // cntb _, ALL, #3
+ "mov x27, #0x0\n"
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "mov x24, #0x20\n" // cntb _, ALL, #2
+ "mov x23, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
"mov w20, #0xfc00\n"
@@ -53,66 +53,66 @@ void a64_fp16_nhwc_max_generic_depthfirst_impl(
"dup v7.8h, w20\n"
"dup v6.8h, w20\n"
"dup v5.8h, w20\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldr q2, [x24, x28]\n"
- "ldr q1, [x23, x28]\n"
- "ldr q0, [x24, x27]\n"
- "ldr q31, [x23, x27]\n"
- "ldr q30, [x24, x26]\n"
- "ldr q29, [x23, x26]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldr q2, [x21, x26]\n"
+ "ldr q1, [x20, x26]\n"
+ "ldr q0, [x21, x24]\n"
+ "ldr q31, [x20, x24]\n"
+ "ldr q30, [x21, x23]\n"
+ "ldr q29, [x20, x23]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"fmax v23.8h, v4.8h, v3.8h\n"
"fmax v19.8h, v28.8h, v22.8h\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
"fmax v22.8h, v2.8h, v1.8h\n"
- "ldr q2, [x24, x28]\n"
+ "ldr q2, [x21, x26]\n"
"fmax v18.8h, v27.8h, v21.8h\n"
- "ldr q1, [x23, x28]\n"
+ "ldr q1, [x20, x26]\n"
"fmax v21.8h, v0.8h, v31.8h\n"
- "ldr q0, [x24, x27]\n"
+ "ldr q0, [x21, x24]\n"
"fmax v17.8h, v26.8h, v20.8h\n"
- "ldr q31, [x23, x27]\n"
+ "ldr q31, [x20, x24]\n"
"fmax v20.8h, v30.8h, v29.8h\n"
- "ldr q30, [x24, x26]\n"
+ "ldr q30, [x21, x23]\n"
"fmax v16.8h, v25.8h, v24.8h\n"
- "ldr q29, [x23, x26]\n"
+ "ldr q29, [x20, x23]\n"
"fmax v19.8h, v23.8h, v19.8h\n"
"fmax v18.8h, v22.8h, v18.8h\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"fmax v17.8h, v21.8h, v17.8h\n"
"fmax v16.8h, v20.8h, v16.8h\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
"subs x25, x25, #0x1\n"
"fmax v8.8h, v8.8h, v19.8h\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
"fmax v7.8h, v7.8h, v18.8h\n"
"fmax v6.8h, v6.8h, v17.8h\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"fmax v5.8h, v5.8h, v16.8h\n"
- "add x20, x20, #0x20\n"
+ "add x22, x22, #0x20\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"fmax v23.8h, v4.8h, v3.8h\n"
@@ -135,28 +135,28 @@ void a64_fp16_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "fmax v8.8h, v8.8h, v4.8h\n"
- "ldr q2, [x24, x28]\n"
- "ldr q0, [x24, x27]\n"
- "fmax v7.8h, v7.8h, v2.8h\n"
- "fmax v6.8h, v6.8h, v0.8h\n"
- "ldr q30, [x24, x26]\n"
- "fmax v5.8h, v5.8h, v30.8h\n"
+ "fmax v8.8h, v8.8h, v16.8h\n"
+ "ldr q17, [x20, x26]\n"
+ "ldr q16, [x20, x24]\n"
+ "fmax v7.8h, v7.8h, v17.8h\n"
+ "fmax v6.8h, v6.8h, v16.8h\n"
+ "ldr q16, [x20, x23]\n"
+ "fmax v5.8h, v5.8h, v16.8h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x20\n"
"cmp %x[n_channels], #0x20\n"
- "str q8, [%x[outptr], x9]\n"
- "str q7, [%x[outptr], x28]\n"
- "add x9, x9, #0x40\n"
- "add x28, x28, #0x40\n"
- "str q6, [%x[outptr], x27]\n"
+ "str q8, [%x[outptr], x27]\n"
+ "str q7, [%x[outptr], x26]\n"
"add x27, x27, #0x40\n"
- "str q5, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
+ "str q6, [%x[outptr], x24]\n"
+ "add x24, x24, #0x40\n"
+ "str q5, [%x[outptr], x23]\n"
+ "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 31f\n"
"7:" // Single vector of channels
@@ -166,146 +166,146 @@ void a64_fp16_nhwc_max_generic_depthfirst_impl(
"mov w20, #0xfc00\n"
"lsr x25, %x[n_valid_cells], #0x2\n"
"dup v8.8h, w20\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "fmax v23.8h, v4.8h, v3.8h\n"
- "fmax v19.8h, v28.8h, v22.8h\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
- "fmax v19.8h, v23.8h, v19.8h\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "fmax v17.8h, v4.8h, v3.8h\n"
+ "fmax v16.8h, v28.8h, v22.8h\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "fmax v16.8h, v17.8h, v16.8h\n"
+ "ldp x21, x20, [x22, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "fmax v8.8h, v8.8h, v19.8h\n"
- "add x20, x20, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "fmax v8.8h, v8.8h, v16.8h\n"
+ "add x22, x22, #0x20\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "fmax v23.8h, v4.8h, v3.8h\n"
- "fmax v19.8h, v28.8h, v22.8h\n"
- "fmax v19.8h, v23.8h, v19.8h\n"
- "fmax v8.8h, v8.8h, v19.8h\n"
+ "fmax v17.8h, v4.8h, v3.8h\n"
+ "fmax v16.8h, v28.8h, v22.8h\n"
+ "fmax v16.8h, v17.8h, v16.8h\n"
+ "fmax v8.8h, v8.8h, v16.8h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "fmax v8.8h, v8.8h, v4.8h\n"
+ "fmax v8.8h, v8.8h, v16.8h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x8\n"
"cmp %x[n_channels], #0x8\n"
- "str q8, [%x[outptr], x9]\n"
- "add x9, x9, #0x10\n"
+ "str q8, [%x[outptr], x27]\n"
+ "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 31f\n"
"14:" // Oddments
"mov w20, #0xfc00\n"
"lsr x25, %x[n_valid_cells], #0x2\n"
"dup v8.8h, w20\n"
- "add %x[outptr], %x[outptr], x9\n"
- "mov x20, %x[inptrs]\n"
+ "add %x[outptr], %x[outptr], x27\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 20f\n"
"15:" // Oddments: 4 inputs loop
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "add x24, x24, x9\n"
- "add x23, x23, x9\n"
- "add x22, x22, x9\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
"movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "add x21, x21, x9\n"
+ "add x20, x20, x27\n"
"movi v28.16b, #0x0\n"
"movi v22.16b, #0x0\n"
"tbz %x[n_channels], #2, 17f\n"
- "ldr d4, [x24], #0x8\n"
- "ldr d3, [x23], #0x8\n"
- "ldr d28, [x22], #0x8\n"
- "ldr d22, [x21], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
+ "ldr d3, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "ldr d22, [x20], #0x8\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
- "ld1 { v28.s }[2], [x22], #0x4\n"
- "ld1 { v22.s }[2], [x21], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v3.s }[2], [x22], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v4.h }[6], [x24], #0x2\n"
- "ld1 { v3.h }[6], [x23], #0x2\n"
- "ld1 { v28.h }[6], [x22], #0x2\n"
- "ld1 { v22.h }[6], [x21], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
+ "ld1 { v3.h }[6], [x22], #0x2\n"
+ "ld1 { v28.h }[6], [x21], #0x2\n"
+ "ld1 { v22.h }[6], [x20], #0x2\n"
"b 19f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v4.h }[4], [x24], #0x2\n"
- "ld1 { v3.h }[4], [x23], #0x2\n"
- "ld1 { v28.h }[4], [x22], #0x2\n"
- "ld1 { v22.h }[4], [x21], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
+ "ld1 { v3.h }[4], [x22], #0x2\n"
+ "ld1 { v28.h }[4], [x21], #0x2\n"
+ "ld1 { v22.h }[4], [x20], #0x2\n"
"b 19f\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ldr s4, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s28, [x22], #0x4\n"
- "ldr s22, [x21], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s28, [x21], #0x4\n"
+ "ldr s22, [x20], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v4.h }[2], [x24], #0x2\n"
- "ld1 { v3.h }[2], [x23], #0x2\n"
- "ld1 { v28.h }[2], [x22], #0x2\n"
- "ld1 { v22.h }[2], [x21], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v3.h }[2], [x22], #0x2\n"
+ "ld1 { v28.h }[2], [x21], #0x2\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
"b 19f\n"
"18:" // Oddments: 4 inputs loop: Load: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ldr h4, [x24], #0x2\n"
- "ldr h3, [x23], #0x2\n"
- "ldr h28, [x22], #0x2\n"
- "ldr h22, [x21], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "ldr h28, [x21], #0x2\n"
+ "ldr h22, [x20], #0x2\n"
"19:" // Oddments: 4 inputs loop: Load: Bit 2: End
- "fmax v23.8h, v4.8h, v3.8h\n"
- "fmax v19.8h, v28.8h, v22.8h\n"
+ "fmax v17.8h, v4.8h, v3.8h\n"
+ "fmax v16.8h, v28.8h, v22.8h\n"
"subs x25, x25, #0x1\n"
- "fmax v19.8h, v23.8h, v19.8h\n"
- "fmax v8.8h, v8.8h, v19.8h\n"
+ "fmax v16.8h, v17.8h, v16.8h\n"
+ "fmax v8.8h, v8.8h, v16.8h\n"
"bgt 15b\n"
"20:" // Oddments: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 26f\n"
"21:" // Oddments: Single input loop
- "ldr x24, [x20], #0x8\n"
- "add x24, x24, x9\n"
+ "ldr x23, [x24], #0x8\n"
+ "add x23, x23, x27\n"
"movi v4.16b, #0x0\n"
"tbz %x[n_channels], #2, 23f\n"
- "ldr d4, [x24], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
"tbz %x[n_channels], #1, 22f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
"tbz %x[n_channels], #0, 25f\n"
- "ld1 { v4.h }[6], [x24], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
"b 25f\n"
"22:" // Oddments: Single input loop: Load: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 25f\n"
- "ld1 { v4.h }[4], [x24], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
"b 25f\n"
"23:" // Oddments: Single input loop: Load: Bit 2: Unset
"tbz %x[n_channels], #1, 24f\n"
- "ldr s4, [x24], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
"tbz %x[n_channels], #0, 25f\n"
- "ld1 { v4.h }[2], [x24], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
"b 25f\n"
"24:" // Oddments: Single input loop: Load: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 25f\n"
- "ldr h4, [x24], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
"25:" // Oddments: Single input loop: Load: Bit 2: End
"subs x21, x21, #0x1\n"
"fmax v8.8h, v8.8h, v4.8h\n"
@@ -335,7 +335,7 @@ void a64_fp16_nhwc_max_generic_depthfirst_impl(
"31:" // End
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
index 813e685606..7add5feb1d 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,8 @@
#pragma once
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -45,3 +47,5 @@ struct a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst : public DepthfirstStrategy
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
index 9db65d62b0..cf0047638e 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -22,12 +22,12 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
-
#include <algorithm>
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -196,38 +196,38 @@ void a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
"add x5, x5, #0x10\n"
"cbz x3, 4f\n"
"3:" // Oddments
- "ldr s6, [x11, x4]\n"
- "ldr s5, [x10, x4]\n"
- "fadd v17.4s, v6.4s, v5.4s\n"
+ "ldr s17, [x11, x4]\n"
+ "ldr s16, [x10, x4]\n"
+ "fadd v18.4s, v17.4s, v16.4s\n"
"subs x3, x3, #0x1\n"
- "ldr s4, [x27, x4]\n"
- "ldr s3, [x26, x4]\n"
- "fadd v16.4s, v4.4s, v3.4s\n"
- "fadd v19.4s, v17.4s, v16.4s\n"
- "ldr s2, [x15, x4]\n"
- "ldr s1, [x14, x4]\n"
- "fadd v18.4s, v2.4s, v1.4s\n"
- "fadd v21.4s, v18.4s, v19.4s\n"
- "ldr s0, [x12, x4]\n"
- "ldr s31, [x28, x4]\n"
- "fadd v17.4s, v0.4s, v31.4s\n"
- "ldr s30, [x9, x4]\n"
- "ldr s29, [x25, x4]\n"
- "fadd v22.4s, v30.4s, v29.4s\n"
- "ldr s28, [x23, x4]\n"
- "ldr s27, [x22, x4]\n"
- "fadd v16.4s, v28.4s, v27.4s\n"
- "fadd v20.4s, v16.4s, v19.4s\n"
- "ldr s26, [x16, x4]\n"
- "ldr s25, [x13, x4]\n"
- "fadd v19.4s, v26.4s, v17.4s\n"
- "fadd v18.4s, v25.4s, v22.4s\n"
- "ldr s24, [x24, x4]\n"
- "ldr s23, [x21, x4]\n"
- "fadd v17.4s, v24.4s, v17.4s\n"
- "fadd v16.4s, v23.4s, v22.4s\n"
- "fadd v19.4s, v21.4s, v19.4s\n"
- "fadd v18.4s, v21.4s, v18.4s\n"
+ "ldr s17, [x27, x4]\n"
+ "ldr s16, [x26, x4]\n"
+ "fadd v16.4s, v17.4s, v16.4s\n"
+ "fadd v18.4s, v18.4s, v16.4s\n"
+ "ldr s17, [x15, x4]\n"
+ "ldr s16, [x14, x4]\n"
+ "fadd v16.4s, v17.4s, v16.4s\n"
+ "fadd v23.4s, v16.4s, v18.4s\n"
+ "ldr s17, [x12, x4]\n"
+ "ldr s16, [x28, x4]\n"
+ "fadd v22.4s, v17.4s, v16.4s\n"
+ "ldr s17, [x9, x4]\n"
+ "ldr s16, [x25, x4]\n"
+ "fadd v21.4s, v17.4s, v16.4s\n"
+ "ldr s17, [x23, x4]\n"
+ "ldr s16, [x22, x4]\n"
+ "fadd v16.4s, v17.4s, v16.4s\n"
+ "fadd v20.4s, v16.4s, v18.4s\n"
+ "ldr s17, [x16, x4]\n"
+ "ldr s16, [x13, x4]\n"
+ "fadd v19.4s, v17.4s, v22.4s\n"
+ "fadd v18.4s, v16.4s, v21.4s\n"
+ "ldr s17, [x24, x4]\n"
+ "ldr s16, [x21, x4]\n"
+ "fadd v17.4s, v17.4s, v22.4s\n"
+ "fadd v16.4s, v16.4s, v21.4s\n"
+ "fadd v19.4s, v23.4s, v19.4s\n"
+ "fadd v18.4s, v23.4s, v18.4s\n"
"add x4, x4, #0x4\n"
"fadd v17.4s, v17.4s, v20.4s\n"
"fadd v16.4s, v16.4s, v20.4s\n"
@@ -250,4 +250,5 @@ void a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
+
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp
index 3f90610591..d236f07b1c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp
@@ -22,11 +22,11 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
-
#include <cstdint>
#include <cstddef>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -44,77 +44,77 @@ void a64_fp32_nhwc_avg_generic_depthfirst_impl(
__asm__ __volatile__(
"ld1r { v9.4s }, [%x[rescale_ptr]]\n"
"cmp %x[n_channels], #0x10\n"
- "mov x9, #0x0\n"
- "mov x28, #0x10\n" // cntb _, ALL, #1
- "mov x27, #0x20\n" // cntb _, ALL, #2
- "mov x26, #0x30\n" // cntb _, ALL, #3
+ "mov x27, #0x0\n"
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "mov x24, #0x20\n" // cntb _, ALL, #2
+ "mov x23, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
"movi v8.16b, #0x0\n"
"movi v7.16b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"movi v6.16b, #0x0\n"
"movi v5.16b, #0x0\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldr q2, [x24, x28]\n"
- "ldr q1, [x23, x28]\n"
- "ldr q0, [x24, x27]\n"
- "ldr q31, [x23, x27]\n"
- "ldr q30, [x24, x26]\n"
- "ldr q29, [x23, x26]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldr q2, [x21, x26]\n"
+ "ldr q1, [x20, x26]\n"
+ "ldr q0, [x21, x24]\n"
+ "ldr q31, [x20, x24]\n"
+ "ldr q30, [x21, x23]\n"
+ "ldr q29, [x20, x23]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"fadd v23.4s, v4.4s, v3.4s\n"
"fadd v19.4s, v28.4s, v22.4s\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
"fadd v22.4s, v2.4s, v1.4s\n"
- "ldr q2, [x24, x28]\n"
+ "ldr q2, [x21, x26]\n"
"fadd v18.4s, v27.4s, v21.4s\n"
- "ldr q1, [x23, x28]\n"
+ "ldr q1, [x20, x26]\n"
"fadd v21.4s, v0.4s, v31.4s\n"
- "ldr q0, [x24, x27]\n"
+ "ldr q0, [x21, x24]\n"
"fadd v17.4s, v26.4s, v20.4s\n"
- "ldr q31, [x23, x27]\n"
+ "ldr q31, [x20, x24]\n"
"fadd v20.4s, v30.4s, v29.4s\n"
- "ldr q30, [x24, x26]\n"
+ "ldr q30, [x21, x23]\n"
"fadd v16.4s, v25.4s, v24.4s\n"
- "ldr q29, [x23, x26]\n"
+ "ldr q29, [x20, x23]\n"
"fadd v19.4s, v23.4s, v19.4s\n"
"fadd v18.4s, v22.4s, v18.4s\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"fadd v17.4s, v21.4s, v17.4s\n"
"fadd v16.4s, v20.4s, v16.4s\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
"subs x25, x25, #0x1\n"
"fadd v8.4s, v8.4s, v19.4s\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
"fadd v7.4s, v7.4s, v18.4s\n"
"fadd v6.4s, v6.4s, v17.4s\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"fadd v5.4s, v5.4s, v16.4s\n"
- "add x20, x20, #0x20\n"
+ "add x22, x22, #0x20\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"fadd v23.4s, v4.4s, v3.4s\n"
@@ -137,16 +137,16 @@ void a64_fp32_nhwc_avg_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "fadd v8.4s, v8.4s, v4.4s\n"
- "ldr q2, [x24, x28]\n"
- "ldr q0, [x24, x27]\n"
- "fadd v7.4s, v7.4s, v2.4s\n"
- "fadd v6.4s, v6.4s, v0.4s\n"
- "ldr q30, [x24, x26]\n"
- "fadd v5.4s, v5.4s, v30.4s\n"
+ "fadd v8.4s, v8.4s, v16.4s\n"
+ "ldr q17, [x20, x26]\n"
+ "ldr q16, [x20, x24]\n"
+ "fadd v7.4s, v7.4s, v17.4s\n"
+ "fadd v6.4s, v6.4s, v16.4s\n"
+ "ldr q16, [x20, x23]\n"
+ "fadd v5.4s, v5.4s, v16.4s\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x10\n"
@@ -155,14 +155,14 @@ void a64_fp32_nhwc_avg_generic_depthfirst_impl(
"fmul v7.4s, v7.4s, v9.4s\n"
"fmul v6.4s, v6.4s, v9.4s\n"
"fmul v5.4s, v5.4s, v9.4s\n"
- "str q8, [%x[outptr], x9]\n"
- "add x9, x9, #0x40\n"
- "str q7, [%x[outptr], x28]\n"
- "add x28, x28, #0x40\n"
- "str q6, [%x[outptr], x27]\n"
+ "str q8, [%x[outptr], x27]\n"
"add x27, x27, #0x40\n"
- "str q5, [%x[outptr], x26]\n"
+ "str q7, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
+ "str q6, [%x[outptr], x24]\n"
+ "add x24, x24, #0x40\n"
+ "str q5, [%x[outptr], x23]\n"
+ "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 25f\n"
"7:" // Single vector of channels
@@ -171,110 +171,110 @@ void a64_fp32_nhwc_avg_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"movi v8.16b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "fadd v23.4s, v4.4s, v3.4s\n"
- "fadd v19.4s, v28.4s, v22.4s\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
- "fadd v19.4s, v23.4s, v19.4s\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "fadd v17.4s, v4.4s, v3.4s\n"
+ "fadd v16.4s, v28.4s, v22.4s\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "fadd v16.4s, v17.4s, v16.4s\n"
+ "ldp x21, x20, [x22, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "fadd v8.4s, v8.4s, v19.4s\n"
- "add x20, x20, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "fadd v8.4s, v8.4s, v16.4s\n"
+ "add x22, x22, #0x20\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "fadd v23.4s, v4.4s, v3.4s\n"
- "fadd v19.4s, v28.4s, v22.4s\n"
- "fadd v19.4s, v23.4s, v19.4s\n"
- "fadd v8.4s, v8.4s, v19.4s\n"
+ "fadd v17.4s, v4.4s, v3.4s\n"
+ "fadd v16.4s, v28.4s, v22.4s\n"
+ "fadd v16.4s, v17.4s, v16.4s\n"
+ "fadd v8.4s, v8.4s, v16.4s\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "fadd v8.4s, v8.4s, v4.4s\n"
+ "fadd v8.4s, v8.4s, v16.4s\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x4\n"
"cmp %x[n_channels], #0x4\n"
"fmul v8.4s, v8.4s, v9.4s\n"
- "str q8, [%x[outptr], x9]\n"
- "add x9, x9, #0x10\n"
+ "str q8, [%x[outptr], x27]\n"
+ "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 25f\n"
"14:" // Oddments
"lsr x25, %x[n_valid_cells], #0x2\n"
- "add %x[outptr], %x[outptr], x9\n"
+ "add %x[outptr], %x[outptr], x27\n"
"movi v8.16b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 18f\n"
"15:" // Oddments: 4 inputs loop
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "add x24, x24, x9\n"
- "add x23, x23, x9\n"
- "add x22, x22, x9\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
"movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "add x21, x21, x9\n"
+ "add x20, x20, x27\n"
"movi v28.16b, #0x0\n"
"movi v22.16b, #0x0\n"
"tbz %x[n_channels], #1, 16f\n"
- "ldr d4, [x24], #0x8\n"
- "ldr d3, [x23], #0x8\n"
- "ldr d28, [x22], #0x8\n"
- "ldr d22, [x21], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
+ "ldr d3, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "ldr d22, [x20], #0x8\n"
"tbz %x[n_channels], #0, 17f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
- "ld1 { v28.s }[2], [x22], #0x4\n"
- "ld1 { v22.s }[2], [x21], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v3.s }[2], [x22], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x20], #0x4\n"
"b 17f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 1: Unset
"tbz %x[n_channels], #0, 17f\n"
- "ldr s4, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s28, [x22], #0x4\n"
- "ldr s22, [x21], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s28, [x21], #0x4\n"
+ "ldr s22, [x20], #0x4\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 1: End
- "fadd v23.4s, v4.4s, v3.4s\n"
- "fadd v19.4s, v28.4s, v22.4s\n"
+ "fadd v17.4s, v4.4s, v3.4s\n"
+ "fadd v16.4s, v28.4s, v22.4s\n"
"subs x25, x25, #0x1\n"
- "fadd v19.4s, v23.4s, v19.4s\n"
- "fadd v8.4s, v8.4s, v19.4s\n"
+ "fadd v16.4s, v17.4s, v16.4s\n"
+ "fadd v8.4s, v8.4s, v16.4s\n"
"bgt 15b\n"
"18:" // Oddments: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 22f\n"
"19:" // Oddments: Single input loop
- "ldr x24, [x20], #0x8\n"
- "add x24, x24, x9\n"
+ "ldr x23, [x24], #0x8\n"
+ "add x23, x23, x27\n"
"movi v4.16b, #0x0\n"
"tbz %x[n_channels], #1, 20f\n"
- "ldr d4, [x24], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
"tbz %x[n_channels], #0, 21f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
"b 21f\n"
"20:" // Oddments: Single input loop: Load: Bit 1: Unset
"tbz %x[n_channels], #0, 21f\n"
- "ldr s4, [x24], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
"21:" // Oddments: Single input loop: Load: Bit 1: End
"subs x21, x21, #0x1\n"
"fadd v8.4s, v8.4s, v4.4s\n"
@@ -293,10 +293,11 @@ void a64_fp32_nhwc_avg_generic_depthfirst_impl(
"25:" // End
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells), [rescale_ptr] "r" (&rescale_value)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
} // namespace pooling
} // namespace arm_conv
+
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 4bf5770857..2f72b59d70 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,8 @@
#pragma once
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -45,3 +47,5 @@ struct a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 2e7fb3c5b1..f4202de1ed 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -22,11 +22,12 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -111,7 +112,7 @@ void a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"fmax v18.4s, v18.4s, v21.4s\n"
"fmax v17.4s, v17.4s, v20.4s\n"
"add x15, x15, #0x10\n"
- "fmax v16.4s, v16.4s, v20.4s\n"
+ "fmax v16.4s, v20.4s, v16.4s\n"
"str q19, [x14, x12]\n"
"str q18, [x13, x12]\n"
"str q17, [x11, x12]\n"
@@ -121,43 +122,43 @@ void a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"2:" // Vector: Tail
"fmax v21.4s, v30.4s, v29.4s\n"
"fmax v20.4s, v29.4s, v28.4s\n"
- "fmax v19.4s, v27.4s, v26.4s\n"
+ "fmax v16.4s, v27.4s, v26.4s\n"
"fmax v18.4s, v25.4s, v24.4s\n"
"fmax v17.4s, v27.4s, v23.4s\n"
- "fmax v16.4s, v24.4s, v22.4s\n"
- "fmax v19.4s, v21.4s, v19.4s\n"
+ "fmax v19.4s, v24.4s, v22.4s\n"
+ "fmax v16.4s, v21.4s, v16.4s\n"
"fmax v18.4s, v18.4s, v21.4s\n"
- "str q19, [x14, x12]\n"
+ "str q16, [x14, x12]\n"
"fmax v17.4s, v17.4s, v20.4s\n"
- "fmax v16.4s, v16.4s, v20.4s\n"
+ "fmax v16.4s, v20.4s, v19.4s\n"
"str q18, [x13, x12]\n"
"str q17, [x11, x12]\n"
"str q16, [x10, x12]\n"
"add x12, x12, #0x10\n"
"cbz x16, 4f\n"
"3:" // Oddments
- "ldr s30, [x28, x15]\n"
- "ldr s29, [x25, x15]\n"
- "fmax v21.4s, v30.4s, v29.4s\n"
+ "ldr s16, [x28, x15]\n"
+ "ldr s17, [x25, x15]\n"
+ "fmax v23.4s, v16.4s, v17.4s\n"
"subs x16, x16, #0x1\n"
- "ldr s28, [x22, x15]\n"
- "ldr s27, [x26, x15]\n"
- "fmax v20.4s, v29.4s, v28.4s\n"
- "ldr s26, [x9, x15]\n"
- "ldr s25, [x27, x15]\n"
- "fmax v19.4s, v27.4s, v26.4s\n"
- "fmax v19.4s, v21.4s, v19.4s\n"
- "ldr s24, [x24, x15]\n"
- "ldr s23, [x23, x15]\n"
- "fmax v18.4s, v25.4s, v24.4s\n"
- "fmax v17.4s, v27.4s, v23.4s\n"
- "ldr s22, [x21, x15]\n"
- "fmax v16.4s, v24.4s, v22.4s\n"
+ "ldr s16, [x22, x15]\n"
+ "ldr s22, [x26, x15]\n"
+ "fmax v21.4s, v17.4s, v16.4s\n"
+ "ldr s16, [x9, x15]\n"
+ "ldr s17, [x27, x15]\n"
+ "fmax v16.4s, v22.4s, v16.4s\n"
+ "fmax v20.4s, v23.4s, v16.4s\n"
+ "ldr s19, [x24, x15]\n"
+ "ldr s16, [x23, x15]\n"
+ "fmax v18.4s, v17.4s, v19.4s\n"
+ "fmax v17.4s, v22.4s, v16.4s\n"
+ "ldr s16, [x21, x15]\n"
+ "fmax v16.4s, v19.4s, v16.4s\n"
"add x15, x15, #0x4\n"
- "fmax v18.4s, v18.4s, v21.4s\n"
- "fmax v17.4s, v17.4s, v20.4s\n"
- "fmax v16.4s, v16.4s, v20.4s\n"
- "str s19, [x14, x12]\n"
+ "fmax v18.4s, v18.4s, v23.4s\n"
+ "fmax v17.4s, v17.4s, v21.4s\n"
+ "fmax v16.4s, v21.4s, v16.4s\n"
+ "str s20, [x14, x12]\n"
"str s18, [x13, x12]\n"
"str s17, [x11, x12]\n"
"str s16, [x10, x12]\n"
@@ -172,4 +173,5 @@ void a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
+
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp
index 4f1af09e08..f4706635dc 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp
@@ -22,11 +22,11 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
-
#include <cstdint>
#include <cstddef>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -41,10 +41,10 @@ void a64_fp32_nhwc_max_generic_depthfirst_impl(
{
__asm__ __volatile__(
"cmp %x[n_channels], #0x10\n"
- "mov x9, #0x0\n"
- "mov x28, #0x10\n" // cntb _, ALL, #1
- "mov x27, #0x20\n" // cntb _, ALL, #2
- "mov x26, #0x30\n" // cntb _, ALL, #3
+ "mov x27, #0x0\n"
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "mov x24, #0x20\n" // cntb _, ALL, #2
+ "mov x23, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
"mov w20, #0xff800000\n"
@@ -53,66 +53,66 @@ void a64_fp32_nhwc_max_generic_depthfirst_impl(
"dup v7.4s, w20\n"
"dup v6.4s, w20\n"
"dup v5.4s, w20\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldr q2, [x24, x28]\n"
- "ldr q1, [x23, x28]\n"
- "ldr q0, [x24, x27]\n"
- "ldr q31, [x23, x27]\n"
- "ldr q30, [x24, x26]\n"
- "ldr q29, [x23, x26]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldr q2, [x21, x26]\n"
+ "ldr q1, [x20, x26]\n"
+ "ldr q0, [x21, x24]\n"
+ "ldr q31, [x20, x24]\n"
+ "ldr q30, [x21, x23]\n"
+ "ldr q29, [x20, x23]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"fmax v23.4s, v4.4s, v3.4s\n"
"fmax v19.4s, v28.4s, v22.4s\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
"fmax v22.4s, v2.4s, v1.4s\n"
- "ldr q2, [x24, x28]\n"
+ "ldr q2, [x21, x26]\n"
"fmax v18.4s, v27.4s, v21.4s\n"
- "ldr q1, [x23, x28]\n"
+ "ldr q1, [x20, x26]\n"
"fmax v21.4s, v0.4s, v31.4s\n"
- "ldr q0, [x24, x27]\n"
+ "ldr q0, [x21, x24]\n"
"fmax v17.4s, v26.4s, v20.4s\n"
- "ldr q31, [x23, x27]\n"
+ "ldr q31, [x20, x24]\n"
"fmax v20.4s, v30.4s, v29.4s\n"
- "ldr q30, [x24, x26]\n"
+ "ldr q30, [x21, x23]\n"
"fmax v16.4s, v25.4s, v24.4s\n"
- "ldr q29, [x23, x26]\n"
+ "ldr q29, [x20, x23]\n"
"fmax v19.4s, v23.4s, v19.4s\n"
"fmax v18.4s, v22.4s, v18.4s\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"fmax v17.4s, v21.4s, v17.4s\n"
"fmax v16.4s, v20.4s, v16.4s\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
"subs x25, x25, #0x1\n"
"fmax v8.4s, v8.4s, v19.4s\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
"fmax v7.4s, v7.4s, v18.4s\n"
"fmax v6.4s, v6.4s, v17.4s\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"fmax v5.4s, v5.4s, v16.4s\n"
- "add x20, x20, #0x20\n"
+ "add x22, x22, #0x20\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"fmax v23.4s, v4.4s, v3.4s\n"
@@ -135,28 +135,28 @@ void a64_fp32_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "fmax v8.4s, v8.4s, v4.4s\n"
- "ldr q2, [x24, x28]\n"
- "ldr q0, [x24, x27]\n"
- "fmax v7.4s, v7.4s, v2.4s\n"
- "fmax v6.4s, v6.4s, v0.4s\n"
- "ldr q30, [x24, x26]\n"
- "fmax v5.4s, v5.4s, v30.4s\n"
+ "fmax v8.4s, v8.4s, v16.4s\n"
+ "ldr q17, [x20, x26]\n"
+ "ldr q16, [x20, x24]\n"
+ "fmax v7.4s, v7.4s, v17.4s\n"
+ "fmax v6.4s, v6.4s, v16.4s\n"
+ "ldr q16, [x20, x23]\n"
+ "fmax v5.4s, v5.4s, v16.4s\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x10\n"
"cmp %x[n_channels], #0x10\n"
- "str q8, [%x[outptr], x9]\n"
- "str q7, [%x[outptr], x28]\n"
- "add x9, x9, #0x40\n"
- "add x28, x28, #0x40\n"
- "str q6, [%x[outptr], x27]\n"
+ "str q8, [%x[outptr], x27]\n"
+ "str q7, [%x[outptr], x26]\n"
"add x27, x27, #0x40\n"
- "str q5, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
+ "str q6, [%x[outptr], x24]\n"
+ "add x24, x24, #0x40\n"
+ "str q5, [%x[outptr], x23]\n"
+ "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 25f\n"
"7:" // Single vector of channels
@@ -166,110 +166,110 @@ void a64_fp32_nhwc_max_generic_depthfirst_impl(
"mov w20, #0xff800000\n"
"lsr x25, %x[n_valid_cells], #0x2\n"
"dup v8.4s, w20\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "fmax v23.4s, v4.4s, v3.4s\n"
- "fmax v19.4s, v28.4s, v22.4s\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
- "fmax v19.4s, v23.4s, v19.4s\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "fmax v17.4s, v4.4s, v3.4s\n"
+ "fmax v16.4s, v28.4s, v22.4s\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "fmax v16.4s, v17.4s, v16.4s\n"
+ "ldp x21, x20, [x22, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "fmax v8.4s, v8.4s, v19.4s\n"
- "add x20, x20, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "fmax v8.4s, v8.4s, v16.4s\n"
+ "add x22, x22, #0x20\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "fmax v23.4s, v4.4s, v3.4s\n"
- "fmax v19.4s, v28.4s, v22.4s\n"
- "fmax v19.4s, v23.4s, v19.4s\n"
- "fmax v8.4s, v8.4s, v19.4s\n"
+ "fmax v17.4s, v4.4s, v3.4s\n"
+ "fmax v16.4s, v28.4s, v22.4s\n"
+ "fmax v16.4s, v17.4s, v16.4s\n"
+ "fmax v8.4s, v8.4s, v16.4s\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "fmax v8.4s, v8.4s, v4.4s\n"
+ "fmax v8.4s, v8.4s, v16.4s\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x4\n"
"cmp %x[n_channels], #0x4\n"
- "str q8, [%x[outptr], x9]\n"
- "add x9, x9, #0x10\n"
+ "str q8, [%x[outptr], x27]\n"
+ "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 25f\n"
"14:" // Oddments
"mov w20, #0xff800000\n"
"lsr x25, %x[n_valid_cells], #0x2\n"
"dup v8.4s, w20\n"
- "add %x[outptr], %x[outptr], x9\n"
- "mov x20, %x[inptrs]\n"
+ "add %x[outptr], %x[outptr], x27\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 18f\n"
"15:" // Oddments: 4 inputs loop
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "add x24, x24, x9\n"
- "add x23, x23, x9\n"
- "add x22, x22, x9\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
"movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "add x21, x21, x9\n"
+ "add x20, x20, x27\n"
"movi v28.16b, #0x0\n"
"movi v22.16b, #0x0\n"
"tbz %x[n_channels], #1, 16f\n"
- "ldr d4, [x24], #0x8\n"
- "ldr d3, [x23], #0x8\n"
- "ldr d28, [x22], #0x8\n"
- "ldr d22, [x21], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
+ "ldr d3, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "ldr d22, [x20], #0x8\n"
"tbz %x[n_channels], #0, 17f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
- "ld1 { v28.s }[2], [x22], #0x4\n"
- "ld1 { v22.s }[2], [x21], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v3.s }[2], [x22], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x20], #0x4\n"
"b 17f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 1: Unset
"tbz %x[n_channels], #0, 17f\n"
- "ldr s4, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s28, [x22], #0x4\n"
- "ldr s22, [x21], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s28, [x21], #0x4\n"
+ "ldr s22, [x20], #0x4\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 1: End
- "fmax v23.4s, v4.4s, v3.4s\n"
- "fmax v19.4s, v28.4s, v22.4s\n"
+ "fmax v17.4s, v4.4s, v3.4s\n"
+ "fmax v16.4s, v28.4s, v22.4s\n"
"subs x25, x25, #0x1\n"
- "fmax v19.4s, v23.4s, v19.4s\n"
- "fmax v8.4s, v8.4s, v19.4s\n"
+ "fmax v16.4s, v17.4s, v16.4s\n"
+ "fmax v8.4s, v8.4s, v16.4s\n"
"bgt 15b\n"
"18:" // Oddments: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 22f\n"
"19:" // Oddments: Single input loop
- "ldr x24, [x20], #0x8\n"
- "add x24, x24, x9\n"
+ "ldr x23, [x24], #0x8\n"
+ "add x23, x23, x27\n"
"movi v4.16b, #0x0\n"
"tbz %x[n_channels], #1, 20f\n"
- "ldr d4, [x24], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
"tbz %x[n_channels], #0, 21f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
"b 21f\n"
"20:" // Oddments: Single input loop: Load: Bit 1: Unset
"tbz %x[n_channels], #0, 21f\n"
- "ldr s4, [x24], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
"21:" // Oddments: Single input loop: Load: Bit 1: End
"subs x21, x21, #0x1\n"
"fmax v8.4s, v8.4s, v4.4s\n"
@@ -287,10 +287,11 @@ void a64_fp32_nhwc_max_generic_depthfirst_impl(
"25:" // End
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
} // namespace pooling
} // namespace arm_conv
+
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp
index 5a7e5f981b..5d082102b3 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -22,14 +22,14 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
-
#include <cstdint>
#include <cstddef>
#include <cstring>
#include <cmath>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -105,7 +105,7 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"movi v15.4s, #0x0\n"
"movi v14.4s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"movi v13.4s, #0x0\n"
"movi v12.4s, #0x0\n"
"movi v11.4s, #0x0\n"
@@ -121,42 +121,42 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
"movi v1.4s, #0x0\n"
"movi v0.4s, #0x0\n"
"cbz x23, 4f\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ldr q30, [x21, x27]\n"
- "ldr q29, [x22, x26]\n"
- "ldr q28, [x21, x26]\n"
- "ldr q27, [x22, x25]\n"
- "ldr q26, [x21, x25]\n"
- "ldr q25, [x22, x24]\n"
- "ldr q24, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
+ "ldr q30, [x20, x27]\n"
+ "ldr q29, [x21, x26]\n"
+ "ldr q28, [x20, x26]\n"
+ "ldr q27, [x21, x25]\n"
+ "ldr q26, [x20, x25]\n"
+ "ldr q25, [x21, x24]\n"
+ "ldr q24, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
"saddl v23.8h, v31.8b, v30.8b\n"
"saddl2 v22.8h, v31.16b, v30.16b\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
- "ldr q30, [x21, x27]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "ldr q30, [x20, x27]\n"
"saddl v21.8h, v29.8b, v28.8b\n"
"saddl2 v20.8h, v29.16b, v28.16b\n"
- "ldr q29, [x22, x26]\n"
- "ldr q28, [x21, x26]\n"
+ "ldr q29, [x21, x26]\n"
+ "ldr q28, [x20, x26]\n"
"saddl v19.8h, v27.8b, v26.8b\n"
"saddl2 v18.8h, v27.16b, v26.16b\n"
- "ldr q27, [x22, x25]\n"
- "ldr q26, [x21, x25]\n"
+ "ldr q27, [x21, x25]\n"
+ "ldr q26, [x20, x25]\n"
+ "saddl v17.8h, v25.8b, v24.8b\n"
+ "saddl2 v16.8h, v25.16b, v24.16b\n"
+ "ldr q25, [x21, x24]\n"
+ "ldr q24, [x20, x24]\n"
"subs x23, x23, #0x1\n"
"saddw v15.4s, v15.4s, v23.4h\n"
"saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddl v17.8h, v25.8b, v24.8b\n"
- "saddl2 v16.8h, v25.16b, v24.16b\n"
- "ldr q25, [x22, x24]\n"
- "add x20, x20, #0x10\n"
"saddw v13.4s, v13.4s, v22.4h\n"
"saddw2 v12.4s, v12.4s, v22.8h\n"
- "ldr q24, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
"saddw v11.4s, v11.4s, v21.4h\n"
"saddw2 v10.4s, v10.4s, v21.8h\n"
"saddw v9.4s, v9.4s, v20.4h\n"
@@ -196,23 +196,23 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
"saddw v1.4s, v1.4s, v16.4h\n"
"saddw2 v0.4s, v0.4s, v16.8h\n"
"4:" // 4-vectors of channels: After loop
- "ands x21, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ldr q31, [x22, x27]\n"
- "sxtl v23.8h, v31.8b\n"
- "sxtl2 v22.8h, v31.16b\n"
- "ldr q29, [x22, x26]\n"
- "ldr q27, [x22, x25]\n"
- "sxtl v21.8h, v29.8b\n"
- "sxtl2 v20.8h, v29.16b\n"
- "ldr q25, [x22, x24]\n"
- "sxtl v19.8h, v27.8b\n"
- "sxtl2 v18.8h, v27.16b\n"
- "subs x21, x21, #0x1\n"
- "sxtl v17.8h, v25.8b\n"
- "sxtl2 v16.8h, v25.16b\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "sxtl v23.8h, v16.8b\n"
+ "sxtl2 v22.8h, v16.16b\n"
+ "ldr q16, [x20, x26]\n"
+ "ldr q17, [x20, x25]\n"
+ "sxtl v21.8h, v16.8b\n"
+ "sxtl2 v20.8h, v16.16b\n"
+ "ldr q16, [x20, x24]\n"
+ "sxtl v19.8h, v17.8b\n"
+ "sxtl2 v18.8h, v17.16b\n"
+ "subs x23, x23, #0x1\n"
+ "sxtl v17.8h, v16.8b\n"
+ "sxtl2 v16.8h, v16.16b\n"
"saddw v15.4s, v15.4s, v23.4h\n"
"saddw2 v14.4s, v14.4s, v23.8h\n"
"saddw v13.4s, v13.4s, v22.4h\n"
@@ -330,49 +330,49 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"movi v15.4s, #0x0\n"
"movi v14.4s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"movi v13.4s, #0x0\n"
"movi v12.4s, #0x0\n"
"cbz x23, 11f\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ldr q30, [x21, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ldr q30, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- "saddl v23.8h, v31.8b, v30.8b\n"
- "saddl2 v22.8h, v31.16b, v30.16b\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
- "ldr q30, [x21, x27]\n"
+ "saddl v17.8h, v31.8b, v30.8b\n"
+ "saddl2 v16.8h, v31.16b, v30.16b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "ldr q30, [x20, x27]\n"
"subs x23, x23, #0x1\n"
- "saddw v15.4s, v15.4s, v23.4h\n"
- "saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddw v13.4s, v13.4s, v22.4h\n"
- "saddw2 v12.4s, v12.4s, v22.8h\n"
- "add x20, x20, #0x10\n"
+ "saddw v15.4s, v15.4s, v17.4h\n"
+ "saddw2 v14.4s, v14.4s, v17.8h\n"
+ "saddw v13.4s, v13.4s, v16.4h\n"
+ "saddw2 v12.4s, v12.4s, v16.8h\n"
+ "add x22, x22, #0x10\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- "saddl v23.8h, v31.8b, v30.8b\n"
- "saddl2 v22.8h, v31.16b, v30.16b\n"
- "saddw v15.4s, v15.4s, v23.4h\n"
- "saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddw v13.4s, v13.4s, v22.4h\n"
- "saddw2 v12.4s, v12.4s, v22.8h\n"
+ "saddl v17.8h, v31.8b, v30.8b\n"
+ "saddl2 v16.8h, v31.16b, v30.16b\n"
+ "saddw v15.4s, v15.4s, v17.4h\n"
+ "saddw2 v14.4s, v14.4s, v17.8h\n"
+ "saddw v13.4s, v13.4s, v16.4h\n"
+ "saddw2 v12.4s, v12.4s, v16.8h\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x21, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ldr q31, [x22, x27]\n"
- "sxtl v23.8h, v31.8b\n"
- "sxtl2 v22.8h, v31.16b\n"
- "subs x21, x21, #0x1\n"
- "saddw v15.4s, v15.4s, v23.4h\n"
- "saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddw v13.4s, v13.4s, v22.4h\n"
- "saddw2 v12.4s, v12.4s, v22.8h\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "sxtl v17.8h, v16.8b\n"
+ "sxtl2 v16.8h, v16.16b\n"
+ "subs x23, x23, #0x1\n"
+ "saddw v15.4s, v15.4s, v17.4h\n"
+ "saddw2 v14.4s, v14.4s, v17.8h\n"
+ "saddw v13.4s, v13.4s, v16.4h\n"
+ "saddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"ld1r { v17.4s }, [%x[rescale_ptr]]\n"
@@ -397,9 +397,9 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
"smin v14.4s, v14.4s, v17.4s\n"
"smin v13.4s, v13.4s, v17.4s\n"
"smin v12.4s, v12.4s, v17.4s\n"
- "uzp1 v23.16b, v15.16b, v14.16b\n"
+ "uzp1 v17.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
"str q16, [%x[outptr], x27]\n"
"add x27, x27, #0x10\n"
"bge 8b\n"
@@ -411,142 +411,142 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
"movi v14.4s, #0x0\n"
"movi v13.4s, #0x0\n"
"movi v12.4s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x23, 24f\n"
"15:" // Oddments: 2 inputs loop
- "ldp x22, x21, [x20, #0x0]\n"
- "add x20, x20, #0x10\n"
- "add x22, x22, x27\n"
- "movi v31.16b, #0x0\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "add x22, x22, #0x10\n"
"add x21, x21, x27\n"
+ "movi v31.16b, #0x0\n"
+ "add x20, x20, x27\n"
"movi v30.16b, #0x0\n"
"tbz %x[n_channels], #3, 19f\n"
- "ldr d31, [x22], #0x8\n"
- "ldr d30, [x21], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
+ "ldr d30, [x20], #0x8\n"
"tbz %x[n_channels], #2, 17f\n"
- "ld1 { v31.s }[2], [x22], #0x4\n"
- "ld1 { v30.s }[2], [x21], #0x4\n"
+ "ld1 { v31.s }[2], [x21], #0x4\n"
+ "ld1 { v30.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v31.h }[6], [x22], #0x2\n"
- "ld1 { v30.h }[6], [x21], #0x2\n"
+ "ld1 { v31.h }[6], [x21], #0x2\n"
+ "ld1 { v30.h }[6], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[14], [x22], #0x1\n"
- "ld1 { v30.b }[14], [x21], #0x1\n"
+ "ld1 { v31.b }[14], [x21], #0x1\n"
+ "ld1 { v30.b }[14], [x20], #0x1\n"
"b 23f\n"
"16:" // Oddments: 2 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[12], [x22], #0x1\n"
- "ld1 { v30.b }[12], [x21], #0x1\n"
+ "ld1 { v31.b }[12], [x21], #0x1\n"
+ "ld1 { v30.b }[12], [x20], #0x1\n"
"b 23f\n"
"17:" // Oddments: 2 inputs loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v31.h }[4], [x22], #0x2\n"
- "ld1 { v30.h }[4], [x21], #0x2\n"
+ "ld1 { v31.h }[4], [x21], #0x2\n"
+ "ld1 { v30.h }[4], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[10], [x22], #0x1\n"
- "ld1 { v30.b }[10], [x21], #0x1\n"
+ "ld1 { v31.b }[10], [x21], #0x1\n"
+ "ld1 { v30.b }[10], [x20], #0x1\n"
"b 23f\n"
"18:" // Oddments: 2 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[8], [x22], #0x1\n"
- "ld1 { v30.b }[8], [x21], #0x1\n"
+ "ld1 { v31.b }[8], [x21], #0x1\n"
+ "ld1 { v30.b }[8], [x20], #0x1\n"
"b 23f\n"
"19:" // Oddments: 2 inputs loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 21f\n"
- "ldr s31, [x22], #0x4\n"
- "ldr s30, [x21], #0x4\n"
+ "ldr s31, [x21], #0x4\n"
+ "ldr s30, [x20], #0x4\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v31.h }[2], [x22], #0x2\n"
- "ld1 { v30.h }[2], [x21], #0x2\n"
+ "ld1 { v31.h }[2], [x21], #0x2\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[6], [x22], #0x1\n"
- "ld1 { v30.b }[6], [x21], #0x1\n"
+ "ld1 { v31.b }[6], [x21], #0x1\n"
+ "ld1 { v30.b }[6], [x20], #0x1\n"
"b 23f\n"
"20:" // Oddments: 2 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[4], [x22], #0x1\n"
- "ld1 { v30.b }[4], [x21], #0x1\n"
+ "ld1 { v31.b }[4], [x21], #0x1\n"
+ "ld1 { v30.b }[4], [x20], #0x1\n"
"b 23f\n"
"21:" // Oddments: 2 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 22f\n"
- "ldr h31, [x22], #0x2\n"
- "ldr h30, [x21], #0x2\n"
+ "ldr h31, [x21], #0x2\n"
+ "ldr h30, [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[2], [x22], #0x1\n"
- "ld1 { v30.b }[2], [x21], #0x1\n"
+ "ld1 { v31.b }[2], [x21], #0x1\n"
+ "ld1 { v30.b }[2], [x20], #0x1\n"
"b 23f\n"
"22:" // Oddments: 2 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ldr b31, [x22], #0x1\n"
- "ldr b30, [x21], #0x1\n"
+ "ldr b31, [x21], #0x1\n"
+ "ldr b30, [x20], #0x1\n"
"23:" // Oddments: 2 inputs loop: Load: Bit 3: End
- "saddl v23.8h, v31.8b, v30.8b\n"
- "saddl2 v22.8h, v31.16b, v30.16b\n"
+ "saddl v17.8h, v31.8b, v30.8b\n"
+ "saddl2 v16.8h, v31.16b, v30.16b\n"
"subs x23, x23, #0x1\n"
- "saddw v15.4s, v15.4s, v23.4h\n"
- "saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddw v13.4s, v13.4s, v22.4h\n"
- "saddw2 v12.4s, v12.4s, v22.8h\n"
+ "saddw v15.4s, v15.4s, v17.4h\n"
+ "saddw2 v14.4s, v14.4s, v17.8h\n"
+ "saddw v13.4s, v13.4s, v16.4h\n"
+ "saddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 15b\n"
"24:" // Oddments: After loop
- "ands x21, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 34f\n"
"25:" // Oddments: Single input loop
- "ldr x22, [x20], #0x8\n"
- "add x22, x22, x27\n"
+ "ldr x21, [x22], #0x8\n"
+ "add x21, x21, x27\n"
"movi v31.16b, #0x0\n"
"tbz %x[n_channels], #3, 29f\n"
- "ldr d31, [x22], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
- "ld1 { v31.s }[2], [x22], #0x4\n"
+ "ld1 { v31.s }[2], [x21], #0x4\n"
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v31.h }[6], [x22], #0x2\n"
+ "ld1 { v31.h }[6], [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[14], [x22], #0x1\n"
+ "ld1 { v31.b }[14], [x21], #0x1\n"
"b 33f\n"
"26:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[12], [x22], #0x1\n"
+ "ld1 { v31.b }[12], [x21], #0x1\n"
"b 33f\n"
"27:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v31.h }[4], [x22], #0x2\n"
+ "ld1 { v31.h }[4], [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[10], [x22], #0x1\n"
+ "ld1 { v31.b }[10], [x21], #0x1\n"
"b 33f\n"
"28:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[8], [x22], #0x1\n"
+ "ld1 { v31.b }[8], [x21], #0x1\n"
"b 33f\n"
"29:" // Oddments: Single input loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 31f\n"
- "ldr s31, [x22], #0x4\n"
+ "ldr s31, [x21], #0x4\n"
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v31.h }[2], [x22], #0x2\n"
+ "ld1 { v31.h }[2], [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[6], [x22], #0x1\n"
+ "ld1 { v31.b }[6], [x21], #0x1\n"
"b 33f\n"
"30:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[4], [x22], #0x1\n"
+ "ld1 { v31.b }[4], [x21], #0x1\n"
"b 33f\n"
"31:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 32f\n"
- "ldr h31, [x22], #0x2\n"
+ "ldr h31, [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[2], [x22], #0x1\n"
+ "ld1 { v31.b }[2], [x21], #0x1\n"
"b 33f\n"
"32:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ldr b31, [x22], #0x1\n"
+ "ldr b31, [x21], #0x1\n"
"33:" // Oddments: Single input loop: Load: Bit 3: End
- "sxtl v23.8h, v31.8b\n"
- "sxtl2 v22.8h, v31.16b\n"
- "subs x21, x21, #0x1\n"
- "saddw v15.4s, v15.4s, v23.4h\n"
- "saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddw v13.4s, v13.4s, v22.4h\n"
- "saddw2 v12.4s, v12.4s, v22.8h\n"
+ "sxtl v17.8h, v31.8b\n"
+ "sxtl2 v16.8h, v31.16b\n"
+ "subs x23, x23, #0x1\n"
+ "saddw v15.4s, v15.4s, v17.4h\n"
+ "saddw2 v14.4s, v14.4s, v17.8h\n"
+ "saddw v13.4s, v13.4s, v16.4h\n"
+ "saddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 25b\n"
"34:" // Oddments: Single input loop: End
"ld1r { v17.4s }, [%x[rescale_ptr]]\n"
@@ -569,9 +569,9 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
"smin v14.4s, v14.4s, v17.4s\n"
"smin v13.4s, v13.4s, v17.4s\n"
"smin v12.4s, v12.4s, v17.4s\n"
- "uzp1 v23.16b, v15.16b, v14.16b\n"
+ "uzp1 v17.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
"tbz %x[n_channels], #3, 38f\n"
"st1 { v16.d }[0], [%x[outptr]], #0x8\n"
"tbz %x[n_channels], #2, 36f\n"
@@ -626,4 +626,5 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
+
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 234b4442c8..f8f1134866 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,8 @@
#pragma once
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -45,3 +47,5 @@ struct a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<i
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index bd14408c74..7e62ac1afc 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -22,11 +22,12 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -111,7 +112,7 @@ void a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"smax v18.16b, v18.16b, v21.16b\n"
"smax v17.16b, v17.16b, v20.16b\n"
"add x15, x15, #0x10\n"
- "smax v16.16b, v16.16b, v20.16b\n"
+ "smax v16.16b, v20.16b, v16.16b\n"
"str q19, [x14, x12]\n"
"str q18, [x13, x12]\n"
"str q17, [x11, x12]\n"
@@ -121,43 +122,43 @@ void a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"2:" // Vector: Tail
"smax v21.16b, v30.16b, v29.16b\n"
"smax v20.16b, v29.16b, v28.16b\n"
- "smax v19.16b, v27.16b, v26.16b\n"
+ "smax v16.16b, v27.16b, v26.16b\n"
"smax v18.16b, v25.16b, v24.16b\n"
"smax v17.16b, v27.16b, v23.16b\n"
- "smax v16.16b, v24.16b, v22.16b\n"
- "smax v19.16b, v21.16b, v19.16b\n"
+ "smax v19.16b, v24.16b, v22.16b\n"
+ "smax v16.16b, v21.16b, v16.16b\n"
"smax v18.16b, v18.16b, v21.16b\n"
- "str q19, [x14, x12]\n"
+ "str q16, [x14, x12]\n"
"smax v17.16b, v17.16b, v20.16b\n"
- "smax v16.16b, v16.16b, v20.16b\n"
+ "smax v16.16b, v20.16b, v19.16b\n"
"str q18, [x13, x12]\n"
"str q17, [x11, x12]\n"
"str q16, [x10, x12]\n"
"add x12, x12, #0x10\n"
"cbz x16, 4f\n"
"3:" // Oddments
- "ldr b30, [x28, x15]\n"
- "ldr b29, [x25, x15]\n"
- "smax v21.16b, v30.16b, v29.16b\n"
+ "ldr b16, [x28, x15]\n"
+ "ldr b17, [x25, x15]\n"
+ "smax v23.16b, v16.16b, v17.16b\n"
"subs x16, x16, #0x1\n"
- "ldr b28, [x22, x15]\n"
- "ldr b27, [x26, x15]\n"
- "smax v20.16b, v29.16b, v28.16b\n"
- "ldr b26, [x9, x15]\n"
- "ldr b25, [x27, x15]\n"
- "smax v19.16b, v27.16b, v26.16b\n"
- "smax v19.16b, v21.16b, v19.16b\n"
- "ldr b24, [x24, x15]\n"
- "ldr b23, [x23, x15]\n"
- "smax v18.16b, v25.16b, v24.16b\n"
- "smax v17.16b, v27.16b, v23.16b\n"
- "ldr b22, [x21, x15]\n"
- "smax v16.16b, v24.16b, v22.16b\n"
+ "ldr b16, [x22, x15]\n"
+ "ldr b22, [x26, x15]\n"
+ "smax v21.16b, v17.16b, v16.16b\n"
+ "ldr b16, [x9, x15]\n"
+ "ldr b17, [x27, x15]\n"
+ "smax v16.16b, v22.16b, v16.16b\n"
+ "smax v20.16b, v23.16b, v16.16b\n"
+ "ldr b19, [x24, x15]\n"
+ "ldr b16, [x23, x15]\n"
+ "smax v18.16b, v17.16b, v19.16b\n"
+ "smax v17.16b, v22.16b, v16.16b\n"
+ "ldr b16, [x21, x15]\n"
+ "smax v16.16b, v19.16b, v16.16b\n"
"add x15, x15, #0x1\n"
- "smax v18.16b, v18.16b, v21.16b\n"
- "smax v17.16b, v17.16b, v20.16b\n"
- "smax v16.16b, v16.16b, v20.16b\n"
- "str b19, [x14, x12]\n"
+ "smax v18.16b, v18.16b, v23.16b\n"
+ "smax v17.16b, v17.16b, v21.16b\n"
+ "smax v16.16b, v21.16b, v16.16b\n"
+ "str b20, [x14, x12]\n"
"str b18, [x13, x12]\n"
"str b17, [x11, x12]\n"
"str b16, [x10, x12]\n"
@@ -172,4 +173,5 @@ void a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
+
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp
index 6168a57ca4..411fd11460 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp
@@ -22,11 +22,11 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
-
#include <cstdint>
#include <cstddef>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -41,77 +41,77 @@ void a64_s8_nhwc_max_generic_depthfirst_impl(
{
__asm__ __volatile__(
"cmp %x[n_channels], #0x40\n"
- "mov x9, #0x0\n"
- "mov x28, #0x10\n" // cntb _, ALL, #1
- "mov x27, #0x20\n" // cntb _, ALL, #2
- "mov x26, #0x30\n" // cntb _, ALL, #3
+ "mov x27, #0x0\n"
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "mov x24, #0x20\n" // cntb _, ALL, #2
+ "mov x23, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
"movi v8.16b, #0x80\n"
"movi v7.16b, #0x80\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"movi v6.16b, #0x80\n"
"movi v5.16b, #0x80\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldr q2, [x24, x28]\n"
- "ldr q1, [x23, x28]\n"
- "ldr q0, [x24, x27]\n"
- "ldr q31, [x23, x27]\n"
- "ldr q30, [x24, x26]\n"
- "ldr q29, [x23, x26]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldr q2, [x21, x26]\n"
+ "ldr q1, [x20, x26]\n"
+ "ldr q0, [x21, x24]\n"
+ "ldr q31, [x20, x24]\n"
+ "ldr q30, [x21, x23]\n"
+ "ldr q29, [x20, x23]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"smax v23.16b, v4.16b, v3.16b\n"
"smax v19.16b, v28.16b, v22.16b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
"smax v22.16b, v2.16b, v1.16b\n"
- "ldr q2, [x24, x28]\n"
+ "ldr q2, [x21, x26]\n"
"smax v18.16b, v27.16b, v21.16b\n"
- "ldr q1, [x23, x28]\n"
+ "ldr q1, [x20, x26]\n"
"smax v21.16b, v0.16b, v31.16b\n"
- "ldr q0, [x24, x27]\n"
+ "ldr q0, [x21, x24]\n"
"smax v17.16b, v26.16b, v20.16b\n"
- "ldr q31, [x23, x27]\n"
+ "ldr q31, [x20, x24]\n"
"smax v20.16b, v30.16b, v29.16b\n"
- "ldr q30, [x24, x26]\n"
+ "ldr q30, [x21, x23]\n"
"smax v16.16b, v25.16b, v24.16b\n"
- "ldr q29, [x23, x26]\n"
+ "ldr q29, [x20, x23]\n"
"smax v19.16b, v23.16b, v19.16b\n"
"smax v18.16b, v22.16b, v18.16b\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"smax v17.16b, v21.16b, v17.16b\n"
"smax v16.16b, v20.16b, v16.16b\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
"subs x25, x25, #0x1\n"
"smax v8.16b, v8.16b, v19.16b\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
"smax v7.16b, v7.16b, v18.16b\n"
"smax v6.16b, v6.16b, v17.16b\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"smax v5.16b, v5.16b, v16.16b\n"
- "add x20, x20, #0x20\n"
+ "add x22, x22, #0x20\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"smax v23.16b, v4.16b, v3.16b\n"
@@ -134,28 +134,28 @@ void a64_s8_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "smax v8.16b, v8.16b, v4.16b\n"
- "ldr q2, [x24, x28]\n"
- "ldr q0, [x24, x27]\n"
- "smax v7.16b, v7.16b, v2.16b\n"
- "smax v6.16b, v6.16b, v0.16b\n"
- "ldr q30, [x24, x26]\n"
- "smax v5.16b, v5.16b, v30.16b\n"
+ "smax v8.16b, v8.16b, v16.16b\n"
+ "ldr q17, [x20, x26]\n"
+ "ldr q16, [x20, x24]\n"
+ "smax v7.16b, v7.16b, v17.16b\n"
+ "smax v6.16b, v6.16b, v16.16b\n"
+ "ldr q16, [x20, x23]\n"
+ "smax v5.16b, v5.16b, v16.16b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x40\n"
"cmp %x[n_channels], #0x40\n"
- "str q8, [%x[outptr], x9]\n"
- "str q7, [%x[outptr], x28]\n"
- "add x9, x9, #0x40\n"
- "add x28, x28, #0x40\n"
- "str q6, [%x[outptr], x27]\n"
+ "str q8, [%x[outptr], x27]\n"
+ "str q7, [%x[outptr], x26]\n"
"add x27, x27, #0x40\n"
- "str q5, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
+ "str q6, [%x[outptr], x24]\n"
+ "add x24, x24, #0x40\n"
+ "str q5, [%x[outptr], x23]\n"
+ "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 43f\n"
"7:" // Single vector of channels
@@ -164,217 +164,217 @@ void a64_s8_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"movi v8.16b, #0x80\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "smax v23.16b, v4.16b, v3.16b\n"
- "smax v19.16b, v28.16b, v22.16b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
- "smax v19.16b, v23.16b, v19.16b\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "smax v17.16b, v4.16b, v3.16b\n"
+ "smax v16.16b, v28.16b, v22.16b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "smax v16.16b, v17.16b, v16.16b\n"
+ "ldp x21, x20, [x22, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "smax v8.16b, v8.16b, v19.16b\n"
- "add x20, x20, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "smax v8.16b, v8.16b, v16.16b\n"
+ "add x22, x22, #0x20\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "smax v23.16b, v4.16b, v3.16b\n"
- "smax v19.16b, v28.16b, v22.16b\n"
- "smax v19.16b, v23.16b, v19.16b\n"
- "smax v8.16b, v8.16b, v19.16b\n"
+ "smax v17.16b, v4.16b, v3.16b\n"
+ "smax v16.16b, v28.16b, v22.16b\n"
+ "smax v16.16b, v17.16b, v16.16b\n"
+ "smax v8.16b, v8.16b, v16.16b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "smax v8.16b, v8.16b, v4.16b\n"
+ "smax v8.16b, v8.16b, v16.16b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x10\n"
"cmp %x[n_channels], #0x10\n"
- "str q8, [%x[outptr], x9]\n"
- "add x9, x9, #0x10\n"
+ "str q8, [%x[outptr], x27]\n"
+ "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 43f\n"
"14:" // Oddments
"lsr x25, %x[n_valid_cells], #0x2\n"
- "add %x[outptr], %x[outptr], x9\n"
+ "add %x[outptr], %x[outptr], x27\n"
"movi v8.16b, #0x80\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 24f\n"
"15:" // Oddments: 4 inputs loop
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "add x24, x24, x9\n"
- "add x23, x23, x9\n"
- "add x22, x22, x9\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
"movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "add x21, x21, x9\n"
+ "add x20, x20, x27\n"
"movi v28.16b, #0x0\n"
"movi v22.16b, #0x0\n"
"tbz %x[n_channels], #3, 19f\n"
- "ldr d4, [x24], #0x8\n"
- "ldr d3, [x23], #0x8\n"
- "ldr d28, [x22], #0x8\n"
- "ldr d22, [x21], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
+ "ldr d3, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "ldr d22, [x20], #0x8\n"
"tbz %x[n_channels], #2, 17f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
- "ld1 { v28.s }[2], [x22], #0x4\n"
- "ld1 { v22.s }[2], [x21], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v3.s }[2], [x22], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v4.h }[6], [x24], #0x2\n"
- "ld1 { v3.h }[6], [x23], #0x2\n"
- "ld1 { v28.h }[6], [x22], #0x2\n"
- "ld1 { v22.h }[6], [x21], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
+ "ld1 { v3.h }[6], [x22], #0x2\n"
+ "ld1 { v28.h }[6], [x21], #0x2\n"
+ "ld1 { v22.h }[6], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[14], [x24], #0x1\n"
- "ld1 { v3.b }[14], [x23], #0x1\n"
- "ld1 { v28.b }[14], [x22], #0x1\n"
- "ld1 { v22.b }[14], [x21], #0x1\n"
+ "ld1 { v4.b }[14], [x23], #0x1\n"
+ "ld1 { v3.b }[14], [x22], #0x1\n"
+ "ld1 { v28.b }[14], [x21], #0x1\n"
+ "ld1 { v22.b }[14], [x20], #0x1\n"
"b 23f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[12], [x24], #0x1\n"
- "ld1 { v3.b }[12], [x23], #0x1\n"
- "ld1 { v28.b }[12], [x22], #0x1\n"
- "ld1 { v22.b }[12], [x21], #0x1\n"
+ "ld1 { v4.b }[12], [x23], #0x1\n"
+ "ld1 { v3.b }[12], [x22], #0x1\n"
+ "ld1 { v28.b }[12], [x21], #0x1\n"
+ "ld1 { v22.b }[12], [x20], #0x1\n"
"b 23f\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v4.h }[4], [x24], #0x2\n"
- "ld1 { v3.h }[4], [x23], #0x2\n"
- "ld1 { v28.h }[4], [x22], #0x2\n"
- "ld1 { v22.h }[4], [x21], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
+ "ld1 { v3.h }[4], [x22], #0x2\n"
+ "ld1 { v28.h }[4], [x21], #0x2\n"
+ "ld1 { v22.h }[4], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[10], [x24], #0x1\n"
- "ld1 { v3.b }[10], [x23], #0x1\n"
- "ld1 { v28.b }[10], [x22], #0x1\n"
- "ld1 { v22.b }[10], [x21], #0x1\n"
+ "ld1 { v4.b }[10], [x23], #0x1\n"
+ "ld1 { v3.b }[10], [x22], #0x1\n"
+ "ld1 { v28.b }[10], [x21], #0x1\n"
+ "ld1 { v22.b }[10], [x20], #0x1\n"
"b 23f\n"
"18:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[8], [x24], #0x1\n"
- "ld1 { v3.b }[8], [x23], #0x1\n"
- "ld1 { v28.b }[8], [x22], #0x1\n"
- "ld1 { v22.b }[8], [x21], #0x1\n"
+ "ld1 { v4.b }[8], [x23], #0x1\n"
+ "ld1 { v3.b }[8], [x22], #0x1\n"
+ "ld1 { v28.b }[8], [x21], #0x1\n"
+ "ld1 { v22.b }[8], [x20], #0x1\n"
"b 23f\n"
"19:" // Oddments: 4 inputs loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 21f\n"
- "ldr s4, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s28, [x22], #0x4\n"
- "ldr s22, [x21], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s28, [x21], #0x4\n"
+ "ldr s22, [x20], #0x4\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v4.h }[2], [x24], #0x2\n"
- "ld1 { v3.h }[2], [x23], #0x2\n"
- "ld1 { v28.h }[2], [x22], #0x2\n"
- "ld1 { v22.h }[2], [x21], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v3.h }[2], [x22], #0x2\n"
+ "ld1 { v28.h }[2], [x21], #0x2\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[6], [x24], #0x1\n"
- "ld1 { v3.b }[6], [x23], #0x1\n"
- "ld1 { v28.b }[6], [x22], #0x1\n"
- "ld1 { v22.b }[6], [x21], #0x1\n"
+ "ld1 { v4.b }[6], [x23], #0x1\n"
+ "ld1 { v3.b }[6], [x22], #0x1\n"
+ "ld1 { v28.b }[6], [x21], #0x1\n"
+ "ld1 { v22.b }[6], [x20], #0x1\n"
"b 23f\n"
"20:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[4], [x24], #0x1\n"
- "ld1 { v3.b }[4], [x23], #0x1\n"
- "ld1 { v28.b }[4], [x22], #0x1\n"
- "ld1 { v22.b }[4], [x21], #0x1\n"
+ "ld1 { v4.b }[4], [x23], #0x1\n"
+ "ld1 { v3.b }[4], [x22], #0x1\n"
+ "ld1 { v28.b }[4], [x21], #0x1\n"
+ "ld1 { v22.b }[4], [x20], #0x1\n"
"b 23f\n"
"21:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 22f\n"
- "ldr h4, [x24], #0x2\n"
- "ldr h3, [x23], #0x2\n"
- "ldr h28, [x22], #0x2\n"
- "ldr h22, [x21], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "ldr h28, [x21], #0x2\n"
+ "ldr h22, [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[2], [x24], #0x1\n"
- "ld1 { v3.b }[2], [x23], #0x1\n"
- "ld1 { v28.b }[2], [x22], #0x1\n"
- "ld1 { v22.b }[2], [x21], #0x1\n"
+ "ld1 { v4.b }[2], [x23], #0x1\n"
+ "ld1 { v3.b }[2], [x22], #0x1\n"
+ "ld1 { v28.b }[2], [x21], #0x1\n"
+ "ld1 { v22.b }[2], [x20], #0x1\n"
"b 23f\n"
"22:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ldr b4, [x24], #0x1\n"
- "ldr b3, [x23], #0x1\n"
- "ldr b28, [x22], #0x1\n"
- "ldr b22, [x21], #0x1\n"
+ "ldr b4, [x23], #0x1\n"
+ "ldr b3, [x22], #0x1\n"
+ "ldr b28, [x21], #0x1\n"
+ "ldr b22, [x20], #0x1\n"
"23:" // Oddments: 4 inputs loop: Load: Bit 3: End
- "smax v23.16b, v4.16b, v3.16b\n"
- "smax v19.16b, v28.16b, v22.16b\n"
+ "smax v17.16b, v4.16b, v3.16b\n"
+ "smax v16.16b, v28.16b, v22.16b\n"
"subs x25, x25, #0x1\n"
- "smax v19.16b, v23.16b, v19.16b\n"
- "smax v8.16b, v8.16b, v19.16b\n"
+ "smax v16.16b, v17.16b, v16.16b\n"
+ "smax v8.16b, v8.16b, v16.16b\n"
"bgt 15b\n"
"24:" // Oddments: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 34f\n"
"25:" // Oddments: Single input loop
- "ldr x24, [x20], #0x8\n"
- "add x24, x24, x9\n"
+ "ldr x23, [x24], #0x8\n"
+ "add x23, x23, x27\n"
"movi v4.16b, #0x0\n"
"tbz %x[n_channels], #3, 29f\n"
- "ldr d4, [x24], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v4.h }[6], [x24], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[14], [x24], #0x1\n"
+ "ld1 { v4.b }[14], [x23], #0x1\n"
"b 33f\n"
"26:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[12], [x24], #0x1\n"
+ "ld1 { v4.b }[12], [x23], #0x1\n"
"b 33f\n"
"27:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v4.h }[4], [x24], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[10], [x24], #0x1\n"
+ "ld1 { v4.b }[10], [x23], #0x1\n"
"b 33f\n"
"28:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[8], [x24], #0x1\n"
+ "ld1 { v4.b }[8], [x23], #0x1\n"
"b 33f\n"
"29:" // Oddments: Single input loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 31f\n"
- "ldr s4, [x24], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v4.h }[2], [x24], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[6], [x24], #0x1\n"
+ "ld1 { v4.b }[6], [x23], #0x1\n"
"b 33f\n"
"30:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[4], [x24], #0x1\n"
+ "ld1 { v4.b }[4], [x23], #0x1\n"
"b 33f\n"
"31:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 32f\n"
- "ldr h4, [x24], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[2], [x24], #0x1\n"
+ "ld1 { v4.b }[2], [x23], #0x1\n"
"b 33f\n"
"32:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ldr b4, [x24], #0x1\n"
+ "ldr b4, [x23], #0x1\n"
"33:" // Oddments: Single input loop: Load: Bit 3: End
"subs x21, x21, #0x1\n"
"smax v8.16b, v8.16b, v4.16b\n"
@@ -428,10 +428,11 @@ void a64_s8_nhwc_max_generic_depthfirst_impl(
"43:" // End
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
} // namespace pooling
} // namespace arm_conv
+
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp
index e889782fa3..019f402911 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -22,8 +22,6 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
-
#include "pooling.hpp"
#include <cstdint>
#include <cstddef>
@@ -31,6 +29,8 @@
#include <cmath>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -124,7 +124,7 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"movi v15.4s, #0x0\n"
"movi v14.4s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"movi v13.4s, #0x0\n"
"movi v12.4s, #0x0\n"
"movi v11.4s, #0x0\n"
@@ -140,42 +140,42 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
"movi v1.4s, #0x0\n"
"movi v0.4s, #0x0\n"
"cbz x23, 4f\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ldr q30, [x21, x27]\n"
- "ldr q29, [x22, x26]\n"
- "ldr q28, [x21, x26]\n"
- "ldr q27, [x22, x25]\n"
- "ldr q26, [x21, x25]\n"
- "ldr q25, [x22, x24]\n"
- "ldr q24, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
+ "ldr q30, [x20, x27]\n"
+ "ldr q29, [x21, x26]\n"
+ "ldr q28, [x20, x26]\n"
+ "ldr q27, [x21, x25]\n"
+ "ldr q26, [x20, x25]\n"
+ "ldr q25, [x21, x24]\n"
+ "ldr q24, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
"saddl v23.8h, v31.8b, v30.8b\n"
"saddl2 v22.8h, v31.16b, v30.16b\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
- "ldr q30, [x21, x27]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "ldr q30, [x20, x27]\n"
"saddl v21.8h, v29.8b, v28.8b\n"
"saddl2 v20.8h, v29.16b, v28.16b\n"
- "ldr q29, [x22, x26]\n"
- "ldr q28, [x21, x26]\n"
+ "ldr q29, [x21, x26]\n"
+ "ldr q28, [x20, x26]\n"
"saddl v19.8h, v27.8b, v26.8b\n"
"saddl2 v18.8h, v27.16b, v26.16b\n"
- "ldr q27, [x22, x25]\n"
- "ldr q26, [x21, x25]\n"
+ "ldr q27, [x21, x25]\n"
+ "ldr q26, [x20, x25]\n"
+ "saddl v17.8h, v25.8b, v24.8b\n"
+ "saddl2 v16.8h, v25.16b, v24.16b\n"
+ "ldr q25, [x21, x24]\n"
+ "ldr q24, [x20, x24]\n"
"subs x23, x23, #0x1\n"
"saddw v15.4s, v15.4s, v23.4h\n"
"saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddl v17.8h, v25.8b, v24.8b\n"
- "saddl2 v16.8h, v25.16b, v24.16b\n"
- "ldr q25, [x22, x24]\n"
- "add x20, x20, #0x10\n"
"saddw v13.4s, v13.4s, v22.4h\n"
"saddw2 v12.4s, v12.4s, v22.8h\n"
- "ldr q24, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
"saddw v11.4s, v11.4s, v21.4h\n"
"saddw2 v10.4s, v10.4s, v21.8h\n"
"saddw v9.4s, v9.4s, v20.4h\n"
@@ -215,23 +215,23 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
"saddw v1.4s, v1.4s, v16.4h\n"
"saddw2 v0.4s, v0.4s, v16.8h\n"
"4:" // 4-vectors of channels: After loop
- "ands x21, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ldr q31, [x22, x27]\n"
- "sxtl v23.8h, v31.8b\n"
- "sxtl2 v22.8h, v31.16b\n"
- "ldr q29, [x22, x26]\n"
- "ldr q27, [x22, x25]\n"
- "sxtl v21.8h, v29.8b\n"
- "sxtl2 v20.8h, v29.16b\n"
- "ldr q25, [x22, x24]\n"
- "sxtl v19.8h, v27.8b\n"
- "sxtl2 v18.8h, v27.16b\n"
- "subs x21, x21, #0x1\n"
- "sxtl v17.8h, v25.8b\n"
- "sxtl2 v16.8h, v25.16b\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "sxtl v23.8h, v16.8b\n"
+ "sxtl2 v22.8h, v16.16b\n"
+ "ldr q16, [x20, x26]\n"
+ "ldr q17, [x20, x25]\n"
+ "sxtl v21.8h, v16.8b\n"
+ "sxtl2 v20.8h, v16.16b\n"
+ "ldr q16, [x20, x24]\n"
+ "sxtl v19.8h, v17.8b\n"
+ "sxtl2 v18.8h, v17.16b\n"
+ "subs x23, x23, #0x1\n"
+ "sxtl v17.8h, v16.8b\n"
+ "sxtl2 v16.8h, v16.16b\n"
"saddw v15.4s, v15.4s, v23.4h\n"
"saddw2 v14.4s, v14.4s, v23.8h\n"
"saddw v13.4s, v13.4s, v22.4h\n"
@@ -366,49 +366,49 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"movi v15.4s, #0x0\n"
"movi v14.4s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"movi v13.4s, #0x0\n"
"movi v12.4s, #0x0\n"
"cbz x23, 11f\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ldr q30, [x21, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ldr q30, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- "saddl v23.8h, v31.8b, v30.8b\n"
- "saddl2 v22.8h, v31.16b, v30.16b\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
- "ldr q30, [x21, x27]\n"
+ "saddl v17.8h, v31.8b, v30.8b\n"
+ "saddl2 v16.8h, v31.16b, v30.16b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "ldr q30, [x20, x27]\n"
"subs x23, x23, #0x1\n"
- "saddw v15.4s, v15.4s, v23.4h\n"
- "saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddw v13.4s, v13.4s, v22.4h\n"
- "saddw2 v12.4s, v12.4s, v22.8h\n"
- "add x20, x20, #0x10\n"
+ "saddw v15.4s, v15.4s, v17.4h\n"
+ "saddw2 v14.4s, v14.4s, v17.8h\n"
+ "saddw v13.4s, v13.4s, v16.4h\n"
+ "saddw2 v12.4s, v12.4s, v16.8h\n"
+ "add x22, x22, #0x10\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- "saddl v23.8h, v31.8b, v30.8b\n"
- "saddl2 v22.8h, v31.16b, v30.16b\n"
- "saddw v15.4s, v15.4s, v23.4h\n"
- "saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddw v13.4s, v13.4s, v22.4h\n"
- "saddw2 v12.4s, v12.4s, v22.8h\n"
+ "saddl v17.8h, v31.8b, v30.8b\n"
+ "saddl2 v16.8h, v31.16b, v30.16b\n"
+ "saddw v15.4s, v15.4s, v17.4h\n"
+ "saddw2 v14.4s, v14.4s, v17.8h\n"
+ "saddw v13.4s, v13.4s, v16.4h\n"
+ "saddw2 v12.4s, v12.4s, v16.8h\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x21, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ldr q31, [x22, x27]\n"
- "sxtl v23.8h, v31.8b\n"
- "sxtl2 v22.8h, v31.16b\n"
- "subs x21, x21, #0x1\n"
- "saddw v15.4s, v15.4s, v23.4h\n"
- "saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddw v13.4s, v13.4s, v22.4h\n"
- "saddw2 v12.4s, v12.4s, v22.8h\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "sxtl v17.8h, v16.8b\n"
+ "sxtl2 v16.8h, v16.16b\n"
+ "subs x23, x23, #0x1\n"
+ "saddw v15.4s, v15.4s, v17.4h\n"
+ "saddw2 v14.4s, v14.4s, v17.8h\n"
+ "saddw v13.4s, v13.4s, v16.4h\n"
+ "saddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"ld1r { v18.4s }, [%x[left_shift]]\n"
@@ -438,9 +438,9 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
"smin v14.4s, v14.4s, v17.4s\n"
"smin v13.4s, v13.4s, v17.4s\n"
"smin v12.4s, v12.4s, v17.4s\n"
- "uzp1 v23.16b, v15.16b, v14.16b\n"
+ "uzp1 v17.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
"str q16, [%x[outptr], x27]\n"
"add x27, x27, #0x10\n"
"bge 8b\n"
@@ -452,142 +452,142 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
"movi v14.4s, #0x0\n"
"movi v13.4s, #0x0\n"
"movi v12.4s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x23, 24f\n"
"15:" // Oddments: 2 inputs loop
- "ldp x22, x21, [x20, #0x0]\n"
- "add x20, x20, #0x10\n"
- "add x22, x22, x27\n"
- "movi v31.16b, #0x0\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "add x22, x22, #0x10\n"
"add x21, x21, x27\n"
+ "movi v31.16b, #0x0\n"
+ "add x20, x20, x27\n"
"movi v30.16b, #0x0\n"
"tbz %x[n_channels], #3, 19f\n"
- "ldr d31, [x22], #0x8\n"
- "ldr d30, [x21], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
+ "ldr d30, [x20], #0x8\n"
"tbz %x[n_channels], #2, 17f\n"
- "ld1 { v31.s }[2], [x22], #0x4\n"
- "ld1 { v30.s }[2], [x21], #0x4\n"
+ "ld1 { v31.s }[2], [x21], #0x4\n"
+ "ld1 { v30.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v31.h }[6], [x22], #0x2\n"
- "ld1 { v30.h }[6], [x21], #0x2\n"
+ "ld1 { v31.h }[6], [x21], #0x2\n"
+ "ld1 { v30.h }[6], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[14], [x22], #0x1\n"
- "ld1 { v30.b }[14], [x21], #0x1\n"
+ "ld1 { v31.b }[14], [x21], #0x1\n"
+ "ld1 { v30.b }[14], [x20], #0x1\n"
"b 23f\n"
"16:" // Oddments: 2 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[12], [x22], #0x1\n"
- "ld1 { v30.b }[12], [x21], #0x1\n"
+ "ld1 { v31.b }[12], [x21], #0x1\n"
+ "ld1 { v30.b }[12], [x20], #0x1\n"
"b 23f\n"
"17:" // Oddments: 2 inputs loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v31.h }[4], [x22], #0x2\n"
- "ld1 { v30.h }[4], [x21], #0x2\n"
+ "ld1 { v31.h }[4], [x21], #0x2\n"
+ "ld1 { v30.h }[4], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[10], [x22], #0x1\n"
- "ld1 { v30.b }[10], [x21], #0x1\n"
+ "ld1 { v31.b }[10], [x21], #0x1\n"
+ "ld1 { v30.b }[10], [x20], #0x1\n"
"b 23f\n"
"18:" // Oddments: 2 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[8], [x22], #0x1\n"
- "ld1 { v30.b }[8], [x21], #0x1\n"
+ "ld1 { v31.b }[8], [x21], #0x1\n"
+ "ld1 { v30.b }[8], [x20], #0x1\n"
"b 23f\n"
"19:" // Oddments: 2 inputs loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 21f\n"
- "ldr s31, [x22], #0x4\n"
- "ldr s30, [x21], #0x4\n"
+ "ldr s31, [x21], #0x4\n"
+ "ldr s30, [x20], #0x4\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v31.h }[2], [x22], #0x2\n"
- "ld1 { v30.h }[2], [x21], #0x2\n"
+ "ld1 { v31.h }[2], [x21], #0x2\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[6], [x22], #0x1\n"
- "ld1 { v30.b }[6], [x21], #0x1\n"
+ "ld1 { v31.b }[6], [x21], #0x1\n"
+ "ld1 { v30.b }[6], [x20], #0x1\n"
"b 23f\n"
"20:" // Oddments: 2 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[4], [x22], #0x1\n"
- "ld1 { v30.b }[4], [x21], #0x1\n"
+ "ld1 { v31.b }[4], [x21], #0x1\n"
+ "ld1 { v30.b }[4], [x20], #0x1\n"
"b 23f\n"
"21:" // Oddments: 2 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 22f\n"
- "ldr h31, [x22], #0x2\n"
- "ldr h30, [x21], #0x2\n"
+ "ldr h31, [x21], #0x2\n"
+ "ldr h30, [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[2], [x22], #0x1\n"
- "ld1 { v30.b }[2], [x21], #0x1\n"
+ "ld1 { v31.b }[2], [x21], #0x1\n"
+ "ld1 { v30.b }[2], [x20], #0x1\n"
"b 23f\n"
"22:" // Oddments: 2 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ldr b31, [x22], #0x1\n"
- "ldr b30, [x21], #0x1\n"
+ "ldr b31, [x21], #0x1\n"
+ "ldr b30, [x20], #0x1\n"
"23:" // Oddments: 2 inputs loop: Load: Bit 3: End
- "saddl v23.8h, v31.8b, v30.8b\n"
- "saddl2 v22.8h, v31.16b, v30.16b\n"
+ "saddl v17.8h, v31.8b, v30.8b\n"
+ "saddl2 v16.8h, v31.16b, v30.16b\n"
"subs x23, x23, #0x1\n"
- "saddw v15.4s, v15.4s, v23.4h\n"
- "saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddw v13.4s, v13.4s, v22.4h\n"
- "saddw2 v12.4s, v12.4s, v22.8h\n"
+ "saddw v15.4s, v15.4s, v17.4h\n"
+ "saddw2 v14.4s, v14.4s, v17.8h\n"
+ "saddw v13.4s, v13.4s, v16.4h\n"
+ "saddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 15b\n"
"24:" // Oddments: After loop
- "ands x21, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 34f\n"
"25:" // Oddments: Single input loop
- "ldr x22, [x20], #0x8\n"
- "add x22, x22, x27\n"
+ "ldr x21, [x22], #0x8\n"
+ "add x21, x21, x27\n"
"movi v31.16b, #0x0\n"
"tbz %x[n_channels], #3, 29f\n"
- "ldr d31, [x22], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
- "ld1 { v31.s }[2], [x22], #0x4\n"
+ "ld1 { v31.s }[2], [x21], #0x4\n"
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v31.h }[6], [x22], #0x2\n"
+ "ld1 { v31.h }[6], [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[14], [x22], #0x1\n"
+ "ld1 { v31.b }[14], [x21], #0x1\n"
"b 33f\n"
"26:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[12], [x22], #0x1\n"
+ "ld1 { v31.b }[12], [x21], #0x1\n"
"b 33f\n"
"27:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v31.h }[4], [x22], #0x2\n"
+ "ld1 { v31.h }[4], [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[10], [x22], #0x1\n"
+ "ld1 { v31.b }[10], [x21], #0x1\n"
"b 33f\n"
"28:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[8], [x22], #0x1\n"
+ "ld1 { v31.b }[8], [x21], #0x1\n"
"b 33f\n"
"29:" // Oddments: Single input loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 31f\n"
- "ldr s31, [x22], #0x4\n"
+ "ldr s31, [x21], #0x4\n"
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v31.h }[2], [x22], #0x2\n"
+ "ld1 { v31.h }[2], [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[6], [x22], #0x1\n"
+ "ld1 { v31.b }[6], [x21], #0x1\n"
"b 33f\n"
"30:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[4], [x22], #0x1\n"
+ "ld1 { v31.b }[4], [x21], #0x1\n"
"b 33f\n"
"31:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 32f\n"
- "ldr h31, [x22], #0x2\n"
+ "ldr h31, [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[2], [x22], #0x1\n"
+ "ld1 { v31.b }[2], [x21], #0x1\n"
"b 33f\n"
"32:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ldr b31, [x22], #0x1\n"
+ "ldr b31, [x21], #0x1\n"
"33:" // Oddments: Single input loop: Load: Bit 3: End
- "sxtl v23.8h, v31.8b\n"
- "sxtl2 v22.8h, v31.16b\n"
- "subs x21, x21, #0x1\n"
- "saddw v15.4s, v15.4s, v23.4h\n"
- "saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddw v13.4s, v13.4s, v22.4h\n"
- "saddw2 v12.4s, v12.4s, v22.8h\n"
+ "sxtl v17.8h, v31.8b\n"
+ "sxtl2 v16.8h, v31.16b\n"
+ "subs x23, x23, #0x1\n"
+ "saddw v15.4s, v15.4s, v17.4h\n"
+ "saddw2 v14.4s, v14.4s, v17.8h\n"
+ "saddw v13.4s, v13.4s, v16.4h\n"
+ "saddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 25b\n"
"34:" // Oddments: Single input loop: End
"ld1r { v18.4s }, [%x[left_shift]]\n"
@@ -615,9 +615,9 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
"smin v14.4s, v14.4s, v17.4s\n"
"smin v13.4s, v13.4s, v17.4s\n"
"smin v12.4s, v12.4s, v17.4s\n"
- "uzp1 v23.16b, v15.16b, v14.16b\n"
+ "uzp1 v17.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
"tbz %x[n_channels], #3, 38f\n"
"st1 { v16.d }[0], [%x[outptr]], #0x8\n"
"tbz %x[n_channels], #2, 36f\n"
@@ -672,4 +672,5 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
+
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp
index 90a31ec677..f7b8dc761c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -21,12 +21,13 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include "pooling.hpp"
#include <cstdint>
#include <cstddef>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -42,77 +43,77 @@ void a64_s8q_nhwc_max_generic_depthfirst_impl(
{
__asm__ __volatile__(
"cmp %x[n_channels], #0x40\n"
- "mov x9, #0x0\n"
- "mov x28, #0x10\n" // cntb _, ALL, #1
- "mov x27, #0x20\n" // cntb _, ALL, #2
- "mov x26, #0x30\n" // cntb _, ALL, #3
+ "mov x27, #0x0\n"
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "mov x24, #0x20\n" // cntb _, ALL, #2
+ "mov x23, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
"movi v8.16b, #0x80\n"
"movi v7.16b, #0x80\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"movi v6.16b, #0x80\n"
"movi v5.16b, #0x80\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldr q2, [x24, x28]\n"
- "ldr q1, [x23, x28]\n"
- "ldr q0, [x24, x27]\n"
- "ldr q31, [x23, x27]\n"
- "ldr q30, [x24, x26]\n"
- "ldr q29, [x23, x26]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldr q2, [x21, x26]\n"
+ "ldr q1, [x20, x26]\n"
+ "ldr q0, [x21, x24]\n"
+ "ldr q31, [x20, x24]\n"
+ "ldr q30, [x21, x23]\n"
+ "ldr q29, [x20, x23]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"smax v23.16b, v4.16b, v3.16b\n"
"smax v19.16b, v28.16b, v22.16b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
"smax v22.16b, v2.16b, v1.16b\n"
- "ldr q2, [x24, x28]\n"
+ "ldr q2, [x21, x26]\n"
"smax v18.16b, v27.16b, v21.16b\n"
- "ldr q1, [x23, x28]\n"
+ "ldr q1, [x20, x26]\n"
"smax v21.16b, v0.16b, v31.16b\n"
- "ldr q0, [x24, x27]\n"
+ "ldr q0, [x21, x24]\n"
"smax v17.16b, v26.16b, v20.16b\n"
- "ldr q31, [x23, x27]\n"
+ "ldr q31, [x20, x24]\n"
"smax v20.16b, v30.16b, v29.16b\n"
- "ldr q30, [x24, x26]\n"
+ "ldr q30, [x21, x23]\n"
"smax v16.16b, v25.16b, v24.16b\n"
- "ldr q29, [x23, x26]\n"
+ "ldr q29, [x20, x23]\n"
"smax v19.16b, v23.16b, v19.16b\n"
"smax v18.16b, v22.16b, v18.16b\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"smax v17.16b, v21.16b, v17.16b\n"
"smax v16.16b, v20.16b, v16.16b\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
"subs x25, x25, #0x1\n"
"smax v8.16b, v8.16b, v19.16b\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
"smax v7.16b, v7.16b, v18.16b\n"
"smax v6.16b, v6.16b, v17.16b\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"smax v5.16b, v5.16b, v16.16b\n"
- "add x20, x20, #0x20\n"
+ "add x22, x22, #0x20\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"smax v23.16b, v4.16b, v3.16b\n"
@@ -135,16 +136,16 @@ void a64_s8q_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "smax v8.16b, v8.16b, v4.16b\n"
- "ldr q2, [x24, x28]\n"
- "ldr q0, [x24, x27]\n"
- "smax v7.16b, v7.16b, v2.16b\n"
- "smax v6.16b, v6.16b, v0.16b\n"
- "ldr q30, [x24, x26]\n"
- "smax v5.16b, v5.16b, v30.16b\n"
+ "smax v8.16b, v8.16b, v16.16b\n"
+ "ldr q17, [x20, x26]\n"
+ "ldr q16, [x20, x24]\n"
+ "smax v7.16b, v7.16b, v17.16b\n"
+ "smax v6.16b, v6.16b, v16.16b\n"
+ "ldr q16, [x20, x23]\n"
+ "smax v5.16b, v5.16b, v16.16b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"sxtl v23.8h, v8.8b\n"
@@ -271,16 +272,16 @@ void a64_s8q_nhwc_max_generic_depthfirst_impl(
"uzp1 v19.16b, v24.16b, v19.16b\n"
"uzp1 v16.16b, v23.16b, v16.16b\n"
"uzp1 v18.16b, v22.16b, v18.16b\n"
- "str q16, [%x[outptr], x9]\n"
- "add x9, x9, #0x40\n"
+ "str q16, [%x[outptr], x27]\n"
+ "add x27, x27, #0x40\n"
"uzp1 v17.16b, v21.16b, v17.16b\n"
"uzp1 v16.16b, v20.16b, v19.16b\n"
- "str q18, [%x[outptr], x28]\n"
- "add x28, x28, #0x40\n"
- "str q17, [%x[outptr], x27]\n"
- "add x27, x27, #0x40\n"
- "str q16, [%x[outptr], x26]\n"
+ "str q18, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
+ "str q17, [%x[outptr], x24]\n"
+ "add x24, x24, #0x40\n"
+ "str q16, [%x[outptr], x23]\n"
+ "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 43f\n"
"7:" // Single vector of channels
@@ -289,296 +290,296 @@ void a64_s8q_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"movi v8.16b, #0x80\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "smax v23.16b, v4.16b, v3.16b\n"
- "smax v19.16b, v28.16b, v22.16b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
- "smax v19.16b, v23.16b, v19.16b\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "smax v17.16b, v4.16b, v3.16b\n"
+ "smax v16.16b, v28.16b, v22.16b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "smax v16.16b, v17.16b, v16.16b\n"
+ "ldp x21, x20, [x22, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "smax v8.16b, v8.16b, v19.16b\n"
- "add x20, x20, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "smax v8.16b, v8.16b, v16.16b\n"
+ "add x22, x22, #0x20\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "smax v23.16b, v4.16b, v3.16b\n"
- "smax v19.16b, v28.16b, v22.16b\n"
- "smax v19.16b, v23.16b, v19.16b\n"
- "smax v8.16b, v8.16b, v19.16b\n"
+ "smax v17.16b, v4.16b, v3.16b\n"
+ "smax v16.16b, v28.16b, v22.16b\n"
+ "smax v16.16b, v17.16b, v16.16b\n"
+ "smax v8.16b, v8.16b, v16.16b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "smax v8.16b, v8.16b, v4.16b\n"
+ "smax v8.16b, v8.16b, v16.16b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "sxtl v23.8h, v8.8b\n"
- "sxtl2 v22.8h, v8.16b\n"
+ "sxtl v17.8h, v8.8b\n"
+ "sxtl2 v16.8h, v8.16b\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- "ld1r { v4.4s }, [x20]\n"
- "sxtl v1.4s, v23.4h\n"
- "sxtl2 v23.4s, v23.8h\n"
+ "ld1r { v22.4s }, [x20]\n"
+ "sxtl v21.4s, v17.4h\n"
+ "sxtl2 v20.4s, v17.8h\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "ld1r { v3.4s }, [x20]\n"
- "sxtl v0.4s, v22.4h\n"
- "sxtl2 v31.4s, v22.8h\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "sxtl v19.4s, v16.4h\n"
+ "sxtl2 v18.4s, v16.8h\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "ld1r { v2.4s }, [x20]\n"
- "srshl v1.4s, v1.4s, v4.4s\n"
- "srshl v23.4s, v23.4s, v4.4s\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "srshl v21.4s, v21.4s, v22.4s\n"
+ "srshl v20.4s, v20.4s, v22.4s\n"
"sub %x[n_channels], %x[n_channels], #0x10\n"
"cmp %x[n_channels], #0x10\n"
- "srshl v0.4s, v0.4s, v4.4s\n"
- "srshl v31.4s, v31.4s, v4.4s\n"
- "sqrdmulh v1.4s, v1.4s, v3.4s\n"
- "sqrdmulh v23.4s, v23.4s, v3.4s\n"
- "sqrdmulh v0.4s, v0.4s, v3.4s\n"
- "sqrdmulh v31.4s, v31.4s, v3.4s\n"
+ "srshl v19.4s, v19.4s, v22.4s\n"
+ "srshl v18.4s, v18.4s, v22.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v17.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v17.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v17.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v17.4s\n"
"movi v17.4s, #0x7f\n"
- "srshl v1.4s, v1.4s, v2.4s\n"
- "srshl v23.4s, v23.4s, v2.4s\n"
- "srshl v0.4s, v0.4s, v2.4s\n"
- "srshl v31.4s, v31.4s, v2.4s\n"
+ "srshl v21.4s, v21.4s, v16.4s\n"
+ "srshl v20.4s, v20.4s, v16.4s\n"
+ "srshl v19.4s, v19.4s, v16.4s\n"
+ "srshl v18.4s, v18.4s, v16.4s\n"
"not v16.16b, v17.16b\n"
- "smax v1.4s, v1.4s, v16.4s\n"
- "smax v23.4s, v23.4s, v16.4s\n"
- "smax v0.4s, v0.4s, v16.4s\n"
- "smax v31.4s, v31.4s, v16.4s\n"
- "smin v1.4s, v1.4s, v17.4s\n"
- "smin v23.4s, v23.4s, v17.4s\n"
- "smin v0.4s, v0.4s, v17.4s\n"
- "smin v31.4s, v31.4s, v17.4s\n"
- "uzp1 v23.16b, v1.16b, v23.16b\n"
- "uzp1 v16.16b, v0.16b, v31.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
- "str q16, [%x[outptr], x9]\n"
- "add x9, x9, #0x10\n"
+ "smax v21.4s, v21.4s, v16.4s\n"
+ "smax v20.4s, v20.4s, v16.4s\n"
+ "smax v19.4s, v19.4s, v16.4s\n"
+ "smax v18.4s, v18.4s, v16.4s\n"
+ "smin v21.4s, v21.4s, v17.4s\n"
+ "smin v20.4s, v20.4s, v17.4s\n"
+ "smin v19.4s, v19.4s, v17.4s\n"
+ "smin v18.4s, v18.4s, v17.4s\n"
+ "uzp1 v17.16b, v21.16b, v20.16b\n"
+ "uzp1 v16.16b, v19.16b, v18.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
+ "str q16, [%x[outptr], x27]\n"
+ "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 43f\n"
"14:" // Oddments
"lsr x25, %x[n_valid_cells], #0x2\n"
- "add %x[outptr], %x[outptr], x9\n"
+ "add %x[outptr], %x[outptr], x27\n"
"movi v8.16b, #0x80\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 24f\n"
"15:" // Oddments: 4 inputs loop
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "add x24, x24, x9\n"
- "add x23, x23, x9\n"
- "add x22, x22, x9\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
"movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "add x21, x21, x9\n"
+ "add x20, x20, x27\n"
"movi v28.16b, #0x0\n"
"movi v22.16b, #0x0\n"
"tbz %x[n_channels], #3, 19f\n"
- "ldr d4, [x24], #0x8\n"
- "ldr d3, [x23], #0x8\n"
- "ldr d28, [x22], #0x8\n"
- "ldr d22, [x21], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
+ "ldr d3, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "ldr d22, [x20], #0x8\n"
"tbz %x[n_channels], #2, 17f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
- "ld1 { v28.s }[2], [x22], #0x4\n"
- "ld1 { v22.s }[2], [x21], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v3.s }[2], [x22], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v4.h }[6], [x24], #0x2\n"
- "ld1 { v3.h }[6], [x23], #0x2\n"
- "ld1 { v28.h }[6], [x22], #0x2\n"
- "ld1 { v22.h }[6], [x21], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
+ "ld1 { v3.h }[6], [x22], #0x2\n"
+ "ld1 { v28.h }[6], [x21], #0x2\n"
+ "ld1 { v22.h }[6], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[14], [x24], #0x1\n"
- "ld1 { v3.b }[14], [x23], #0x1\n"
- "ld1 { v28.b }[14], [x22], #0x1\n"
- "ld1 { v22.b }[14], [x21], #0x1\n"
+ "ld1 { v4.b }[14], [x23], #0x1\n"
+ "ld1 { v3.b }[14], [x22], #0x1\n"
+ "ld1 { v28.b }[14], [x21], #0x1\n"
+ "ld1 { v22.b }[14], [x20], #0x1\n"
"b 23f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[12], [x24], #0x1\n"
- "ld1 { v3.b }[12], [x23], #0x1\n"
- "ld1 { v28.b }[12], [x22], #0x1\n"
- "ld1 { v22.b }[12], [x21], #0x1\n"
+ "ld1 { v4.b }[12], [x23], #0x1\n"
+ "ld1 { v3.b }[12], [x22], #0x1\n"
+ "ld1 { v28.b }[12], [x21], #0x1\n"
+ "ld1 { v22.b }[12], [x20], #0x1\n"
"b 23f\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v4.h }[4], [x24], #0x2\n"
- "ld1 { v3.h }[4], [x23], #0x2\n"
- "ld1 { v28.h }[4], [x22], #0x2\n"
- "ld1 { v22.h }[4], [x21], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
+ "ld1 { v3.h }[4], [x22], #0x2\n"
+ "ld1 { v28.h }[4], [x21], #0x2\n"
+ "ld1 { v22.h }[4], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[10], [x24], #0x1\n"
- "ld1 { v3.b }[10], [x23], #0x1\n"
- "ld1 { v28.b }[10], [x22], #0x1\n"
- "ld1 { v22.b }[10], [x21], #0x1\n"
+ "ld1 { v4.b }[10], [x23], #0x1\n"
+ "ld1 { v3.b }[10], [x22], #0x1\n"
+ "ld1 { v28.b }[10], [x21], #0x1\n"
+ "ld1 { v22.b }[10], [x20], #0x1\n"
"b 23f\n"
"18:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[8], [x24], #0x1\n"
- "ld1 { v3.b }[8], [x23], #0x1\n"
- "ld1 { v28.b }[8], [x22], #0x1\n"
- "ld1 { v22.b }[8], [x21], #0x1\n"
+ "ld1 { v4.b }[8], [x23], #0x1\n"
+ "ld1 { v3.b }[8], [x22], #0x1\n"
+ "ld1 { v28.b }[8], [x21], #0x1\n"
+ "ld1 { v22.b }[8], [x20], #0x1\n"
"b 23f\n"
"19:" // Oddments: 4 inputs loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 21f\n"
- "ldr s4, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s28, [x22], #0x4\n"
- "ldr s22, [x21], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s28, [x21], #0x4\n"
+ "ldr s22, [x20], #0x4\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v4.h }[2], [x24], #0x2\n"
- "ld1 { v3.h }[2], [x23], #0x2\n"
- "ld1 { v28.h }[2], [x22], #0x2\n"
- "ld1 { v22.h }[2], [x21], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v3.h }[2], [x22], #0x2\n"
+ "ld1 { v28.h }[2], [x21], #0x2\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[6], [x24], #0x1\n"
- "ld1 { v3.b }[6], [x23], #0x1\n"
- "ld1 { v28.b }[6], [x22], #0x1\n"
- "ld1 { v22.b }[6], [x21], #0x1\n"
+ "ld1 { v4.b }[6], [x23], #0x1\n"
+ "ld1 { v3.b }[6], [x22], #0x1\n"
+ "ld1 { v28.b }[6], [x21], #0x1\n"
+ "ld1 { v22.b }[6], [x20], #0x1\n"
"b 23f\n"
"20:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[4], [x24], #0x1\n"
- "ld1 { v3.b }[4], [x23], #0x1\n"
- "ld1 { v28.b }[4], [x22], #0x1\n"
- "ld1 { v22.b }[4], [x21], #0x1\n"
+ "ld1 { v4.b }[4], [x23], #0x1\n"
+ "ld1 { v3.b }[4], [x22], #0x1\n"
+ "ld1 { v28.b }[4], [x21], #0x1\n"
+ "ld1 { v22.b }[4], [x20], #0x1\n"
"b 23f\n"
"21:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 22f\n"
- "ldr h4, [x24], #0x2\n"
- "ldr h3, [x23], #0x2\n"
- "ldr h28, [x22], #0x2\n"
- "ldr h22, [x21], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "ldr h28, [x21], #0x2\n"
+ "ldr h22, [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[2], [x24], #0x1\n"
- "ld1 { v3.b }[2], [x23], #0x1\n"
- "ld1 { v28.b }[2], [x22], #0x1\n"
- "ld1 { v22.b }[2], [x21], #0x1\n"
+ "ld1 { v4.b }[2], [x23], #0x1\n"
+ "ld1 { v3.b }[2], [x22], #0x1\n"
+ "ld1 { v28.b }[2], [x21], #0x1\n"
+ "ld1 { v22.b }[2], [x20], #0x1\n"
"b 23f\n"
"22:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ldr b4, [x24], #0x1\n"
- "ldr b3, [x23], #0x1\n"
- "ldr b28, [x22], #0x1\n"
- "ldr b22, [x21], #0x1\n"
+ "ldr b4, [x23], #0x1\n"
+ "ldr b3, [x22], #0x1\n"
+ "ldr b28, [x21], #0x1\n"
+ "ldr b22, [x20], #0x1\n"
"23:" // Oddments: 4 inputs loop: Load: Bit 3: End
- "smax v23.16b, v4.16b, v3.16b\n"
- "smax v19.16b, v28.16b, v22.16b\n"
+ "smax v17.16b, v4.16b, v3.16b\n"
+ "smax v16.16b, v28.16b, v22.16b\n"
"subs x25, x25, #0x1\n"
- "smax v19.16b, v23.16b, v19.16b\n"
- "smax v8.16b, v8.16b, v19.16b\n"
+ "smax v16.16b, v17.16b, v16.16b\n"
+ "smax v8.16b, v8.16b, v16.16b\n"
"bgt 15b\n"
"24:" // Oddments: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 34f\n"
"25:" // Oddments: Single input loop
- "ldr x24, [x20], #0x8\n"
- "add x24, x24, x9\n"
+ "ldr x23, [x24], #0x8\n"
+ "add x23, x23, x27\n"
"movi v4.16b, #0x0\n"
"tbz %x[n_channels], #3, 29f\n"
- "ldr d4, [x24], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v4.h }[6], [x24], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[14], [x24], #0x1\n"
+ "ld1 { v4.b }[14], [x23], #0x1\n"
"b 33f\n"
"26:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[12], [x24], #0x1\n"
+ "ld1 { v4.b }[12], [x23], #0x1\n"
"b 33f\n"
"27:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v4.h }[4], [x24], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[10], [x24], #0x1\n"
+ "ld1 { v4.b }[10], [x23], #0x1\n"
"b 33f\n"
"28:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[8], [x24], #0x1\n"
+ "ld1 { v4.b }[8], [x23], #0x1\n"
"b 33f\n"
"29:" // Oddments: Single input loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 31f\n"
- "ldr s4, [x24], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v4.h }[2], [x24], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[6], [x24], #0x1\n"
+ "ld1 { v4.b }[6], [x23], #0x1\n"
"b 33f\n"
"30:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[4], [x24], #0x1\n"
+ "ld1 { v4.b }[4], [x23], #0x1\n"
"b 33f\n"
"31:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 32f\n"
- "ldr h4, [x24], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[2], [x24], #0x1\n"
+ "ld1 { v4.b }[2], [x23], #0x1\n"
"b 33f\n"
"32:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ldr b4, [x24], #0x1\n"
+ "ldr b4, [x23], #0x1\n"
"33:" // Oddments: Single input loop: Load: Bit 3: End
"subs x21, x21, #0x1\n"
"smax v8.16b, v8.16b, v4.16b\n"
"bgt 25b\n"
"34:" // Oddments: Single input loop: End
- "sxtl v23.8h, v8.8b\n"
- "sxtl2 v22.8h, v8.16b\n"
+ "sxtl v17.8h, v8.8b\n"
+ "sxtl2 v16.8h, v8.16b\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- "ld1r { v4.4s }, [x20]\n"
- "sxtl v1.4s, v23.4h\n"
- "sxtl2 v23.4s, v23.8h\n"
+ "ld1r { v22.4s }, [x20]\n"
+ "sxtl v21.4s, v17.4h\n"
+ "sxtl2 v20.4s, v17.8h\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "ld1r { v3.4s }, [x20]\n"
- "sxtl v0.4s, v22.4h\n"
- "sxtl2 v31.4s, v22.8h\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "sxtl v19.4s, v16.4h\n"
+ "sxtl2 v18.4s, v16.8h\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "ld1r { v2.4s }, [x20]\n"
- "srshl v1.4s, v1.4s, v4.4s\n"
- "srshl v23.4s, v23.4s, v4.4s\n"
- "srshl v0.4s, v0.4s, v4.4s\n"
- "srshl v31.4s, v31.4s, v4.4s\n"
- "sqrdmulh v1.4s, v1.4s, v3.4s\n"
- "sqrdmulh v23.4s, v23.4s, v3.4s\n"
- "sqrdmulh v0.4s, v0.4s, v3.4s\n"
- "sqrdmulh v31.4s, v31.4s, v3.4s\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "srshl v21.4s, v21.4s, v22.4s\n"
+ "srshl v20.4s, v20.4s, v22.4s\n"
+ "srshl v19.4s, v19.4s, v22.4s\n"
+ "srshl v18.4s, v18.4s, v22.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v17.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v17.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v17.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v17.4s\n"
"movi v17.4s, #0x7f\n"
- "srshl v1.4s, v1.4s, v2.4s\n"
- "srshl v23.4s, v23.4s, v2.4s\n"
- "srshl v0.4s, v0.4s, v2.4s\n"
- "srshl v31.4s, v31.4s, v2.4s\n"
+ "srshl v21.4s, v21.4s, v16.4s\n"
+ "srshl v20.4s, v20.4s, v16.4s\n"
+ "srshl v19.4s, v19.4s, v16.4s\n"
+ "srshl v18.4s, v18.4s, v16.4s\n"
"not v16.16b, v17.16b\n"
- "smax v1.4s, v1.4s, v16.4s\n"
- "smax v23.4s, v23.4s, v16.4s\n"
- "smax v0.4s, v0.4s, v16.4s\n"
- "smax v31.4s, v31.4s, v16.4s\n"
- "smin v1.4s, v1.4s, v17.4s\n"
- "smin v23.4s, v23.4s, v17.4s\n"
- "smin v0.4s, v0.4s, v17.4s\n"
- "smin v31.4s, v31.4s, v17.4s\n"
- "uzp1 v23.16b, v1.16b, v23.16b\n"
- "uzp1 v16.16b, v0.16b, v31.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "smax v21.4s, v21.4s, v16.4s\n"
+ "smax v20.4s, v20.4s, v16.4s\n"
+ "smax v19.4s, v19.4s, v16.4s\n"
+ "smax v18.4s, v18.4s, v16.4s\n"
+ "smin v21.4s, v21.4s, v17.4s\n"
+ "smin v20.4s, v20.4s, v17.4s\n"
+ "smin v19.4s, v19.4s, v17.4s\n"
+ "smin v18.4s, v18.4s, v17.4s\n"
+ "uzp1 v17.16b, v21.16b, v20.16b\n"
+ "uzp1 v16.16b, v19.16b, v18.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
"tbz %x[n_channels], #3, 38f\n"
"st1 { v16.d }[0], [%x[outptr]], #0x8\n"
"tbz %x[n_channels], #2, 36f\n"
@@ -627,10 +628,11 @@ void a64_s8q_nhwc_max_generic_depthfirst_impl(
"43:" // End
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_per_layer_left_shift] "I" (offsetof(Requantize32, per_layer_left_shift)), [offsetof_qp_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_qp_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [quant_params] "r" (&qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
} // namespace pooling
} // namespace arm_conv
+
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp
index 76828a911e..f8984c451c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -22,14 +22,14 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
-
#include <cstdint>
#include <cstddef>
#include <cstring>
#include <cmath>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -105,7 +105,7 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"movi v15.4s, #0x0\n"
"movi v14.4s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"movi v13.4s, #0x0\n"
"movi v12.4s, #0x0\n"
"movi v11.4s, #0x0\n"
@@ -121,42 +121,42 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"movi v1.4s, #0x0\n"
"movi v0.4s, #0x0\n"
"cbz x23, 4f\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ldr q30, [x21, x27]\n"
- "ldr q29, [x22, x26]\n"
- "ldr q28, [x21, x26]\n"
- "ldr q27, [x22, x25]\n"
- "ldr q26, [x21, x25]\n"
- "ldr q25, [x22, x24]\n"
- "ldr q24, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
+ "ldr q30, [x20, x27]\n"
+ "ldr q29, [x21, x26]\n"
+ "ldr q28, [x20, x26]\n"
+ "ldr q27, [x21, x25]\n"
+ "ldr q26, [x20, x25]\n"
+ "ldr q25, [x21, x24]\n"
+ "ldr q24, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
"uaddl v23.8h, v31.8b, v30.8b\n"
"uaddl2 v22.8h, v31.16b, v30.16b\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
- "ldr q30, [x21, x27]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "ldr q30, [x20, x27]\n"
"uaddl v21.8h, v29.8b, v28.8b\n"
"uaddl2 v20.8h, v29.16b, v28.16b\n"
- "ldr q29, [x22, x26]\n"
- "ldr q28, [x21, x26]\n"
+ "ldr q29, [x21, x26]\n"
+ "ldr q28, [x20, x26]\n"
"uaddl v19.8h, v27.8b, v26.8b\n"
"uaddl2 v18.8h, v27.16b, v26.16b\n"
- "ldr q27, [x22, x25]\n"
- "ldr q26, [x21, x25]\n"
+ "ldr q27, [x21, x25]\n"
+ "ldr q26, [x20, x25]\n"
+ "uaddl v17.8h, v25.8b, v24.8b\n"
+ "uaddl2 v16.8h, v25.16b, v24.16b\n"
+ "ldr q25, [x21, x24]\n"
+ "ldr q24, [x20, x24]\n"
"subs x23, x23, #0x1\n"
"uaddw v15.4s, v15.4s, v23.4h\n"
"uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddl v17.8h, v25.8b, v24.8b\n"
- "uaddl2 v16.8h, v25.16b, v24.16b\n"
- "ldr q25, [x22, x24]\n"
- "add x20, x20, #0x10\n"
"uaddw v13.4s, v13.4s, v22.4h\n"
"uaddw2 v12.4s, v12.4s, v22.8h\n"
- "ldr q24, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
"uaddw v11.4s, v11.4s, v21.4h\n"
"uaddw2 v10.4s, v10.4s, v21.8h\n"
"uaddw v9.4s, v9.4s, v20.4h\n"
@@ -196,23 +196,23 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"uaddw v1.4s, v1.4s, v16.4h\n"
"uaddw2 v0.4s, v0.4s, v16.8h\n"
"4:" // 4-vectors of channels: After loop
- "ands x21, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ldr q31, [x22, x27]\n"
- "uxtl v23.8h, v31.8b\n"
- "uxtl2 v22.8h, v31.16b\n"
- "ldr q29, [x22, x26]\n"
- "ldr q27, [x22, x25]\n"
- "uxtl v21.8h, v29.8b\n"
- "uxtl2 v20.8h, v29.16b\n"
- "ldr q25, [x22, x24]\n"
- "uxtl v19.8h, v27.8b\n"
- "uxtl2 v18.8h, v27.16b\n"
- "subs x21, x21, #0x1\n"
- "uxtl v17.8h, v25.8b\n"
- "uxtl2 v16.8h, v25.16b\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "uxtl v23.8h, v16.8b\n"
+ "uxtl2 v22.8h, v16.16b\n"
+ "ldr q16, [x20, x26]\n"
+ "ldr q17, [x20, x25]\n"
+ "uxtl v21.8h, v16.8b\n"
+ "uxtl2 v20.8h, v16.16b\n"
+ "ldr q16, [x20, x24]\n"
+ "uxtl v19.8h, v17.8b\n"
+ "uxtl2 v18.8h, v17.16b\n"
+ "subs x23, x23, #0x1\n"
+ "uxtl v17.8h, v16.8b\n"
+ "uxtl2 v16.8h, v16.16b\n"
"uaddw v15.4s, v15.4s, v23.4h\n"
"uaddw2 v14.4s, v14.4s, v23.8h\n"
"uaddw v13.4s, v13.4s, v22.4h\n"
@@ -330,49 +330,49 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"movi v15.4s, #0x0\n"
"movi v14.4s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"movi v13.4s, #0x0\n"
"movi v12.4s, #0x0\n"
"cbz x23, 11f\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ldr q30, [x21, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ldr q30, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- "uaddl v23.8h, v31.8b, v30.8b\n"
- "uaddl2 v22.8h, v31.16b, v30.16b\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
- "ldr q30, [x21, x27]\n"
+ "uaddl v17.8h, v31.8b, v30.8b\n"
+ "uaddl2 v16.8h, v31.16b, v30.16b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "ldr q30, [x20, x27]\n"
"subs x23, x23, #0x1\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
- "add x20, x20, #0x10\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
+ "add x22, x22, #0x10\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- "uaddl v23.8h, v31.8b, v30.8b\n"
- "uaddl2 v22.8h, v31.16b, v30.16b\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "uaddl v17.8h, v31.8b, v30.8b\n"
+ "uaddl2 v16.8h, v31.16b, v30.16b\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x21, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ldr q31, [x22, x27]\n"
- "uxtl v23.8h, v31.8b\n"
- "uxtl2 v22.8h, v31.16b\n"
- "subs x21, x21, #0x1\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "uxtl v17.8h, v16.8b\n"
+ "uxtl2 v16.8h, v16.16b\n"
+ "subs x23, x23, #0x1\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"ld1r { v17.4s }, [%x[rescale_ptr]]\n"
@@ -397,9 +397,9 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"smin v14.4s, v14.4s, v16.4s\n"
"smin v13.4s, v13.4s, v16.4s\n"
"smin v12.4s, v12.4s, v16.4s\n"
- "uzp1 v23.16b, v15.16b, v14.16b\n"
+ "uzp1 v17.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
"str q16, [%x[outptr], x27]\n"
"add x27, x27, #0x10\n"
"bge 8b\n"
@@ -411,142 +411,142 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"movi v14.4s, #0x0\n"
"movi v13.4s, #0x0\n"
"movi v12.4s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x23, 24f\n"
"15:" // Oddments: 2 inputs loop
- "ldp x22, x21, [x20, #0x0]\n"
- "add x20, x20, #0x10\n"
- "add x22, x22, x27\n"
- "movi v31.16b, #0x0\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "add x22, x22, #0x10\n"
"add x21, x21, x27\n"
+ "movi v31.16b, #0x0\n"
+ "add x20, x20, x27\n"
"movi v30.16b, #0x0\n"
"tbz %x[n_channels], #3, 19f\n"
- "ldr d31, [x22], #0x8\n"
- "ldr d30, [x21], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
+ "ldr d30, [x20], #0x8\n"
"tbz %x[n_channels], #2, 17f\n"
- "ld1 { v31.s }[2], [x22], #0x4\n"
- "ld1 { v30.s }[2], [x21], #0x4\n"
+ "ld1 { v31.s }[2], [x21], #0x4\n"
+ "ld1 { v30.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v31.h }[6], [x22], #0x2\n"
- "ld1 { v30.h }[6], [x21], #0x2\n"
+ "ld1 { v31.h }[6], [x21], #0x2\n"
+ "ld1 { v30.h }[6], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[14], [x22], #0x1\n"
- "ld1 { v30.b }[14], [x21], #0x1\n"
+ "ld1 { v31.b }[14], [x21], #0x1\n"
+ "ld1 { v30.b }[14], [x20], #0x1\n"
"b 23f\n"
"16:" // Oddments: 2 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[12], [x22], #0x1\n"
- "ld1 { v30.b }[12], [x21], #0x1\n"
+ "ld1 { v31.b }[12], [x21], #0x1\n"
+ "ld1 { v30.b }[12], [x20], #0x1\n"
"b 23f\n"
"17:" // Oddments: 2 inputs loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v31.h }[4], [x22], #0x2\n"
- "ld1 { v30.h }[4], [x21], #0x2\n"
+ "ld1 { v31.h }[4], [x21], #0x2\n"
+ "ld1 { v30.h }[4], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[10], [x22], #0x1\n"
- "ld1 { v30.b }[10], [x21], #0x1\n"
+ "ld1 { v31.b }[10], [x21], #0x1\n"
+ "ld1 { v30.b }[10], [x20], #0x1\n"
"b 23f\n"
"18:" // Oddments: 2 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[8], [x22], #0x1\n"
- "ld1 { v30.b }[8], [x21], #0x1\n"
+ "ld1 { v31.b }[8], [x21], #0x1\n"
+ "ld1 { v30.b }[8], [x20], #0x1\n"
"b 23f\n"
"19:" // Oddments: 2 inputs loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 21f\n"
- "ldr s31, [x22], #0x4\n"
- "ldr s30, [x21], #0x4\n"
+ "ldr s31, [x21], #0x4\n"
+ "ldr s30, [x20], #0x4\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v31.h }[2], [x22], #0x2\n"
- "ld1 { v30.h }[2], [x21], #0x2\n"
+ "ld1 { v31.h }[2], [x21], #0x2\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[6], [x22], #0x1\n"
- "ld1 { v30.b }[6], [x21], #0x1\n"
+ "ld1 { v31.b }[6], [x21], #0x1\n"
+ "ld1 { v30.b }[6], [x20], #0x1\n"
"b 23f\n"
"20:" // Oddments: 2 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[4], [x22], #0x1\n"
- "ld1 { v30.b }[4], [x21], #0x1\n"
+ "ld1 { v31.b }[4], [x21], #0x1\n"
+ "ld1 { v30.b }[4], [x20], #0x1\n"
"b 23f\n"
"21:" // Oddments: 2 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 22f\n"
- "ldr h31, [x22], #0x2\n"
- "ldr h30, [x21], #0x2\n"
+ "ldr h31, [x21], #0x2\n"
+ "ldr h30, [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[2], [x22], #0x1\n"
- "ld1 { v30.b }[2], [x21], #0x1\n"
+ "ld1 { v31.b }[2], [x21], #0x1\n"
+ "ld1 { v30.b }[2], [x20], #0x1\n"
"b 23f\n"
"22:" // Oddments: 2 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ldr b31, [x22], #0x1\n"
- "ldr b30, [x21], #0x1\n"
+ "ldr b31, [x21], #0x1\n"
+ "ldr b30, [x20], #0x1\n"
"23:" // Oddments: 2 inputs loop: Load: Bit 3: End
- "uaddl v23.8h, v31.8b, v30.8b\n"
- "uaddl2 v22.8h, v31.16b, v30.16b\n"
+ "uaddl v17.8h, v31.8b, v30.8b\n"
+ "uaddl2 v16.8h, v31.16b, v30.16b\n"
"subs x23, x23, #0x1\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 15b\n"
"24:" // Oddments: After loop
- "ands x21, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 34f\n"
"25:" // Oddments: Single input loop
- "ldr x22, [x20], #0x8\n"
- "add x22, x22, x27\n"
+ "ldr x21, [x22], #0x8\n"
+ "add x21, x21, x27\n"
"movi v31.16b, #0x0\n"
"tbz %x[n_channels], #3, 29f\n"
- "ldr d31, [x22], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
- "ld1 { v31.s }[2], [x22], #0x4\n"
+ "ld1 { v31.s }[2], [x21], #0x4\n"
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v31.h }[6], [x22], #0x2\n"
+ "ld1 { v31.h }[6], [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[14], [x22], #0x1\n"
+ "ld1 { v31.b }[14], [x21], #0x1\n"
"b 33f\n"
"26:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[12], [x22], #0x1\n"
+ "ld1 { v31.b }[12], [x21], #0x1\n"
"b 33f\n"
"27:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v31.h }[4], [x22], #0x2\n"
+ "ld1 { v31.h }[4], [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[10], [x22], #0x1\n"
+ "ld1 { v31.b }[10], [x21], #0x1\n"
"b 33f\n"
"28:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[8], [x22], #0x1\n"
+ "ld1 { v31.b }[8], [x21], #0x1\n"
"b 33f\n"
"29:" // Oddments: Single input loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 31f\n"
- "ldr s31, [x22], #0x4\n"
+ "ldr s31, [x21], #0x4\n"
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v31.h }[2], [x22], #0x2\n"
+ "ld1 { v31.h }[2], [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[6], [x22], #0x1\n"
+ "ld1 { v31.b }[6], [x21], #0x1\n"
"b 33f\n"
"30:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[4], [x22], #0x1\n"
+ "ld1 { v31.b }[4], [x21], #0x1\n"
"b 33f\n"
"31:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 32f\n"
- "ldr h31, [x22], #0x2\n"
+ "ldr h31, [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[2], [x22], #0x1\n"
+ "ld1 { v31.b }[2], [x21], #0x1\n"
"b 33f\n"
"32:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ldr b31, [x22], #0x1\n"
+ "ldr b31, [x21], #0x1\n"
"33:" // Oddments: Single input loop: Load: Bit 3: End
- "uxtl v23.8h, v31.8b\n"
- "uxtl2 v22.8h, v31.16b\n"
- "subs x21, x21, #0x1\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "uxtl v17.8h, v31.8b\n"
+ "uxtl2 v16.8h, v31.16b\n"
+ "subs x23, x23, #0x1\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 25b\n"
"34:" // Oddments: Single input loop: End
"ld1r { v17.4s }, [%x[rescale_ptr]]\n"
@@ -569,9 +569,9 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"smin v14.4s, v14.4s, v16.4s\n"
"smin v13.4s, v13.4s, v16.4s\n"
"smin v12.4s, v12.4s, v16.4s\n"
- "uzp1 v23.16b, v15.16b, v14.16b\n"
+ "uzp1 v17.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
"tbz %x[n_channels], #3, 38f\n"
"st1 { v16.d }[0], [%x[outptr]], #0x8\n"
"tbz %x[n_channels], #2, 36f\n"
@@ -626,4 +626,5 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
+
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 556d833681..9d160bf8f8 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,8 @@
#pragma once
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -45,3 +47,5 @@ struct a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<u
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 149566197a..66cdb7f849 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -22,11 +22,12 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -111,7 +112,7 @@ void a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"umax v18.16b, v18.16b, v21.16b\n"
"umax v17.16b, v17.16b, v20.16b\n"
"add x15, x15, #0x10\n"
- "umax v16.16b, v16.16b, v20.16b\n"
+ "umax v16.16b, v20.16b, v16.16b\n"
"str q19, [x14, x12]\n"
"str q18, [x13, x12]\n"
"str q17, [x11, x12]\n"
@@ -121,43 +122,43 @@ void a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"2:" // Vector: Tail
"umax v21.16b, v30.16b, v29.16b\n"
"umax v20.16b, v29.16b, v28.16b\n"
- "umax v19.16b, v27.16b, v26.16b\n"
+ "umax v16.16b, v27.16b, v26.16b\n"
"umax v18.16b, v25.16b, v24.16b\n"
"umax v17.16b, v27.16b, v23.16b\n"
- "umax v16.16b, v24.16b, v22.16b\n"
- "umax v19.16b, v21.16b, v19.16b\n"
+ "umax v19.16b, v24.16b, v22.16b\n"
+ "umax v16.16b, v21.16b, v16.16b\n"
"umax v18.16b, v18.16b, v21.16b\n"
- "str q19, [x14, x12]\n"
+ "str q16, [x14, x12]\n"
"umax v17.16b, v17.16b, v20.16b\n"
- "umax v16.16b, v16.16b, v20.16b\n"
+ "umax v16.16b, v20.16b, v19.16b\n"
"str q18, [x13, x12]\n"
"str q17, [x11, x12]\n"
"str q16, [x10, x12]\n"
"add x12, x12, #0x10\n"
"cbz x16, 4f\n"
"3:" // Oddments
- "ldr b30, [x28, x15]\n"
- "ldr b29, [x25, x15]\n"
- "umax v21.16b, v30.16b, v29.16b\n"
+ "ldr b16, [x28, x15]\n"
+ "ldr b17, [x25, x15]\n"
+ "umax v23.16b, v16.16b, v17.16b\n"
"subs x16, x16, #0x1\n"
- "ldr b28, [x22, x15]\n"
- "ldr b27, [x26, x15]\n"
- "umax v20.16b, v29.16b, v28.16b\n"
- "ldr b26, [x9, x15]\n"
- "ldr b25, [x27, x15]\n"
- "umax v19.16b, v27.16b, v26.16b\n"
- "umax v19.16b, v21.16b, v19.16b\n"
- "ldr b24, [x24, x15]\n"
- "ldr b23, [x23, x15]\n"
- "umax v18.16b, v25.16b, v24.16b\n"
- "umax v17.16b, v27.16b, v23.16b\n"
- "ldr b22, [x21, x15]\n"
- "umax v16.16b, v24.16b, v22.16b\n"
+ "ldr b16, [x22, x15]\n"
+ "ldr b22, [x26, x15]\n"
+ "umax v21.16b, v17.16b, v16.16b\n"
+ "ldr b16, [x9, x15]\n"
+ "ldr b17, [x27, x15]\n"
+ "umax v16.16b, v22.16b, v16.16b\n"
+ "umax v20.16b, v23.16b, v16.16b\n"
+ "ldr b19, [x24, x15]\n"
+ "ldr b16, [x23, x15]\n"
+ "umax v18.16b, v17.16b, v19.16b\n"
+ "umax v17.16b, v22.16b, v16.16b\n"
+ "ldr b16, [x21, x15]\n"
+ "umax v16.16b, v19.16b, v16.16b\n"
"add x15, x15, #0x1\n"
- "umax v18.16b, v18.16b, v21.16b\n"
- "umax v17.16b, v17.16b, v20.16b\n"
- "umax v16.16b, v16.16b, v20.16b\n"
- "str b19, [x14, x12]\n"
+ "umax v18.16b, v18.16b, v23.16b\n"
+ "umax v17.16b, v17.16b, v21.16b\n"
+ "umax v16.16b, v21.16b, v16.16b\n"
+ "str b20, [x14, x12]\n"
"str b18, [x13, x12]\n"
"str b17, [x11, x12]\n"
"str b16, [x10, x12]\n"
@@ -172,4 +173,5 @@ void a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
+
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp
index 98f5b8351c..2ceef125ca 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp
@@ -22,11 +22,11 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
-
#include <cstdint>
#include <cstddef>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -41,77 +41,77 @@ void a64_u8_nhwc_max_generic_depthfirst_impl(
{
__asm__ __volatile__(
"cmp %x[n_channels], #0x40\n"
- "mov x9, #0x0\n"
- "mov x28, #0x10\n" // cntb _, ALL, #1
- "mov x27, #0x20\n" // cntb _, ALL, #2
- "mov x26, #0x30\n" // cntb _, ALL, #3
+ "mov x27, #0x0\n"
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "mov x24, #0x20\n" // cntb _, ALL, #2
+ "mov x23, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
"movi v8.16b, #0x0\n"
"movi v7.16b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"movi v6.16b, #0x0\n"
"movi v5.16b, #0x0\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldr q2, [x24, x28]\n"
- "ldr q1, [x23, x28]\n"
- "ldr q0, [x24, x27]\n"
- "ldr q31, [x23, x27]\n"
- "ldr q30, [x24, x26]\n"
- "ldr q29, [x23, x26]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldr q2, [x21, x26]\n"
+ "ldr q1, [x20, x26]\n"
+ "ldr q0, [x21, x24]\n"
+ "ldr q31, [x20, x24]\n"
+ "ldr q30, [x21, x23]\n"
+ "ldr q29, [x20, x23]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"umax v23.16b, v4.16b, v3.16b\n"
"umax v19.16b, v28.16b, v22.16b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
"umax v22.16b, v2.16b, v1.16b\n"
- "ldr q2, [x24, x28]\n"
+ "ldr q2, [x21, x26]\n"
"umax v18.16b, v27.16b, v21.16b\n"
- "ldr q1, [x23, x28]\n"
+ "ldr q1, [x20, x26]\n"
"umax v21.16b, v0.16b, v31.16b\n"
- "ldr q0, [x24, x27]\n"
+ "ldr q0, [x21, x24]\n"
"umax v17.16b, v26.16b, v20.16b\n"
- "ldr q31, [x23, x27]\n"
+ "ldr q31, [x20, x24]\n"
"umax v20.16b, v30.16b, v29.16b\n"
- "ldr q30, [x24, x26]\n"
+ "ldr q30, [x21, x23]\n"
"umax v16.16b, v25.16b, v24.16b\n"
- "ldr q29, [x23, x26]\n"
+ "ldr q29, [x20, x23]\n"
"umax v19.16b, v23.16b, v19.16b\n"
"umax v18.16b, v22.16b, v18.16b\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"umax v17.16b, v21.16b, v17.16b\n"
"umax v16.16b, v20.16b, v16.16b\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
"subs x25, x25, #0x1\n"
"umax v8.16b, v8.16b, v19.16b\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
"umax v7.16b, v7.16b, v18.16b\n"
"umax v6.16b, v6.16b, v17.16b\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"umax v5.16b, v5.16b, v16.16b\n"
- "add x20, x20, #0x20\n"
+ "add x22, x22, #0x20\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"umax v23.16b, v4.16b, v3.16b\n"
@@ -134,28 +134,28 @@ void a64_u8_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "umax v8.16b, v8.16b, v4.16b\n"
- "ldr q2, [x24, x28]\n"
- "ldr q0, [x24, x27]\n"
- "umax v7.16b, v7.16b, v2.16b\n"
- "umax v6.16b, v6.16b, v0.16b\n"
- "ldr q30, [x24, x26]\n"
- "umax v5.16b, v5.16b, v30.16b\n"
+ "umax v8.16b, v8.16b, v16.16b\n"
+ "ldr q17, [x20, x26]\n"
+ "ldr q16, [x20, x24]\n"
+ "umax v7.16b, v7.16b, v17.16b\n"
+ "umax v6.16b, v6.16b, v16.16b\n"
+ "ldr q16, [x20, x23]\n"
+ "umax v5.16b, v5.16b, v16.16b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x40\n"
"cmp %x[n_channels], #0x40\n"
- "str q8, [%x[outptr], x9]\n"
- "str q7, [%x[outptr], x28]\n"
- "add x9, x9, #0x40\n"
- "add x28, x28, #0x40\n"
- "str q6, [%x[outptr], x27]\n"
+ "str q8, [%x[outptr], x27]\n"
+ "str q7, [%x[outptr], x26]\n"
"add x27, x27, #0x40\n"
- "str q5, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
+ "str q6, [%x[outptr], x24]\n"
+ "add x24, x24, #0x40\n"
+ "str q5, [%x[outptr], x23]\n"
+ "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 43f\n"
"7:" // Single vector of channels
@@ -164,217 +164,217 @@ void a64_u8_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"movi v8.16b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "umax v23.16b, v4.16b, v3.16b\n"
- "umax v19.16b, v28.16b, v22.16b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
- "umax v19.16b, v23.16b, v19.16b\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "umax v17.16b, v4.16b, v3.16b\n"
+ "umax v16.16b, v28.16b, v22.16b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "umax v16.16b, v17.16b, v16.16b\n"
+ "ldp x21, x20, [x22, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "umax v8.16b, v8.16b, v19.16b\n"
- "add x20, x20, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "umax v8.16b, v8.16b, v16.16b\n"
+ "add x22, x22, #0x20\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "umax v23.16b, v4.16b, v3.16b\n"
- "umax v19.16b, v28.16b, v22.16b\n"
- "umax v19.16b, v23.16b, v19.16b\n"
- "umax v8.16b, v8.16b, v19.16b\n"
+ "umax v17.16b, v4.16b, v3.16b\n"
+ "umax v16.16b, v28.16b, v22.16b\n"
+ "umax v16.16b, v17.16b, v16.16b\n"
+ "umax v8.16b, v8.16b, v16.16b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "umax v8.16b, v8.16b, v4.16b\n"
+ "umax v8.16b, v8.16b, v16.16b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x10\n"
"cmp %x[n_channels], #0x10\n"
- "str q8, [%x[outptr], x9]\n"
- "add x9, x9, #0x10\n"
+ "str q8, [%x[outptr], x27]\n"
+ "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 43f\n"
"14:" // Oddments
"lsr x25, %x[n_valid_cells], #0x2\n"
- "add %x[outptr], %x[outptr], x9\n"
+ "add %x[outptr], %x[outptr], x27\n"
"movi v8.16b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 24f\n"
"15:" // Oddments: 4 inputs loop
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "add x24, x24, x9\n"
- "add x23, x23, x9\n"
- "add x22, x22, x9\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
"movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "add x21, x21, x9\n"
+ "add x20, x20, x27\n"
"movi v28.16b, #0x0\n"
"movi v22.16b, #0x0\n"
"tbz %x[n_channels], #3, 19f\n"
- "ldr d4, [x24], #0x8\n"
- "ldr d3, [x23], #0x8\n"
- "ldr d28, [x22], #0x8\n"
- "ldr d22, [x21], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
+ "ldr d3, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "ldr d22, [x20], #0x8\n"
"tbz %x[n_channels], #2, 17f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
- "ld1 { v28.s }[2], [x22], #0x4\n"
- "ld1 { v22.s }[2], [x21], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v3.s }[2], [x22], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v4.h }[6], [x24], #0x2\n"
- "ld1 { v3.h }[6], [x23], #0x2\n"
- "ld1 { v28.h }[6], [x22], #0x2\n"
- "ld1 { v22.h }[6], [x21], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
+ "ld1 { v3.h }[6], [x22], #0x2\n"
+ "ld1 { v28.h }[6], [x21], #0x2\n"
+ "ld1 { v22.h }[6], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[14], [x24], #0x1\n"
- "ld1 { v3.b }[14], [x23], #0x1\n"
- "ld1 { v28.b }[14], [x22], #0x1\n"
- "ld1 { v22.b }[14], [x21], #0x1\n"
+ "ld1 { v4.b }[14], [x23], #0x1\n"
+ "ld1 { v3.b }[14], [x22], #0x1\n"
+ "ld1 { v28.b }[14], [x21], #0x1\n"
+ "ld1 { v22.b }[14], [x20], #0x1\n"
"b 23f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[12], [x24], #0x1\n"
- "ld1 { v3.b }[12], [x23], #0x1\n"
- "ld1 { v28.b }[12], [x22], #0x1\n"
- "ld1 { v22.b }[12], [x21], #0x1\n"
+ "ld1 { v4.b }[12], [x23], #0x1\n"
+ "ld1 { v3.b }[12], [x22], #0x1\n"
+ "ld1 { v28.b }[12], [x21], #0x1\n"
+ "ld1 { v22.b }[12], [x20], #0x1\n"
"b 23f\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v4.h }[4], [x24], #0x2\n"
- "ld1 { v3.h }[4], [x23], #0x2\n"
- "ld1 { v28.h }[4], [x22], #0x2\n"
- "ld1 { v22.h }[4], [x21], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
+ "ld1 { v3.h }[4], [x22], #0x2\n"
+ "ld1 { v28.h }[4], [x21], #0x2\n"
+ "ld1 { v22.h }[4], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[10], [x24], #0x1\n"
- "ld1 { v3.b }[10], [x23], #0x1\n"
- "ld1 { v28.b }[10], [x22], #0x1\n"
- "ld1 { v22.b }[10], [x21], #0x1\n"
+ "ld1 { v4.b }[10], [x23], #0x1\n"
+ "ld1 { v3.b }[10], [x22], #0x1\n"
+ "ld1 { v28.b }[10], [x21], #0x1\n"
+ "ld1 { v22.b }[10], [x20], #0x1\n"
"b 23f\n"
"18:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[8], [x24], #0x1\n"
- "ld1 { v3.b }[8], [x23], #0x1\n"
- "ld1 { v28.b }[8], [x22], #0x1\n"
- "ld1 { v22.b }[8], [x21], #0x1\n"
+ "ld1 { v4.b }[8], [x23], #0x1\n"
+ "ld1 { v3.b }[8], [x22], #0x1\n"
+ "ld1 { v28.b }[8], [x21], #0x1\n"
+ "ld1 { v22.b }[8], [x20], #0x1\n"
"b 23f\n"
"19:" // Oddments: 4 inputs loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 21f\n"
- "ldr s4, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s28, [x22], #0x4\n"
- "ldr s22, [x21], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s28, [x21], #0x4\n"
+ "ldr s22, [x20], #0x4\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v4.h }[2], [x24], #0x2\n"
- "ld1 { v3.h }[2], [x23], #0x2\n"
- "ld1 { v28.h }[2], [x22], #0x2\n"
- "ld1 { v22.h }[2], [x21], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v3.h }[2], [x22], #0x2\n"
+ "ld1 { v28.h }[2], [x21], #0x2\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[6], [x24], #0x1\n"
- "ld1 { v3.b }[6], [x23], #0x1\n"
- "ld1 { v28.b }[6], [x22], #0x1\n"
- "ld1 { v22.b }[6], [x21], #0x1\n"
+ "ld1 { v4.b }[6], [x23], #0x1\n"
+ "ld1 { v3.b }[6], [x22], #0x1\n"
+ "ld1 { v28.b }[6], [x21], #0x1\n"
+ "ld1 { v22.b }[6], [x20], #0x1\n"
"b 23f\n"
"20:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[4], [x24], #0x1\n"
- "ld1 { v3.b }[4], [x23], #0x1\n"
- "ld1 { v28.b }[4], [x22], #0x1\n"
- "ld1 { v22.b }[4], [x21], #0x1\n"
+ "ld1 { v4.b }[4], [x23], #0x1\n"
+ "ld1 { v3.b }[4], [x22], #0x1\n"
+ "ld1 { v28.b }[4], [x21], #0x1\n"
+ "ld1 { v22.b }[4], [x20], #0x1\n"
"b 23f\n"
"21:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 22f\n"
- "ldr h4, [x24], #0x2\n"
- "ldr h3, [x23], #0x2\n"
- "ldr h28, [x22], #0x2\n"
- "ldr h22, [x21], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "ldr h28, [x21], #0x2\n"
+ "ldr h22, [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[2], [x24], #0x1\n"
- "ld1 { v3.b }[2], [x23], #0x1\n"
- "ld1 { v28.b }[2], [x22], #0x1\n"
- "ld1 { v22.b }[2], [x21], #0x1\n"
+ "ld1 { v4.b }[2], [x23], #0x1\n"
+ "ld1 { v3.b }[2], [x22], #0x1\n"
+ "ld1 { v28.b }[2], [x21], #0x1\n"
+ "ld1 { v22.b }[2], [x20], #0x1\n"
"b 23f\n"
"22:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ldr b4, [x24], #0x1\n"
- "ldr b3, [x23], #0x1\n"
- "ldr b28, [x22], #0x1\n"
- "ldr b22, [x21], #0x1\n"
+ "ldr b4, [x23], #0x1\n"
+ "ldr b3, [x22], #0x1\n"
+ "ldr b28, [x21], #0x1\n"
+ "ldr b22, [x20], #0x1\n"
"23:" // Oddments: 4 inputs loop: Load: Bit 3: End
- "umax v23.16b, v4.16b, v3.16b\n"
- "umax v19.16b, v28.16b, v22.16b\n"
+ "umax v17.16b, v4.16b, v3.16b\n"
+ "umax v16.16b, v28.16b, v22.16b\n"
"subs x25, x25, #0x1\n"
- "umax v19.16b, v23.16b, v19.16b\n"
- "umax v8.16b, v8.16b, v19.16b\n"
+ "umax v16.16b, v17.16b, v16.16b\n"
+ "umax v8.16b, v8.16b, v16.16b\n"
"bgt 15b\n"
"24:" // Oddments: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 34f\n"
"25:" // Oddments: Single input loop
- "ldr x24, [x20], #0x8\n"
- "add x24, x24, x9\n"
+ "ldr x23, [x24], #0x8\n"
+ "add x23, x23, x27\n"
"movi v4.16b, #0x0\n"
"tbz %x[n_channels], #3, 29f\n"
- "ldr d4, [x24], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v4.h }[6], [x24], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[14], [x24], #0x1\n"
+ "ld1 { v4.b }[14], [x23], #0x1\n"
"b 33f\n"
"26:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[12], [x24], #0x1\n"
+ "ld1 { v4.b }[12], [x23], #0x1\n"
"b 33f\n"
"27:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v4.h }[4], [x24], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[10], [x24], #0x1\n"
+ "ld1 { v4.b }[10], [x23], #0x1\n"
"b 33f\n"
"28:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[8], [x24], #0x1\n"
+ "ld1 { v4.b }[8], [x23], #0x1\n"
"b 33f\n"
"29:" // Oddments: Single input loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 31f\n"
- "ldr s4, [x24], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v4.h }[2], [x24], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[6], [x24], #0x1\n"
+ "ld1 { v4.b }[6], [x23], #0x1\n"
"b 33f\n"
"30:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[4], [x24], #0x1\n"
+ "ld1 { v4.b }[4], [x23], #0x1\n"
"b 33f\n"
"31:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 32f\n"
- "ldr h4, [x24], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[2], [x24], #0x1\n"
+ "ld1 { v4.b }[2], [x23], #0x1\n"
"b 33f\n"
"32:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ldr b4, [x24], #0x1\n"
+ "ldr b4, [x23], #0x1\n"
"33:" // Oddments: Single input loop: Load: Bit 3: End
"subs x21, x21, #0x1\n"
"umax v8.16b, v8.16b, v4.16b\n"
@@ -428,10 +428,11 @@ void a64_u8_nhwc_max_generic_depthfirst_impl(
"43:" // End
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
} // namespace pooling
} // namespace arm_conv
+
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp
index 19227d8aaa..31a3489e5c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -22,8 +22,6 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
-
#include "pooling.hpp"
#include <cstdint>
#include <cstddef>
@@ -31,6 +29,8 @@
#include <cmath>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -132,7 +132,7 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"mov v13.16b, v15.16b\n"
"mov v12.16b, v15.16b\n"
"mov v11.16b, v15.16b\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"mov v10.16b, v15.16b\n"
"mov v9.16b, v15.16b\n"
"mov v8.16b, v15.16b\n"
@@ -145,42 +145,42 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"mov v1.16b, v15.16b\n"
"mov v0.16b, v15.16b\n"
"cbz x23, 4f\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ldr q30, [x21, x27]\n"
- "ldr q29, [x22, x26]\n"
- "ldr q28, [x21, x26]\n"
- "ldr q27, [x22, x25]\n"
- "ldr q26, [x21, x25]\n"
- "ldr q25, [x22, x24]\n"
- "ldr q24, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
+ "ldr q30, [x20, x27]\n"
+ "ldr q29, [x21, x26]\n"
+ "ldr q28, [x20, x26]\n"
+ "ldr q27, [x21, x25]\n"
+ "ldr q26, [x20, x25]\n"
+ "ldr q25, [x21, x24]\n"
+ "ldr q24, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
"uaddl v23.8h, v31.8b, v30.8b\n"
"uaddl2 v22.8h, v31.16b, v30.16b\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
- "ldr q30, [x21, x27]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "ldr q30, [x20, x27]\n"
"uaddl v21.8h, v29.8b, v28.8b\n"
"uaddl2 v20.8h, v29.16b, v28.16b\n"
- "ldr q29, [x22, x26]\n"
- "ldr q28, [x21, x26]\n"
+ "ldr q29, [x21, x26]\n"
+ "ldr q28, [x20, x26]\n"
"uaddl v19.8h, v27.8b, v26.8b\n"
"uaddl2 v18.8h, v27.16b, v26.16b\n"
- "ldr q27, [x22, x25]\n"
- "ldr q26, [x21, x25]\n"
+ "ldr q27, [x21, x25]\n"
+ "ldr q26, [x20, x25]\n"
+ "uaddl v17.8h, v25.8b, v24.8b\n"
+ "uaddl2 v16.8h, v25.16b, v24.16b\n"
+ "ldr q25, [x21, x24]\n"
+ "ldr q24, [x20, x24]\n"
"subs x23, x23, #0x1\n"
"uaddw v15.4s, v15.4s, v23.4h\n"
"uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddl v17.8h, v25.8b, v24.8b\n"
- "uaddl2 v16.8h, v25.16b, v24.16b\n"
- "ldr q25, [x22, x24]\n"
- "add x20, x20, #0x10\n"
"uaddw v13.4s, v13.4s, v22.4h\n"
"uaddw2 v12.4s, v12.4s, v22.8h\n"
- "ldr q24, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
"uaddw v11.4s, v11.4s, v21.4h\n"
"uaddw2 v10.4s, v10.4s, v21.8h\n"
"uaddw v9.4s, v9.4s, v20.4h\n"
@@ -220,23 +220,23 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"uaddw v1.4s, v1.4s, v16.4h\n"
"uaddw2 v0.4s, v0.4s, v16.8h\n"
"4:" // 4-vectors of channels: After loop
- "ands x21, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ldr q31, [x22, x27]\n"
- "uxtl v23.8h, v31.8b\n"
- "uxtl2 v22.8h, v31.16b\n"
- "ldr q29, [x22, x26]\n"
- "ldr q27, [x22, x25]\n"
- "uxtl v21.8h, v29.8b\n"
- "uxtl2 v20.8h, v29.16b\n"
- "ldr q25, [x22, x24]\n"
- "uxtl v19.8h, v27.8b\n"
- "uxtl2 v18.8h, v27.16b\n"
- "subs x21, x21, #0x1\n"
- "uxtl v17.8h, v25.8b\n"
- "uxtl2 v16.8h, v25.16b\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "uxtl v23.8h, v16.8b\n"
+ "uxtl2 v22.8h, v16.16b\n"
+ "ldr q16, [x20, x26]\n"
+ "ldr q17, [x20, x25]\n"
+ "uxtl v21.8h, v16.8b\n"
+ "uxtl2 v20.8h, v16.16b\n"
+ "ldr q16, [x20, x24]\n"
+ "uxtl v19.8h, v17.8b\n"
+ "uxtl2 v18.8h, v17.16b\n"
+ "subs x23, x23, #0x1\n"
+ "uxtl v17.8h, v16.8b\n"
+ "uxtl2 v16.8h, v16.16b\n"
"uaddw v15.4s, v15.4s, v23.4h\n"
"uaddw2 v14.4s, v14.4s, v23.8h\n"
"uaddw v13.4s, v13.4s, v22.4h\n"
@@ -391,56 +391,56 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"mov v14.16b, v15.16b\n"
"mov v13.16b, v15.16b\n"
"mov v12.16b, v15.16b\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x23, 11f\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ldr q30, [x21, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ldr q30, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- "uaddl v23.8h, v31.8b, v30.8b\n"
- "uaddl2 v22.8h, v31.16b, v30.16b\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
- "ldr q30, [x21, x27]\n"
+ "uaddl v17.8h, v31.8b, v30.8b\n"
+ "uaddl2 v16.8h, v31.16b, v30.16b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "ldr q30, [x20, x27]\n"
"subs x23, x23, #0x1\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
- "add x20, x20, #0x10\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
+ "add x22, x22, #0x10\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- "uaddl v23.8h, v31.8b, v30.8b\n"
- "uaddl2 v22.8h, v31.16b, v30.16b\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "uaddl v17.8h, v31.8b, v30.8b\n"
+ "uaddl2 v16.8h, v31.16b, v30.16b\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x21, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ldr q31, [x22, x27]\n"
- "uxtl v23.8h, v31.8b\n"
- "uxtl2 v22.8h, v31.16b\n"
- "subs x21, x21, #0x1\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "uxtl v17.8h, v16.8b\n"
+ "uxtl2 v16.8h, v16.16b\n"
+ "subs x23, x23, #0x1\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "ld1r { v19.4s }, [%x[left_shift]]\n"
+ "ld1r { v16.4s }, [%x[left_shift]]\n"
"ld1r { v18.4s }, [%x[combined_rescale_value]]\n"
- "srshl v15.4s, v15.4s, v19.4s\n"
- "srshl v14.4s, v14.4s, v19.4s\n"
+ "srshl v15.4s, v15.4s, v16.4s\n"
+ "srshl v14.4s, v14.4s, v16.4s\n"
"ld1r { v17.4s }, [%x[right_shift]]\n"
- "srshl v13.4s, v13.4s, v19.4s\n"
- "srshl v12.4s, v12.4s, v19.4s\n"
+ "srshl v13.4s, v13.4s, v16.4s\n"
+ "srshl v12.4s, v12.4s, v16.4s\n"
"add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
"ld1r { v16.4s }, [x20]\n"
"sqrdmulh v15.4s, v15.4s, v18.4s\n"
@@ -467,9 +467,9 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"smin v14.4s, v14.4s, v16.4s\n"
"smin v13.4s, v13.4s, v16.4s\n"
"smin v12.4s, v12.4s, v16.4s\n"
- "uzp1 v23.16b, v15.16b, v14.16b\n"
+ "uzp1 v17.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
"str q16, [%x[outptr], x27]\n"
"add x27, x27, #0x10\n"
"bge 8b\n"
@@ -481,151 +481,151 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"mov v14.16b, v15.16b\n"
"mov v13.16b, v15.16b\n"
"mov v12.16b, v15.16b\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x23, 24f\n"
"15:" // Oddments: 2 inputs loop
- "ldp x22, x21, [x20, #0x0]\n"
- "add x20, x20, #0x10\n"
- "add x22, x22, x27\n"
- "movi v31.16b, #0x0\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "add x22, x22, #0x10\n"
"add x21, x21, x27\n"
+ "movi v31.16b, #0x0\n"
+ "add x20, x20, x27\n"
"movi v30.16b, #0x0\n"
"tbz %x[n_channels], #3, 19f\n"
- "ldr d31, [x22], #0x8\n"
- "ldr d30, [x21], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
+ "ldr d30, [x20], #0x8\n"
"tbz %x[n_channels], #2, 17f\n"
- "ld1 { v31.s }[2], [x22], #0x4\n"
- "ld1 { v30.s }[2], [x21], #0x4\n"
+ "ld1 { v31.s }[2], [x21], #0x4\n"
+ "ld1 { v30.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v31.h }[6], [x22], #0x2\n"
- "ld1 { v30.h }[6], [x21], #0x2\n"
+ "ld1 { v31.h }[6], [x21], #0x2\n"
+ "ld1 { v30.h }[6], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[14], [x22], #0x1\n"
- "ld1 { v30.b }[14], [x21], #0x1\n"
+ "ld1 { v31.b }[14], [x21], #0x1\n"
+ "ld1 { v30.b }[14], [x20], #0x1\n"
"b 23f\n"
"16:" // Oddments: 2 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[12], [x22], #0x1\n"
- "ld1 { v30.b }[12], [x21], #0x1\n"
+ "ld1 { v31.b }[12], [x21], #0x1\n"
+ "ld1 { v30.b }[12], [x20], #0x1\n"
"b 23f\n"
"17:" // Oddments: 2 inputs loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v31.h }[4], [x22], #0x2\n"
- "ld1 { v30.h }[4], [x21], #0x2\n"
+ "ld1 { v31.h }[4], [x21], #0x2\n"
+ "ld1 { v30.h }[4], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[10], [x22], #0x1\n"
- "ld1 { v30.b }[10], [x21], #0x1\n"
+ "ld1 { v31.b }[10], [x21], #0x1\n"
+ "ld1 { v30.b }[10], [x20], #0x1\n"
"b 23f\n"
"18:" // Oddments: 2 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[8], [x22], #0x1\n"
- "ld1 { v30.b }[8], [x21], #0x1\n"
+ "ld1 { v31.b }[8], [x21], #0x1\n"
+ "ld1 { v30.b }[8], [x20], #0x1\n"
"b 23f\n"
"19:" // Oddments: 2 inputs loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 21f\n"
- "ldr s31, [x22], #0x4\n"
- "ldr s30, [x21], #0x4\n"
+ "ldr s31, [x21], #0x4\n"
+ "ldr s30, [x20], #0x4\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v31.h }[2], [x22], #0x2\n"
- "ld1 { v30.h }[2], [x21], #0x2\n"
+ "ld1 { v31.h }[2], [x21], #0x2\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[6], [x22], #0x1\n"
- "ld1 { v30.b }[6], [x21], #0x1\n"
+ "ld1 { v31.b }[6], [x21], #0x1\n"
+ "ld1 { v30.b }[6], [x20], #0x1\n"
"b 23f\n"
"20:" // Oddments: 2 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[4], [x22], #0x1\n"
- "ld1 { v30.b }[4], [x21], #0x1\n"
+ "ld1 { v31.b }[4], [x21], #0x1\n"
+ "ld1 { v30.b }[4], [x20], #0x1\n"
"b 23f\n"
"21:" // Oddments: 2 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 22f\n"
- "ldr h31, [x22], #0x2\n"
- "ldr h30, [x21], #0x2\n"
+ "ldr h31, [x21], #0x2\n"
+ "ldr h30, [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[2], [x22], #0x1\n"
- "ld1 { v30.b }[2], [x21], #0x1\n"
+ "ld1 { v31.b }[2], [x21], #0x1\n"
+ "ld1 { v30.b }[2], [x20], #0x1\n"
"b 23f\n"
"22:" // Oddments: 2 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ldr b31, [x22], #0x1\n"
- "ldr b30, [x21], #0x1\n"
+ "ldr b31, [x21], #0x1\n"
+ "ldr b30, [x20], #0x1\n"
"23:" // Oddments: 2 inputs loop: Load: Bit 3: End
- "uaddl v23.8h, v31.8b, v30.8b\n"
- "uaddl2 v22.8h, v31.16b, v30.16b\n"
+ "uaddl v17.8h, v31.8b, v30.8b\n"
+ "uaddl2 v16.8h, v31.16b, v30.16b\n"
"subs x23, x23, #0x1\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 15b\n"
"24:" // Oddments: After loop
- "ands x21, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 34f\n"
"25:" // Oddments: Single input loop
- "ldr x22, [x20], #0x8\n"
- "add x22, x22, x27\n"
+ "ldr x21, [x22], #0x8\n"
+ "add x21, x21, x27\n"
"movi v31.16b, #0x0\n"
"tbz %x[n_channels], #3, 29f\n"
- "ldr d31, [x22], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
- "ld1 { v31.s }[2], [x22], #0x4\n"
+ "ld1 { v31.s }[2], [x21], #0x4\n"
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v31.h }[6], [x22], #0x2\n"
+ "ld1 { v31.h }[6], [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[14], [x22], #0x1\n"
+ "ld1 { v31.b }[14], [x21], #0x1\n"
"b 33f\n"
"26:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[12], [x22], #0x1\n"
+ "ld1 { v31.b }[12], [x21], #0x1\n"
"b 33f\n"
"27:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v31.h }[4], [x22], #0x2\n"
+ "ld1 { v31.h }[4], [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[10], [x22], #0x1\n"
+ "ld1 { v31.b }[10], [x21], #0x1\n"
"b 33f\n"
"28:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[8], [x22], #0x1\n"
+ "ld1 { v31.b }[8], [x21], #0x1\n"
"b 33f\n"
"29:" // Oddments: Single input loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 31f\n"
- "ldr s31, [x22], #0x4\n"
+ "ldr s31, [x21], #0x4\n"
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v31.h }[2], [x22], #0x2\n"
+ "ld1 { v31.h }[2], [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[6], [x22], #0x1\n"
+ "ld1 { v31.b }[6], [x21], #0x1\n"
"b 33f\n"
"30:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[4], [x22], #0x1\n"
+ "ld1 { v31.b }[4], [x21], #0x1\n"
"b 33f\n"
"31:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 32f\n"
- "ldr h31, [x22], #0x2\n"
+ "ldr h31, [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[2], [x22], #0x1\n"
+ "ld1 { v31.b }[2], [x21], #0x1\n"
"b 33f\n"
"32:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ldr b31, [x22], #0x1\n"
+ "ldr b31, [x21], #0x1\n"
"33:" // Oddments: Single input loop: Load: Bit 3: End
- "uxtl v23.8h, v31.8b\n"
- "uxtl2 v22.8h, v31.16b\n"
- "subs x21, x21, #0x1\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "uxtl v17.8h, v31.8b\n"
+ "uxtl2 v16.8h, v31.16b\n"
+ "subs x23, x23, #0x1\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 25b\n"
"34:" // Oddments: Single input loop: End
- "ld1r { v19.4s }, [%x[left_shift]]\n"
+ "ld1r { v16.4s }, [%x[left_shift]]\n"
"ld1r { v18.4s }, [%x[combined_rescale_value]]\n"
- "srshl v15.4s, v15.4s, v19.4s\n"
- "srshl v14.4s, v14.4s, v19.4s\n"
+ "srshl v15.4s, v15.4s, v16.4s\n"
+ "srshl v14.4s, v14.4s, v16.4s\n"
"ld1r { v17.4s }, [%x[right_shift]]\n"
- "srshl v13.4s, v13.4s, v19.4s\n"
- "srshl v12.4s, v12.4s, v19.4s\n"
+ "srshl v13.4s, v13.4s, v16.4s\n"
+ "srshl v12.4s, v12.4s, v16.4s\n"
"add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
"ld1r { v16.4s }, [x20]\n"
"sqrdmulh v15.4s, v15.4s, v18.4s\n"
@@ -650,9 +650,9 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"smin v14.4s, v14.4s, v16.4s\n"
"smin v13.4s, v13.4s, v16.4s\n"
"smin v12.4s, v12.4s, v16.4s\n"
- "uzp1 v23.16b, v15.16b, v14.16b\n"
+ "uzp1 v17.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
"tbz %x[n_channels], #3, 38f\n"
"st1 { v16.d }[0], [%x[outptr]], #0x8\n"
"tbz %x[n_channels], #2, 36f\n"
@@ -707,4 +707,5 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
+
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp
index 7eea14f70f..f4927c5536 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -22,12 +22,12 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
-
#include "pooling.hpp"
#include <cstdint>
#include <cstddef>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -43,77 +43,77 @@ void a64_u8q_nhwc_max_generic_depthfirst_impl(
{
__asm__ __volatile__(
"cmp %x[n_channels], #0x40\n"
- "mov x9, #0x0\n"
- "mov x28, #0x10\n" // cntb _, ALL, #1
- "mov x27, #0x20\n" // cntb _, ALL, #2
- "mov x26, #0x30\n" // cntb _, ALL, #3
+ "mov x27, #0x0\n"
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "mov x24, #0x20\n" // cntb _, ALL, #2
+ "mov x23, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
"movi v8.16b, #0x0\n"
"movi v7.16b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"movi v6.16b, #0x0\n"
"movi v5.16b, #0x0\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldr q2, [x24, x28]\n"
- "ldr q1, [x23, x28]\n"
- "ldr q0, [x24, x27]\n"
- "ldr q31, [x23, x27]\n"
- "ldr q30, [x24, x26]\n"
- "ldr q29, [x23, x26]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldr q2, [x21, x26]\n"
+ "ldr q1, [x20, x26]\n"
+ "ldr q0, [x21, x24]\n"
+ "ldr q31, [x20, x24]\n"
+ "ldr q30, [x21, x23]\n"
+ "ldr q29, [x20, x23]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"umax v23.16b, v4.16b, v3.16b\n"
"umax v19.16b, v28.16b, v22.16b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
"umax v22.16b, v2.16b, v1.16b\n"
- "ldr q2, [x24, x28]\n"
+ "ldr q2, [x21, x26]\n"
"umax v18.16b, v27.16b, v21.16b\n"
- "ldr q1, [x23, x28]\n"
+ "ldr q1, [x20, x26]\n"
"umax v21.16b, v0.16b, v31.16b\n"
- "ldr q0, [x24, x27]\n"
+ "ldr q0, [x21, x24]\n"
"umax v17.16b, v26.16b, v20.16b\n"
- "ldr q31, [x23, x27]\n"
+ "ldr q31, [x20, x24]\n"
"umax v20.16b, v30.16b, v29.16b\n"
- "ldr q30, [x24, x26]\n"
+ "ldr q30, [x21, x23]\n"
"umax v16.16b, v25.16b, v24.16b\n"
- "ldr q29, [x23, x26]\n"
+ "ldr q29, [x20, x23]\n"
"umax v19.16b, v23.16b, v19.16b\n"
"umax v18.16b, v22.16b, v18.16b\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"umax v17.16b, v21.16b, v17.16b\n"
"umax v16.16b, v20.16b, v16.16b\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
"subs x25, x25, #0x1\n"
"umax v8.16b, v8.16b, v19.16b\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
"umax v7.16b, v7.16b, v18.16b\n"
"umax v6.16b, v6.16b, v17.16b\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"umax v5.16b, v5.16b, v16.16b\n"
- "add x20, x20, #0x20\n"
+ "add x22, x22, #0x20\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"umax v23.16b, v4.16b, v3.16b\n"
@@ -136,16 +136,16 @@ void a64_u8q_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "umax v8.16b, v8.16b, v4.16b\n"
- "ldr q2, [x24, x28]\n"
- "ldr q0, [x24, x27]\n"
- "umax v7.16b, v7.16b, v2.16b\n"
- "umax v6.16b, v6.16b, v0.16b\n"
- "ldr q30, [x24, x26]\n"
- "umax v5.16b, v5.16b, v30.16b\n"
+ "umax v8.16b, v8.16b, v16.16b\n"
+ "ldr q17, [x20, x26]\n"
+ "ldr q16, [x20, x24]\n"
+ "umax v7.16b, v7.16b, v17.16b\n"
+ "umax v6.16b, v6.16b, v16.16b\n"
+ "ldr q16, [x20, x23]\n"
+ "umax v5.16b, v5.16b, v16.16b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
@@ -292,17 +292,17 @@ void a64_u8q_nhwc_max_generic_depthfirst_impl(
"uzp1 v19.16b, v25.16b, v19.16b\n"
"uzp1 v18.16b, v24.16b, v18.16b\n"
"uzp1 v16.16b, v23.16b, v16.16b\n"
- "str q16, [%x[outptr], x9]\n"
- "add x9, x9, #0x40\n"
+ "str q16, [%x[outptr], x27]\n"
+ "add x27, x27, #0x40\n"
"uzp1 v16.16b, v22.16b, v21.16b\n"
"uzp1 v17.16b, v20.16b, v17.16b\n"
- "str q16, [%x[outptr], x28]\n"
- "add x28, x28, #0x40\n"
- "uzp1 v16.16b, v19.16b, v18.16b\n"
- "str q17, [%x[outptr], x27]\n"
- "add x27, x27, #0x40\n"
"str q16, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
+ "uzp1 v16.16b, v19.16b, v18.16b\n"
+ "str q17, [%x[outptr], x24]\n"
+ "add x24, x24, #0x40\n"
+ "str q16, [%x[outptr], x23]\n"
+ "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 43f\n"
"7:" // Single vector of channels
@@ -311,314 +311,314 @@ void a64_u8q_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"movi v8.16b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "umax v23.16b, v4.16b, v3.16b\n"
- "umax v19.16b, v28.16b, v22.16b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
- "umax v19.16b, v23.16b, v19.16b\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "umax v17.16b, v4.16b, v3.16b\n"
+ "umax v16.16b, v28.16b, v22.16b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "umax v16.16b, v17.16b, v16.16b\n"
+ "ldp x21, x20, [x22, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "umax v8.16b, v8.16b, v19.16b\n"
- "add x20, x20, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "umax v8.16b, v8.16b, v16.16b\n"
+ "add x22, x22, #0x20\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "umax v23.16b, v4.16b, v3.16b\n"
- "umax v19.16b, v28.16b, v22.16b\n"
- "umax v19.16b, v23.16b, v19.16b\n"
- "umax v8.16b, v8.16b, v19.16b\n"
+ "umax v17.16b, v4.16b, v3.16b\n"
+ "umax v16.16b, v28.16b, v22.16b\n"
+ "umax v16.16b, v17.16b, v16.16b\n"
+ "umax v8.16b, v8.16b, v16.16b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "umax v8.16b, v8.16b, v4.16b\n"
+ "umax v8.16b, v8.16b, v16.16b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
- "ld1r { v4.4s }, [x20]\n"
- "uxtl v23.8h, v8.8b\n"
- "uxtl2 v24.8h, v8.16b\n"
- "neg v4.4s, v4.4s\n"
+ "ld1r { v18.4s }, [x20]\n"
+ "uxtl v17.8h, v8.8b\n"
+ "uxtl2 v16.8h, v8.16b\n"
+ "neg v18.4s, v18.4s\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- "ld1r { v3.4s }, [x20]\n"
- "saddw v0.4s, v4.4s, v23.4h\n"
- "saddw2 v23.4s, v4.4s, v23.8h\n"
- "saddw v31.4s, v4.4s, v24.4h\n"
+ "ld1r { v23.4s }, [x20]\n"
+ "saddw v22.4s, v18.4s, v17.4h\n"
+ "saddw2 v21.4s, v18.4s, v17.8h\n"
+ "saddw v20.4s, v18.4s, v16.4h\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "ld1r { v2.4s }, [x20]\n"
- "saddw2 v30.4s, v4.4s, v24.8h\n"
- "srshl v0.4s, v0.4s, v3.4s\n"
+ "ld1r { v19.4s }, [x20]\n"
+ "saddw2 v18.4s, v18.4s, v16.8h\n"
+ "srshl v22.4s, v22.4s, v23.4s\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "ld1r { v1.4s }, [x20]\n"
- "srshl v23.4s, v23.4s, v3.4s\n"
- "srshl v31.4s, v31.4s, v3.4s\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "srshl v21.4s, v21.4s, v23.4s\n"
+ "srshl v20.4s, v20.4s, v23.4s\n"
"add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
"ld1r { v16.4s }, [x20]\n"
- "srshl v30.4s, v30.4s, v3.4s\n"
- "sqrdmulh v0.4s, v0.4s, v2.4s\n"
+ "srshl v18.4s, v18.4s, v23.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v19.4s\n"
"sub %x[n_channels], %x[n_channels], #0x10\n"
"cmp %x[n_channels], #0x10\n"
- "sqrdmulh v23.4s, v23.4s, v2.4s\n"
- "sqrdmulh v31.4s, v31.4s, v2.4s\n"
- "sqrdmulh v30.4s, v30.4s, v2.4s\n"
- "srshl v0.4s, v0.4s, v1.4s\n"
- "srshl v23.4s, v23.4s, v1.4s\n"
- "srshl v31.4s, v31.4s, v1.4s\n"
- "srshl v30.4s, v30.4s, v1.4s\n"
- "add v0.4s, v0.4s, v16.4s\n"
- "add v23.4s, v23.4s, v16.4s\n"
- "add v31.4s, v31.4s, v16.4s\n"
- "add v30.4s, v30.4s, v16.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v19.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v19.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v19.4s\n"
+ "srshl v22.4s, v22.4s, v17.4s\n"
+ "srshl v21.4s, v21.4s, v17.4s\n"
+ "srshl v20.4s, v20.4s, v17.4s\n"
+ "srshl v18.4s, v18.4s, v17.4s\n"
+ "add v22.4s, v22.4s, v16.4s\n"
+ "add v21.4s, v21.4s, v16.4s\n"
+ "add v20.4s, v20.4s, v16.4s\n"
+ "add v18.4s, v18.4s, v16.4s\n"
"movi v16.4s, #0x0\n"
- "smax v0.4s, v0.4s, v16.4s\n"
- "smax v23.4s, v23.4s, v16.4s\n"
- "smax v31.4s, v31.4s, v16.4s\n"
- "smax v30.4s, v30.4s, v16.4s\n"
+ "smax v22.4s, v22.4s, v16.4s\n"
+ "smax v21.4s, v21.4s, v16.4s\n"
+ "smax v20.4s, v20.4s, v16.4s\n"
+ "smax v18.4s, v18.4s, v16.4s\n"
"movi v16.4s, #0xff\n"
- "smin v0.4s, v0.4s, v16.4s\n"
- "smin v23.4s, v23.4s, v16.4s\n"
- "smin v31.4s, v31.4s, v16.4s\n"
- "smin v30.4s, v30.4s, v16.4s\n"
- "uzp1 v23.16b, v0.16b, v23.16b\n"
- "uzp1 v16.16b, v31.16b, v30.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
- "str q16, [%x[outptr], x9]\n"
- "add x9, x9, #0x10\n"
+ "smin v22.4s, v22.4s, v16.4s\n"
+ "smin v21.4s, v21.4s, v16.4s\n"
+ "smin v20.4s, v20.4s, v16.4s\n"
+ "smin v18.4s, v18.4s, v16.4s\n"
+ "uzp1 v17.16b, v22.16b, v21.16b\n"
+ "uzp1 v16.16b, v20.16b, v18.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
+ "str q16, [%x[outptr], x27]\n"
+ "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 43f\n"
"14:" // Oddments
"lsr x25, %x[n_valid_cells], #0x2\n"
- "add %x[outptr], %x[outptr], x9\n"
+ "add %x[outptr], %x[outptr], x27\n"
"movi v8.16b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 24f\n"
"15:" // Oddments: 4 inputs loop
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "add x24, x24, x9\n"
- "add x23, x23, x9\n"
- "add x22, x22, x9\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
"movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "add x21, x21, x9\n"
+ "add x20, x20, x27\n"
"movi v28.16b, #0x0\n"
"movi v22.16b, #0x0\n"
"tbz %x[n_channels], #3, 19f\n"
- "ldr d4, [x24], #0x8\n"
- "ldr d3, [x23], #0x8\n"
- "ldr d28, [x22], #0x8\n"
- "ldr d22, [x21], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
+ "ldr d3, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "ldr d22, [x20], #0x8\n"
"tbz %x[n_channels], #2, 17f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
- "ld1 { v28.s }[2], [x22], #0x4\n"
- "ld1 { v22.s }[2], [x21], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v3.s }[2], [x22], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v4.h }[6], [x24], #0x2\n"
- "ld1 { v3.h }[6], [x23], #0x2\n"
- "ld1 { v28.h }[6], [x22], #0x2\n"
- "ld1 { v22.h }[6], [x21], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
+ "ld1 { v3.h }[6], [x22], #0x2\n"
+ "ld1 { v28.h }[6], [x21], #0x2\n"
+ "ld1 { v22.h }[6], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[14], [x24], #0x1\n"
- "ld1 { v3.b }[14], [x23], #0x1\n"
- "ld1 { v28.b }[14], [x22], #0x1\n"
- "ld1 { v22.b }[14], [x21], #0x1\n"
+ "ld1 { v4.b }[14], [x23], #0x1\n"
+ "ld1 { v3.b }[14], [x22], #0x1\n"
+ "ld1 { v28.b }[14], [x21], #0x1\n"
+ "ld1 { v22.b }[14], [x20], #0x1\n"
"b 23f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[12], [x24], #0x1\n"
- "ld1 { v3.b }[12], [x23], #0x1\n"
- "ld1 { v28.b }[12], [x22], #0x1\n"
- "ld1 { v22.b }[12], [x21], #0x1\n"
+ "ld1 { v4.b }[12], [x23], #0x1\n"
+ "ld1 { v3.b }[12], [x22], #0x1\n"
+ "ld1 { v28.b }[12], [x21], #0x1\n"
+ "ld1 { v22.b }[12], [x20], #0x1\n"
"b 23f\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v4.h }[4], [x24], #0x2\n"
- "ld1 { v3.h }[4], [x23], #0x2\n"
- "ld1 { v28.h }[4], [x22], #0x2\n"
- "ld1 { v22.h }[4], [x21], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
+ "ld1 { v3.h }[4], [x22], #0x2\n"
+ "ld1 { v28.h }[4], [x21], #0x2\n"
+ "ld1 { v22.h }[4], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[10], [x24], #0x1\n"
- "ld1 { v3.b }[10], [x23], #0x1\n"
- "ld1 { v28.b }[10], [x22], #0x1\n"
- "ld1 { v22.b }[10], [x21], #0x1\n"
+ "ld1 { v4.b }[10], [x23], #0x1\n"
+ "ld1 { v3.b }[10], [x22], #0x1\n"
+ "ld1 { v28.b }[10], [x21], #0x1\n"
+ "ld1 { v22.b }[10], [x20], #0x1\n"
"b 23f\n"
"18:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[8], [x24], #0x1\n"
- "ld1 { v3.b }[8], [x23], #0x1\n"
- "ld1 { v28.b }[8], [x22], #0x1\n"
- "ld1 { v22.b }[8], [x21], #0x1\n"
+ "ld1 { v4.b }[8], [x23], #0x1\n"
+ "ld1 { v3.b }[8], [x22], #0x1\n"
+ "ld1 { v28.b }[8], [x21], #0x1\n"
+ "ld1 { v22.b }[8], [x20], #0x1\n"
"b 23f\n"
"19:" // Oddments: 4 inputs loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 21f\n"
- "ldr s4, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s28, [x22], #0x4\n"
- "ldr s22, [x21], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s28, [x21], #0x4\n"
+ "ldr s22, [x20], #0x4\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v4.h }[2], [x24], #0x2\n"
- "ld1 { v3.h }[2], [x23], #0x2\n"
- "ld1 { v28.h }[2], [x22], #0x2\n"
- "ld1 { v22.h }[2], [x21], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v3.h }[2], [x22], #0x2\n"
+ "ld1 { v28.h }[2], [x21], #0x2\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[6], [x24], #0x1\n"
- "ld1 { v3.b }[6], [x23], #0x1\n"
- "ld1 { v28.b }[6], [x22], #0x1\n"
- "ld1 { v22.b }[6], [x21], #0x1\n"
+ "ld1 { v4.b }[6], [x23], #0x1\n"
+ "ld1 { v3.b }[6], [x22], #0x1\n"
+ "ld1 { v28.b }[6], [x21], #0x1\n"
+ "ld1 { v22.b }[6], [x20], #0x1\n"
"b 23f\n"
"20:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[4], [x24], #0x1\n"
- "ld1 { v3.b }[4], [x23], #0x1\n"
- "ld1 { v28.b }[4], [x22], #0x1\n"
- "ld1 { v22.b }[4], [x21], #0x1\n"
+ "ld1 { v4.b }[4], [x23], #0x1\n"
+ "ld1 { v3.b }[4], [x22], #0x1\n"
+ "ld1 { v28.b }[4], [x21], #0x1\n"
+ "ld1 { v22.b }[4], [x20], #0x1\n"
"b 23f\n"
"21:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 22f\n"
- "ldr h4, [x24], #0x2\n"
- "ldr h3, [x23], #0x2\n"
- "ldr h28, [x22], #0x2\n"
- "ldr h22, [x21], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "ldr h28, [x21], #0x2\n"
+ "ldr h22, [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[2], [x24], #0x1\n"
- "ld1 { v3.b }[2], [x23], #0x1\n"
- "ld1 { v28.b }[2], [x22], #0x1\n"
- "ld1 { v22.b }[2], [x21], #0x1\n"
+ "ld1 { v4.b }[2], [x23], #0x1\n"
+ "ld1 { v3.b }[2], [x22], #0x1\n"
+ "ld1 { v28.b }[2], [x21], #0x1\n"
+ "ld1 { v22.b }[2], [x20], #0x1\n"
"b 23f\n"
"22:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ldr b4, [x24], #0x1\n"
- "ldr b3, [x23], #0x1\n"
- "ldr b28, [x22], #0x1\n"
- "ldr b22, [x21], #0x1\n"
+ "ldr b4, [x23], #0x1\n"
+ "ldr b3, [x22], #0x1\n"
+ "ldr b28, [x21], #0x1\n"
+ "ldr b22, [x20], #0x1\n"
"23:" // Oddments: 4 inputs loop: Load: Bit 3: End
- "umax v23.16b, v4.16b, v3.16b\n"
- "umax v19.16b, v28.16b, v22.16b\n"
+ "umax v17.16b, v4.16b, v3.16b\n"
+ "umax v16.16b, v28.16b, v22.16b\n"
"subs x25, x25, #0x1\n"
- "umax v19.16b, v23.16b, v19.16b\n"
- "umax v8.16b, v8.16b, v19.16b\n"
+ "umax v16.16b, v17.16b, v16.16b\n"
+ "umax v8.16b, v8.16b, v16.16b\n"
"bgt 15b\n"
"24:" // Oddments: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 34f\n"
"25:" // Oddments: Single input loop
- "ldr x24, [x20], #0x8\n"
- "add x24, x24, x9\n"
+ "ldr x23, [x24], #0x8\n"
+ "add x23, x23, x27\n"
"movi v4.16b, #0x0\n"
"tbz %x[n_channels], #3, 29f\n"
- "ldr d4, [x24], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v4.h }[6], [x24], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[14], [x24], #0x1\n"
+ "ld1 { v4.b }[14], [x23], #0x1\n"
"b 33f\n"
"26:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[12], [x24], #0x1\n"
+ "ld1 { v4.b }[12], [x23], #0x1\n"
"b 33f\n"
"27:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v4.h }[4], [x24], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[10], [x24], #0x1\n"
+ "ld1 { v4.b }[10], [x23], #0x1\n"
"b 33f\n"
"28:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[8], [x24], #0x1\n"
+ "ld1 { v4.b }[8], [x23], #0x1\n"
"b 33f\n"
"29:" // Oddments: Single input loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 31f\n"
- "ldr s4, [x24], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v4.h }[2], [x24], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[6], [x24], #0x1\n"
+ "ld1 { v4.b }[6], [x23], #0x1\n"
"b 33f\n"
"30:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[4], [x24], #0x1\n"
+ "ld1 { v4.b }[4], [x23], #0x1\n"
"b 33f\n"
"31:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 32f\n"
- "ldr h4, [x24], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[2], [x24], #0x1\n"
+ "ld1 { v4.b }[2], [x23], #0x1\n"
"b 33f\n"
"32:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ldr b4, [x24], #0x1\n"
+ "ldr b4, [x23], #0x1\n"
"33:" // Oddments: Single input loop: Load: Bit 3: End
"subs x21, x21, #0x1\n"
"umax v8.16b, v8.16b, v4.16b\n"
"bgt 25b\n"
"34:" // Oddments: Single input loop: End
"add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
- "ld1r { v4.4s }, [x20]\n"
- "uxtl v23.8h, v8.8b\n"
- "uxtl2 v24.8h, v8.16b\n"
- "neg v4.4s, v4.4s\n"
+ "ld1r { v18.4s }, [x20]\n"
+ "uxtl v17.8h, v8.8b\n"
+ "uxtl2 v16.8h, v8.16b\n"
+ "neg v18.4s, v18.4s\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- "ld1r { v3.4s }, [x20]\n"
- "saddw v0.4s, v4.4s, v23.4h\n"
- "saddw2 v23.4s, v4.4s, v23.8h\n"
- "saddw v31.4s, v4.4s, v24.4h\n"
+ "ld1r { v23.4s }, [x20]\n"
+ "saddw v22.4s, v18.4s, v17.4h\n"
+ "saddw2 v21.4s, v18.4s, v17.8h\n"
+ "saddw v20.4s, v18.4s, v16.4h\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "ld1r { v2.4s }, [x20]\n"
- "saddw2 v30.4s, v4.4s, v24.8h\n"
- "srshl v0.4s, v0.4s, v3.4s\n"
+ "ld1r { v19.4s }, [x20]\n"
+ "saddw2 v18.4s, v18.4s, v16.8h\n"
+ "srshl v22.4s, v22.4s, v23.4s\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "ld1r { v1.4s }, [x20]\n"
- "srshl v23.4s, v23.4s, v3.4s\n"
- "srshl v31.4s, v31.4s, v3.4s\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "srshl v21.4s, v21.4s, v23.4s\n"
+ "srshl v20.4s, v20.4s, v23.4s\n"
"add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
"ld1r { v16.4s }, [x20]\n"
- "srshl v30.4s, v30.4s, v3.4s\n"
- "sqrdmulh v0.4s, v0.4s, v2.4s\n"
- "sqrdmulh v23.4s, v23.4s, v2.4s\n"
- "sqrdmulh v31.4s, v31.4s, v2.4s\n"
- "sqrdmulh v30.4s, v30.4s, v2.4s\n"
- "srshl v0.4s, v0.4s, v1.4s\n"
- "srshl v23.4s, v23.4s, v1.4s\n"
- "srshl v31.4s, v31.4s, v1.4s\n"
- "srshl v30.4s, v30.4s, v1.4s\n"
- "add v0.4s, v0.4s, v16.4s\n"
- "add v23.4s, v23.4s, v16.4s\n"
- "add v31.4s, v31.4s, v16.4s\n"
- "add v30.4s, v30.4s, v16.4s\n"
+ "srshl v18.4s, v18.4s, v23.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v19.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v19.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v19.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v19.4s\n"
+ "srshl v22.4s, v22.4s, v17.4s\n"
+ "srshl v21.4s, v21.4s, v17.4s\n"
+ "srshl v20.4s, v20.4s, v17.4s\n"
+ "srshl v18.4s, v18.4s, v17.4s\n"
+ "add v22.4s, v22.4s, v16.4s\n"
+ "add v21.4s, v21.4s, v16.4s\n"
+ "add v20.4s, v20.4s, v16.4s\n"
+ "add v18.4s, v18.4s, v16.4s\n"
"movi v16.4s, #0x0\n"
- "smax v0.4s, v0.4s, v16.4s\n"
- "smax v23.4s, v23.4s, v16.4s\n"
- "smax v31.4s, v31.4s, v16.4s\n"
- "smax v30.4s, v30.4s, v16.4s\n"
+ "smax v22.4s, v22.4s, v16.4s\n"
+ "smax v21.4s, v21.4s, v16.4s\n"
+ "smax v20.4s, v20.4s, v16.4s\n"
+ "smax v18.4s, v18.4s, v16.4s\n"
"movi v16.4s, #0xff\n"
- "smin v0.4s, v0.4s, v16.4s\n"
- "smin v23.4s, v23.4s, v16.4s\n"
- "smin v31.4s, v31.4s, v16.4s\n"
- "smin v30.4s, v30.4s, v16.4s\n"
- "uzp1 v23.16b, v0.16b, v23.16b\n"
- "uzp1 v16.16b, v31.16b, v30.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "smin v22.4s, v22.4s, v16.4s\n"
+ "smin v21.4s, v21.4s, v16.4s\n"
+ "smin v20.4s, v20.4s, v16.4s\n"
+ "smin v18.4s, v18.4s, v16.4s\n"
+ "uzp1 v17.16b, v22.16b, v21.16b\n"
+ "uzp1 v16.16b, v20.16b, v18.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
"tbz %x[n_channels], #3, 38f\n"
"st1 { v16.d }[0], [%x[outptr]], #0x8\n"
"tbz %x[n_channels], #2, 36f\n"
@@ -667,10 +667,11 @@ void a64_u8q_nhwc_max_generic_depthfirst_impl(
"43:" // End
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_input_offset] "I" (offsetof(Requantize32, input_offset)), [offsetof_qp_output_offset] "I" (offsetof(Requantize32, output_offset)), [offsetof_qp_per_layer_left_shift] "I" (offsetof(Requantize32, per_layer_left_shift)), [offsetof_qp_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_qp_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [quant_params] "r" (&qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp
index 2bb22131f7..1f8f863de2 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,10 @@
#include <cstdint>
#include <cstring>
+#ifdef ARM_COMPUTE_ENABLE_BF16
+#include "bfloat.hpp"
+using arm_gemm::bfloat16;
+#endif
namespace arm_conv {
namespace pooling {
@@ -41,9 +45,15 @@ void cpp_nhwc_1x1_stride_any_depthfirst_impl(
}
template void cpp_nhwc_1x1_stride_any_depthfirst_impl(uint64_t, uint64_t, uint64_t, const float *const *, float *);
-#if defined(__ARM_FP16_ARGS)
+
+#ifdef __ARM_FP16_ARGS
template void cpp_nhwc_1x1_stride_any_depthfirst_impl(uint64_t, uint64_t, uint64_t, const __fp16 *const *, __fp16 *);
-#endif // defined(__ARM_FP16_ARGS)
+#endif
+
+#ifdef ARM_COMPUTE_ENABLE_BF16
+template void cpp_nhwc_1x1_stride_any_depthfirst_impl(uint64_t, uint64_t, uint64_t, const bfloat16 *const *, bfloat16 *);
+#endif
+
template void cpp_nhwc_1x1_stride_any_depthfirst_impl(uint64_t, uint64_t, uint64_t, const int8_t *const *, int8_t *);
template void cpp_nhwc_1x1_stride_any_depthfirst_impl(uint64_t, uint64_t, uint64_t, const uint8_t *const *, uint8_t *);
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
index 250d92c051..f6682e75e2 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
namespace arm_conv {
namespace pooling {
@@ -45,3 +47,5 @@ struct sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst : public DepthfirstStrategy
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
index bce623acd1..67b07205cd 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -26,7 +26,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) && defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
namespace arm_conv {
namespace pooling {
@@ -91,34 +91,34 @@ void sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
"add x20, %x[args], %[offsetof_rescale]\n"
"ld1rqh { z4.h }, p0/Z, [x20]\n"
"ldr x5, [%x[args], %[offsetof_n_channels]]\n"
- "whilelt p1.h, x3, x5\n"
+ "whilelt p0.h, x3, x5\n"
"mov x6, #0x0\n"
"ldp x7, x8, [x21, #0x0]\n"
"ldp x17, x16, [x21, #0x10]\n"
"ldp x15, x14, [x4, #0x0]\n"
- "ld1h { z3.h }, p1/Z, [x14, x3, LSL #1]\n"
+ "ld1h { z3.h }, p0/Z, [x14, x3, LSL #1]\n"
"ldp x13, x12, [x4, #0x10]\n"
- "ld1h { z2.h }, p1/Z, [x13, x3, LSL #1]\n"
+ "ld1h { z2.h }, p0/Z, [x13, x3, LSL #1]\n"
"ldp x11, x10, [x4, #0x20]\n"
- "ld1h { z1.h }, p1/Z, [x10, x3, LSL #1]\n"
+ "ld1h { z1.h }, p0/Z, [x10, x3, LSL #1]\n"
"ldp x9, x28, [x4, #0x30]\n"
- "ld1h { z0.h }, p1/Z, [x9, x3, LSL #1]\n"
+ "ld1h { z0.h }, p0/Z, [x9, x3, LSL #1]\n"
"ldp x27, x26, [x4, #0x40]\n"
- "ld1h { z31.h }, p1/Z, [x26, x3, LSL #1]\n"
+ "ld1h { z31.h }, p0/Z, [x26, x3, LSL #1]\n"
"ldp x25, x24, [x4, #0x50]\n"
- "ld1h { z30.h }, p1/Z, [x25, x3, LSL #1]\n"
+ "ld1h { z30.h }, p0/Z, [x25, x3, LSL #1]\n"
"ldp x23, x22, [x4, #0x60]\n"
- "ld1h { z29.h }, p1/Z, [x11, x3, LSL #1]\n"
+ "ld1h { z29.h }, p0/Z, [x11, x3, LSL #1]\n"
"ldp x21, x20, [x4, #0x70]\n"
- "ld1h { z28.h }, p1/Z, [x27, x3, LSL #1]\n"
- "ld1h { z27.h }, p1/Z, [x28, x3, LSL #1]\n"
- "ld1h { z22.h }, p1/Z, [x24, x3, LSL #1]\n"
- "ld1h { z21.h }, p1/Z, [x22, x3, LSL #1]\n"
- "ld1h { z20.h }, p1/Z, [x21, x3, LSL #1]\n"
- "ld1h { z26.h }, p1/Z, [x15, x3, LSL #1]\n"
- "ld1h { z25.h }, p1/Z, [x12, x3, LSL #1]\n"
- "ld1h { z24.h }, p1/Z, [x23, x3, LSL #1]\n"
- "ld1h { z23.h }, p1/Z, [x20, x3, LSL #1]\n"
+ "ld1h { z28.h }, p0/Z, [x27, x3, LSL #1]\n"
+ "ld1h { z27.h }, p0/Z, [x28, x3, LSL #1]\n"
+ "ld1h { z22.h }, p0/Z, [x24, x3, LSL #1]\n"
+ "ld1h { z21.h }, p0/Z, [x22, x3, LSL #1]\n"
+ "ld1h { z20.h }, p0/Z, [x21, x3, LSL #1]\n"
+ "ld1h { z26.h }, p0/Z, [x15, x3, LSL #1]\n"
+ "ld1h { z25.h }, p0/Z, [x12, x3, LSL #1]\n"
+ "ld1h { z24.h }, p0/Z, [x23, x3, LSL #1]\n"
+ "ld1h { z23.h }, p0/Z, [x20, x3, LSL #1]\n"
"incw x3\n"
"whilelt p1.h, x3, x5\n"
"b.none 2f\n"
@@ -206,4 +206,4 @@ void sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) && defined(ARM_COMPUTE_ENABLE_SME)
+#endif // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst.hpp
index 117eb36007..cf09f421c4 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
namespace arm_conv {
namespace pooling {
@@ -40,3 +42,5 @@ struct sme_fp16_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<__fp16,
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst/generic.cpp
index c43da42d9e..60f17b7bc2 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst/generic.cpp
@@ -22,9 +22,10 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME)
-
#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
namespace arm_conv {
namespace pooling {
@@ -57,68 +58,68 @@ void sme_fp16_nhwc_avg_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z5.b, #0x0\n"
"mov z4.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z3.b, #0x0\n"
"mov z2.b, #0x0\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1h { z1.h }, p3/Z, [x24, x9, LSL #1]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1h { z0.h }, p3/Z, [x23, x9, LSL #1]\n"
- "ld1h { z31.h }, p3/Z, [x22, x9, LSL #1]\n"
- "ld1h { z30.h }, p3/Z, [x21, x9, LSL #1]\n"
- "ld1h { z29.h }, p2/Z, [x24, x28, LSL #1]\n"
- "ld1h { z22.h }, p2/Z, [x23, x28, LSL #1]\n"
- "ld1h { z28.h }, p2/Z, [x22, x28, LSL #1]\n"
- "ld1h { z18.h }, p2/Z, [x21, x28, LSL #1]\n"
- "ld1h { z27.h }, p1/Z, [x24, x27, LSL #1]\n"
- "ld1h { z21.h }, p1/Z, [x23, x27, LSL #1]\n"
- "ld1h { z26.h }, p1/Z, [x22, x27, LSL #1]\n"
- "ld1h { z17.h }, p1/Z, [x21, x27, LSL #1]\n"
- "ld1h { z25.h }, p0/Z, [x24, x26, LSL #1]\n"
- "ld1h { z20.h }, p0/Z, [x23, x26, LSL #1]\n"
- "ld1h { z24.h }, p0/Z, [x22, x26, LSL #1]\n"
- "ld1h { z16.h }, p0/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z1.h }, p3/Z, [x23, x9, LSL #1]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z30.h }, p3/Z, [x20, x9, LSL #1]\n"
+ "ld1h { z29.h }, p2/Z, [x23, x28, LSL #1]\n"
+ "ld1h { z22.h }, p2/Z, [x22, x28, LSL #1]\n"
+ "ld1h { z28.h }, p2/Z, [x21, x28, LSL #1]\n"
+ "ld1h { z18.h }, p2/Z, [x20, x28, LSL #1]\n"
+ "ld1h { z27.h }, p1/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z21.h }, p1/Z, [x22, x27, LSL #1]\n"
+ "ld1h { z26.h }, p1/Z, [x21, x27, LSL #1]\n"
+ "ld1h { z17.h }, p1/Z, [x20, x27, LSL #1]\n"
+ "ld1h { z25.h }, p0/Z, [x23, x26, LSL #1]\n"
+ "ld1h { z20.h }, p0/Z, [x22, x26, LSL #1]\n"
+ "ld1h { z24.h }, p0/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"fadd z23.h, z1.h, z0.h\n"
"fadd z19.h, z31.h, z30.h\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
"fadd z22.h, z29.h, z22.h\n"
"fadd z18.h, z28.h, z18.h\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
"fadd z21.h, z27.h, z21.h\n"
"fadd z17.h, z26.h, z17.h\n"
- "ld1h { z1.h }, p3/Z, [x24, x9, LSL #1]\n"
+ "ld1h { z1.h }, p3/Z, [x23, x9, LSL #1]\n"
"fadd z20.h, z25.h, z20.h\n"
"fadd z16.h, z24.h, z16.h\n"
- "ld1h { z0.h }, p3/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
"fadd z19.h, z23.h, z19.h\n"
"fadd z18.h, z22.h, z18.h\n"
- "ld1h { z31.h }, p3/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
"fadd z17.h, z21.h, z17.h\n"
"fadd z16.h, z20.h, z16.h\n"
- "ld1h { z30.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z30.h }, p3/Z, [x20, x9, LSL #1]\n"
"fadd z5.h, z5.h, z19.h\n"
"fadd z4.h, z4.h, z18.h\n"
- "ld1h { z29.h }, p2/Z, [x24, x28, LSL #1]\n"
+ "ld1h { z29.h }, p2/Z, [x23, x28, LSL #1]\n"
"fadd z3.h, z3.h, z17.h\n"
"fadd z2.h, z2.h, z16.h\n"
- "ld1h { z22.h }, p2/Z, [x23, x28, LSL #1]\n"
- "ld1h { z28.h }, p2/Z, [x22, x28, LSL #1]\n"
- "ld1h { z18.h }, p2/Z, [x21, x28, LSL #1]\n"
- "ld1h { z27.h }, p1/Z, [x24, x27, LSL #1]\n"
- "ld1h { z21.h }, p1/Z, [x23, x27, LSL #1]\n"
- "ld1h { z26.h }, p1/Z, [x22, x27, LSL #1]\n"
- "ld1h { z17.h }, p1/Z, [x21, x27, LSL #1]\n"
- "ld1h { z25.h }, p0/Z, [x24, x26, LSL #1]\n"
- "ld1h { z20.h }, p0/Z, [x23, x26, LSL #1]\n"
- "ld1h { z24.h }, p0/Z, [x22, x26, LSL #1]\n"
- "ld1h { z16.h }, p0/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z22.h }, p2/Z, [x22, x28, LSL #1]\n"
+ "ld1h { z28.h }, p2/Z, [x21, x28, LSL #1]\n"
+ "ld1h { z18.h }, p2/Z, [x20, x28, LSL #1]\n"
+ "ld1h { z27.h }, p1/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z21.h }, p1/Z, [x22, x27, LSL #1]\n"
+ "ld1h { z26.h }, p1/Z, [x21, x27, LSL #1]\n"
+ "ld1h { z17.h }, p1/Z, [x20, x27, LSL #1]\n"
+ "ld1h { z25.h }, p0/Z, [x23, x26, LSL #1]\n"
+ "ld1h { z20.h }, p0/Z, [x22, x26, LSL #1]\n"
+ "ld1h { z24.h }, p0/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"fadd z23.h, z1.h, z0.h\n"
@@ -141,16 +142,16 @@ void sme_fp16_nhwc_avg_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1h { z1.h }, p3/Z, [x24, x9, LSL #1]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1h { z16.h }, p3/Z, [x20, x9, LSL #1]\n"
"subs x21, x21, #0x1\n"
- "fadd z5.h, z5.h, z1.h\n"
- "ld1h { z29.h }, p2/Z, [x24, x28, LSL #1]\n"
- "fadd z4.h, z4.h, z29.h\n"
- "ld1h { z27.h }, p1/Z, [x24, x27, LSL #1]\n"
- "fadd z3.h, z3.h, z27.h\n"
- "ld1h { z25.h }, p0/Z, [x24, x26, LSL #1]\n"
- "fadd z2.h, z2.h, z25.h\n"
+ "fadd z5.h, z5.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x28, LSL #1]\n"
+ "fadd z4.h, z4.h, z16.h\n"
+ "ld1h { z16.h }, p1/Z, [x20, x27, LSL #1]\n"
+ "fadd z3.h, z3.h, z16.h\n"
+ "ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
+ "fadd z2.h, z2.h, z16.h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"fmul z5.h, z5.h, z6.h\n"
@@ -173,44 +174,44 @@ void sme_fp16_nhwc_avg_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z5.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x20, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1h { z1.h }, p3/Z, [x24, x9, LSL #1]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1h { z0.h }, p3/Z, [x23, x9, LSL #1]\n"
- "ld1h { z31.h }, p3/Z, [x22, x9, LSL #1]\n"
- "ld1h { z30.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z1.h }, p3/Z, [x20, x9, LSL #1]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z30.h }, p3/Z, [x20, x9, LSL #1]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "fadd z23.h, z1.h, z0.h\n"
- "fadd z19.h, z31.h, z30.h\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "fadd z17.h, z1.h, z0.h\n"
+ "fadd z16.h, z31.h, z30.h\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "fadd z19.h, z23.h, z19.h\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "fadd z5.h, z5.h, z19.h\n"
- "add x20, x20, #0x20\n"
- "ld1h { z1.h }, p3/Z, [x24, x9, LSL #1]\n"
- "ld1h { z0.h }, p3/Z, [x23, x9, LSL #1]\n"
- "ld1h { z31.h }, p3/Z, [x22, x9, LSL #1]\n"
- "ld1h { z30.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "fadd z16.h, z17.h, z16.h\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "fadd z5.h, z5.h, z16.h\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z1.h }, p3/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z30.h }, p3/Z, [x20, x9, LSL #1]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "fadd z23.h, z1.h, z0.h\n"
- "fadd z19.h, z31.h, z30.h\n"
- "fadd z19.h, z23.h, z19.h\n"
- "fadd z5.h, z5.h, z19.h\n"
+ "fadd z17.h, z1.h, z0.h\n"
+ "fadd z16.h, z31.h, z30.h\n"
+ "fadd z16.h, z17.h, z16.h\n"
+ "fadd z5.h, z5.h, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1h { z1.h }, p3/Z, [x24, x9, LSL #1]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1h { z16.h }, p3/Z, [x20, x9, LSL #1]\n"
"subs x21, x21, #0x1\n"
- "fadd z5.h, z5.h, z1.h\n"
+ "fadd z5.h, z5.h, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"fmul z5.h, z5.h, z6.h\n"
@@ -229,4 +230,4 @@ void sme_fp16_nhwc_avg_generic_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
-#endif // defined(ARM_COMPUTE_ENABLE_SME)
+#endif // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 9489c1f8da..cd6c7449a8 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
namespace arm_conv {
namespace pooling {
@@ -45,3 +47,5 @@ struct sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index f71f2625b6..7fc776ed4e 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -26,7 +26,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) && defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
namespace arm_conv {
namespace pooling {
@@ -70,23 +70,23 @@ void sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr x20, [%x[args], %[offsetof_inptrs]]\n"
"mov x14, #0x0\n"
"ldr x13, [%x[args], %[offsetof_n_channels]]\n"
- "whilelt p1.h, x15, x13\n"
+ "whilelt p0.h, x15, x13\n"
"ldp x12, x11, [x21, #0x0]\n"
"ldp x10, x9, [x21, #0x10]\n"
"ldp x28, x27, [x20, #0x0]\n"
- "ld1h { z30.h }, p1/Z, [x27, x15, LSL #1]\n"
+ "ld1h { z30.h }, p0/Z, [x27, x15, LSL #1]\n"
"ldp x26, x25, [x20, #0x10]\n"
- "ld1h { z29.h }, p1/Z, [x25, x15, LSL #1]\n"
+ "ld1h { z29.h }, p0/Z, [x25, x15, LSL #1]\n"
"ldp x24, x23, [x20, #0x20]\n"
- "ld1h { z28.h }, p1/Z, [x24, x15, LSL #1]\n"
+ "ld1h { z28.h }, p0/Z, [x24, x15, LSL #1]\n"
"ldp x22, x21, [x20, #0x30]\n"
- "ld1h { z27.h }, p1/Z, [x21, x15, LSL #1]\n"
+ "ld1h { z27.h }, p0/Z, [x21, x15, LSL #1]\n"
"ldr x20, [x20, #0x40]\n"
- "ld1h { z26.h }, p1/Z, [x28, x15, LSL #1]\n"
- "ld1h { z25.h }, p1/Z, [x26, x15, LSL #1]\n"
- "ld1h { z24.h }, p1/Z, [x23, x15, LSL #1]\n"
- "ld1h { z23.h }, p1/Z, [x22, x15, LSL #1]\n"
- "ld1h { z19.h }, p1/Z, [x20, x15, LSL #1]\n"
+ "ld1h { z26.h }, p0/Z, [x28, x15, LSL #1]\n"
+ "ld1h { z25.h }, p0/Z, [x26, x15, LSL #1]\n"
+ "ld1h { z24.h }, p0/Z, [x23, x15, LSL #1]\n"
+ "ld1h { z19.h }, p0/Z, [x22, x15, LSL #1]\n"
+ "ld1h { z23.h }, p0/Z, [x20, x15, LSL #1]\n"
"incw x15\n"
"whilelt p1.h, x15, x13\n"
"b.none 2f\n"
@@ -95,25 +95,25 @@ void sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"movprfx z21, z28\n fmax z21.h, p2/M, z21.h, z27.h\n"
"ld1h { z30.h }, p1/Z, [x27, x15, LSL #1]\n"
"whilelt p0.h, x14, x13\n"
- "movprfx z20, z29\n fmax z20.h, p2/M, z20.h, z26.h\n"
- "movprfx z18, z25\n fmax z18.h, p2/M, z18.h, z24.h\n"
+ "movprfx z18, z29\n fmax z18.h, p2/M, z18.h, z26.h\n"
+ "movprfx z17, z25\n fmax z17.h, p2/M, z17.h, z24.h\n"
"ld1h { z28.h }, p1/Z, [x24, x15, LSL #1]\n"
- "movprfx z17, z29\n fmax z17.h, p2/M, z17.h, z23.h\n"
- "movprfx z16, z24\n fmax z16.h, p2/M, z16.h, z19.h\n"
+ "movprfx z16, z29\n fmax z16.h, p2/M, z16.h, z19.h\n"
+ "movprfx z20, z24\n fmax z20.h, p2/M, z20.h, z23.h\n"
"ld1h { z27.h }, p1/Z, [x21, x15, LSL #1]\n"
"ld1h { z29.h }, p1/Z, [x25, x15, LSL #1]\n"
- "movprfx z19, z22\n fmax z19.h, p2/M, z19.h, z20.h\n"
- "fmax z18.h, p2/M, z18.h, z22.h\n"
+ "movprfx z19, z22\n fmax z19.h, p2/M, z19.h, z18.h\n"
+ "movprfx z18, z17\n fmax z18.h, p2/M, z18.h, z22.h\n"
"ld1h { z26.h }, p1/Z, [x28, x15, LSL #1]\n"
- "fmax z17.h, p2/M, z17.h, z21.h\n"
- "fmax z16.h, p2/M, z16.h, z21.h\n"
+ "movprfx z17, z16\n fmax z17.h, p2/M, z17.h, z21.h\n"
+ "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z20.h\n"
"ld1h { z25.h }, p1/Z, [x26, x15, LSL #1]\n"
"st1h { z19.h }, p0, [x12, x14, LSL #1]\n"
"ld1h { z24.h }, p1/Z, [x23, x15, LSL #1]\n"
"st1h { z18.h }, p0, [x11, x14, LSL #1]\n"
- "ld1h { z23.h }, p1/Z, [x22, x15, LSL #1]\n"
+ "ld1h { z19.h }, p1/Z, [x22, x15, LSL #1]\n"
"st1h { z17.h }, p0, [x10, x14, LSL #1]\n"
- "ld1h { z19.h }, p1/Z, [x20, x15, LSL #1]\n"
+ "ld1h { z23.h }, p1/Z, [x20, x15, LSL #1]\n"
"incw x15\n"
"whilelt p1.h, x15, x13\n"
"st1h { z16.h }, p0, [x9, x14, LSL #1]\n"
@@ -125,13 +125,13 @@ void sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"whilelt p0.h, x14, x13\n"
"movprfx z20, z29\n fmax z20.h, p2/M, z20.h, z26.h\n"
"movprfx z18, z25\n fmax z18.h, p2/M, z18.h, z24.h\n"
- "movprfx z17, z29\n fmax z17.h, p2/M, z17.h, z23.h\n"
- "movprfx z16, z24\n fmax z16.h, p2/M, z16.h, z19.h\n"
- "movprfx z19, z22\n fmax z19.h, p2/M, z19.h, z20.h\n"
+ "movprfx z17, z29\n fmax z17.h, p2/M, z17.h, z19.h\n"
+ "movprfx z19, z24\n fmax z19.h, p2/M, z19.h, z23.h\n"
+ "movprfx z16, z22\n fmax z16.h, p2/M, z16.h, z20.h\n"
"fmax z18.h, p2/M, z18.h, z22.h\n"
- "st1h { z19.h }, p0, [x12, x14, LSL #1]\n"
+ "st1h { z16.h }, p0, [x12, x14, LSL #1]\n"
"fmax z17.h, p2/M, z17.h, z21.h\n"
- "fmax z16.h, p2/M, z16.h, z21.h\n"
+ "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z19.h\n"
"st1h { z18.h }, p0, [x11, x14, LSL #1]\n"
"st1h { z17.h }, p0, [x10, x14, LSL #1]\n"
"st1h { z16.h }, p0, [x9, x14, LSL #1]\n"
@@ -145,4 +145,4 @@ void sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) && defined(ARM_COMPUTE_ENABLE_SME)
+#endif // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst.hpp
index 33ff1f2154..bfb3bf5b1a 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
namespace arm_conv {
namespace pooling {
@@ -40,3 +42,5 @@ struct sme_fp16_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<__fp16,
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst/generic.cpp
index c07ce97231..afa2ccbd71 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst/generic.cpp
@@ -22,9 +22,10 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME)
-
#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
namespace arm_conv {
namespace pooling {
@@ -54,68 +55,68 @@ void sme_fp16_nhwc_max_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z4.h, #0xfc00\n"
"mov z3.h, #0xfc00\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z2.h, #0xfc00\n"
"mov z1.h, #0xfc00\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1h { z0.h }, p4/Z, [x24, x9, LSL #1]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1h { z31.h }, p4/Z, [x23, x9, LSL #1]\n"
- "ld1h { z23.h }, p4/Z, [x22, x9, LSL #1]\n"
- "ld1h { z30.h }, p4/Z, [x21, x9, LSL #1]\n"
- "ld1h { z18.h }, p3/Z, [x24, x28, LSL #1]\n"
- "ld1h { z29.h }, p3/Z, [x23, x28, LSL #1]\n"
- "ld1h { z22.h }, p3/Z, [x22, x28, LSL #1]\n"
- "ld1h { z28.h }, p3/Z, [x21, x28, LSL #1]\n"
- "ld1h { z17.h }, p2/Z, [x24, x27, LSL #1]\n"
- "ld1h { z27.h }, p2/Z, [x23, x27, LSL #1]\n"
- "ld1h { z21.h }, p2/Z, [x22, x27, LSL #1]\n"
- "ld1h { z26.h }, p2/Z, [x21, x27, LSL #1]\n"
- "ld1h { z16.h }, p1/Z, [x24, x26, LSL #1]\n"
- "ld1h { z25.h }, p1/Z, [x23, x26, LSL #1]\n"
- "ld1h { z20.h }, p1/Z, [x22, x26, LSL #1]\n"
- "ld1h { z24.h }, p1/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z0.h }, p4/Z, [x23, x9, LSL #1]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z31.h }, p4/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z23.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z30.h }, p4/Z, [x20, x9, LSL #1]\n"
+ "ld1h { z18.h }, p3/Z, [x23, x28, LSL #1]\n"
+ "ld1h { z29.h }, p3/Z, [x22, x28, LSL #1]\n"
+ "ld1h { z22.h }, p3/Z, [x21, x28, LSL #1]\n"
+ "ld1h { z28.h }, p3/Z, [x20, x28, LSL #1]\n"
+ "ld1h { z17.h }, p2/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z27.h }, p2/Z, [x22, x27, LSL #1]\n"
+ "ld1h { z21.h }, p2/Z, [x21, x27, LSL #1]\n"
+ "ld1h { z26.h }, p2/Z, [x20, x27, LSL #1]\n"
+ "ld1h { z16.h }, p1/Z, [x23, x26, LSL #1]\n"
+ "ld1h { z25.h }, p1/Z, [x22, x26, LSL #1]\n"
+ "ld1h { z20.h }, p1/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z24.h }, p1/Z, [x20, x26, LSL #1]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"movprfx z19, z0\n fmax z19.h, p0/M, z19.h, z31.h\n"
"fmax z23.h, p0/M, z23.h, z30.h\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
"fmax z18.h, p0/M, z18.h, z29.h\n"
"fmax z22.h, p0/M, z22.h, z28.h\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
"fmax z17.h, p0/M, z17.h, z27.h\n"
"fmax z21.h, p0/M, z21.h, z26.h\n"
- "ld1h { z0.h }, p4/Z, [x24, x9, LSL #1]\n"
+ "ld1h { z0.h }, p4/Z, [x23, x9, LSL #1]\n"
"fmax z16.h, p0/M, z16.h, z25.h\n"
"fmax z20.h, p0/M, z20.h, z24.h\n"
- "ld1h { z31.h }, p4/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z31.h }, p4/Z, [x22, x9, LSL #1]\n"
"fmax z19.h, p0/M, z19.h, z23.h\n"
"fmax z18.h, p0/M, z18.h, z22.h\n"
- "ld1h { z23.h }, p4/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z23.h }, p4/Z, [x21, x9, LSL #1]\n"
"fmax z17.h, p0/M, z17.h, z21.h\n"
"fmax z16.h, p0/M, z16.h, z20.h\n"
- "ld1h { z30.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z30.h }, p4/Z, [x20, x9, LSL #1]\n"
"fmax z4.h, p0/M, z4.h, z19.h\n"
"fmax z3.h, p0/M, z3.h, z18.h\n"
- "ld1h { z18.h }, p3/Z, [x24, x28, LSL #1]\n"
+ "ld1h { z18.h }, p3/Z, [x23, x28, LSL #1]\n"
"fmax z2.h, p0/M, z2.h, z17.h\n"
"fmax z1.h, p0/M, z1.h, z16.h\n"
- "ld1h { z29.h }, p3/Z, [x23, x28, LSL #1]\n"
- "ld1h { z22.h }, p3/Z, [x22, x28, LSL #1]\n"
- "ld1h { z28.h }, p3/Z, [x21, x28, LSL #1]\n"
- "ld1h { z17.h }, p2/Z, [x24, x27, LSL #1]\n"
- "ld1h { z27.h }, p2/Z, [x23, x27, LSL #1]\n"
- "ld1h { z21.h }, p2/Z, [x22, x27, LSL #1]\n"
- "ld1h { z26.h }, p2/Z, [x21, x27, LSL #1]\n"
- "ld1h { z16.h }, p1/Z, [x24, x26, LSL #1]\n"
- "ld1h { z25.h }, p1/Z, [x23, x26, LSL #1]\n"
- "ld1h { z20.h }, p1/Z, [x22, x26, LSL #1]\n"
- "ld1h { z24.h }, p1/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z29.h }, p3/Z, [x22, x28, LSL #1]\n"
+ "ld1h { z22.h }, p3/Z, [x21, x28, LSL #1]\n"
+ "ld1h { z28.h }, p3/Z, [x20, x28, LSL #1]\n"
+ "ld1h { z17.h }, p2/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z27.h }, p2/Z, [x22, x27, LSL #1]\n"
+ "ld1h { z21.h }, p2/Z, [x21, x27, LSL #1]\n"
+ "ld1h { z26.h }, p2/Z, [x20, x27, LSL #1]\n"
+ "ld1h { z16.h }, p1/Z, [x23, x26, LSL #1]\n"
+ "ld1h { z25.h }, p1/Z, [x22, x26, LSL #1]\n"
+ "ld1h { z20.h }, p1/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z24.h }, p1/Z, [x20, x26, LSL #1]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"movprfx z19, z0\n fmax z19.h, p0/M, z19.h, z31.h\n"
@@ -138,15 +139,15 @@ void sme_fp16_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1h { z0.h }, p4/Z, [x24, x9, LSL #1]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1h { z16.h }, p4/Z, [x20, x9, LSL #1]\n"
"subs x21, x21, #0x1\n"
- "fmax z4.h, p0/M, z4.h, z0.h\n"
- "ld1h { z18.h }, p3/Z, [x24, x28, LSL #1]\n"
- "fmax z3.h, p0/M, z3.h, z18.h\n"
- "ld1h { z17.h }, p2/Z, [x24, x27, LSL #1]\n"
- "fmax z2.h, p0/M, z2.h, z17.h\n"
- "ld1h { z16.h }, p1/Z, [x24, x26, LSL #1]\n"
+ "fmax z4.h, p0/M, z4.h, z16.h\n"
+ "ld1h { z16.h }, p3/Z, [x20, x28, LSL #1]\n"
+ "fmax z3.h, p0/M, z3.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x27, LSL #1]\n"
+ "fmax z2.h, p0/M, z2.h, z16.h\n"
+ "ld1h { z16.h }, p1/Z, [x20, x26, LSL #1]\n"
"fmax z1.h, p0/M, z1.h, z16.h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
@@ -166,44 +167,44 @@ void sme_fp16_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z4.h, #0xfc00\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x20, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1h { z0.h }, p4/Z, [x24, x9, LSL #1]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1h { z31.h }, p4/Z, [x23, x9, LSL #1]\n"
- "ld1h { z23.h }, p4/Z, [x22, x9, LSL #1]\n"
- "ld1h { z30.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z0.h }, p4/Z, [x20, x9, LSL #1]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z31.h }, p4/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z23.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z30.h }, p4/Z, [x20, x9, LSL #1]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z19, z0\n fmax z19.h, p0/M, z19.h, z31.h\n"
- "fmax z23.h, p0/M, z23.h, z30.h\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "movprfx z16, z0\n fmax z16.h, p0/M, z16.h, z31.h\n"
+ "movprfx z17, z23\n fmax z17.h, p0/M, z17.h, z30.h\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "fmax z19.h, p0/M, z19.h, z23.h\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "fmax z4.h, p0/M, z4.h, z19.h\n"
- "add x20, x20, #0x20\n"
- "ld1h { z0.h }, p4/Z, [x24, x9, LSL #1]\n"
- "ld1h { z31.h }, p4/Z, [x23, x9, LSL #1]\n"
- "ld1h { z23.h }, p4/Z, [x22, x9, LSL #1]\n"
- "ld1h { z30.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "fmax z16.h, p0/M, z16.h, z17.h\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "fmax z4.h, p0/M, z4.h, z16.h\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z0.h }, p4/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z31.h }, p4/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z23.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z30.h }, p4/Z, [x20, x9, LSL #1]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z19, z0\n fmax z19.h, p0/M, z19.h, z31.h\n"
- "fmax z23.h, p0/M, z23.h, z30.h\n"
- "fmax z19.h, p0/M, z19.h, z23.h\n"
- "fmax z4.h, p0/M, z4.h, z19.h\n"
+ "movprfx z16, z0\n fmax z16.h, p0/M, z16.h, z31.h\n"
+ "movprfx z17, z23\n fmax z17.h, p0/M, z17.h, z30.h\n"
+ "fmax z16.h, p0/M, z16.h, z17.h\n"
+ "fmax z4.h, p0/M, z4.h, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1h { z0.h }, p4/Z, [x24, x9, LSL #1]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1h { z16.h }, p4/Z, [x20, x9, LSL #1]\n"
"subs x21, x21, #0x1\n"
- "fmax z4.h, p0/M, z4.h, z0.h\n"
+ "fmax z4.h, p0/M, z4.h, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"st1h { z4.h }, p4, [%x[outptr], x9, LSL #1]\n"
@@ -221,4 +222,4 @@ void sme_fp16_nhwc_max_generic_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
-#endif // defined(ARM_COMPUTE_ENABLE_SME)
+#endif // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
index fa1b441371..23a0eee04e 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -45,3 +47,5 @@ struct sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst : public DepthfirstStrategy
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
index cf69800522..8c8532827a 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -26,7 +26,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__ARM_FEATURE_SVE) && defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace arm_conv {
namespace pooling {
@@ -91,34 +91,34 @@ void sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
"add x20, %x[args], %[offsetof_rescale]\n"
"ld1rqw { z4.s }, p0/Z, [x20]\n"
"ldr x5, [%x[args], %[offsetof_n_channels]]\n"
- "whilelt p1.s, x3, x5\n"
+ "whilelt p0.s, x3, x5\n"
"mov x6, #0x0\n"
"ldp x7, x8, [x21, #0x0]\n"
"ldp x17, x16, [x21, #0x10]\n"
"ldp x15, x14, [x4, #0x0]\n"
- "ld1w { z3.s }, p1/Z, [x14, x3, LSL #2]\n"
+ "ld1w { z3.s }, p0/Z, [x14, x3, LSL #2]\n"
"ldp x13, x12, [x4, #0x10]\n"
- "ld1w { z2.s }, p1/Z, [x13, x3, LSL #2]\n"
+ "ld1w { z2.s }, p0/Z, [x13, x3, LSL #2]\n"
"ldp x11, x10, [x4, #0x20]\n"
- "ld1w { z1.s }, p1/Z, [x10, x3, LSL #2]\n"
+ "ld1w { z1.s }, p0/Z, [x10, x3, LSL #2]\n"
"ldp x9, x28, [x4, #0x30]\n"
- "ld1w { z0.s }, p1/Z, [x9, x3, LSL #2]\n"
+ "ld1w { z0.s }, p0/Z, [x9, x3, LSL #2]\n"
"ldp x27, x26, [x4, #0x40]\n"
- "ld1w { z31.s }, p1/Z, [x26, x3, LSL #2]\n"
+ "ld1w { z31.s }, p0/Z, [x26, x3, LSL #2]\n"
"ldp x25, x24, [x4, #0x50]\n"
- "ld1w { z30.s }, p1/Z, [x25, x3, LSL #2]\n"
+ "ld1w { z30.s }, p0/Z, [x25, x3, LSL #2]\n"
"ldp x23, x22, [x4, #0x60]\n"
- "ld1w { z29.s }, p1/Z, [x11, x3, LSL #2]\n"
+ "ld1w { z29.s }, p0/Z, [x11, x3, LSL #2]\n"
"ldp x21, x20, [x4, #0x70]\n"
- "ld1w { z28.s }, p1/Z, [x27, x3, LSL #2]\n"
- "ld1w { z27.s }, p1/Z, [x28, x3, LSL #2]\n"
- "ld1w { z22.s }, p1/Z, [x24, x3, LSL #2]\n"
- "ld1w { z21.s }, p1/Z, [x22, x3, LSL #2]\n"
- "ld1w { z20.s }, p1/Z, [x21, x3, LSL #2]\n"
- "ld1w { z26.s }, p1/Z, [x15, x3, LSL #2]\n"
- "ld1w { z25.s }, p1/Z, [x12, x3, LSL #2]\n"
- "ld1w { z24.s }, p1/Z, [x23, x3, LSL #2]\n"
- "ld1w { z23.s }, p1/Z, [x20, x3, LSL #2]\n"
+ "ld1w { z28.s }, p0/Z, [x27, x3, LSL #2]\n"
+ "ld1w { z27.s }, p0/Z, [x28, x3, LSL #2]\n"
+ "ld1w { z22.s }, p0/Z, [x24, x3, LSL #2]\n"
+ "ld1w { z21.s }, p0/Z, [x22, x3, LSL #2]\n"
+ "ld1w { z20.s }, p0/Z, [x21, x3, LSL #2]\n"
+ "ld1w { z26.s }, p0/Z, [x15, x3, LSL #2]\n"
+ "ld1w { z25.s }, p0/Z, [x12, x3, LSL #2]\n"
+ "ld1w { z24.s }, p0/Z, [x23, x3, LSL #2]\n"
+ "ld1w { z23.s }, p0/Z, [x20, x3, LSL #2]\n"
"incw x3\n"
"whilelt p1.s, x3, x5\n"
"b.none 2f\n"
@@ -206,4 +206,4 @@ void sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(ARM_COMPUTE_ENABLE_SME)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst.hpp
index 814c89ca23..29bcfc5a3b 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -40,3 +42,5 @@ struct sme_fp32_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<float,
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst/generic.cpp
index 03ab9c0a9e..86e7f84542 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst/generic.cpp
@@ -22,9 +22,10 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME)
-
#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace arm_conv {
namespace pooling {
@@ -57,68 +58,68 @@ void sme_fp32_nhwc_avg_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z5.b, #0x0\n"
"mov z4.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z3.b, #0x0\n"
"mov z2.b, #0x0\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1w { z1.s }, p3/Z, [x24, x9, LSL #2]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1w { z0.s }, p3/Z, [x23, x9, LSL #2]\n"
- "ld1w { z31.s }, p3/Z, [x22, x9, LSL #2]\n"
- "ld1w { z30.s }, p3/Z, [x21, x9, LSL #2]\n"
- "ld1w { z29.s }, p2/Z, [x24, x28, LSL #2]\n"
- "ld1w { z22.s }, p2/Z, [x23, x28, LSL #2]\n"
- "ld1w { z28.s }, p2/Z, [x22, x28, LSL #2]\n"
- "ld1w { z18.s }, p2/Z, [x21, x28, LSL #2]\n"
- "ld1w { z27.s }, p1/Z, [x24, x27, LSL #2]\n"
- "ld1w { z21.s }, p1/Z, [x23, x27, LSL #2]\n"
- "ld1w { z26.s }, p1/Z, [x22, x27, LSL #2]\n"
- "ld1w { z17.s }, p1/Z, [x21, x27, LSL #2]\n"
- "ld1w { z25.s }, p0/Z, [x24, x26, LSL #2]\n"
- "ld1w { z20.s }, p0/Z, [x23, x26, LSL #2]\n"
- "ld1w { z24.s }, p0/Z, [x22, x26, LSL #2]\n"
- "ld1w { z16.s }, p0/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z1.s }, p3/Z, [x23, x9, LSL #2]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z30.s }, p3/Z, [x20, x9, LSL #2]\n"
+ "ld1w { z29.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z22.s }, p2/Z, [x22, x28, LSL #2]\n"
+ "ld1w { z28.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z27.s }, p1/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z21.s }, p1/Z, [x22, x27, LSL #2]\n"
+ "ld1w { z26.s }, p1/Z, [x21, x27, LSL #2]\n"
+ "ld1w { z17.s }, p1/Z, [x20, x27, LSL #2]\n"
+ "ld1w { z25.s }, p0/Z, [x23, x26, LSL #2]\n"
+ "ld1w { z20.s }, p0/Z, [x22, x26, LSL #2]\n"
+ "ld1w { z24.s }, p0/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"fadd z23.s, z1.s, z0.s\n"
"fadd z19.s, z31.s, z30.s\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
"fadd z22.s, z29.s, z22.s\n"
"fadd z18.s, z28.s, z18.s\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
"fadd z21.s, z27.s, z21.s\n"
"fadd z17.s, z26.s, z17.s\n"
- "ld1w { z1.s }, p3/Z, [x24, x9, LSL #2]\n"
+ "ld1w { z1.s }, p3/Z, [x23, x9, LSL #2]\n"
"fadd z20.s, z25.s, z20.s\n"
"fadd z16.s, z24.s, z16.s\n"
- "ld1w { z0.s }, p3/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
"fadd z19.s, z23.s, z19.s\n"
"fadd z18.s, z22.s, z18.s\n"
- "ld1w { z31.s }, p3/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
"fadd z17.s, z21.s, z17.s\n"
"fadd z16.s, z20.s, z16.s\n"
- "ld1w { z30.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z30.s }, p3/Z, [x20, x9, LSL #2]\n"
"fadd z5.s, z5.s, z19.s\n"
"fadd z4.s, z4.s, z18.s\n"
- "ld1w { z29.s }, p2/Z, [x24, x28, LSL #2]\n"
+ "ld1w { z29.s }, p2/Z, [x23, x28, LSL #2]\n"
"fadd z3.s, z3.s, z17.s\n"
"fadd z2.s, z2.s, z16.s\n"
- "ld1w { z22.s }, p2/Z, [x23, x28, LSL #2]\n"
- "ld1w { z28.s }, p2/Z, [x22, x28, LSL #2]\n"
- "ld1w { z18.s }, p2/Z, [x21, x28, LSL #2]\n"
- "ld1w { z27.s }, p1/Z, [x24, x27, LSL #2]\n"
- "ld1w { z21.s }, p1/Z, [x23, x27, LSL #2]\n"
- "ld1w { z26.s }, p1/Z, [x22, x27, LSL #2]\n"
- "ld1w { z17.s }, p1/Z, [x21, x27, LSL #2]\n"
- "ld1w { z25.s }, p0/Z, [x24, x26, LSL #2]\n"
- "ld1w { z20.s }, p0/Z, [x23, x26, LSL #2]\n"
- "ld1w { z24.s }, p0/Z, [x22, x26, LSL #2]\n"
- "ld1w { z16.s }, p0/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z22.s }, p2/Z, [x22, x28, LSL #2]\n"
+ "ld1w { z28.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z27.s }, p1/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z21.s }, p1/Z, [x22, x27, LSL #2]\n"
+ "ld1w { z26.s }, p1/Z, [x21, x27, LSL #2]\n"
+ "ld1w { z17.s }, p1/Z, [x20, x27, LSL #2]\n"
+ "ld1w { z25.s }, p0/Z, [x23, x26, LSL #2]\n"
+ "ld1w { z20.s }, p0/Z, [x22, x26, LSL #2]\n"
+ "ld1w { z24.s }, p0/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"fadd z23.s, z1.s, z0.s\n"
@@ -141,16 +142,16 @@ void sme_fp32_nhwc_avg_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1w { z1.s }, p3/Z, [x24, x9, LSL #2]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1w { z16.s }, p3/Z, [x20, x9, LSL #2]\n"
"subs x21, x21, #0x1\n"
- "fadd z5.s, z5.s, z1.s\n"
- "ld1w { z29.s }, p2/Z, [x24, x28, LSL #2]\n"
- "fadd z4.s, z4.s, z29.s\n"
- "ld1w { z27.s }, p1/Z, [x24, x27, LSL #2]\n"
- "fadd z3.s, z3.s, z27.s\n"
- "ld1w { z25.s }, p0/Z, [x24, x26, LSL #2]\n"
- "fadd z2.s, z2.s, z25.s\n"
+ "fadd z5.s, z5.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "fadd z4.s, z4.s, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x20, x27, LSL #2]\n"
+ "fadd z3.s, z3.s, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
+ "fadd z2.s, z2.s, z16.s\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"fmul z5.s, z5.s, z6.s\n"
@@ -173,44 +174,44 @@ void sme_fp32_nhwc_avg_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z5.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x20, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1w { z1.s }, p3/Z, [x24, x9, LSL #2]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1w { z0.s }, p3/Z, [x23, x9, LSL #2]\n"
- "ld1w { z31.s }, p3/Z, [x22, x9, LSL #2]\n"
- "ld1w { z30.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z1.s }, p3/Z, [x20, x9, LSL #2]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z30.s }, p3/Z, [x20, x9, LSL #2]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "fadd z23.s, z1.s, z0.s\n"
- "fadd z19.s, z31.s, z30.s\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "fadd z17.s, z1.s, z0.s\n"
+ "fadd z16.s, z31.s, z30.s\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "fadd z19.s, z23.s, z19.s\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "fadd z5.s, z5.s, z19.s\n"
- "add x20, x20, #0x20\n"
- "ld1w { z1.s }, p3/Z, [x24, x9, LSL #2]\n"
- "ld1w { z0.s }, p3/Z, [x23, x9, LSL #2]\n"
- "ld1w { z31.s }, p3/Z, [x22, x9, LSL #2]\n"
- "ld1w { z30.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "fadd z16.s, z17.s, z16.s\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "fadd z5.s, z5.s, z16.s\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z1.s }, p3/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z30.s }, p3/Z, [x20, x9, LSL #2]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "fadd z23.s, z1.s, z0.s\n"
- "fadd z19.s, z31.s, z30.s\n"
- "fadd z19.s, z23.s, z19.s\n"
- "fadd z5.s, z5.s, z19.s\n"
+ "fadd z17.s, z1.s, z0.s\n"
+ "fadd z16.s, z31.s, z30.s\n"
+ "fadd z16.s, z17.s, z16.s\n"
+ "fadd z5.s, z5.s, z16.s\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1w { z1.s }, p3/Z, [x24, x9, LSL #2]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1w { z16.s }, p3/Z, [x20, x9, LSL #2]\n"
"subs x21, x21, #0x1\n"
- "fadd z5.s, z5.s, z1.s\n"
+ "fadd z5.s, z5.s, z16.s\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"fmul z5.s, z5.s, z6.s\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 4e3cd6e228..338348231f 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -45,3 +47,5 @@ struct sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 05edac6623..3c7213a498 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -26,7 +26,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__ARM_FEATURE_SVE) && defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace arm_conv {
namespace pooling {
@@ -70,23 +70,23 @@ void sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr x20, [%x[args], %[offsetof_inptrs]]\n"
"mov x14, #0x0\n"
"ldr x13, [%x[args], %[offsetof_n_channels]]\n"
- "whilelt p1.s, x15, x13\n"
+ "whilelt p0.s, x15, x13\n"
"ldp x12, x11, [x21, #0x0]\n"
"ldp x10, x9, [x21, #0x10]\n"
"ldp x28, x27, [x20, #0x0]\n"
- "ld1w { z30.s }, p1/Z, [x27, x15, LSL #2]\n"
+ "ld1w { z30.s }, p0/Z, [x27, x15, LSL #2]\n"
"ldp x26, x25, [x20, #0x10]\n"
- "ld1w { z29.s }, p1/Z, [x25, x15, LSL #2]\n"
+ "ld1w { z29.s }, p0/Z, [x25, x15, LSL #2]\n"
"ldp x24, x23, [x20, #0x20]\n"
- "ld1w { z28.s }, p1/Z, [x24, x15, LSL #2]\n"
+ "ld1w { z28.s }, p0/Z, [x24, x15, LSL #2]\n"
"ldp x22, x21, [x20, #0x30]\n"
- "ld1w { z27.s }, p1/Z, [x21, x15, LSL #2]\n"
+ "ld1w { z27.s }, p0/Z, [x21, x15, LSL #2]\n"
"ldr x20, [x20, #0x40]\n"
- "ld1w { z26.s }, p1/Z, [x28, x15, LSL #2]\n"
- "ld1w { z25.s }, p1/Z, [x26, x15, LSL #2]\n"
- "ld1w { z24.s }, p1/Z, [x23, x15, LSL #2]\n"
- "ld1w { z23.s }, p1/Z, [x22, x15, LSL #2]\n"
- "ld1w { z19.s }, p1/Z, [x20, x15, LSL #2]\n"
+ "ld1w { z26.s }, p0/Z, [x28, x15, LSL #2]\n"
+ "ld1w { z25.s }, p0/Z, [x26, x15, LSL #2]\n"
+ "ld1w { z24.s }, p0/Z, [x23, x15, LSL #2]\n"
+ "ld1w { z19.s }, p0/Z, [x22, x15, LSL #2]\n"
+ "ld1w { z23.s }, p0/Z, [x20, x15, LSL #2]\n"
"incw x15\n"
"whilelt p1.s, x15, x13\n"
"b.none 2f\n"
@@ -95,25 +95,25 @@ void sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"movprfx z21, z28\n fmax z21.s, p2/M, z21.s, z27.s\n"
"ld1w { z30.s }, p1/Z, [x27, x15, LSL #2]\n"
"whilelt p0.s, x14, x13\n"
- "movprfx z20, z29\n fmax z20.s, p2/M, z20.s, z26.s\n"
- "movprfx z18, z25\n fmax z18.s, p2/M, z18.s, z24.s\n"
+ "movprfx z18, z29\n fmax z18.s, p2/M, z18.s, z26.s\n"
+ "movprfx z17, z25\n fmax z17.s, p2/M, z17.s, z24.s\n"
"ld1w { z28.s }, p1/Z, [x24, x15, LSL #2]\n"
- "movprfx z17, z29\n fmax z17.s, p2/M, z17.s, z23.s\n"
- "movprfx z16, z24\n fmax z16.s, p2/M, z16.s, z19.s\n"
+ "movprfx z16, z29\n fmax z16.s, p2/M, z16.s, z19.s\n"
+ "movprfx z20, z24\n fmax z20.s, p2/M, z20.s, z23.s\n"
"ld1w { z27.s }, p1/Z, [x21, x15, LSL #2]\n"
"ld1w { z29.s }, p1/Z, [x25, x15, LSL #2]\n"
- "movprfx z19, z22\n fmax z19.s, p2/M, z19.s, z20.s\n"
- "fmax z18.s, p2/M, z18.s, z22.s\n"
+ "movprfx z19, z22\n fmax z19.s, p2/M, z19.s, z18.s\n"
+ "movprfx z18, z17\n fmax z18.s, p2/M, z18.s, z22.s\n"
"ld1w { z26.s }, p1/Z, [x28, x15, LSL #2]\n"
- "fmax z17.s, p2/M, z17.s, z21.s\n"
- "fmax z16.s, p2/M, z16.s, z21.s\n"
+ "movprfx z17, z16\n fmax z17.s, p2/M, z17.s, z21.s\n"
+ "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z20.s\n"
"ld1w { z25.s }, p1/Z, [x26, x15, LSL #2]\n"
"st1w { z19.s }, p0, [x12, x14, LSL #2]\n"
"ld1w { z24.s }, p1/Z, [x23, x15, LSL #2]\n"
"st1w { z18.s }, p0, [x11, x14, LSL #2]\n"
- "ld1w { z23.s }, p1/Z, [x22, x15, LSL #2]\n"
+ "ld1w { z19.s }, p1/Z, [x22, x15, LSL #2]\n"
"st1w { z17.s }, p0, [x10, x14, LSL #2]\n"
- "ld1w { z19.s }, p1/Z, [x20, x15, LSL #2]\n"
+ "ld1w { z23.s }, p1/Z, [x20, x15, LSL #2]\n"
"incw x15\n"
"whilelt p1.s, x15, x13\n"
"st1w { z16.s }, p0, [x9, x14, LSL #2]\n"
@@ -125,13 +125,13 @@ void sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"whilelt p0.s, x14, x13\n"
"movprfx z20, z29\n fmax z20.s, p2/M, z20.s, z26.s\n"
"movprfx z18, z25\n fmax z18.s, p2/M, z18.s, z24.s\n"
- "movprfx z17, z29\n fmax z17.s, p2/M, z17.s, z23.s\n"
- "movprfx z16, z24\n fmax z16.s, p2/M, z16.s, z19.s\n"
- "movprfx z19, z22\n fmax z19.s, p2/M, z19.s, z20.s\n"
+ "movprfx z17, z29\n fmax z17.s, p2/M, z17.s, z19.s\n"
+ "movprfx z19, z24\n fmax z19.s, p2/M, z19.s, z23.s\n"
+ "movprfx z16, z22\n fmax z16.s, p2/M, z16.s, z20.s\n"
"fmax z18.s, p2/M, z18.s, z22.s\n"
- "st1w { z19.s }, p0, [x12, x14, LSL #2]\n"
+ "st1w { z16.s }, p0, [x12, x14, LSL #2]\n"
"fmax z17.s, p2/M, z17.s, z21.s\n"
- "fmax z16.s, p2/M, z16.s, z21.s\n"
+ "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z19.s\n"
"st1w { z18.s }, p0, [x11, x14, LSL #2]\n"
"st1w { z17.s }, p0, [x10, x14, LSL #2]\n"
"st1w { z16.s }, p0, [x9, x14, LSL #2]\n"
@@ -145,4 +145,4 @@ void sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(ARM_COMPUTE_ENABLE_SME)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst.hpp
index 0c0e445c7a..9bc1f11601 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -40,3 +42,5 @@ struct sme_fp32_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<float,
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst/generic.cpp
index 14c07724a1..0dabc2f292 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst/generic.cpp
@@ -22,9 +22,10 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME)
-
#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace arm_conv {
namespace pooling {
@@ -54,68 +55,68 @@ void sme_fp32_nhwc_max_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z4.s, #0xff800000\n"
"mov z3.s, #0xff800000\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z2.s, #0xff800000\n"
"mov z1.s, #0xff800000\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1w { z0.s }, p4/Z, [x24, x9, LSL #2]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1w { z31.s }, p4/Z, [x23, x9, LSL #2]\n"
- "ld1w { z23.s }, p4/Z, [x22, x9, LSL #2]\n"
- "ld1w { z30.s }, p4/Z, [x21, x9, LSL #2]\n"
- "ld1w { z18.s }, p3/Z, [x24, x28, LSL #2]\n"
- "ld1w { z29.s }, p3/Z, [x23, x28, LSL #2]\n"
- "ld1w { z22.s }, p3/Z, [x22, x28, LSL #2]\n"
- "ld1w { z28.s }, p3/Z, [x21, x28, LSL #2]\n"
- "ld1w { z17.s }, p2/Z, [x24, x27, LSL #2]\n"
- "ld1w { z27.s }, p2/Z, [x23, x27, LSL #2]\n"
- "ld1w { z21.s }, p2/Z, [x22, x27, LSL #2]\n"
- "ld1w { z26.s }, p2/Z, [x21, x27, LSL #2]\n"
- "ld1w { z16.s }, p1/Z, [x24, x26, LSL #2]\n"
- "ld1w { z25.s }, p1/Z, [x23, x26, LSL #2]\n"
- "ld1w { z20.s }, p1/Z, [x22, x26, LSL #2]\n"
- "ld1w { z24.s }, p1/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z0.s }, p4/Z, [x23, x9, LSL #2]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z31.s }, p4/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z23.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z30.s }, p4/Z, [x20, x9, LSL #2]\n"
+ "ld1w { z18.s }, p3/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z29.s }, p3/Z, [x22, x28, LSL #2]\n"
+ "ld1w { z22.s }, p3/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z28.s }, p3/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z27.s }, p2/Z, [x22, x27, LSL #2]\n"
+ "ld1w { z21.s }, p2/Z, [x21, x27, LSL #2]\n"
+ "ld1w { z26.s }, p2/Z, [x20, x27, LSL #2]\n"
+ "ld1w { z16.s }, p1/Z, [x23, x26, LSL #2]\n"
+ "ld1w { z25.s }, p1/Z, [x22, x26, LSL #2]\n"
+ "ld1w { z20.s }, p1/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z24.s }, p1/Z, [x20, x26, LSL #2]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"movprfx z19, z0\n fmax z19.s, p0/M, z19.s, z31.s\n"
"fmax z23.s, p0/M, z23.s, z30.s\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
"fmax z18.s, p0/M, z18.s, z29.s\n"
"fmax z22.s, p0/M, z22.s, z28.s\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
"fmax z17.s, p0/M, z17.s, z27.s\n"
"fmax z21.s, p0/M, z21.s, z26.s\n"
- "ld1w { z0.s }, p4/Z, [x24, x9, LSL #2]\n"
+ "ld1w { z0.s }, p4/Z, [x23, x9, LSL #2]\n"
"fmax z16.s, p0/M, z16.s, z25.s\n"
"fmax z20.s, p0/M, z20.s, z24.s\n"
- "ld1w { z31.s }, p4/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z31.s }, p4/Z, [x22, x9, LSL #2]\n"
"fmax z19.s, p0/M, z19.s, z23.s\n"
"fmax z18.s, p0/M, z18.s, z22.s\n"
- "ld1w { z23.s }, p4/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z23.s }, p4/Z, [x21, x9, LSL #2]\n"
"fmax z17.s, p0/M, z17.s, z21.s\n"
"fmax z16.s, p0/M, z16.s, z20.s\n"
- "ld1w { z30.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z30.s }, p4/Z, [x20, x9, LSL #2]\n"
"fmax z4.s, p0/M, z4.s, z19.s\n"
"fmax z3.s, p0/M, z3.s, z18.s\n"
- "ld1w { z18.s }, p3/Z, [x24, x28, LSL #2]\n"
+ "ld1w { z18.s }, p3/Z, [x23, x28, LSL #2]\n"
"fmax z2.s, p0/M, z2.s, z17.s\n"
"fmax z1.s, p0/M, z1.s, z16.s\n"
- "ld1w { z29.s }, p3/Z, [x23, x28, LSL #2]\n"
- "ld1w { z22.s }, p3/Z, [x22, x28, LSL #2]\n"
- "ld1w { z28.s }, p3/Z, [x21, x28, LSL #2]\n"
- "ld1w { z17.s }, p2/Z, [x24, x27, LSL #2]\n"
- "ld1w { z27.s }, p2/Z, [x23, x27, LSL #2]\n"
- "ld1w { z21.s }, p2/Z, [x22, x27, LSL #2]\n"
- "ld1w { z26.s }, p2/Z, [x21, x27, LSL #2]\n"
- "ld1w { z16.s }, p1/Z, [x24, x26, LSL #2]\n"
- "ld1w { z25.s }, p1/Z, [x23, x26, LSL #2]\n"
- "ld1w { z20.s }, p1/Z, [x22, x26, LSL #2]\n"
- "ld1w { z24.s }, p1/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z29.s }, p3/Z, [x22, x28, LSL #2]\n"
+ "ld1w { z22.s }, p3/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z28.s }, p3/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z27.s }, p2/Z, [x22, x27, LSL #2]\n"
+ "ld1w { z21.s }, p2/Z, [x21, x27, LSL #2]\n"
+ "ld1w { z26.s }, p2/Z, [x20, x27, LSL #2]\n"
+ "ld1w { z16.s }, p1/Z, [x23, x26, LSL #2]\n"
+ "ld1w { z25.s }, p1/Z, [x22, x26, LSL #2]\n"
+ "ld1w { z20.s }, p1/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z24.s }, p1/Z, [x20, x26, LSL #2]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"movprfx z19, z0\n fmax z19.s, p0/M, z19.s, z31.s\n"
@@ -138,15 +139,15 @@ void sme_fp32_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1w { z0.s }, p4/Z, [x24, x9, LSL #2]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1w { z16.s }, p4/Z, [x20, x9, LSL #2]\n"
"subs x21, x21, #0x1\n"
- "fmax z4.s, p0/M, z4.s, z0.s\n"
- "ld1w { z18.s }, p3/Z, [x24, x28, LSL #2]\n"
- "fmax z3.s, p0/M, z3.s, z18.s\n"
- "ld1w { z17.s }, p2/Z, [x24, x27, LSL #2]\n"
- "fmax z2.s, p0/M, z2.s, z17.s\n"
- "ld1w { z16.s }, p1/Z, [x24, x26, LSL #2]\n"
+ "fmax z4.s, p0/M, z4.s, z16.s\n"
+ "ld1w { z16.s }, p3/Z, [x20, x28, LSL #2]\n"
+ "fmax z3.s, p0/M, z3.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x27, LSL #2]\n"
+ "fmax z2.s, p0/M, z2.s, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x20, x26, LSL #2]\n"
"fmax z1.s, p0/M, z1.s, z16.s\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
@@ -166,44 +167,44 @@ void sme_fp32_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z4.s, #0xff800000\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x20, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1w { z0.s }, p4/Z, [x24, x9, LSL #2]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1w { z31.s }, p4/Z, [x23, x9, LSL #2]\n"
- "ld1w { z23.s }, p4/Z, [x22, x9, LSL #2]\n"
- "ld1w { z30.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z0.s }, p4/Z, [x20, x9, LSL #2]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z31.s }, p4/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z23.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z30.s }, p4/Z, [x20, x9, LSL #2]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z19, z0\n fmax z19.s, p0/M, z19.s, z31.s\n"
- "fmax z23.s, p0/M, z23.s, z30.s\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "movprfx z16, z0\n fmax z16.s, p0/M, z16.s, z31.s\n"
+ "movprfx z17, z23\n fmax z17.s, p0/M, z17.s, z30.s\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "fmax z19.s, p0/M, z19.s, z23.s\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "fmax z4.s, p0/M, z4.s, z19.s\n"
- "add x20, x20, #0x20\n"
- "ld1w { z0.s }, p4/Z, [x24, x9, LSL #2]\n"
- "ld1w { z31.s }, p4/Z, [x23, x9, LSL #2]\n"
- "ld1w { z23.s }, p4/Z, [x22, x9, LSL #2]\n"
- "ld1w { z30.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "fmax z16.s, p0/M, z16.s, z17.s\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "fmax z4.s, p0/M, z4.s, z16.s\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z0.s }, p4/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z31.s }, p4/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z23.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z30.s }, p4/Z, [x20, x9, LSL #2]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z19, z0\n fmax z19.s, p0/M, z19.s, z31.s\n"
- "fmax z23.s, p0/M, z23.s, z30.s\n"
- "fmax z19.s, p0/M, z19.s, z23.s\n"
- "fmax z4.s, p0/M, z4.s, z19.s\n"
+ "movprfx z16, z0\n fmax z16.s, p0/M, z16.s, z31.s\n"
+ "movprfx z17, z23\n fmax z17.s, p0/M, z17.s, z30.s\n"
+ "fmax z16.s, p0/M, z16.s, z17.s\n"
+ "fmax z4.s, p0/M, z4.s, z16.s\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1w { z0.s }, p4/Z, [x24, x9, LSL #2]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1w { z16.s }, p4/Z, [x20, x9, LSL #2]\n"
"subs x21, x21, #0x1\n"
- "fmax z4.s, p0/M, z4.s, z0.s\n"
+ "fmax z4.s, p0/M, z4.s, z16.s\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"st1w { z4.s }, p4, [%x[outptr], x9, LSL #2]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst.hpp
index e383a4c3bd..318510e697 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -40,3 +42,5 @@ struct sme_s8_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, i
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst/generic.cpp
index ded1274c13..c24e977dc6 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -22,13 +22,14 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME)
-
#include <cstdint>
+#include <cstddef>
#include <cstring>
#include <cmath>
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -109,7 +110,7 @@ void sme_s8_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"mov z15.s, #0x0\n"
"mov z14.s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"mov z13.s, #0x0\n"
"mov z12.s, #0x0\n"
"mov z11.s, #0x0\n"
@@ -125,48 +126,48 @@ void sme_s8_nhwc_avg_generic_depthfirst_impl(
"mov z1.s, #0x0\n"
"mov z0.s, #0x0\n"
"cbz x23, 4f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
".inst 0x455c03b5 // saddlb z21.h, z29.b, z28.b\n"
".inst 0x455c07b4 // saddlt z20.h, z29.b, z28.b\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x455a0373 // saddlb z19.h, z27.b, z26.b\n"
".inst 0x455a0772 // saddlt z18.h, z27.b, z26.b\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
".inst 0x45580331 // saddlb z17.h, z25.b, z24.b\n"
".inst 0x45580730 // saddlt z16.h, z25.b, z24.b\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
".inst 0x4595416b // saddwb z11.s, z11.s, z21.h\n"
".inst 0x4595454a // saddwt z10.s, z10.s, z21.h\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
".inst 0x45944129 // saddwb z9.s, z9.s, z20.h\n"
".inst 0x45944508 // saddwt z8.s, z8.s, z20.h\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
".inst 0x459340e7 // saddwb z7.s, z7.s, z19.h\n"
".inst 0x459344c6 // saddwt z6.s, z6.s, z19.h\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
".inst 0x459240a5 // saddwb z5.s, z5.s, z18.h\n"
".inst 0x45924484 // saddwt z4.s, z4.s, z18.h\n"
".inst 0x45914063 // saddwb z3.s, z3.s, z17.h\n"
@@ -203,20 +204,20 @@ void sme_s8_nhwc_avg_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508a3f7 // sshllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508a7f6 // sshllt z22.h, z31.b, #0x0\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- ".inst 0x4508a3b5 // sshllb z21.h, z29.b, #0x0\n"
- ".inst 0x4508a7b4 // sshllt z20.h, z29.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508a217 // sshllb z23.h, z16.b, #0x0\n"
+ ".inst 0x4508a616 // sshllt z22.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+ ".inst 0x4508a215 // sshllb z21.h, z16.b, #0x0\n"
+ ".inst 0x4508a614 // sshllt z20.h, z16.b, #0x0\n"
"subs x21, x21, #0x1\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- ".inst 0x4508a373 // sshllb z19.h, z27.b, #0x0\n"
- ".inst 0x4508a772 // sshllt z18.h, z27.b, #0x0\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- ".inst 0x4508a331 // sshllb z17.h, z25.b, #0x0\n"
- ".inst 0x4508a730 // sshllt z16.h, z25.b, #0x0\n"
+ "ld1b { z16.b }, p2/Z, [x20, x25]\n"
+ ".inst 0x4508a213 // sshllb z19.h, z16.b, #0x0\n"
+ ".inst 0x4508a612 // sshllt z18.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
@@ -332,74 +333,74 @@ void sme_s8_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"mov z15.s, #0x0\n"
"mov z14.s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"mov z13.s, #0x0\n"
"mov z12.s, #0x0\n"
"cbz x23, 11f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- ".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ ".inst 0x455e03f1 // saddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e07f0 // saddlt z16.h, z31.b, z30.b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
- ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
- ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- ".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
- ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
- ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
- ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x455e03f1 // saddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e07f0 // saddlt z16.h, z31.b, z30.b\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508a3f7 // sshllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508a7f6 // sshllt z22.h, z31.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
"subs x21, x21, #0x1\n"
- ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
- ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
- ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
- ".inst 0x04b175ef // sqdmulh z15.s, z15.s, z17.s\n"
- ".inst 0x04b175ce // sqdmulh z14.s, z14.s, z17.s\n"
- ".inst 0x04b175ad // sqdmulh z13.s, z13.s, z17.s\n"
- ".inst 0x04b1758c // sqdmulh z12.s, z12.s, z17.s\n"
+ "ld1rw { z16.s }, p0/Z, [%x[rescale_ptr]]\n"
+ ".inst 0x04b075ef // sqdmulh z15.s, z15.s, z16.s\n"
+ ".inst 0x04b075ce // sqdmulh z14.s, z14.s, z16.s\n"
+ ".inst 0x04b075ad // sqdmulh z13.s, z13.s, z16.s\n"
+ ".inst 0x04b0758c // sqdmulh z12.s, z12.s, z16.s\n"
"ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
- "mov z19.s, #0x7f\n"
+ "mov z18.s, #0x7f\n"
".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
- "not z16.s, p0/M, z19.s\n"
+ "not z16.s, p0/M, z18.s\n"
"smax z15.s, p0/M, z15.s, z16.s\n"
"smax z14.s, p0/M, z14.s, z16.s\n"
"smax z13.s, p0/M, z13.s, z16.s\n"
"smax z12.s, p0/M, z12.s, z16.s\n"
- "smin z15.s, p0/M, z15.s, z19.s\n"
- "smin z14.s, p0/M, z14.s, z19.s\n"
- "trn1 z23.h, z15.h, z14.h\n"
- "smin z13.s, p0/M, z13.s, z19.s\n"
- "smin z12.s, p0/M, z12.s, z19.s\n"
+ "smin z15.s, p0/M, z15.s, z18.s\n"
+ "smin z14.s, p0/M, z14.s, z18.s\n"
+ "trn1 z17.h, z15.h, z14.h\n"
+ "smin z13.s, p0/M, z13.s, z18.s\n"
+ "smin z12.s, p0/M, z12.s, z18.s\n"
"trn1 z16.h, z13.h, z12.h\n"
- "trn1 z16.b, z23.b, z16.b\n"
+ "trn1 z16.b, z17.b, z16.b\n"
"st1b { z16.b }, p4, [%x[outptr], x27]\n"
"incb x27\n"
"whilelt p4.b, x27, %x[n_channels]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 1613970618..c9a80e6a5b 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -45,3 +47,5 @@ struct sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<i
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index e3b9c98d80..96617566a8 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -26,7 +26,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__ARM_FEATURE_SVE) && defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace arm_conv {
namespace pooling {
@@ -70,23 +70,23 @@ void sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr x20, [%x[args], %[offsetof_inptrs]]\n"
"mov x14, #0x0\n"
"ldr x13, [%x[args], %[offsetof_n_channels]]\n"
- "whilelt p1.b, x15, x13\n"
+ "whilelt p0.b, x15, x13\n"
"ldp x12, x11, [x21, #0x0]\n"
"ldp x10, x9, [x21, #0x10]\n"
"ldp x28, x27, [x20, #0x0]\n"
- "ld1b { z30.b }, p1/Z, [x27, x15]\n"
+ "ld1b { z30.b }, p0/Z, [x27, x15]\n"
"ldp x26, x25, [x20, #0x10]\n"
- "ld1b { z29.b }, p1/Z, [x25, x15]\n"
+ "ld1b { z29.b }, p0/Z, [x25, x15]\n"
"ldp x24, x23, [x20, #0x20]\n"
- "ld1b { z28.b }, p1/Z, [x24, x15]\n"
+ "ld1b { z28.b }, p0/Z, [x24, x15]\n"
"ldp x22, x21, [x20, #0x30]\n"
- "ld1b { z27.b }, p1/Z, [x21, x15]\n"
+ "ld1b { z27.b }, p0/Z, [x21, x15]\n"
"ldr x20, [x20, #0x40]\n"
- "ld1b { z26.b }, p1/Z, [x28, x15]\n"
- "ld1b { z25.b }, p1/Z, [x26, x15]\n"
- "ld1b { z24.b }, p1/Z, [x23, x15]\n"
- "ld1b { z23.b }, p1/Z, [x22, x15]\n"
- "ld1b { z19.b }, p1/Z, [x20, x15]\n"
+ "ld1b { z26.b }, p0/Z, [x28, x15]\n"
+ "ld1b { z25.b }, p0/Z, [x26, x15]\n"
+ "ld1b { z24.b }, p0/Z, [x23, x15]\n"
+ "ld1b { z19.b }, p0/Z, [x22, x15]\n"
+ "ld1b { z23.b }, p0/Z, [x20, x15]\n"
"incw x15\n"
"whilelt p1.b, x15, x13\n"
"b.none 2f\n"
@@ -95,25 +95,25 @@ void sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"movprfx z21, z28\n smax z21.b, p2/M, z21.b, z27.b\n"
"ld1b { z30.b }, p1/Z, [x27, x15]\n"
"whilelt p0.b, x14, x13\n"
- "movprfx z20, z29\n smax z20.b, p2/M, z20.b, z26.b\n"
- "movprfx z18, z25\n smax z18.b, p2/M, z18.b, z24.b\n"
+ "movprfx z18, z29\n smax z18.b, p2/M, z18.b, z26.b\n"
+ "movprfx z17, z25\n smax z17.b, p2/M, z17.b, z24.b\n"
"ld1b { z28.b }, p1/Z, [x24, x15]\n"
- "movprfx z17, z29\n smax z17.b, p2/M, z17.b, z23.b\n"
- "movprfx z16, z24\n smax z16.b, p2/M, z16.b, z19.b\n"
+ "movprfx z16, z29\n smax z16.b, p2/M, z16.b, z19.b\n"
+ "movprfx z20, z24\n smax z20.b, p2/M, z20.b, z23.b\n"
"ld1b { z27.b }, p1/Z, [x21, x15]\n"
"ld1b { z29.b }, p1/Z, [x25, x15]\n"
- "movprfx z19, z22\n smax z19.b, p2/M, z19.b, z20.b\n"
- "smax z18.b, p2/M, z18.b, z22.b\n"
+ "movprfx z19, z22\n smax z19.b, p2/M, z19.b, z18.b\n"
+ "movprfx z18, z17\n smax z18.b, p2/M, z18.b, z22.b\n"
"ld1b { z26.b }, p1/Z, [x28, x15]\n"
- "smax z17.b, p2/M, z17.b, z21.b\n"
- "smax z16.b, p2/M, z16.b, z21.b\n"
+ "movprfx z17, z16\n smax z17.b, p2/M, z17.b, z21.b\n"
+ "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z20.b\n"
"ld1b { z25.b }, p1/Z, [x26, x15]\n"
"st1b { z19.b }, p0, [x12, x14]\n"
"ld1b { z24.b }, p1/Z, [x23, x15]\n"
"st1b { z18.b }, p0, [x11, x14]\n"
- "ld1b { z23.b }, p1/Z, [x22, x15]\n"
+ "ld1b { z19.b }, p1/Z, [x22, x15]\n"
"st1b { z17.b }, p0, [x10, x14]\n"
- "ld1b { z19.b }, p1/Z, [x20, x15]\n"
+ "ld1b { z23.b }, p1/Z, [x20, x15]\n"
"incw x15\n"
"whilelt p1.b, x15, x13\n"
"st1b { z16.b }, p0, [x9, x14]\n"
@@ -125,13 +125,13 @@ void sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"whilelt p0.b, x14, x13\n"
"movprfx z20, z29\n smax z20.b, p2/M, z20.b, z26.b\n"
"movprfx z18, z25\n smax z18.b, p2/M, z18.b, z24.b\n"
- "movprfx z17, z29\n smax z17.b, p2/M, z17.b, z23.b\n"
- "movprfx z16, z24\n smax z16.b, p2/M, z16.b, z19.b\n"
- "movprfx z19, z22\n smax z19.b, p2/M, z19.b, z20.b\n"
+ "movprfx z17, z29\n smax z17.b, p2/M, z17.b, z19.b\n"
+ "movprfx z19, z24\n smax z19.b, p2/M, z19.b, z23.b\n"
+ "movprfx z16, z22\n smax z16.b, p2/M, z16.b, z20.b\n"
"smax z18.b, p2/M, z18.b, z22.b\n"
- "st1b { z19.b }, p0, [x12, x14]\n"
+ "st1b { z16.b }, p0, [x12, x14]\n"
"smax z17.b, p2/M, z17.b, z21.b\n"
- "smax z16.b, p2/M, z16.b, z21.b\n"
+ "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z19.b\n"
"st1b { z18.b }, p0, [x11, x14]\n"
"st1b { z17.b }, p0, [x10, x14]\n"
"st1b { z16.b }, p0, [x9, x14]\n"
@@ -145,4 +145,4 @@ void sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(ARM_COMPUTE_ENABLE_SME)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst.hpp
index 56aa120cfe..3e0d76c277 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -40,3 +42,5 @@ struct sme_s8_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, i
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst/generic.cpp
index 4e6cad6e92..d2b45cd353 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst/generic.cpp
@@ -22,9 +22,10 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME)
-
#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace arm_conv {
namespace pooling {
@@ -54,68 +55,68 @@ void sme_s8_nhwc_max_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z4.b, #0x80\n"
"mov z3.b, #0x80\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z2.b, #0x80\n"
"mov z1.b, #0x80\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
- "ld1b { z18.b }, p3/Z, [x24, x28]\n"
- "ld1b { z29.b }, p3/Z, [x23, x28]\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z28.b }, p3/Z, [x21, x28]\n"
- "ld1b { z17.b }, p2/Z, [x24, x27]\n"
- "ld1b { z27.b }, p2/Z, [x23, x27]\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z26.b }, p2/Z, [x21, x27]\n"
- "ld1b { z16.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
"smax z23.b, p0/M, z23.b, z30.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
"smax z18.b, p0/M, z18.b, z29.b\n"
"smax z22.b, p0/M, z22.b, z28.b\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
"smax z17.b, p0/M, z17.b, z27.b\n"
"smax z21.b, p0/M, z21.b, z26.b\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
"smax z16.b, p0/M, z16.b, z25.b\n"
"smax z20.b, p0/M, z20.b, z24.b\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
"smax z19.b, p0/M, z19.b, z23.b\n"
"smax z18.b, p0/M, z18.b, z22.b\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
"smax z17.b, p0/M, z17.b, z21.b\n"
"smax z16.b, p0/M, z16.b, z20.b\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
"smax z4.b, p0/M, z4.b, z19.b\n"
"smax z3.b, p0/M, z3.b, z18.b\n"
- "ld1b { z18.b }, p3/Z, [x24, x28]\n"
+ "ld1b { z18.b }, p3/Z, [x23, x28]\n"
"smax z2.b, p0/M, z2.b, z17.b\n"
"smax z1.b, p0/M, z1.b, z16.b\n"
- "ld1b { z29.b }, p3/Z, [x23, x28]\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z28.b }, p3/Z, [x21, x28]\n"
- "ld1b { z17.b }, p2/Z, [x24, x27]\n"
- "ld1b { z27.b }, p2/Z, [x23, x27]\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z26.b }, p2/Z, [x21, x27]\n"
- "ld1b { z16.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
@@ -138,15 +139,15 @@ void sme_s8_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "smax z4.b, p0/M, z4.b, z0.b\n"
- "ld1b { z18.b }, p3/Z, [x24, x28]\n"
- "smax z3.b, p0/M, z3.b, z18.b\n"
- "ld1b { z17.b }, p2/Z, [x24, x27]\n"
- "smax z2.b, p0/M, z2.b, z17.b\n"
- "ld1b { z16.b }, p1/Z, [x24, x26]\n"
+ "smax z4.b, p0/M, z4.b, z16.b\n"
+ "ld1b { z16.b }, p3/Z, [x20, x28]\n"
+ "smax z3.b, p0/M, z3.b, z16.b\n"
+ "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+ "smax z2.b, p0/M, z2.b, z16.b\n"
+ "ld1b { z16.b }, p1/Z, [x20, x26]\n"
"smax z1.b, p0/M, z1.b, z16.b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
@@ -166,44 +167,44 @@ void sme_s8_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z4.b, #0x80\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x20, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z0.b }, p4/Z, [x20, x9]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
- "smax z23.b, p0/M, z23.b, z30.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "movprfx z16, z0\n smax z16.b, p0/M, z16.b, z31.b\n"
+ "movprfx z17, z23\n smax z17.b, p0/M, z17.b, z30.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "smax z19.b, p0/M, z19.b, z23.b\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "smax z4.b, p0/M, z4.b, z19.b\n"
- "add x20, x20, #0x20\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
+ "smax z16.b, p0/M, z16.b, z17.b\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "smax z4.b, p0/M, z4.b, z16.b\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
- "smax z23.b, p0/M, z23.b, z30.b\n"
- "smax z19.b, p0/M, z19.b, z23.b\n"
- "smax z4.b, p0/M, z4.b, z19.b\n"
+ "movprfx z16, z0\n smax z16.b, p0/M, z16.b, z31.b\n"
+ "movprfx z17, z23\n smax z17.b, p0/M, z17.b, z30.b\n"
+ "smax z16.b, p0/M, z16.b, z17.b\n"
+ "smax z4.b, p0/M, z4.b, z16.b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "smax z4.b, p0/M, z4.b, z0.b\n"
+ "smax z4.b, p0/M, z4.b, z16.b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"st1b { z4.b }, p4, [%x[outptr], x9]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst.hpp
index ee02c60bc1..c6263f5dbc 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -40,3 +42,5 @@ struct sme_s8q_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<int8_t,
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst/generic.cpp
index cc58d3e9e2..91f2f7ab31 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -22,14 +22,15 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME)
-
-#include "src/core/NEON/kernels/assembly/pooling.hpp"
+#include "pooling.hpp"
#include <cstdint>
+#include <cstddef>
#include <cstring>
#include <cmath>
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -128,7 +129,7 @@ void sme_s8q_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"mov z15.s, #0x0\n"
"mov z14.s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"mov z13.s, #0x0\n"
"mov z12.s, #0x0\n"
"mov z11.s, #0x0\n"
@@ -144,48 +145,48 @@ void sme_s8q_nhwc_avg_generic_depthfirst_impl(
"mov z1.s, #0x0\n"
"mov z0.s, #0x0\n"
"cbz x23, 4f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
".inst 0x455c03b5 // saddlb z21.h, z29.b, z28.b\n"
".inst 0x455c07b4 // saddlt z20.h, z29.b, z28.b\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x455a0373 // saddlb z19.h, z27.b, z26.b\n"
".inst 0x455a0772 // saddlt z18.h, z27.b, z26.b\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
".inst 0x45580331 // saddlb z17.h, z25.b, z24.b\n"
".inst 0x45580730 // saddlt z16.h, z25.b, z24.b\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
".inst 0x4595416b // saddwb z11.s, z11.s, z21.h\n"
".inst 0x4595454a // saddwt z10.s, z10.s, z21.h\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
".inst 0x45944129 // saddwb z9.s, z9.s, z20.h\n"
".inst 0x45944508 // saddwt z8.s, z8.s, z20.h\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
".inst 0x459340e7 // saddwb z7.s, z7.s, z19.h\n"
".inst 0x459344c6 // saddwt z6.s, z6.s, z19.h\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
".inst 0x459240a5 // saddwb z5.s, z5.s, z18.h\n"
".inst 0x45924484 // saddwt z4.s, z4.s, z18.h\n"
".inst 0x45914063 // saddwb z3.s, z3.s, z17.h\n"
@@ -222,20 +223,20 @@ void sme_s8q_nhwc_avg_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508a3f7 // sshllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508a7f6 // sshllt z22.h, z31.b, #0x0\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- ".inst 0x4508a3b5 // sshllb z21.h, z29.b, #0x0\n"
- ".inst 0x4508a7b4 // sshllt z20.h, z29.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508a217 // sshllb z23.h, z16.b, #0x0\n"
+ ".inst 0x4508a616 // sshllt z22.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+ ".inst 0x4508a215 // sshllb z21.h, z16.b, #0x0\n"
+ ".inst 0x4508a614 // sshllt z20.h, z16.b, #0x0\n"
"subs x21, x21, #0x1\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- ".inst 0x4508a373 // sshllb z19.h, z27.b, #0x0\n"
- ".inst 0x4508a772 // sshllt z18.h, z27.b, #0x0\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- ".inst 0x4508a331 // sshllb z17.h, z25.b, #0x0\n"
- ".inst 0x4508a730 // sshllt z16.h, z25.b, #0x0\n"
+ "ld1b { z16.b }, p2/Z, [x20, x25]\n"
+ ".inst 0x4508a213 // sshllb z19.h, z16.b, #0x0\n"
+ ".inst 0x4508a612 // sshllt z18.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
@@ -368,79 +369,79 @@ void sme_s8q_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"mov z15.s, #0x0\n"
"mov z14.s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"mov z13.s, #0x0\n"
"mov z12.s, #0x0\n"
"cbz x23, 11f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- ".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ ".inst 0x455e03f1 // saddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e07f0 // saddlt z16.h, z31.b, z30.b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
- ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
- ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- ".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
- ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
- ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
- ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x455e03f1 // saddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e07f0 // saddlt z16.h, z31.b, z30.b\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508a3f7 // sshllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508a7f6 // sshllt z22.h, z31.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
"subs x21, x21, #0x1\n"
- ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
- ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
- ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "ld1rw { z18.s }, p0/Z, [%x[left_shift]]\n"
- ".inst 0x4482824f // srshl z15.s, p0/M, z15.s, z18.s\n"
- ".inst 0x4482824e // srshl z14.s, p0/M, z14.s, z18.s\n"
- ".inst 0x4482824d // srshl z13.s, p0/M, z13.s, z18.s\n"
- ".inst 0x4482824c // srshl z12.s, p0/M, z12.s, z18.s\n"
+ "ld1rw { z16.s }, p0/Z, [%x[left_shift]]\n"
+ ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
+ ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
+ ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
+ ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
"ld1rw { z17.s }, p0/Z, [%x[combined_rescale_value]]\n"
".inst 0x04b175ef // sqrdmulh z15.s, z15.s, z17.s\n"
".inst 0x04b175ce // sqrdmulh z14.s, z14.s, z17.s\n"
"ld1rw { z16.s }, p0/Z, [%x[right_shift]]\n"
".inst 0x04b175ad // sqrdmulh z13.s, z13.s, z17.s\n"
".inst 0x04b1758c // sqrdmulh z12.s, z12.s, z17.s\n"
- "mov z19.s, #0x7f\n"
+ "mov z18.s, #0x7f\n"
".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
- "not z16.s, p0/M, z19.s\n"
+ "not z16.s, p0/M, z18.s\n"
"smax z15.s, p0/M, z15.s, z16.s\n"
"smax z14.s, p0/M, z14.s, z16.s\n"
"smax z13.s, p0/M, z13.s, z16.s\n"
"smax z12.s, p0/M, z12.s, z16.s\n"
- "smin z15.s, p0/M, z15.s, z19.s\n"
- "smin z14.s, p0/M, z14.s, z19.s\n"
- "trn1 z23.h, z15.h, z14.h\n"
- "smin z13.s, p0/M, z13.s, z19.s\n"
- "smin z12.s, p0/M, z12.s, z19.s\n"
+ "smin z15.s, p0/M, z15.s, z18.s\n"
+ "smin z14.s, p0/M, z14.s, z18.s\n"
+ "trn1 z17.h, z15.h, z14.h\n"
+ "smin z13.s, p0/M, z13.s, z18.s\n"
+ "smin z12.s, p0/M, z12.s, z18.s\n"
"trn1 z16.h, z13.h, z12.h\n"
- "trn1 z16.b, z23.b, z16.b\n"
+ "trn1 z16.b, z17.b, z16.b\n"
"st1b { z16.b }, p4, [%x[outptr], x27]\n"
"incb x27\n"
"whilelt p4.b, x27, %x[n_channels]\n"
@@ -456,4 +457,4 @@ void sme_s8q_nhwc_avg_generic_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
-#endif // defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst.hpp
index 050aff397e..9667d37954 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -40,3 +42,5 @@ struct sme_s8q_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<int8_t,
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst/generic.cpp
index 3850ebf464..e9b586f4ce 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -22,10 +22,11 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME)
-
-#include "src/core/NEON/kernels/assembly/pooling.hpp"
+#include "pooling.hpp"
#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace arm_conv {
namespace pooling {
@@ -56,68 +57,68 @@ void sme_s8q_nhwc_max_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z4.b, #0x80\n"
"mov z3.b, #0x80\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z2.b, #0x80\n"
"mov z1.b, #0x80\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
- "ld1b { z18.b }, p3/Z, [x24, x28]\n"
- "ld1b { z29.b }, p3/Z, [x23, x28]\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z28.b }, p3/Z, [x21, x28]\n"
- "ld1b { z17.b }, p2/Z, [x24, x27]\n"
- "ld1b { z27.b }, p2/Z, [x23, x27]\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z26.b }, p2/Z, [x21, x27]\n"
- "ld1b { z16.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
"smax z23.b, p0/M, z23.b, z30.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
"smax z18.b, p0/M, z18.b, z29.b\n"
"smax z22.b, p0/M, z22.b, z28.b\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
"smax z17.b, p0/M, z17.b, z27.b\n"
"smax z21.b, p0/M, z21.b, z26.b\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
"smax z16.b, p0/M, z16.b, z25.b\n"
"smax z20.b, p0/M, z20.b, z24.b\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
"smax z19.b, p0/M, z19.b, z23.b\n"
"smax z18.b, p0/M, z18.b, z22.b\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
"smax z17.b, p0/M, z17.b, z21.b\n"
"smax z16.b, p0/M, z16.b, z20.b\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
"smax z4.b, p0/M, z4.b, z19.b\n"
"smax z3.b, p0/M, z3.b, z18.b\n"
- "ld1b { z18.b }, p3/Z, [x24, x28]\n"
+ "ld1b { z18.b }, p3/Z, [x23, x28]\n"
"smax z2.b, p0/M, z2.b, z17.b\n"
"smax z1.b, p0/M, z1.b, z16.b\n"
- "ld1b { z29.b }, p3/Z, [x23, x28]\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z28.b }, p3/Z, [x21, x28]\n"
- "ld1b { z17.b }, p2/Z, [x24, x27]\n"
- "ld1b { z27.b }, p2/Z, [x23, x27]\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z26.b }, p2/Z, [x21, x27]\n"
- "ld1b { z16.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
@@ -140,15 +141,15 @@ void sme_s8q_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "smax z4.b, p0/M, z4.b, z0.b\n"
- "ld1b { z18.b }, p3/Z, [x24, x28]\n"
- "smax z3.b, p0/M, z3.b, z18.b\n"
- "ld1b { z17.b }, p2/Z, [x24, x27]\n"
- "smax z2.b, p0/M, z2.b, z17.b\n"
- "ld1b { z16.b }, p1/Z, [x24, x26]\n"
+ "smax z4.b, p0/M, z4.b, z16.b\n"
+ "ld1b { z16.b }, p3/Z, [x20, x28]\n"
+ "smax z3.b, p0/M, z3.b, z16.b\n"
+ "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+ "smax z2.b, p0/M, z2.b, z16.b\n"
+ "ld1b { z16.b }, p1/Z, [x20, x26]\n"
"smax z1.b, p0/M, z1.b, z16.b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
@@ -292,83 +293,83 @@ void sme_s8q_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z4.b, #0x80\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x20, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z0.b }, p4/Z, [x20, x9]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
- "smax z23.b, p0/M, z23.b, z30.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "movprfx z16, z0\n smax z16.b, p0/M, z16.b, z31.b\n"
+ "movprfx z17, z23\n smax z17.b, p0/M, z17.b, z30.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "smax z19.b, p0/M, z19.b, z23.b\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "smax z4.b, p0/M, z4.b, z19.b\n"
- "add x20, x20, #0x20\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
+ "smax z16.b, p0/M, z16.b, z17.b\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "smax z4.b, p0/M, z4.b, z16.b\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
- "smax z23.b, p0/M, z23.b, z30.b\n"
- "smax z19.b, p0/M, z19.b, z23.b\n"
- "smax z4.b, p0/M, z4.b, z19.b\n"
+ "movprfx z16, z0\n smax z16.b, p0/M, z16.b, z31.b\n"
+ "movprfx z17, z23\n smax z17.b, p0/M, z17.b, z30.b\n"
+ "smax z16.b, p0/M, z16.b, z17.b\n"
+ "smax z4.b, p0/M, z4.b, z16.b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "smax z4.b, p0/M, z4.b, z0.b\n"
+ "smax z4.b, p0/M, z4.b, z16.b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- ".inst 0x4508a097 // sshllb z23.h, z4.b, #0x0\n"
- ".inst 0x4508a496 // sshllt z22.h, z4.b, #0x0\n"
+ ".inst 0x4508a091 // sshllb z17.h, z4.b, #0x0\n"
+ ".inst 0x4508a490 // sshllt z16.h, z4.b, #0x0\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- "ld1rw { z4.s }, p0/Z, [x20]\n"
- ".inst 0x4510a2e1 // sshllb z1.s, z23.h, #0x0\n"
- ".inst 0x4510a6f7 // sshllt z23.s, z23.h, #0x0\n"
+ "ld1rw { z18.s }, p0/Z, [x20]\n"
+ ".inst 0x4510a236 // sshllb z22.s, z17.h, #0x0\n"
+ ".inst 0x4510a635 // sshllt z21.s, z17.h, #0x0\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "ld1rw { z3.s }, p0/Z, [x20]\n"
- ".inst 0x4510a2c0 // sshllb z0.s, z22.h, #0x0\n"
- ".inst 0x4510a6df // sshllt z31.s, z22.h, #0x0\n"
+ "ld1rw { z17.s }, p0/Z, [x20]\n"
+ ".inst 0x4510a214 // sshllb z20.s, z16.h, #0x0\n"
+ ".inst 0x4510a613 // sshllt z19.s, z16.h, #0x0\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "ld1rw { z2.s }, p0/Z, [x20]\n"
- ".inst 0x44828081 // srshl z1.s, p0/M, z1.s, z4.s\n"
- ".inst 0x44828097 // srshl z23.s, p0/M, z23.s, z4.s\n"
- ".inst 0x44828080 // srshl z0.s, p0/M, z0.s, z4.s\n"
- ".inst 0x4482809f // srshl z31.s, p0/M, z31.s, z4.s\n"
- ".inst 0x04a37421 // sqrdmulh z1.s, z1.s, z3.s\n"
- ".inst 0x04a376f7 // sqrdmulh z23.s, z23.s, z3.s\n"
- ".inst 0x04a37400 // sqrdmulh z0.s, z0.s, z3.s\n"
- ".inst 0x04a377ff // sqrdmulh z31.s, z31.s, z3.s\n"
- "mov z19.s, #0x7f\n"
- ".inst 0x44828041 // srshl z1.s, p0/M, z1.s, z2.s\n"
- ".inst 0x44828057 // srshl z23.s, p0/M, z23.s, z2.s\n"
- ".inst 0x44828040 // srshl z0.s, p0/M, z0.s, z2.s\n"
- ".inst 0x4482805f // srshl z31.s, p0/M, z31.s, z2.s\n"
- "not z16.s, p0/M, z19.s\n"
- "smax z1.s, p0/M, z1.s, z16.s\n"
- "smax z23.s, p0/M, z23.s, z16.s\n"
- "smax z0.s, p0/M, z0.s, z16.s\n"
- "smax z31.s, p0/M, z31.s, z16.s\n"
- "smin z1.s, p0/M, z1.s, z19.s\n"
- "smin z23.s, p0/M, z23.s, z19.s\n"
- "trn1 z23.h, z1.h, z23.h\n"
- "smin z0.s, p0/M, z0.s, z19.s\n"
- "smin z31.s, p0/M, z31.s, z19.s\n"
- "trn1 z16.h, z0.h, z31.h\n"
- "trn1 z16.b, z23.b, z16.b\n"
+ "ld1rw { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x44828256 // srshl z22.s, p0/M, z22.s, z18.s\n"
+ ".inst 0x44828255 // srshl z21.s, p0/M, z21.s, z18.s\n"
+ ".inst 0x44828254 // srshl z20.s, p0/M, z20.s, z18.s\n"
+ ".inst 0x44828253 // srshl z19.s, p0/M, z19.s, z18.s\n"
+ ".inst 0x04b176d6 // sqrdmulh z22.s, z22.s, z17.s\n"
+ ".inst 0x04b176b5 // sqrdmulh z21.s, z21.s, z17.s\n"
+ ".inst 0x04b17694 // sqrdmulh z20.s, z20.s, z17.s\n"
+ ".inst 0x04b17673 // sqrdmulh z19.s, z19.s, z17.s\n"
+ "mov z18.s, #0x7f\n"
+ ".inst 0x44828216 // srshl z22.s, p0/M, z22.s, z16.s\n"
+ ".inst 0x44828215 // srshl z21.s, p0/M, z21.s, z16.s\n"
+ ".inst 0x44828214 // srshl z20.s, p0/M, z20.s, z16.s\n"
+ ".inst 0x44828213 // srshl z19.s, p0/M, z19.s, z16.s\n"
+ "not z16.s, p0/M, z18.s\n"
+ "smax z22.s, p0/M, z22.s, z16.s\n"
+ "smax z21.s, p0/M, z21.s, z16.s\n"
+ "smax z20.s, p0/M, z20.s, z16.s\n"
+ "smax z19.s, p0/M, z19.s, z16.s\n"
+ "smin z22.s, p0/M, z22.s, z18.s\n"
+ "smin z21.s, p0/M, z21.s, z18.s\n"
+ "trn1 z17.h, z22.h, z21.h\n"
+ "smin z20.s, p0/M, z20.s, z18.s\n"
+ "smin z19.s, p0/M, z19.s, z18.s\n"
+ "trn1 z16.h, z20.h, z19.h\n"
+ "trn1 z16.b, z17.b, z16.b\n"
"st1b { z16.b }, p4, [%x[outptr], x9]\n"
"incb x9\n"
"whilelt p4.b, x9, %x[n_channels]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst.hpp
index 2cdb2883c2..29a03ec509 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -40,3 +42,5 @@ struct sme_u8_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t,
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst/generic.cpp
index a637654908..f0e7bbf5cc 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -22,13 +22,14 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME)
-
#include <cstdint>
+#include <cstddef>
#include <cstring>
#include <cmath>
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -109,7 +110,7 @@ void sme_u8_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"mov z15.s, #0x0\n"
"mov z14.s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"mov z13.s, #0x0\n"
"mov z12.s, #0x0\n"
"mov z11.s, #0x0\n"
@@ -125,48 +126,48 @@ void sme_u8_nhwc_avg_generic_depthfirst_impl(
"mov z1.s, #0x0\n"
"mov z0.s, #0x0\n"
"cbz x23, 4f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
".inst 0x455c0bb5 // uaddlb z21.h, z29.b, z28.b\n"
".inst 0x455c0fb4 // uaddlt z20.h, z29.b, z28.b\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x455a0b73 // uaddlb z19.h, z27.b, z26.b\n"
".inst 0x455a0f72 // uaddlt z18.h, z27.b, z26.b\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
".inst 0x45580b31 // uaddlb z17.h, z25.b, z24.b\n"
".inst 0x45580f30 // uaddlt z16.h, z25.b, z24.b\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
".inst 0x4595496b // uaddwb z11.s, z11.s, z21.h\n"
".inst 0x45954d4a // uaddwt z10.s, z10.s, z21.h\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
".inst 0x45944929 // uaddwb z9.s, z9.s, z20.h\n"
".inst 0x45944d08 // uaddwt z8.s, z8.s, z20.h\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
".inst 0x459348e7 // uaddwb z7.s, z7.s, z19.h\n"
".inst 0x45934cc6 // uaddwt z6.s, z6.s, z19.h\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
".inst 0x459248a5 // uaddwb z5.s, z5.s, z18.h\n"
".inst 0x45924c84 // uaddwt z4.s, z4.s, z18.h\n"
".inst 0x45914863 // uaddwb z3.s, z3.s, z17.h\n"
@@ -203,20 +204,20 @@ void sme_u8_nhwc_avg_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508abf7 // ushllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508aff6 // ushllt z22.h, z31.b, #0x0\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- ".inst 0x4508abb5 // ushllb z21.h, z29.b, #0x0\n"
- ".inst 0x4508afb4 // ushllt z20.h, z29.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508aa17 // ushllb z23.h, z16.b, #0x0\n"
+ ".inst 0x4508ae16 // ushllt z22.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+ ".inst 0x4508aa15 // ushllb z21.h, z16.b, #0x0\n"
+ ".inst 0x4508ae14 // ushllt z20.h, z16.b, #0x0\n"
"subs x21, x21, #0x1\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- ".inst 0x4508ab73 // ushllb z19.h, z27.b, #0x0\n"
- ".inst 0x4508af72 // ushllt z18.h, z27.b, #0x0\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- ".inst 0x4508ab31 // ushllb z17.h, z25.b, #0x0\n"
- ".inst 0x4508af30 // ushllt z16.h, z25.b, #0x0\n"
+ "ld1b { z16.b }, p2/Z, [x20, x25]\n"
+ ".inst 0x4508aa13 // ushllb z19.h, z16.b, #0x0\n"
+ ".inst 0x4508ae12 // ushllt z18.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
@@ -332,74 +333,74 @@ void sme_u8_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"mov z15.s, #0x0\n"
"mov z14.s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"mov z13.s, #0x0\n"
"mov z12.s, #0x0\n"
"cbz x23, 11f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- ".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ ".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e0ff0 // uaddlt z16.h, z31.b, z30.b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
- ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
- ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- ".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
- ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
- ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
- ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e0ff0 // uaddlt z16.h, z31.b, z30.b\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508abf7 // ushllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508aff6 // ushllt z22.h, z31.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
"subs x21, x21, #0x1\n"
- ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
- ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
- ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
- ".inst 0x04b175ef // sqdmulh z15.s, z15.s, z17.s\n"
- ".inst 0x04b175ce // sqdmulh z14.s, z14.s, z17.s\n"
- ".inst 0x04b175ad // sqdmulh z13.s, z13.s, z17.s\n"
- ".inst 0x04b1758c // sqdmulh z12.s, z12.s, z17.s\n"
+ "ld1rw { z16.s }, p0/Z, [%x[rescale_ptr]]\n"
+ ".inst 0x04b075ef // sqdmulh z15.s, z15.s, z16.s\n"
+ ".inst 0x04b075ce // sqdmulh z14.s, z14.s, z16.s\n"
+ ".inst 0x04b075ad // sqdmulh z13.s, z13.s, z16.s\n"
+ ".inst 0x04b0758c // sqdmulh z12.s, z12.s, z16.s\n"
"ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
- "mov z16.s, #0x0\n"
- "mov z19.s, #0xff\n"
- "smax z15.s, p0/M, z15.s, z16.s\n"
- "smax z14.s, p0/M, z14.s, z16.s\n"
- "smax z13.s, p0/M, z13.s, z16.s\n"
- "smax z12.s, p0/M, z12.s, z16.s\n"
- "smin z15.s, p0/M, z15.s, z19.s\n"
- "smin z14.s, p0/M, z14.s, z19.s\n"
- "trn1 z23.h, z15.h, z14.h\n"
- "smin z13.s, p0/M, z13.s, z19.s\n"
- "smin z12.s, p0/M, z12.s, z19.s\n"
+ "mov z17.s, #0x0\n"
+ "mov z16.s, #0xff\n"
+ "smax z15.s, p0/M, z15.s, z17.s\n"
+ "smax z14.s, p0/M, z14.s, z17.s\n"
+ "smax z13.s, p0/M, z13.s, z17.s\n"
+ "smax z12.s, p0/M, z12.s, z17.s\n"
+ "smin z15.s, p0/M, z15.s, z16.s\n"
+ "smin z14.s, p0/M, z14.s, z16.s\n"
+ "trn1 z17.h, z15.h, z14.h\n"
+ "smin z13.s, p0/M, z13.s, z16.s\n"
+ "smin z12.s, p0/M, z12.s, z16.s\n"
"trn1 z16.h, z13.h, z12.h\n"
- "trn1 z16.b, z23.b, z16.b\n"
+ "trn1 z16.b, z17.b, z16.b\n"
"st1b { z16.b }, p4, [%x[outptr], x27]\n"
"incb x27\n"
"whilelt p4.b, x27, %x[n_channels]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 6d5f53d7a5..3df4e4efb8 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -45,3 +47,5 @@ struct sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<u
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 9f267d76ea..9088cbde89 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -26,7 +26,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__ARM_FEATURE_SVE) && defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace arm_conv {
namespace pooling {
@@ -70,23 +70,23 @@ void sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr x20, [%x[args], %[offsetof_inptrs]]\n"
"mov x14, #0x0\n"
"ldr x13, [%x[args], %[offsetof_n_channels]]\n"
- "whilelt p1.b, x15, x13\n"
+ "whilelt p0.b, x15, x13\n"
"ldp x12, x11, [x21, #0x0]\n"
"ldp x10, x9, [x21, #0x10]\n"
"ldp x28, x27, [x20, #0x0]\n"
- "ld1b { z30.b }, p1/Z, [x27, x15]\n"
+ "ld1b { z30.b }, p0/Z, [x27, x15]\n"
"ldp x26, x25, [x20, #0x10]\n"
- "ld1b { z29.b }, p1/Z, [x25, x15]\n"
+ "ld1b { z29.b }, p0/Z, [x25, x15]\n"
"ldp x24, x23, [x20, #0x20]\n"
- "ld1b { z28.b }, p1/Z, [x24, x15]\n"
+ "ld1b { z28.b }, p0/Z, [x24, x15]\n"
"ldp x22, x21, [x20, #0x30]\n"
- "ld1b { z27.b }, p1/Z, [x21, x15]\n"
+ "ld1b { z27.b }, p0/Z, [x21, x15]\n"
"ldr x20, [x20, #0x40]\n"
- "ld1b { z26.b }, p1/Z, [x28, x15]\n"
- "ld1b { z25.b }, p1/Z, [x26, x15]\n"
- "ld1b { z24.b }, p1/Z, [x23, x15]\n"
- "ld1b { z23.b }, p1/Z, [x22, x15]\n"
- "ld1b { z19.b }, p1/Z, [x20, x15]\n"
+ "ld1b { z26.b }, p0/Z, [x28, x15]\n"
+ "ld1b { z25.b }, p0/Z, [x26, x15]\n"
+ "ld1b { z24.b }, p0/Z, [x23, x15]\n"
+ "ld1b { z19.b }, p0/Z, [x22, x15]\n"
+ "ld1b { z23.b }, p0/Z, [x20, x15]\n"
"incw x15\n"
"whilelt p1.b, x15, x13\n"
"b.none 2f\n"
@@ -95,25 +95,25 @@ void sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"movprfx z21, z28\n umax z21.b, p2/M, z21.b, z27.b\n"
"ld1b { z30.b }, p1/Z, [x27, x15]\n"
"whilelt p0.b, x14, x13\n"
- "movprfx z20, z29\n umax z20.b, p2/M, z20.b, z26.b\n"
- "movprfx z18, z25\n umax z18.b, p2/M, z18.b, z24.b\n"
+ "movprfx z18, z29\n umax z18.b, p2/M, z18.b, z26.b\n"
+ "movprfx z17, z25\n umax z17.b, p2/M, z17.b, z24.b\n"
"ld1b { z28.b }, p1/Z, [x24, x15]\n"
- "movprfx z17, z29\n umax z17.b, p2/M, z17.b, z23.b\n"
- "movprfx z16, z24\n umax z16.b, p2/M, z16.b, z19.b\n"
+ "movprfx z16, z29\n umax z16.b, p2/M, z16.b, z19.b\n"
+ "movprfx z20, z24\n umax z20.b, p2/M, z20.b, z23.b\n"
"ld1b { z27.b }, p1/Z, [x21, x15]\n"
"ld1b { z29.b }, p1/Z, [x25, x15]\n"
- "movprfx z19, z22\n umax z19.b, p2/M, z19.b, z20.b\n"
- "umax z18.b, p2/M, z18.b, z22.b\n"
+ "movprfx z19, z22\n umax z19.b, p2/M, z19.b, z18.b\n"
+ "movprfx z18, z17\n umax z18.b, p2/M, z18.b, z22.b\n"
"ld1b { z26.b }, p1/Z, [x28, x15]\n"
- "umax z17.b, p2/M, z17.b, z21.b\n"
- "umax z16.b, p2/M, z16.b, z21.b\n"
+ "movprfx z17, z16\n umax z17.b, p2/M, z17.b, z21.b\n"
+ "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z20.b\n"
"ld1b { z25.b }, p1/Z, [x26, x15]\n"
"st1b { z19.b }, p0, [x12, x14]\n"
"ld1b { z24.b }, p1/Z, [x23, x15]\n"
"st1b { z18.b }, p0, [x11, x14]\n"
- "ld1b { z23.b }, p1/Z, [x22, x15]\n"
+ "ld1b { z19.b }, p1/Z, [x22, x15]\n"
"st1b { z17.b }, p0, [x10, x14]\n"
- "ld1b { z19.b }, p1/Z, [x20, x15]\n"
+ "ld1b { z23.b }, p1/Z, [x20, x15]\n"
"incw x15\n"
"whilelt p1.b, x15, x13\n"
"st1b { z16.b }, p0, [x9, x14]\n"
@@ -125,13 +125,13 @@ void sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"whilelt p0.b, x14, x13\n"
"movprfx z20, z29\n umax z20.b, p2/M, z20.b, z26.b\n"
"movprfx z18, z25\n umax z18.b, p2/M, z18.b, z24.b\n"
- "movprfx z17, z29\n umax z17.b, p2/M, z17.b, z23.b\n"
- "movprfx z16, z24\n umax z16.b, p2/M, z16.b, z19.b\n"
- "movprfx z19, z22\n umax z19.b, p2/M, z19.b, z20.b\n"
+ "movprfx z17, z29\n umax z17.b, p2/M, z17.b, z19.b\n"
+ "movprfx z19, z24\n umax z19.b, p2/M, z19.b, z23.b\n"
+ "movprfx z16, z22\n umax z16.b, p2/M, z16.b, z20.b\n"
"umax z18.b, p2/M, z18.b, z22.b\n"
- "st1b { z19.b }, p0, [x12, x14]\n"
+ "st1b { z16.b }, p0, [x12, x14]\n"
"umax z17.b, p2/M, z17.b, z21.b\n"
- "umax z16.b, p2/M, z16.b, z21.b\n"
+ "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z19.b\n"
"st1b { z18.b }, p0, [x11, x14]\n"
"st1b { z17.b }, p0, [x10, x14]\n"
"st1b { z16.b }, p0, [x9, x14]\n"
@@ -145,4 +145,4 @@ void sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(ARM_COMPUTE_ENABLE_SME)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst.hpp
index 5c637ec3c3..077c8ed2f7 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -40,3 +42,5 @@ struct sme_u8_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t,
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst/generic.cpp
index 9a13deafda..06f13e8111 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst/generic.cpp
@@ -22,9 +22,10 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME)
-
#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace arm_conv {
namespace pooling {
@@ -54,68 +55,68 @@ void sme_u8_nhwc_max_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z4.b, #0x0\n"
"mov z3.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z2.b, #0x0\n"
"mov z1.b, #0x0\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
- "ld1b { z18.b }, p3/Z, [x24, x28]\n"
- "ld1b { z29.b }, p3/Z, [x23, x28]\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z28.b }, p3/Z, [x21, x28]\n"
- "ld1b { z17.b }, p2/Z, [x24, x27]\n"
- "ld1b { z27.b }, p2/Z, [x23, x27]\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z26.b }, p2/Z, [x21, x27]\n"
- "ld1b { z16.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
"umax z23.b, p0/M, z23.b, z30.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
"umax z18.b, p0/M, z18.b, z29.b\n"
"umax z22.b, p0/M, z22.b, z28.b\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
"umax z17.b, p0/M, z17.b, z27.b\n"
"umax z21.b, p0/M, z21.b, z26.b\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
"umax z16.b, p0/M, z16.b, z25.b\n"
"umax z20.b, p0/M, z20.b, z24.b\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
"umax z19.b, p0/M, z19.b, z23.b\n"
"umax z18.b, p0/M, z18.b, z22.b\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
"umax z17.b, p0/M, z17.b, z21.b\n"
"umax z16.b, p0/M, z16.b, z20.b\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
"umax z4.b, p0/M, z4.b, z19.b\n"
"umax z3.b, p0/M, z3.b, z18.b\n"
- "ld1b { z18.b }, p3/Z, [x24, x28]\n"
+ "ld1b { z18.b }, p3/Z, [x23, x28]\n"
"umax z2.b, p0/M, z2.b, z17.b\n"
"umax z1.b, p0/M, z1.b, z16.b\n"
- "ld1b { z29.b }, p3/Z, [x23, x28]\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z28.b }, p3/Z, [x21, x28]\n"
- "ld1b { z17.b }, p2/Z, [x24, x27]\n"
- "ld1b { z27.b }, p2/Z, [x23, x27]\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z26.b }, p2/Z, [x21, x27]\n"
- "ld1b { z16.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
@@ -138,15 +139,15 @@ void sme_u8_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "umax z4.b, p0/M, z4.b, z0.b\n"
- "ld1b { z18.b }, p3/Z, [x24, x28]\n"
- "umax z3.b, p0/M, z3.b, z18.b\n"
- "ld1b { z17.b }, p2/Z, [x24, x27]\n"
- "umax z2.b, p0/M, z2.b, z17.b\n"
- "ld1b { z16.b }, p1/Z, [x24, x26]\n"
+ "umax z4.b, p0/M, z4.b, z16.b\n"
+ "ld1b { z16.b }, p3/Z, [x20, x28]\n"
+ "umax z3.b, p0/M, z3.b, z16.b\n"
+ "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+ "umax z2.b, p0/M, z2.b, z16.b\n"
+ "ld1b { z16.b }, p1/Z, [x20, x26]\n"
"umax z1.b, p0/M, z1.b, z16.b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
@@ -166,44 +167,44 @@ void sme_u8_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z4.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x20, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z0.b }, p4/Z, [x20, x9]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
- "umax z23.b, p0/M, z23.b, z30.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "movprfx z16, z0\n umax z16.b, p0/M, z16.b, z31.b\n"
+ "movprfx z17, z23\n umax z17.b, p0/M, z17.b, z30.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "umax z19.b, p0/M, z19.b, z23.b\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "umax z4.b, p0/M, z4.b, z19.b\n"
- "add x20, x20, #0x20\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
+ "umax z16.b, p0/M, z16.b, z17.b\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "umax z4.b, p0/M, z4.b, z16.b\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
- "umax z23.b, p0/M, z23.b, z30.b\n"
- "umax z19.b, p0/M, z19.b, z23.b\n"
- "umax z4.b, p0/M, z4.b, z19.b\n"
+ "movprfx z16, z0\n umax z16.b, p0/M, z16.b, z31.b\n"
+ "movprfx z17, z23\n umax z17.b, p0/M, z17.b, z30.b\n"
+ "umax z16.b, p0/M, z16.b, z17.b\n"
+ "umax z4.b, p0/M, z4.b, z16.b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "umax z4.b, p0/M, z4.b, z0.b\n"
+ "umax z4.b, p0/M, z4.b, z16.b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"st1b { z4.b }, p4, [%x[outptr], x9]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst.hpp
index 2930993800..bd30a32828 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -40,3 +42,5 @@ struct sme_u8q_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t,
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst/generic.cpp
index a2fe7a301d..52c52ccdb9 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -22,14 +22,15 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME)
-
-#include "src/core/NEON/kernels/assembly/pooling.hpp"
+#include "pooling.hpp"
#include <cstdint>
+#include <cstddef>
#include <cstring>
#include <cmath>
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -136,7 +137,7 @@ void sme_u8q_nhwc_avg_generic_depthfirst_impl(
"mov z13.d, z15.d\n"
"mov z12.d, z15.d\n"
"mov z11.d, z15.d\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"mov z10.d, z15.d\n"
"mov z9.d, z15.d\n"
"mov z8.d, z15.d\n"
@@ -149,48 +150,48 @@ void sme_u8q_nhwc_avg_generic_depthfirst_impl(
"mov z1.d, z15.d\n"
"mov z0.d, z15.d\n"
"cbz x23, 4f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
".inst 0x455c0bb5 // uaddlb z21.h, z29.b, z28.b\n"
".inst 0x455c0fb4 // uaddlt z20.h, z29.b, z28.b\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x455a0b73 // uaddlb z19.h, z27.b, z26.b\n"
".inst 0x455a0f72 // uaddlt z18.h, z27.b, z26.b\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
".inst 0x45580b31 // uaddlb z17.h, z25.b, z24.b\n"
".inst 0x45580f30 // uaddlt z16.h, z25.b, z24.b\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
".inst 0x4595496b // uaddwb z11.s, z11.s, z21.h\n"
".inst 0x45954d4a // uaddwt z10.s, z10.s, z21.h\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
".inst 0x45944929 // uaddwb z9.s, z9.s, z20.h\n"
".inst 0x45944d08 // uaddwt z8.s, z8.s, z20.h\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
".inst 0x459348e7 // uaddwb z7.s, z7.s, z19.h\n"
".inst 0x45934cc6 // uaddwt z6.s, z6.s, z19.h\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
".inst 0x459248a5 // uaddwb z5.s, z5.s, z18.h\n"
".inst 0x45924c84 // uaddwt z4.s, z4.s, z18.h\n"
".inst 0x45914863 // uaddwb z3.s, z3.s, z17.h\n"
@@ -227,20 +228,20 @@ void sme_u8q_nhwc_avg_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508abf7 // ushllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508aff6 // ushllt z22.h, z31.b, #0x0\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- ".inst 0x4508abb5 // ushllb z21.h, z29.b, #0x0\n"
- ".inst 0x4508afb4 // ushllt z20.h, z29.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508aa17 // ushllb z23.h, z16.b, #0x0\n"
+ ".inst 0x4508ae16 // ushllt z22.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+ ".inst 0x4508aa15 // ushllb z21.h, z16.b, #0x0\n"
+ ".inst 0x4508ae14 // ushllt z20.h, z16.b, #0x0\n"
"subs x21, x21, #0x1\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- ".inst 0x4508ab73 // ushllb z19.h, z27.b, #0x0\n"
- ".inst 0x4508af72 // ushllt z18.h, z27.b, #0x0\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- ".inst 0x4508ab31 // ushllb z17.h, z25.b, #0x0\n"
- ".inst 0x4508af30 // ushllt z16.h, z25.b, #0x0\n"
+ "ld1b { z16.b }, p2/Z, [x20, x25]\n"
+ ".inst 0x4508aa13 // ushllb z19.h, z16.b, #0x0\n"
+ ".inst 0x4508ae12 // ushllt z18.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
@@ -393,61 +394,61 @@ void sme_u8q_nhwc_avg_generic_depthfirst_impl(
"mov z14.d, z15.d\n"
"mov z13.d, z15.d\n"
"mov z12.d, z15.d\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x23, 11f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- ".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ ".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e0ff0 // uaddlt z16.h, z31.b, z30.b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
- ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
- ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- ".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
- ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
- ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
- ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e0ff0 // uaddlt z16.h, z31.b, z30.b\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508abf7 // ushllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508aff6 // ushllt z22.h, z31.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
"subs x21, x21, #0x1\n"
- ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
- ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
- ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "ld1rw { z19.s }, p0/Z, [%x[left_shift]]\n"
- ".inst 0x4482826f // srshl z15.s, p0/M, z15.s, z19.s\n"
- ".inst 0x4482826e // srshl z14.s, p0/M, z14.s, z19.s\n"
+ "ld1rw { z16.s }, p0/Z, [%x[left_shift]]\n"
+ ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
+ ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
"add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
- ".inst 0x4482826d // srshl z13.s, p0/M, z13.s, z19.s\n"
- ".inst 0x4482826c // srshl z12.s, p0/M, z12.s, z19.s\n"
- "ld1rw { z18.s }, p0/Z, [%x[combined_rescale_value]]\n"
- ".inst 0x04b275ef // sqrdmulh z15.s, z15.s, z18.s\n"
- ".inst 0x04b275ce // sqrdmulh z14.s, z14.s, z18.s\n"
+ ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
+ ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
+ "ld1rw { z16.s }, p0/Z, [%x[combined_rescale_value]]\n"
+ ".inst 0x04b075ef // sqrdmulh z15.s, z15.s, z16.s\n"
+ ".inst 0x04b075ce // sqrdmulh z14.s, z14.s, z16.s\n"
"ld1rw { z17.s }, p0/Z, [%x[right_shift]]\n"
- ".inst 0x04b275ad // sqrdmulh z13.s, z13.s, z18.s\n"
- ".inst 0x04b2758c // sqrdmulh z12.s, z12.s, z18.s\n"
+ ".inst 0x04b075ad // sqrdmulh z13.s, z13.s, z16.s\n"
+ ".inst 0x04b0758c // sqrdmulh z12.s, z12.s, z16.s\n"
"ld1rw { z16.s }, p0/Z, [x20]\n"
".inst 0x4482822f // srshl z15.s, p0/M, z15.s, z17.s\n"
".inst 0x4482822e // srshl z14.s, p0/M, z14.s, z17.s\n"
@@ -457,19 +458,19 @@ void sme_u8q_nhwc_avg_generic_depthfirst_impl(
"add z14.s, z14.s, z16.s\n"
"add z13.s, z13.s, z16.s\n"
"add z12.s, z12.s, z16.s\n"
- "mov z16.s, #0x0\n"
- "mov z19.s, #0xff\n"
- "smax z15.s, p0/M, z15.s, z16.s\n"
- "smax z14.s, p0/M, z14.s, z16.s\n"
- "smax z13.s, p0/M, z13.s, z16.s\n"
- "smax z12.s, p0/M, z12.s, z16.s\n"
- "smin z15.s, p0/M, z15.s, z19.s\n"
- "smin z14.s, p0/M, z14.s, z19.s\n"
- "trn1 z23.h, z15.h, z14.h\n"
- "smin z13.s, p0/M, z13.s, z19.s\n"
- "smin z12.s, p0/M, z12.s, z19.s\n"
+ "mov z17.s, #0x0\n"
+ "mov z16.s, #0xff\n"
+ "smax z15.s, p0/M, z15.s, z17.s\n"
+ "smax z14.s, p0/M, z14.s, z17.s\n"
+ "smax z13.s, p0/M, z13.s, z17.s\n"
+ "smax z12.s, p0/M, z12.s, z17.s\n"
+ "smin z15.s, p0/M, z15.s, z16.s\n"
+ "smin z14.s, p0/M, z14.s, z16.s\n"
+ "trn1 z17.h, z15.h, z14.h\n"
+ "smin z13.s, p0/M, z13.s, z16.s\n"
+ "smin z12.s, p0/M, z12.s, z16.s\n"
"trn1 z16.h, z13.h, z12.h\n"
- "trn1 z16.b, z23.b, z16.b\n"
+ "trn1 z16.b, z17.b, z16.b\n"
"st1b { z16.b }, p4, [%x[outptr], x27]\n"
"incb x27\n"
"whilelt p4.b, x27, %x[n_channels]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst.hpp
index d7bf6cbd08..69d627c047 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -40,3 +42,5 @@ struct sme_u8q_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t,
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst/generic.cpp
index d050cd014f..c8e8e7d399 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -22,10 +22,11 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME)
-
-#include "src/core/NEON/kernels/assembly/pooling.hpp"
+#include "pooling.hpp"
#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace arm_conv {
namespace pooling {
@@ -56,68 +57,68 @@ void sme_u8q_nhwc_max_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z5.b, #0x0\n"
"mov z3.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z2.b, #0x0\n"
"mov z1.b, #0x0\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
- "ld1b { z18.b }, p3/Z, [x24, x28]\n"
- "ld1b { z29.b }, p3/Z, [x23, x28]\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z28.b }, p3/Z, [x21, x28]\n"
- "ld1b { z17.b }, p2/Z, [x24, x27]\n"
- "ld1b { z27.b }, p2/Z, [x23, x27]\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z26.b }, p2/Z, [x21, x27]\n"
- "ld1b { z16.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
"umax z23.b, p0/M, z23.b, z30.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
"umax z18.b, p0/M, z18.b, z29.b\n"
"umax z22.b, p0/M, z22.b, z28.b\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
"umax z17.b, p0/M, z17.b, z27.b\n"
"umax z21.b, p0/M, z21.b, z26.b\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
"umax z16.b, p0/M, z16.b, z25.b\n"
"umax z20.b, p0/M, z20.b, z24.b\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
"umax z19.b, p0/M, z19.b, z23.b\n"
"umax z18.b, p0/M, z18.b, z22.b\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
"umax z17.b, p0/M, z17.b, z21.b\n"
"umax z16.b, p0/M, z16.b, z20.b\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
"umax z5.b, p0/M, z5.b, z19.b\n"
"umax z3.b, p0/M, z3.b, z18.b\n"
- "ld1b { z18.b }, p3/Z, [x24, x28]\n"
+ "ld1b { z18.b }, p3/Z, [x23, x28]\n"
"umax z2.b, p0/M, z2.b, z17.b\n"
"umax z1.b, p0/M, z1.b, z16.b\n"
- "ld1b { z29.b }, p3/Z, [x23, x28]\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z28.b }, p3/Z, [x21, x28]\n"
- "ld1b { z17.b }, p2/Z, [x24, x27]\n"
- "ld1b { z27.b }, p2/Z, [x23, x27]\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z26.b }, p2/Z, [x21, x27]\n"
- "ld1b { z16.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
@@ -140,15 +141,15 @@ void sme_u8q_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "umax z5.b, p0/M, z5.b, z0.b\n"
- "ld1b { z18.b }, p3/Z, [x24, x28]\n"
- "umax z3.b, p0/M, z3.b, z18.b\n"
- "ld1b { z17.b }, p2/Z, [x24, x27]\n"
- "umax z2.b, p0/M, z2.b, z17.b\n"
- "ld1b { z16.b }, p1/Z, [x24, x26]\n"
+ "umax z5.b, p0/M, z5.b, z16.b\n"
+ "ld1b { z16.b }, p3/Z, [x20, x28]\n"
+ "umax z3.b, p0/M, z3.b, z16.b\n"
+ "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+ "umax z2.b, p0/M, z2.b, z16.b\n"
+ "ld1b { z16.b }, p1/Z, [x20, x26]\n"
"umax z1.b, p0/M, z1.b, z16.b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
@@ -313,92 +314,92 @@ void sme_u8q_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z5.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x20, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z0.b }, p4/Z, [x20, x9]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
- "umax z23.b, p0/M, z23.b, z30.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "movprfx z16, z0\n umax z16.b, p0/M, z16.b, z31.b\n"
+ "movprfx z17, z23\n umax z17.b, p0/M, z17.b, z30.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "umax z19.b, p0/M, z19.b, z23.b\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "umax z5.b, p0/M, z5.b, z19.b\n"
- "add x20, x20, #0x20\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
+ "umax z16.b, p0/M, z16.b, z17.b\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "umax z5.b, p0/M, z5.b, z16.b\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
- "umax z23.b, p0/M, z23.b, z30.b\n"
- "umax z19.b, p0/M, z19.b, z23.b\n"
- "umax z5.b, p0/M, z5.b, z19.b\n"
+ "movprfx z16, z0\n umax z16.b, p0/M, z16.b, z31.b\n"
+ "movprfx z17, z23\n umax z17.b, p0/M, z17.b, z30.b\n"
+ "umax z16.b, p0/M, z16.b, z17.b\n"
+ "umax z5.b, p0/M, z5.b, z16.b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "umax z5.b, p0/M, z5.b, z0.b\n"
+ "umax z5.b, p0/M, z5.b, z16.b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
- "ld1rw { z4.s }, p0/Z, [x20]\n"
- ".inst 0x4508a8b7 // ushllb z23.h, z5.b, #0x0\n"
- ".inst 0x4508acb9 // ushllt z25.h, z5.b, #0x0\n"
- "neg z4.s, p0/M, z4.s\n"
- ".inst 0x45974081 // saddwb z1.s, z4.s, z23.h\n"
+ "ld1rw { z18.s }, p0/Z, [x20]\n"
+ ".inst 0x4508a8b1 // ushllb z17.h, z5.b, #0x0\n"
+ ".inst 0x4508acb0 // ushllt z16.h, z5.b, #0x0\n"
+ "neg z18.s, p0/M, z18.s\n"
+ ".inst 0x45914257 // saddwb z23.s, z18.s, z17.h\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- "ld1rw { z3.s }, p0/Z, [x20]\n"
- ".inst 0x45974497 // saddwt z23.s, z4.s, z23.h\n"
- ".inst 0x45994080 // saddwb z0.s, z4.s, z25.h\n"
+ "ld1rw { z22.s }, p0/Z, [x20]\n"
+ ".inst 0x45914655 // saddwt z21.s, z18.s, z17.h\n"
+ ".inst 0x45904254 // saddwb z20.s, z18.s, z16.h\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "ld1rw { z2.s }, p0/Z, [x20]\n"
- ".inst 0x4599449f // saddwt z31.s, z4.s, z25.h\n"
- ".inst 0x44828061 // srshl z1.s, p0/M, z1.s, z3.s\n"
- "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
"ld1rw { z19.s }, p0/Z, [x20]\n"
- ".inst 0x44828077 // srshl z23.s, p0/M, z23.s, z3.s\n"
- ".inst 0x44828060 // srshl z0.s, p0/M, z0.s, z3.s\n"
+ ".inst 0x45904652 // saddwt z18.s, z18.s, z16.h\n"
+ ".inst 0x448282d7 // srshl z23.s, p0/M, z23.s, z22.s\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+ "ld1rw { z17.s }, p0/Z, [x20]\n"
+ ".inst 0x448282d5 // srshl z21.s, p0/M, z21.s, z22.s\n"
+ ".inst 0x448282d4 // srshl z20.s, p0/M, z20.s, z22.s\n"
"add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
"ld1rw { z16.s }, p0/Z, [x20]\n"
- ".inst 0x4482807f // srshl z31.s, p0/M, z31.s, z3.s\n"
- ".inst 0x04a27421 // sqrdmulh z1.s, z1.s, z2.s\n"
- ".inst 0x04a276f7 // sqrdmulh z23.s, z23.s, z2.s\n"
- ".inst 0x04a27400 // sqrdmulh z0.s, z0.s, z2.s\n"
- ".inst 0x04a277ff // sqrdmulh z31.s, z31.s, z2.s\n"
- ".inst 0x44828261 // srshl z1.s, p0/M, z1.s, z19.s\n"
- ".inst 0x44828277 // srshl z23.s, p0/M, z23.s, z19.s\n"
- ".inst 0x44828260 // srshl z0.s, p0/M, z0.s, z19.s\n"
- ".inst 0x4482827f // srshl z31.s, p0/M, z31.s, z19.s\n"
- "add z1.s, z1.s, z16.s\n"
+ ".inst 0x448282d2 // srshl z18.s, p0/M, z18.s, z22.s\n"
+ ".inst 0x04b376f7 // sqrdmulh z23.s, z23.s, z19.s\n"
+ ".inst 0x04b376b5 // sqrdmulh z21.s, z21.s, z19.s\n"
+ ".inst 0x04b37694 // sqrdmulh z20.s, z20.s, z19.s\n"
+ ".inst 0x04b37652 // sqrdmulh z18.s, z18.s, z19.s\n"
+ ".inst 0x44828237 // srshl z23.s, p0/M, z23.s, z17.s\n"
+ ".inst 0x44828235 // srshl z21.s, p0/M, z21.s, z17.s\n"
+ ".inst 0x44828234 // srshl z20.s, p0/M, z20.s, z17.s\n"
+ ".inst 0x44828232 // srshl z18.s, p0/M, z18.s, z17.s\n"
"add z23.s, z23.s, z16.s\n"
- "add z0.s, z0.s, z16.s\n"
- "add z31.s, z31.s, z16.s\n"
- "mov z16.s, #0x0\n"
- "mov z19.s, #0xff\n"
- "smax z1.s, p0/M, z1.s, z16.s\n"
- "smax z23.s, p0/M, z23.s, z16.s\n"
- "smax z0.s, p0/M, z0.s, z16.s\n"
- "smax z31.s, p0/M, z31.s, z16.s\n"
- "smin z1.s, p0/M, z1.s, z19.s\n"
- "smin z23.s, p0/M, z23.s, z19.s\n"
- "smin z0.s, p0/M, z0.s, z19.s\n"
- "trn1 z23.h, z1.h, z23.h\n"
- "smin z31.s, p0/M, z31.s, z19.s\n"
- "trn1 z16.h, z0.h, z31.h\n"
- "trn1 z16.b, z23.b, z16.b\n"
+ "add z21.s, z21.s, z16.s\n"
+ "add z20.s, z20.s, z16.s\n"
+ "add z18.s, z18.s, z16.s\n"
+ "mov z17.s, #0x0\n"
+ "mov z16.s, #0xff\n"
+ "smax z23.s, p0/M, z23.s, z17.s\n"
+ "smax z21.s, p0/M, z21.s, z17.s\n"
+ "smax z20.s, p0/M, z20.s, z17.s\n"
+ "smax z18.s, p0/M, z18.s, z17.s\n"
+ "smin z23.s, p0/M, z23.s, z16.s\n"
+ "smin z21.s, p0/M, z21.s, z16.s\n"
+ "smin z20.s, p0/M, z20.s, z16.s\n"
+ "trn1 z17.h, z23.h, z21.h\n"
+ "smin z18.s, p0/M, z18.s, z16.s\n"
+ "trn1 z16.h, z20.h, z18.h\n"
+ "trn1 z16.b, z17.b, z16.b\n"
"st1b { z16.b }, p4, [%x[outptr], x9]\n"
"incb x9\n"
"whilelt p4.b, x9, %x[n_channels]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
index 593fb58445..1ba78f3fba 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -88,8 +88,8 @@ void sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
"mov x20, #0x4\n"
"ldr x4, [%x[args], %[offsetof_inptrs]]\n"
"ldp x5, x6, [x21, #0x0]\n"
- "whilelt p0.h, XZR, x20\n"
- "whilelt p1.h, x3, x2\n"
+ "whilelt p2.h, XZR, x20\n"
+ "whilelt p0.h, x3, x2\n"
"ldp x7, x8, [x21, #0x10]\n"
"ldp x17, x16, [x4, #0x0]\n"
"add x15, %x[args], %[offsetof_rescale]\n"
@@ -101,25 +101,25 @@ void sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
"ldp x25, x24, [x4, #0x50]\n"
"ldp x23, x22, [x4, #0x60]\n"
"ldp x21, x20, [x4, #0x70]\n"
- "ld1h { z7.h }, p1/Z, [x10, x3, LSL #1]\n"
- "ld1h { z6.h }, p1/Z, [x9, x3, LSL #1]\n"
- "ld1h { z5.h }, p1/Z, [x26, x3, LSL #1]\n"
- "ld1h { z4.h }, p1/Z, [x25, x3, LSL #1]\n"
- "ld1h { z3.h }, p1/Z, [x16, x3, LSL #1]\n"
- "ld1h { z2.h }, p1/Z, [x13, x3, LSL #1]\n"
- "ld1h { z1.h }, p1/Z, [x11, x3, LSL #1]\n"
- "ld1h { z31.h }, p1/Z, [x27, x3, LSL #1]\n"
- "ld1h { z30.h }, p1/Z, [x28, x3, LSL #1]\n"
- "ld1h { z29.h }, p1/Z, [x24, x3, LSL #1]\n"
- "ld1h { z28.h }, p1/Z, [x22, x3, LSL #1]\n"
- "ld1h { z27.h }, p1/Z, [x21, x3, LSL #1]\n"
- "ld1h { z26.h }, p1/Z, [x17, x3, LSL #1]\n"
- "ld1h { z25.h }, p1/Z, [x12, x3, LSL #1]\n"
- "ld1h { z24.h }, p1/Z, [x23, x3, LSL #1]\n"
- "ld1h { z23.h }, p1/Z, [x20, x3, LSL #1]\n"
+ "ld1h { z7.h }, p0/Z, [x10, x3, LSL #1]\n"
+ "ld1h { z6.h }, p0/Z, [x9, x3, LSL #1]\n"
+ "ld1h { z5.h }, p0/Z, [x26, x3, LSL #1]\n"
+ "ld1h { z4.h }, p0/Z, [x25, x3, LSL #1]\n"
+ "ld1h { z3.h }, p0/Z, [x16, x3, LSL #1]\n"
+ "ld1h { z2.h }, p0/Z, [x13, x3, LSL #1]\n"
+ "ld1h { z1.h }, p0/Z, [x11, x3, LSL #1]\n"
+ "ld1h { z31.h }, p0/Z, [x27, x3, LSL #1]\n"
+ "ld1h { z30.h }, p0/Z, [x28, x3, LSL #1]\n"
+ "ld1h { z29.h }, p0/Z, [x24, x3, LSL #1]\n"
+ "ld1h { z28.h }, p0/Z, [x22, x3, LSL #1]\n"
+ "ld1h { z27.h }, p0/Z, [x21, x3, LSL #1]\n"
+ "ld1h { z26.h }, p0/Z, [x17, x3, LSL #1]\n"
+ "ld1h { z25.h }, p0/Z, [x12, x3, LSL #1]\n"
+ "ld1h { z24.h }, p0/Z, [x23, x3, LSL #1]\n"
+ "ld1h { z23.h }, p0/Z, [x20, x3, LSL #1]\n"
"incw x3\n"
"whilelt p1.h, x3, x2\n"
- "ld1rqh { z0.h }, p0/Z, [x15]\n"
+ "ld1rqh { z0.h }, p2/Z, [x15]\n"
"b.none 2f\n"
"1:" // Vector: Loop
"fadd z17.h, z7.h, z6.h\n"
@@ -172,32 +172,32 @@ void sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
"fadd z17.h, z7.h, z6.h\n"
"fadd z16.h, z5.h, z4.h\n"
"whilelt p0.h, x14, x2\n"
- "fadd z19.h, z17.h, z16.h\n"
+ "fadd z20.h, z17.h, z16.h\n"
"fadd z18.h, z3.h, z2.h\n"
"fadd z17.h, z1.h, z31.h\n"
- "fadd z22.h, z30.h, z29.h\n"
+ "fadd z19.h, z30.h, z29.h\n"
"fadd z16.h, z28.h, z27.h\n"
- "fadd z21.h, z18.h, z19.h\n"
- "fadd z20.h, z16.h, z19.h\n"
- "fadd z19.h, z26.h, z17.h\n"
- "fadd z18.h, z25.h, z22.h\n"
+ "fadd z21.h, z18.h, z20.h\n"
+ "fadd z20.h, z16.h, z20.h\n"
+ "fadd z16.h, z26.h, z17.h\n"
+ "fadd z18.h, z25.h, z19.h\n"
"fadd z17.h, z24.h, z17.h\n"
- "fadd z16.h, z23.h, z22.h\n"
- "fadd z19.h, z21.h, z19.h\n"
- "fmul z19.h, z19.h, z0.h[0]\n"
- "st1h { z19.h }, p0, [x5, x14, LSL #1]\n"
+ "fadd z19.h, z23.h, z19.h\n"
+ "fadd z16.h, z21.h, z16.h\n"
+ "fmul z16.h, z16.h, z0.h[0]\n"
+ "st1h { z16.h }, p0, [x5, x14, LSL #1]\n"
"fadd z18.h, z21.h, z18.h\n"
"fadd z17.h, z17.h, z20.h\n"
"fmul z18.h, z18.h, z0.h[1]\n"
"fmul z17.h, z17.h, z0.h[2]\n"
- "fadd z16.h, z16.h, z20.h\n"
+ "fadd z16.h, z19.h, z20.h\n"
"fmul z16.h, z16.h, z0.h[3]\n"
"st1h { z18.h }, p0, [x6, x14, LSL #1]\n"
"st1h { z17.h }, p0, [x7, x14, LSL #1]\n"
"st1h { z16.h }, p0, [x8, x14, LSL #1]\n"
:
: [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs)), [offsetof_rescale] "I" (offsetof(KernelArgs, rescale_vals))
- : "cc", "memory", "p0", "p1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp
index 594c65e18d..2bef44ea5c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp
@@ -57,68 +57,68 @@ void sve_fp16_nhwc_avg_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z6.b, #0x0\n"
"mov z5.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z4.b, #0x0\n"
"mov z3.b, #0x0\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1h { z2.h }, p3/Z, [x24, x9, LSL #1]\n"
- "ld1h { z1.h }, p3/Z, [x23, x9, LSL #1]\n"
- "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
- "ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
- "ld1h { z30.h }, p2/Z, [x24, x28, LSL #1]\n"
- "ld1h { z22.h }, p2/Z, [x23, x28, LSL #1]\n"
- "ld1h { z29.h }, p2/Z, [x22, x28, LSL #1]\n"
- "ld1h { z28.h }, p2/Z, [x21, x28, LSL #1]\n"
- "ld1h { z27.h }, p1/Z, [x24, x27, LSL #1]\n"
- "ld1h { z21.h }, p1/Z, [x23, x27, LSL #1]\n"
- "ld1h { z26.h }, p1/Z, [x22, x27, LSL #1]\n"
- "ld1h { z17.h }, p1/Z, [x21, x27, LSL #1]\n"
- "ld1h { z25.h }, p0/Z, [x24, x26, LSL #1]\n"
- "ld1h { z20.h }, p0/Z, [x23, x26, LSL #1]\n"
- "ld1h { z24.h }, p0/Z, [x22, x26, LSL #1]\n"
- "ld1h { z16.h }, p0/Z, [x21, x26, LSL #1]\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z2.h }, p3/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z1.h }, p3/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z31.h }, p3/Z, [x20, x9, LSL #1]\n"
+ "ld1h { z30.h }, p2/Z, [x23, x28, LSL #1]\n"
+ "ld1h { z22.h }, p2/Z, [x22, x28, LSL #1]\n"
+ "ld1h { z29.h }, p2/Z, [x21, x28, LSL #1]\n"
+ "ld1h { z28.h }, p2/Z, [x20, x28, LSL #1]\n"
+ "ld1h { z27.h }, p1/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z21.h }, p1/Z, [x22, x27, LSL #1]\n"
+ "ld1h { z26.h }, p1/Z, [x21, x27, LSL #1]\n"
+ "ld1h { z17.h }, p1/Z, [x20, x27, LSL #1]\n"
+ "ld1h { z25.h }, p0/Z, [x23, x26, LSL #1]\n"
+ "ld1h { z20.h }, p0/Z, [x22, x26, LSL #1]\n"
+ "ld1h { z24.h }, p0/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"fadd z23.h, z2.h, z1.h\n"
"fadd z19.h, z0.h, z31.h\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"fadd z22.h, z30.h, z22.h\n"
"fadd z18.h, z29.h, z28.h\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
+ "add x24, x24, #0x20\n"
"fadd z21.h, z27.h, z21.h\n"
"fadd z17.h, z26.h, z17.h\n"
- "ld1h { z2.h }, p3/Z, [x24, x9, LSL #1]\n"
- "ld1h { z1.h }, p3/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z2.h }, p3/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z1.h }, p3/Z, [x22, x9, LSL #1]\n"
"fadd z20.h, z25.h, z20.h\n"
"fadd z16.h, z24.h, z16.h\n"
- "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
- "ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z31.h }, p3/Z, [x20, x9, LSL #1]\n"
"fadd z19.h, z23.h, z19.h\n"
"fadd z18.h, z22.h, z18.h\n"
- "ld1h { z30.h }, p2/Z, [x24, x28, LSL #1]\n"
- "ld1h { z22.h }, p2/Z, [x23, x28, LSL #1]\n"
+ "ld1h { z30.h }, p2/Z, [x23, x28, LSL #1]\n"
+ "ld1h { z22.h }, p2/Z, [x22, x28, LSL #1]\n"
"fadd z17.h, z21.h, z17.h\n"
"fadd z16.h, z20.h, z16.h\n"
- "ld1h { z29.h }, p2/Z, [x22, x28, LSL #1]\n"
- "ld1h { z28.h }, p2/Z, [x21, x28, LSL #1]\n"
+ "ld1h { z29.h }, p2/Z, [x21, x28, LSL #1]\n"
+ "ld1h { z28.h }, p2/Z, [x20, x28, LSL #1]\n"
"fadd z6.h, z6.h, z19.h\n"
"fadd z5.h, z5.h, z18.h\n"
- "ld1h { z27.h }, p1/Z, [x24, x27, LSL #1]\n"
- "ld1h { z21.h }, p1/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z27.h }, p1/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z21.h }, p1/Z, [x22, x27, LSL #1]\n"
"fadd z4.h, z4.h, z17.h\n"
"fadd z3.h, z3.h, z16.h\n"
- "ld1h { z26.h }, p1/Z, [x22, x27, LSL #1]\n"
- "ld1h { z17.h }, p1/Z, [x21, x27, LSL #1]\n"
- "ld1h { z25.h }, p0/Z, [x24, x26, LSL #1]\n"
- "ld1h { z20.h }, p0/Z, [x23, x26, LSL #1]\n"
- "ld1h { z24.h }, p0/Z, [x22, x26, LSL #1]\n"
- "ld1h { z16.h }, p0/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z26.h }, p1/Z, [x21, x27, LSL #1]\n"
+ "ld1h { z17.h }, p1/Z, [x20, x27, LSL #1]\n"
+ "ld1h { z25.h }, p0/Z, [x23, x26, LSL #1]\n"
+ "ld1h { z20.h }, p0/Z, [x22, x26, LSL #1]\n"
+ "ld1h { z24.h }, p0/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"fadd z23.h, z2.h, z1.h\n"
@@ -141,16 +141,16 @@ void sve_fp16_nhwc_avg_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1h { z2.h }, p3/Z, [x24, x9, LSL #1]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1h { z16.h }, p3/Z, [x20, x9, LSL #1]\n"
"subs x21, x21, #0x1\n"
- "fadd z6.h, z6.h, z2.h\n"
- "ld1h { z30.h }, p2/Z, [x24, x28, LSL #1]\n"
- "ld1h { z27.h }, p1/Z, [x24, x27, LSL #1]\n"
- "fadd z5.h, z5.h, z30.h\n"
- "fadd z4.h, z4.h, z27.h\n"
- "ld1h { z25.h }, p0/Z, [x24, x26, LSL #1]\n"
- "fadd z3.h, z3.h, z25.h\n"
+ "fadd z6.h, z6.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x28, LSL #1]\n"
+ "ld1h { z16.h }, p1/Z, [x20, x27, LSL #1]\n"
+ "fadd z5.h, z5.h, z17.h\n"
+ "fadd z4.h, z4.h, z16.h\n"
+ "ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
+ "fadd z3.h, z3.h, z16.h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"fmul z6.h, z6.h, z7.h\n"
@@ -173,44 +173,44 @@ void sve_fp16_nhwc_avg_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z6.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1h { z2.h }, p3/Z, [x24, x9, LSL #1]\n"
- "ld1h { z1.h }, p3/Z, [x23, x9, LSL #1]\n"
- "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
- "ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z2.h }, p3/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z1.h }, p3/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z31.h }, p3/Z, [x20, x9, LSL #1]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "fadd z23.h, z2.h, z1.h\n"
- "fadd z19.h, z0.h, z31.h\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "fadd z19.h, z23.h, z19.h\n"
+ "fadd z17.h, z2.h, z1.h\n"
+ "fadd z16.h, z0.h, z31.h\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "fadd z16.h, z17.h, z16.h\n"
"subs x25, x25, #0x1\n"
- "fadd z6.h, z6.h, z19.h\n"
- "add x20, x20, #0x20\n"
- "ld1h { z2.h }, p3/Z, [x24, x9, LSL #1]\n"
- "ld1h { z1.h }, p3/Z, [x23, x9, LSL #1]\n"
- "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
- "ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "fadd z6.h, z6.h, z16.h\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z2.h }, p3/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z1.h }, p3/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z31.h }, p3/Z, [x20, x9, LSL #1]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "fadd z23.h, z2.h, z1.h\n"
- "fadd z19.h, z0.h, z31.h\n"
- "fadd z19.h, z23.h, z19.h\n"
- "fadd z6.h, z6.h, z19.h\n"
+ "fadd z17.h, z2.h, z1.h\n"
+ "fadd z16.h, z0.h, z31.h\n"
+ "fadd z16.h, z17.h, z16.h\n"
+ "fadd z6.h, z6.h, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1h { z2.h }, p3/Z, [x24, x9, LSL #1]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1h { z16.h }, p3/Z, [x20, x9, LSL #1]\n"
"subs x21, x21, #0x1\n"
- "fadd z6.h, z6.h, z2.h\n"
+ "fadd z6.h, z6.h, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"fmul z6.h, z6.h, z7.h\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 838cd3406c..31bbfd085e 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -66,10 +66,10 @@ void sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr x15, [%x[args], %[offsetof_n_channels]]\n"
"ldr x21, [%x[args], %[offsetof_outptrs]]\n"
"mov x14, #0x0\n"
- "whilelt p2.h, x14, x15\n"
+ "whilelt p0.h, x14, x15\n"
"ldr x20, [%x[args], %[offsetof_inptrs]]\n"
"ldp x13, x12, [x21, #0x0]\n"
- "ptrue p1.b\n"
+ "ptrue p2.b\n"
"mov x11, #0x0\n"
"ldp x10, x9, [x21, #0x10]\n"
"ldp x28, x27, [x20, #0x0]\n"
@@ -77,61 +77,61 @@ void sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldp x24, x23, [x20, #0x20]\n"
"ldp x22, x21, [x20, #0x30]\n"
"ldr x20, [x20, #0x40]\n"
- "ld1h { z31.h }, p2/Z, [x27, x14, LSL #1]\n"
- "ld1h { z30.h }, p2/Z, [x24, x14, LSL #1]\n"
- "ld1h { z29.h }, p2/Z, [x21, x14, LSL #1]\n"
- "ld1h { z28.h }, p2/Z, [x25, x14, LSL #1]\n"
- "ld1h { z27.h }, p2/Z, [x28, x14, LSL #1]\n"
- "ld1h { z26.h }, p2/Z, [x26, x14, LSL #1]\n"
- "ld1h { z25.h }, p2/Z, [x23, x14, LSL #1]\n"
- "ld1h { z24.h }, p2/Z, [x22, x14, LSL #1]\n"
- "ld1h { z23.h }, p2/Z, [x20, x14, LSL #1]\n"
+ "ld1h { z31.h }, p0/Z, [x27, x14, LSL #1]\n"
+ "ld1h { z30.h }, p0/Z, [x24, x14, LSL #1]\n"
+ "ld1h { z29.h }, p0/Z, [x21, x14, LSL #1]\n"
+ "ld1h { z28.h }, p0/Z, [x25, x14, LSL #1]\n"
+ "ld1h { z27.h }, p0/Z, [x28, x14, LSL #1]\n"
+ "ld1h { z26.h }, p0/Z, [x26, x14, LSL #1]\n"
+ "ld1h { z25.h }, p0/Z, [x23, x14, LSL #1]\n"
+ "ld1h { z24.h }, p0/Z, [x22, x14, LSL #1]\n"
+ "ld1h { z23.h }, p0/Z, [x20, x14, LSL #1]\n"
"incw x14\n"
- "whilelt p2.h, x14, x15\n"
+ "whilelt p1.h, x14, x15\n"
"b.none 2f\n"
"1:" // Vector: Loop
- "movprfx z22, z31\n fmax z22.h, p1/M, z22.h, z30.h\n"
- "movprfx z21, z30\n fmax z21.h, p1/M, z21.h, z29.h\n"
- "ld1h { z31.h }, p2/Z, [x27, x14, LSL #1]\n"
- "ld1h { z30.h }, p2/Z, [x24, x14, LSL #1]\n"
- "movprfx z20, z28\n fmax z20.h, p1/M, z20.h, z27.h\n"
- "movprfx z19, z26\n fmax z19.h, p1/M, z19.h, z25.h\n"
- "ld1h { z29.h }, p2/Z, [x21, x14, LSL #1]\n"
- "ld1h { z27.h }, p2/Z, [x28, x14, LSL #1]\n"
- "movprfx z17, z28\n fmax z17.h, p1/M, z17.h, z24.h\n"
- "movprfx z18, z25\n fmax z18.h, p1/M, z18.h, z23.h\n"
- "ld1h { z28.h }, p2/Z, [x25, x14, LSL #1]\n"
- "ld1h { z26.h }, p2/Z, [x26, x14, LSL #1]\n"
- "ld1h { z25.h }, p2/Z, [x23, x14, LSL #1]\n"
- "ld1h { z24.h }, p2/Z, [x22, x14, LSL #1]\n"
+ "movprfx z22, z31\n fmax z22.h, p2/M, z22.h, z30.h\n"
+ "movprfx z21, z30\n fmax z21.h, p2/M, z21.h, z29.h\n"
+ "ld1h { z31.h }, p1/Z, [x27, x14, LSL #1]\n"
+ "ld1h { z30.h }, p1/Z, [x24, x14, LSL #1]\n"
+ "movprfx z20, z28\n fmax z20.h, p2/M, z20.h, z27.h\n"
+ "movprfx z19, z26\n fmax z19.h, p2/M, z19.h, z25.h\n"
+ "ld1h { z29.h }, p1/Z, [x21, x14, LSL #1]\n"
+ "ld1h { z27.h }, p1/Z, [x28, x14, LSL #1]\n"
+ "movprfx z17, z28\n fmax z17.h, p2/M, z17.h, z24.h\n"
+ "movprfx z18, z25\n fmax z18.h, p2/M, z18.h, z23.h\n"
+ "ld1h { z28.h }, p1/Z, [x25, x14, LSL #1]\n"
+ "ld1h { z26.h }, p1/Z, [x26, x14, LSL #1]\n"
+ "ld1h { z25.h }, p1/Z, [x23, x14, LSL #1]\n"
+ "ld1h { z24.h }, p1/Z, [x22, x14, LSL #1]\n"
"whilelt p0.h, x11, x15\n"
- "movprfx z16, z22\n fmax z16.h, p1/M, z16.h, z20.h\n"
- "ld1h { z23.h }, p2/Z, [x20, x14, LSL #1]\n"
+ "movprfx z16, z22\n fmax z16.h, p2/M, z16.h, z20.h\n"
+ "ld1h { z23.h }, p1/Z, [x20, x14, LSL #1]\n"
"incw x14\n"
- "whilelt p2.h, x14, x15\n"
+ "whilelt p1.h, x14, x15\n"
"st1h { z16.h }, p0, [x13, x11, LSL #1]\n"
- "movprfx z16, z19\n fmax z16.h, p1/M, z16.h, z22.h\n"
- "fmax z17.h, p1/M, z17.h, z21.h\n"
+ "movprfx z16, z19\n fmax z16.h, p2/M, z16.h, z22.h\n"
+ "fmax z17.h, p2/M, z17.h, z21.h\n"
"st1h { z16.h }, p0, [x12, x11, LSL #1]\n"
- "movprfx z16, z18\n fmax z16.h, p1/M, z16.h, z21.h\n"
+ "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z18.h\n"
"st1h { z17.h }, p0, [x10, x11, LSL #1]\n"
"st1h { z16.h }, p0, [x9, x11, LSL #1]\n"
"incw x11\n"
"b.any 1b\n"
"2:" // Vector: Tail
- "movprfx z22, z31\n fmax z22.h, p1/M, z22.h, z30.h\n"
- "movprfx z21, z30\n fmax z21.h, p1/M, z21.h, z29.h\n"
- "movprfx z20, z28\n fmax z20.h, p1/M, z20.h, z27.h\n"
- "movprfx z19, z26\n fmax z19.h, p1/M, z19.h, z25.h\n"
- "movprfx z17, z28\n fmax z17.h, p1/M, z17.h, z24.h\n"
- "movprfx z18, z25\n fmax z18.h, p1/M, z18.h, z23.h\n"
+ "movprfx z22, z31\n fmax z22.h, p2/M, z22.h, z30.h\n"
+ "movprfx z21, z30\n fmax z21.h, p2/M, z21.h, z29.h\n"
+ "movprfx z20, z28\n fmax z20.h, p2/M, z20.h, z27.h\n"
+ "movprfx z19, z26\n fmax z19.h, p2/M, z19.h, z25.h\n"
+ "movprfx z17, z28\n fmax z17.h, p2/M, z17.h, z24.h\n"
+ "movprfx z18, z25\n fmax z18.h, p2/M, z18.h, z23.h\n"
"whilelt p0.h, x11, x15\n"
- "movprfx z16, z22\n fmax z16.h, p1/M, z16.h, z20.h\n"
+ "movprfx z16, z22\n fmax z16.h, p2/M, z16.h, z20.h\n"
"st1h { z16.h }, p0, [x13, x11, LSL #1]\n"
- "movprfx z16, z19\n fmax z16.h, p1/M, z16.h, z22.h\n"
- "fmax z17.h, p1/M, z17.h, z21.h\n"
+ "movprfx z16, z19\n fmax z16.h, p2/M, z16.h, z22.h\n"
+ "fmax z17.h, p2/M, z17.h, z21.h\n"
"st1h { z16.h }, p0, [x12, x11, LSL #1]\n"
- "movprfx z16, z18\n fmax z16.h, p1/M, z16.h, z21.h\n"
+ "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z18.h\n"
"st1h { z17.h }, p0, [x10, x11, LSL #1]\n"
"st1h { z16.h }, p0, [x9, x11, LSL #1]\n"
:
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp
index 9f1f9e7377..1a01412836 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp
@@ -54,68 +54,68 @@ void sve_fp16_nhwc_max_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z8.h, #0xfc00\n"
"mov z7.h, #0xfc00\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z6.h, #0xfc00\n"
"mov z5.h, #0xfc00\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1h { z4.h }, p4/Z, [x24, x9, LSL #1]\n"
- "ld1h { z3.h }, p4/Z, [x23, x9, LSL #1]\n"
- "ld1h { z2.h }, p4/Z, [x22, x9, LSL #1]\n"
- "ld1h { z1.h }, p4/Z, [x21, x9, LSL #1]\n"
- "ld1h { z0.h }, p3/Z, [x24, x28, LSL #1]\n"
- "ld1h { z31.h }, p3/Z, [x23, x28, LSL #1]\n"
- "ld1h { z22.h }, p3/Z, [x22, x28, LSL #1]\n"
- "ld1h { z30.h }, p3/Z, [x21, x28, LSL #1]\n"
- "ld1h { z29.h }, p2/Z, [x24, x27, LSL #1]\n"
- "ld1h { z28.h }, p2/Z, [x23, x27, LSL #1]\n"
- "ld1h { z21.h }, p2/Z, [x22, x27, LSL #1]\n"
- "ld1h { z27.h }, p2/Z, [x21, x27, LSL #1]\n"
- "ld1h { z26.h }, p1/Z, [x24, x26, LSL #1]\n"
- "ld1h { z25.h }, p1/Z, [x23, x26, LSL #1]\n"
- "ld1h { z20.h }, p1/Z, [x22, x26, LSL #1]\n"
- "ld1h { z24.h }, p1/Z, [x21, x26, LSL #1]\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z4.h }, p4/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z3.h }, p4/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z2.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z1.h }, p4/Z, [x20, x9, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x23, x28, LSL #1]\n"
+ "ld1h { z31.h }, p3/Z, [x22, x28, LSL #1]\n"
+ "ld1h { z22.h }, p3/Z, [x21, x28, LSL #1]\n"
+ "ld1h { z30.h }, p3/Z, [x20, x28, LSL #1]\n"
+ "ld1h { z29.h }, p2/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z28.h }, p2/Z, [x22, x27, LSL #1]\n"
+ "ld1h { z21.h }, p2/Z, [x21, x27, LSL #1]\n"
+ "ld1h { z27.h }, p2/Z, [x20, x27, LSL #1]\n"
+ "ld1h { z26.h }, p1/Z, [x23, x26, LSL #1]\n"
+ "ld1h { z25.h }, p1/Z, [x22, x26, LSL #1]\n"
+ "ld1h { z20.h }, p1/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z24.h }, p1/Z, [x20, x26, LSL #1]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"movprfx z19, z4\n fmax z19.h, p0/M, z19.h, z3.h\n"
"movprfx z23, z2\n fmax z23.h, p0/M, z23.h, z1.h\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"movprfx z18, z0\n fmax z18.h, p0/M, z18.h, z31.h\n"
"fmax z22.h, p0/M, z22.h, z30.h\n"
- "ld1h { z4.h }, p4/Z, [x24, x9, LSL #1]\n"
- "ld1h { z3.h }, p4/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z4.h }, p4/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z3.h }, p4/Z, [x22, x9, LSL #1]\n"
"movprfx z17, z29\n fmax z17.h, p0/M, z17.h, z28.h\n"
"fmax z21.h, p0/M, z21.h, z27.h\n"
- "ld1h { z2.h }, p4/Z, [x22, x9, LSL #1]\n"
- "ld1h { z1.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z2.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z1.h }, p4/Z, [x20, x9, LSL #1]\n"
"movprfx z16, z26\n fmax z16.h, p0/M, z16.h, z25.h\n"
"fmax z20.h, p0/M, z20.h, z24.h\n"
- "ld1h { z0.h }, p3/Z, [x24, x28, LSL #1]\n"
- "ld1h { z31.h }, p3/Z, [x23, x28, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x23, x28, LSL #1]\n"
+ "ld1h { z31.h }, p3/Z, [x22, x28, LSL #1]\n"
"fmax z19.h, p0/M, z19.h, z23.h\n"
"fmax z18.h, p0/M, z18.h, z22.h\n"
- "ld1h { z22.h }, p3/Z, [x22, x28, LSL #1]\n"
- "ld1h { z30.h }, p3/Z, [x21, x28, LSL #1]\n"
+ "ld1h { z22.h }, p3/Z, [x21, x28, LSL #1]\n"
+ "ld1h { z30.h }, p3/Z, [x20, x28, LSL #1]\n"
"fmax z17.h, p0/M, z17.h, z21.h\n"
"fmax z16.h, p0/M, z16.h, z20.h\n"
- "ld1h { z29.h }, p2/Z, [x24, x27, LSL #1]\n"
- "ld1h { z28.h }, p2/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z29.h }, p2/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z28.h }, p2/Z, [x22, x27, LSL #1]\n"
"subs x25, x25, #0x1\n"
"fmax z8.h, p0/M, z8.h, z19.h\n"
- "ld1h { z21.h }, p2/Z, [x22, x27, LSL #1]\n"
- "ld1h { z27.h }, p2/Z, [x21, x27, LSL #1]\n"
+ "ld1h { z21.h }, p2/Z, [x21, x27, LSL #1]\n"
+ "ld1h { z27.h }, p2/Z, [x20, x27, LSL #1]\n"
"fmax z7.h, p0/M, z7.h, z18.h\n"
"fmax z6.h, p0/M, z6.h, z17.h\n"
- "ld1h { z26.h }, p1/Z, [x24, x26, LSL #1]\n"
- "ld1h { z25.h }, p1/Z, [x23, x26, LSL #1]\n"
+ "ld1h { z26.h }, p1/Z, [x23, x26, LSL #1]\n"
+ "ld1h { z25.h }, p1/Z, [x22, x26, LSL #1]\n"
"fmax z5.h, p0/M, z5.h, z16.h\n"
- "add x20, x20, #0x20\n"
- "ld1h { z20.h }, p1/Z, [x22, x26, LSL #1]\n"
- "ld1h { z24.h }, p1/Z, [x21, x26, LSL #1]\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z20.h }, p1/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z24.h }, p1/Z, [x20, x26, LSL #1]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"movprfx z19, z4\n fmax z19.h, p0/M, z19.h, z3.h\n"
@@ -138,16 +138,16 @@ void sve_fp16_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1h { z4.h }, p4/Z, [x24, x9, LSL #1]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1h { z16.h }, p4/Z, [x20, x9, LSL #1]\n"
"subs x21, x21, #0x1\n"
- "fmax z8.h, p0/M, z8.h, z4.h\n"
- "ld1h { z0.h }, p3/Z, [x24, x28, LSL #1]\n"
- "ld1h { z29.h }, p2/Z, [x24, x27, LSL #1]\n"
- "fmax z7.h, p0/M, z7.h, z0.h\n"
- "fmax z6.h, p0/M, z6.h, z29.h\n"
- "ld1h { z26.h }, p1/Z, [x24, x26, LSL #1]\n"
- "fmax z5.h, p0/M, z5.h, z26.h\n"
+ "fmax z8.h, p0/M, z8.h, z16.h\n"
+ "ld1h { z17.h }, p3/Z, [x20, x28, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x20, x27, LSL #1]\n"
+ "fmax z7.h, p0/M, z7.h, z17.h\n"
+ "fmax z6.h, p0/M, z6.h, z16.h\n"
+ "ld1h { z16.h }, p1/Z, [x20, x26, LSL #1]\n"
+ "fmax z5.h, p0/M, z5.h, z16.h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"st1h { z8.h }, p4, [%x[outptr], x9, LSL #1]\n"
@@ -166,44 +166,44 @@ void sve_fp16_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z8.h, #0xfc00\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1h { z4.h }, p4/Z, [x24, x9, LSL #1]\n"
- "ld1h { z3.h }, p4/Z, [x23, x9, LSL #1]\n"
- "ld1h { z2.h }, p4/Z, [x22, x9, LSL #1]\n"
- "ld1h { z1.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z4.h }, p4/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z3.h }, p4/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z2.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z1.h }, p4/Z, [x20, x9, LSL #1]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z19, z4\n fmax z19.h, p0/M, z19.h, z3.h\n"
- "movprfx z23, z2\n fmax z23.h, p0/M, z23.h, z1.h\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "fmax z19.h, p0/M, z19.h, z23.h\n"
+ "movprfx z16, z4\n fmax z16.h, p0/M, z16.h, z3.h\n"
+ "movprfx z17, z2\n fmax z17.h, p0/M, z17.h, z1.h\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "fmax z16.h, p0/M, z16.h, z17.h\n"
"subs x25, x25, #0x1\n"
- "ld1h { z4.h }, p4/Z, [x24, x9, LSL #1]\n"
- "ld1h { z3.h }, p4/Z, [x23, x9, LSL #1]\n"
- "fmax z8.h, p0/M, z8.h, z19.h\n"
- "add x20, x20, #0x20\n"
- "ld1h { z2.h }, p4/Z, [x22, x9, LSL #1]\n"
- "ld1h { z1.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z4.h }, p4/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z3.h }, p4/Z, [x22, x9, LSL #1]\n"
+ "fmax z8.h, p0/M, z8.h, z16.h\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z2.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z1.h }, p4/Z, [x20, x9, LSL #1]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z19, z4\n fmax z19.h, p0/M, z19.h, z3.h\n"
- "movprfx z23, z2\n fmax z23.h, p0/M, z23.h, z1.h\n"
- "fmax z19.h, p0/M, z19.h, z23.h\n"
- "fmax z8.h, p0/M, z8.h, z19.h\n"
+ "movprfx z16, z4\n fmax z16.h, p0/M, z16.h, z3.h\n"
+ "movprfx z17, z2\n fmax z17.h, p0/M, z17.h, z1.h\n"
+ "fmax z16.h, p0/M, z16.h, z17.h\n"
+ "fmax z8.h, p0/M, z8.h, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1h { z4.h }, p4/Z, [x24, x9, LSL #1]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1h { z16.h }, p4/Z, [x20, x9, LSL #1]\n"
"subs x21, x21, #0x1\n"
- "fmax z8.h, p0/M, z8.h, z4.h\n"
+ "fmax z8.h, p0/M, z8.h, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"st1h { z8.h }, p4, [%x[outptr], x9, LSL #1]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
index 39197aa04d..c5ea5adea0 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -88,8 +88,8 @@ void sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
"mov x20, #0x4\n"
"ldr x4, [%x[args], %[offsetof_inptrs]]\n"
"ldp x5, x6, [x21, #0x0]\n"
- "whilelt p0.s, XZR, x20\n"
- "whilelt p1.s, x3, x2\n"
+ "whilelt p2.s, XZR, x20\n"
+ "whilelt p0.s, x3, x2\n"
"ldp x7, x8, [x21, #0x10]\n"
"ldp x17, x16, [x4, #0x0]\n"
"add x15, %x[args], %[offsetof_rescale]\n"
@@ -101,25 +101,25 @@ void sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
"ldp x25, x24, [x4, #0x50]\n"
"ldp x23, x22, [x4, #0x60]\n"
"ldp x21, x20, [x4, #0x70]\n"
- "ld1w { z7.s }, p1/Z, [x10, x3, LSL #2]\n"
- "ld1w { z6.s }, p1/Z, [x9, x3, LSL #2]\n"
- "ld1w { z5.s }, p1/Z, [x26, x3, LSL #2]\n"
- "ld1w { z4.s }, p1/Z, [x25, x3, LSL #2]\n"
- "ld1w { z3.s }, p1/Z, [x16, x3, LSL #2]\n"
- "ld1w { z2.s }, p1/Z, [x13, x3, LSL #2]\n"
- "ld1w { z1.s }, p1/Z, [x11, x3, LSL #2]\n"
- "ld1w { z31.s }, p1/Z, [x27, x3, LSL #2]\n"
- "ld1w { z30.s }, p1/Z, [x28, x3, LSL #2]\n"
- "ld1w { z29.s }, p1/Z, [x24, x3, LSL #2]\n"
- "ld1w { z28.s }, p1/Z, [x22, x3, LSL #2]\n"
- "ld1w { z27.s }, p1/Z, [x21, x3, LSL #2]\n"
- "ld1w { z26.s }, p1/Z, [x17, x3, LSL #2]\n"
- "ld1w { z25.s }, p1/Z, [x12, x3, LSL #2]\n"
- "ld1w { z24.s }, p1/Z, [x23, x3, LSL #2]\n"
- "ld1w { z23.s }, p1/Z, [x20, x3, LSL #2]\n"
+ "ld1w { z7.s }, p0/Z, [x10, x3, LSL #2]\n"
+ "ld1w { z6.s }, p0/Z, [x9, x3, LSL #2]\n"
+ "ld1w { z5.s }, p0/Z, [x26, x3, LSL #2]\n"
+ "ld1w { z4.s }, p0/Z, [x25, x3, LSL #2]\n"
+ "ld1w { z3.s }, p0/Z, [x16, x3, LSL #2]\n"
+ "ld1w { z2.s }, p0/Z, [x13, x3, LSL #2]\n"
+ "ld1w { z1.s }, p0/Z, [x11, x3, LSL #2]\n"
+ "ld1w { z31.s }, p0/Z, [x27, x3, LSL #2]\n"
+ "ld1w { z30.s }, p0/Z, [x28, x3, LSL #2]\n"
+ "ld1w { z29.s }, p0/Z, [x24, x3, LSL #2]\n"
+ "ld1w { z28.s }, p0/Z, [x22, x3, LSL #2]\n"
+ "ld1w { z27.s }, p0/Z, [x21, x3, LSL #2]\n"
+ "ld1w { z26.s }, p0/Z, [x17, x3, LSL #2]\n"
+ "ld1w { z25.s }, p0/Z, [x12, x3, LSL #2]\n"
+ "ld1w { z24.s }, p0/Z, [x23, x3, LSL #2]\n"
+ "ld1w { z23.s }, p0/Z, [x20, x3, LSL #2]\n"
"incw x3\n"
"whilelt p1.s, x3, x2\n"
- "ld1rqw { z0.s }, p0/Z, [x15]\n"
+ "ld1rqw { z0.s }, p2/Z, [x15]\n"
"b.none 2f\n"
"1:" // Vector: Loop
"fadd z17.s, z7.s, z6.s\n"
@@ -172,32 +172,32 @@ void sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
"fadd z17.s, z7.s, z6.s\n"
"fadd z16.s, z5.s, z4.s\n"
"whilelt p0.s, x14, x2\n"
- "fadd z19.s, z17.s, z16.s\n"
+ "fadd z20.s, z17.s, z16.s\n"
"fadd z18.s, z3.s, z2.s\n"
"fadd z17.s, z1.s, z31.s\n"
- "fadd z22.s, z30.s, z29.s\n"
+ "fadd z19.s, z30.s, z29.s\n"
"fadd z16.s, z28.s, z27.s\n"
- "fadd z21.s, z18.s, z19.s\n"
- "fadd z20.s, z16.s, z19.s\n"
- "fadd z19.s, z26.s, z17.s\n"
- "fadd z18.s, z25.s, z22.s\n"
+ "fadd z21.s, z18.s, z20.s\n"
+ "fadd z20.s, z16.s, z20.s\n"
+ "fadd z16.s, z26.s, z17.s\n"
+ "fadd z18.s, z25.s, z19.s\n"
"fadd z17.s, z24.s, z17.s\n"
- "fadd z16.s, z23.s, z22.s\n"
- "fadd z19.s, z21.s, z19.s\n"
- "fmul z19.s, z19.s, z0.s[0]\n"
- "st1w { z19.s }, p0, [x5, x14, LSL #2]\n"
+ "fadd z19.s, z23.s, z19.s\n"
+ "fadd z16.s, z21.s, z16.s\n"
+ "fmul z16.s, z16.s, z0.s[0]\n"
+ "st1w { z16.s }, p0, [x5, x14, LSL #2]\n"
"fadd z18.s, z21.s, z18.s\n"
"fadd z17.s, z17.s, z20.s\n"
"fmul z18.s, z18.s, z0.s[1]\n"
"fmul z17.s, z17.s, z0.s[2]\n"
- "fadd z16.s, z16.s, z20.s\n"
+ "fadd z16.s, z19.s, z20.s\n"
"fmul z16.s, z16.s, z0.s[3]\n"
"st1w { z18.s }, p0, [x6, x14, LSL #2]\n"
"st1w { z17.s }, p0, [x7, x14, LSL #2]\n"
"st1w { z16.s }, p0, [x8, x14, LSL #2]\n"
:
: [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs)), [offsetof_rescale] "I" (offsetof(KernelArgs, rescale_vals))
- : "cc", "memory", "p0", "p1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp
index c1a3e5de84..7c94894892 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp
@@ -57,68 +57,68 @@ void sve_fp32_nhwc_avg_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z6.b, #0x0\n"
"mov z5.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z4.b, #0x0\n"
"mov z3.b, #0x0\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1w { z2.s }, p3/Z, [x24, x9, LSL #2]\n"
- "ld1w { z1.s }, p3/Z, [x23, x9, LSL #2]\n"
- "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
- "ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
- "ld1w { z30.s }, p2/Z, [x24, x28, LSL #2]\n"
- "ld1w { z22.s }, p2/Z, [x23, x28, LSL #2]\n"
- "ld1w { z29.s }, p2/Z, [x22, x28, LSL #2]\n"
- "ld1w { z28.s }, p2/Z, [x21, x28, LSL #2]\n"
- "ld1w { z27.s }, p1/Z, [x24, x27, LSL #2]\n"
- "ld1w { z21.s }, p1/Z, [x23, x27, LSL #2]\n"
- "ld1w { z26.s }, p1/Z, [x22, x27, LSL #2]\n"
- "ld1w { z17.s }, p1/Z, [x21, x27, LSL #2]\n"
- "ld1w { z25.s }, p0/Z, [x24, x26, LSL #2]\n"
- "ld1w { z20.s }, p0/Z, [x23, x26, LSL #2]\n"
- "ld1w { z24.s }, p0/Z, [x22, x26, LSL #2]\n"
- "ld1w { z16.s }, p0/Z, [x21, x26, LSL #2]\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z2.s }, p3/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z1.s }, p3/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z31.s }, p3/Z, [x20, x9, LSL #2]\n"
+ "ld1w { z30.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z22.s }, p2/Z, [x22, x28, LSL #2]\n"
+ "ld1w { z29.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z28.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z27.s }, p1/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z21.s }, p1/Z, [x22, x27, LSL #2]\n"
+ "ld1w { z26.s }, p1/Z, [x21, x27, LSL #2]\n"
+ "ld1w { z17.s }, p1/Z, [x20, x27, LSL #2]\n"
+ "ld1w { z25.s }, p0/Z, [x23, x26, LSL #2]\n"
+ "ld1w { z20.s }, p0/Z, [x22, x26, LSL #2]\n"
+ "ld1w { z24.s }, p0/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"fadd z23.s, z2.s, z1.s\n"
"fadd z19.s, z0.s, z31.s\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"fadd z22.s, z30.s, z22.s\n"
"fadd z18.s, z29.s, z28.s\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
+ "add x24, x24, #0x20\n"
"fadd z21.s, z27.s, z21.s\n"
"fadd z17.s, z26.s, z17.s\n"
- "ld1w { z2.s }, p3/Z, [x24, x9, LSL #2]\n"
- "ld1w { z1.s }, p3/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z2.s }, p3/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z1.s }, p3/Z, [x22, x9, LSL #2]\n"
"fadd z20.s, z25.s, z20.s\n"
"fadd z16.s, z24.s, z16.s\n"
- "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
- "ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z31.s }, p3/Z, [x20, x9, LSL #2]\n"
"fadd z19.s, z23.s, z19.s\n"
"fadd z18.s, z22.s, z18.s\n"
- "ld1w { z30.s }, p2/Z, [x24, x28, LSL #2]\n"
- "ld1w { z22.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z30.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z22.s }, p2/Z, [x22, x28, LSL #2]\n"
"fadd z17.s, z21.s, z17.s\n"
"fadd z16.s, z20.s, z16.s\n"
- "ld1w { z29.s }, p2/Z, [x22, x28, LSL #2]\n"
- "ld1w { z28.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z29.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z28.s }, p2/Z, [x20, x28, LSL #2]\n"
"fadd z6.s, z6.s, z19.s\n"
"fadd z5.s, z5.s, z18.s\n"
- "ld1w { z27.s }, p1/Z, [x24, x27, LSL #2]\n"
- "ld1w { z21.s }, p1/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z27.s }, p1/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z21.s }, p1/Z, [x22, x27, LSL #2]\n"
"fadd z4.s, z4.s, z17.s\n"
"fadd z3.s, z3.s, z16.s\n"
- "ld1w { z26.s }, p1/Z, [x22, x27, LSL #2]\n"
- "ld1w { z17.s }, p1/Z, [x21, x27, LSL #2]\n"
- "ld1w { z25.s }, p0/Z, [x24, x26, LSL #2]\n"
- "ld1w { z20.s }, p0/Z, [x23, x26, LSL #2]\n"
- "ld1w { z24.s }, p0/Z, [x22, x26, LSL #2]\n"
- "ld1w { z16.s }, p0/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z26.s }, p1/Z, [x21, x27, LSL #2]\n"
+ "ld1w { z17.s }, p1/Z, [x20, x27, LSL #2]\n"
+ "ld1w { z25.s }, p0/Z, [x23, x26, LSL #2]\n"
+ "ld1w { z20.s }, p0/Z, [x22, x26, LSL #2]\n"
+ "ld1w { z24.s }, p0/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"fadd z23.s, z2.s, z1.s\n"
@@ -141,16 +141,16 @@ void sve_fp32_nhwc_avg_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1w { z2.s }, p3/Z, [x24, x9, LSL #2]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1w { z16.s }, p3/Z, [x20, x9, LSL #2]\n"
"subs x21, x21, #0x1\n"
- "fadd z6.s, z6.s, z2.s\n"
- "ld1w { z30.s }, p2/Z, [x24, x28, LSL #2]\n"
- "ld1w { z27.s }, p1/Z, [x24, x27, LSL #2]\n"
- "fadd z5.s, z5.s, z30.s\n"
- "fadd z4.s, z4.s, z27.s\n"
- "ld1w { z25.s }, p0/Z, [x24, x26, LSL #2]\n"
- "fadd z3.s, z3.s, z25.s\n"
+ "fadd z6.s, z6.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z16.s }, p1/Z, [x20, x27, LSL #2]\n"
+ "fadd z5.s, z5.s, z17.s\n"
+ "fadd z4.s, z4.s, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
+ "fadd z3.s, z3.s, z16.s\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"fmul z6.s, z6.s, z7.s\n"
@@ -173,44 +173,44 @@ void sve_fp32_nhwc_avg_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z6.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1w { z2.s }, p3/Z, [x24, x9, LSL #2]\n"
- "ld1w { z1.s }, p3/Z, [x23, x9, LSL #2]\n"
- "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
- "ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z2.s }, p3/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z1.s }, p3/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z31.s }, p3/Z, [x20, x9, LSL #2]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "fadd z23.s, z2.s, z1.s\n"
- "fadd z19.s, z0.s, z31.s\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "fadd z19.s, z23.s, z19.s\n"
+ "fadd z17.s, z2.s, z1.s\n"
+ "fadd z16.s, z0.s, z31.s\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "fadd z16.s, z17.s, z16.s\n"
"subs x25, x25, #0x1\n"
- "fadd z6.s, z6.s, z19.s\n"
- "add x20, x20, #0x20\n"
- "ld1w { z2.s }, p3/Z, [x24, x9, LSL #2]\n"
- "ld1w { z1.s }, p3/Z, [x23, x9, LSL #2]\n"
- "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
- "ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "fadd z6.s, z6.s, z16.s\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z2.s }, p3/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z1.s }, p3/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z31.s }, p3/Z, [x20, x9, LSL #2]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "fadd z23.s, z2.s, z1.s\n"
- "fadd z19.s, z0.s, z31.s\n"
- "fadd z19.s, z23.s, z19.s\n"
- "fadd z6.s, z6.s, z19.s\n"
+ "fadd z17.s, z2.s, z1.s\n"
+ "fadd z16.s, z0.s, z31.s\n"
+ "fadd z16.s, z17.s, z16.s\n"
+ "fadd z6.s, z6.s, z16.s\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1w { z2.s }, p3/Z, [x24, x9, LSL #2]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1w { z16.s }, p3/Z, [x20, x9, LSL #2]\n"
"subs x21, x21, #0x1\n"
- "fadd z6.s, z6.s, z2.s\n"
+ "fadd z6.s, z6.s, z16.s\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"fmul z6.s, z6.s, z7.s\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index da0239cea8..d9cebd1363 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -66,10 +66,10 @@ void sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr x15, [%x[args], %[offsetof_n_channels]]\n"
"ldr x21, [%x[args], %[offsetof_outptrs]]\n"
"mov x14, #0x0\n"
- "whilelt p2.s, x14, x15\n"
+ "whilelt p0.s, x14, x15\n"
"ldr x20, [%x[args], %[offsetof_inptrs]]\n"
"ldp x13, x12, [x21, #0x0]\n"
- "ptrue p1.b\n"
+ "ptrue p2.b\n"
"mov x11, #0x0\n"
"ldp x10, x9, [x21, #0x10]\n"
"ldp x28, x27, [x20, #0x0]\n"
@@ -77,61 +77,61 @@ void sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldp x24, x23, [x20, #0x20]\n"
"ldp x22, x21, [x20, #0x30]\n"
"ldr x20, [x20, #0x40]\n"
- "ld1w { z31.s }, p2/Z, [x27, x14, LSL #2]\n"
- "ld1w { z30.s }, p2/Z, [x24, x14, LSL #2]\n"
- "ld1w { z29.s }, p2/Z, [x21, x14, LSL #2]\n"
- "ld1w { z28.s }, p2/Z, [x25, x14, LSL #2]\n"
- "ld1w { z27.s }, p2/Z, [x28, x14, LSL #2]\n"
- "ld1w { z26.s }, p2/Z, [x26, x14, LSL #2]\n"
- "ld1w { z25.s }, p2/Z, [x23, x14, LSL #2]\n"
- "ld1w { z24.s }, p2/Z, [x22, x14, LSL #2]\n"
- "ld1w { z23.s }, p2/Z, [x20, x14, LSL #2]\n"
+ "ld1w { z31.s }, p0/Z, [x27, x14, LSL #2]\n"
+ "ld1w { z30.s }, p0/Z, [x24, x14, LSL #2]\n"
+ "ld1w { z29.s }, p0/Z, [x21, x14, LSL #2]\n"
+ "ld1w { z28.s }, p0/Z, [x25, x14, LSL #2]\n"
+ "ld1w { z27.s }, p0/Z, [x28, x14, LSL #2]\n"
+ "ld1w { z26.s }, p0/Z, [x26, x14, LSL #2]\n"
+ "ld1w { z25.s }, p0/Z, [x23, x14, LSL #2]\n"
+ "ld1w { z24.s }, p0/Z, [x22, x14, LSL #2]\n"
+ "ld1w { z23.s }, p0/Z, [x20, x14, LSL #2]\n"
"incw x14\n"
- "whilelt p2.s, x14, x15\n"
+ "whilelt p1.s, x14, x15\n"
"b.none 2f\n"
"1:" // Vector: Loop
- "movprfx z22, z31\n fmax z22.s, p1/M, z22.s, z30.s\n"
- "movprfx z21, z30\n fmax z21.s, p1/M, z21.s, z29.s\n"
- "ld1w { z31.s }, p2/Z, [x27, x14, LSL #2]\n"
- "ld1w { z30.s }, p2/Z, [x24, x14, LSL #2]\n"
- "movprfx z20, z28\n fmax z20.s, p1/M, z20.s, z27.s\n"
- "movprfx z19, z26\n fmax z19.s, p1/M, z19.s, z25.s\n"
- "ld1w { z29.s }, p2/Z, [x21, x14, LSL #2]\n"
- "ld1w { z27.s }, p2/Z, [x28, x14, LSL #2]\n"
- "movprfx z17, z28\n fmax z17.s, p1/M, z17.s, z24.s\n"
- "movprfx z18, z25\n fmax z18.s, p1/M, z18.s, z23.s\n"
- "ld1w { z28.s }, p2/Z, [x25, x14, LSL #2]\n"
- "ld1w { z26.s }, p2/Z, [x26, x14, LSL #2]\n"
- "ld1w { z25.s }, p2/Z, [x23, x14, LSL #2]\n"
- "ld1w { z24.s }, p2/Z, [x22, x14, LSL #2]\n"
+ "movprfx z22, z31\n fmax z22.s, p2/M, z22.s, z30.s\n"
+ "movprfx z21, z30\n fmax z21.s, p2/M, z21.s, z29.s\n"
+ "ld1w { z31.s }, p1/Z, [x27, x14, LSL #2]\n"
+ "ld1w { z30.s }, p1/Z, [x24, x14, LSL #2]\n"
+ "movprfx z20, z28\n fmax z20.s, p2/M, z20.s, z27.s\n"
+ "movprfx z19, z26\n fmax z19.s, p2/M, z19.s, z25.s\n"
+ "ld1w { z29.s }, p1/Z, [x21, x14, LSL #2]\n"
+ "ld1w { z27.s }, p1/Z, [x28, x14, LSL #2]\n"
+ "movprfx z17, z28\n fmax z17.s, p2/M, z17.s, z24.s\n"
+ "movprfx z18, z25\n fmax z18.s, p2/M, z18.s, z23.s\n"
+ "ld1w { z28.s }, p1/Z, [x25, x14, LSL #2]\n"
+ "ld1w { z26.s }, p1/Z, [x26, x14, LSL #2]\n"
+ "ld1w { z25.s }, p1/Z, [x23, x14, LSL #2]\n"
+ "ld1w { z24.s }, p1/Z, [x22, x14, LSL #2]\n"
"whilelt p0.s, x11, x15\n"
- "movprfx z16, z22\n fmax z16.s, p1/M, z16.s, z20.s\n"
- "ld1w { z23.s }, p2/Z, [x20, x14, LSL #2]\n"
+ "movprfx z16, z22\n fmax z16.s, p2/M, z16.s, z20.s\n"
+ "ld1w { z23.s }, p1/Z, [x20, x14, LSL #2]\n"
"incw x14\n"
- "whilelt p2.s, x14, x15\n"
+ "whilelt p1.s, x14, x15\n"
"st1w { z16.s }, p0, [x13, x11, LSL #2]\n"
- "movprfx z16, z19\n fmax z16.s, p1/M, z16.s, z22.s\n"
- "fmax z17.s, p1/M, z17.s, z21.s\n"
+ "movprfx z16, z19\n fmax z16.s, p2/M, z16.s, z22.s\n"
+ "fmax z17.s, p2/M, z17.s, z21.s\n"
"st1w { z16.s }, p0, [x12, x11, LSL #2]\n"
- "movprfx z16, z18\n fmax z16.s, p1/M, z16.s, z21.s\n"
+ "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z18.s\n"
"st1w { z17.s }, p0, [x10, x11, LSL #2]\n"
"st1w { z16.s }, p0, [x9, x11, LSL #2]\n"
"incw x11\n"
"b.any 1b\n"
"2:" // Vector: Tail
- "movprfx z22, z31\n fmax z22.s, p1/M, z22.s, z30.s\n"
- "movprfx z21, z30\n fmax z21.s, p1/M, z21.s, z29.s\n"
- "movprfx z20, z28\n fmax z20.s, p1/M, z20.s, z27.s\n"
- "movprfx z19, z26\n fmax z19.s, p1/M, z19.s, z25.s\n"
- "movprfx z17, z28\n fmax z17.s, p1/M, z17.s, z24.s\n"
- "movprfx z18, z25\n fmax z18.s, p1/M, z18.s, z23.s\n"
+ "movprfx z22, z31\n fmax z22.s, p2/M, z22.s, z30.s\n"
+ "movprfx z21, z30\n fmax z21.s, p2/M, z21.s, z29.s\n"
+ "movprfx z20, z28\n fmax z20.s, p2/M, z20.s, z27.s\n"
+ "movprfx z19, z26\n fmax z19.s, p2/M, z19.s, z25.s\n"
+ "movprfx z17, z28\n fmax z17.s, p2/M, z17.s, z24.s\n"
+ "movprfx z18, z25\n fmax z18.s, p2/M, z18.s, z23.s\n"
"whilelt p0.s, x11, x15\n"
- "movprfx z16, z22\n fmax z16.s, p1/M, z16.s, z20.s\n"
+ "movprfx z16, z22\n fmax z16.s, p2/M, z16.s, z20.s\n"
"st1w { z16.s }, p0, [x13, x11, LSL #2]\n"
- "movprfx z16, z19\n fmax z16.s, p1/M, z16.s, z22.s\n"
- "fmax z17.s, p1/M, z17.s, z21.s\n"
+ "movprfx z16, z19\n fmax z16.s, p2/M, z16.s, z22.s\n"
+ "fmax z17.s, p2/M, z17.s, z21.s\n"
"st1w { z16.s }, p0, [x12, x11, LSL #2]\n"
- "movprfx z16, z18\n fmax z16.s, p1/M, z16.s, z21.s\n"
+ "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z18.s\n"
"st1w { z17.s }, p0, [x10, x11, LSL #2]\n"
"st1w { z16.s }, p0, [x9, x11, LSL #2]\n"
:
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp
index ddce2be62c..87fc75adda 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp
@@ -54,68 +54,68 @@ void sve_fp32_nhwc_max_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z8.s, #0xff800000\n"
"mov z7.s, #0xff800000\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z6.s, #0xff800000\n"
"mov z5.s, #0xff800000\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1w { z4.s }, p4/Z, [x24, x9, LSL #2]\n"
- "ld1w { z3.s }, p4/Z, [x23, x9, LSL #2]\n"
- "ld1w { z2.s }, p4/Z, [x22, x9, LSL #2]\n"
- "ld1w { z1.s }, p4/Z, [x21, x9, LSL #2]\n"
- "ld1w { z0.s }, p3/Z, [x24, x28, LSL #2]\n"
- "ld1w { z31.s }, p3/Z, [x23, x28, LSL #2]\n"
- "ld1w { z22.s }, p3/Z, [x22, x28, LSL #2]\n"
- "ld1w { z30.s }, p3/Z, [x21, x28, LSL #2]\n"
- "ld1w { z29.s }, p2/Z, [x24, x27, LSL #2]\n"
- "ld1w { z28.s }, p2/Z, [x23, x27, LSL #2]\n"
- "ld1w { z21.s }, p2/Z, [x22, x27, LSL #2]\n"
- "ld1w { z27.s }, p2/Z, [x21, x27, LSL #2]\n"
- "ld1w { z26.s }, p1/Z, [x24, x26, LSL #2]\n"
- "ld1w { z25.s }, p1/Z, [x23, x26, LSL #2]\n"
- "ld1w { z20.s }, p1/Z, [x22, x26, LSL #2]\n"
- "ld1w { z24.s }, p1/Z, [x21, x26, LSL #2]\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z4.s }, p4/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z3.s }, p4/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z2.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z1.s }, p4/Z, [x20, x9, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z31.s }, p3/Z, [x22, x28, LSL #2]\n"
+ "ld1w { z22.s }, p3/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z30.s }, p3/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z29.s }, p2/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z28.s }, p2/Z, [x22, x27, LSL #2]\n"
+ "ld1w { z21.s }, p2/Z, [x21, x27, LSL #2]\n"
+ "ld1w { z27.s }, p2/Z, [x20, x27, LSL #2]\n"
+ "ld1w { z26.s }, p1/Z, [x23, x26, LSL #2]\n"
+ "ld1w { z25.s }, p1/Z, [x22, x26, LSL #2]\n"
+ "ld1w { z20.s }, p1/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z24.s }, p1/Z, [x20, x26, LSL #2]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"movprfx z19, z4\n fmax z19.s, p0/M, z19.s, z3.s\n"
"movprfx z23, z2\n fmax z23.s, p0/M, z23.s, z1.s\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"movprfx z18, z0\n fmax z18.s, p0/M, z18.s, z31.s\n"
"fmax z22.s, p0/M, z22.s, z30.s\n"
- "ld1w { z4.s }, p4/Z, [x24, x9, LSL #2]\n"
- "ld1w { z3.s }, p4/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z4.s }, p4/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z3.s }, p4/Z, [x22, x9, LSL #2]\n"
"movprfx z17, z29\n fmax z17.s, p0/M, z17.s, z28.s\n"
"fmax z21.s, p0/M, z21.s, z27.s\n"
- "ld1w { z2.s }, p4/Z, [x22, x9, LSL #2]\n"
- "ld1w { z1.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z2.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z1.s }, p4/Z, [x20, x9, LSL #2]\n"
"movprfx z16, z26\n fmax z16.s, p0/M, z16.s, z25.s\n"
"fmax z20.s, p0/M, z20.s, z24.s\n"
- "ld1w { z0.s }, p3/Z, [x24, x28, LSL #2]\n"
- "ld1w { z31.s }, p3/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z31.s }, p3/Z, [x22, x28, LSL #2]\n"
"fmax z19.s, p0/M, z19.s, z23.s\n"
"fmax z18.s, p0/M, z18.s, z22.s\n"
- "ld1w { z22.s }, p3/Z, [x22, x28, LSL #2]\n"
- "ld1w { z30.s }, p3/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z22.s }, p3/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z30.s }, p3/Z, [x20, x28, LSL #2]\n"
"fmax z17.s, p0/M, z17.s, z21.s\n"
"fmax z16.s, p0/M, z16.s, z20.s\n"
- "ld1w { z29.s }, p2/Z, [x24, x27, LSL #2]\n"
- "ld1w { z28.s }, p2/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z29.s }, p2/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z28.s }, p2/Z, [x22, x27, LSL #2]\n"
"subs x25, x25, #0x1\n"
"fmax z8.s, p0/M, z8.s, z19.s\n"
- "ld1w { z21.s }, p2/Z, [x22, x27, LSL #2]\n"
- "ld1w { z27.s }, p2/Z, [x21, x27, LSL #2]\n"
+ "ld1w { z21.s }, p2/Z, [x21, x27, LSL #2]\n"
+ "ld1w { z27.s }, p2/Z, [x20, x27, LSL #2]\n"
"fmax z7.s, p0/M, z7.s, z18.s\n"
"fmax z6.s, p0/M, z6.s, z17.s\n"
- "ld1w { z26.s }, p1/Z, [x24, x26, LSL #2]\n"
- "ld1w { z25.s }, p1/Z, [x23, x26, LSL #2]\n"
+ "ld1w { z26.s }, p1/Z, [x23, x26, LSL #2]\n"
+ "ld1w { z25.s }, p1/Z, [x22, x26, LSL #2]\n"
"fmax z5.s, p0/M, z5.s, z16.s\n"
- "add x20, x20, #0x20\n"
- "ld1w { z20.s }, p1/Z, [x22, x26, LSL #2]\n"
- "ld1w { z24.s }, p1/Z, [x21, x26, LSL #2]\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z20.s }, p1/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z24.s }, p1/Z, [x20, x26, LSL #2]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"movprfx z19, z4\n fmax z19.s, p0/M, z19.s, z3.s\n"
@@ -138,16 +138,16 @@ void sve_fp32_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1w { z4.s }, p4/Z, [x24, x9, LSL #2]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1w { z16.s }, p4/Z, [x20, x9, LSL #2]\n"
"subs x21, x21, #0x1\n"
- "fmax z8.s, p0/M, z8.s, z4.s\n"
- "ld1w { z0.s }, p3/Z, [x24, x28, LSL #2]\n"
- "ld1w { z29.s }, p2/Z, [x24, x27, LSL #2]\n"
- "fmax z7.s, p0/M, z7.s, z0.s\n"
- "fmax z6.s, p0/M, z6.s, z29.s\n"
- "ld1w { z26.s }, p1/Z, [x24, x26, LSL #2]\n"
- "fmax z5.s, p0/M, z5.s, z26.s\n"
+ "fmax z8.s, p0/M, z8.s, z16.s\n"
+ "ld1w { z17.s }, p3/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x20, x27, LSL #2]\n"
+ "fmax z7.s, p0/M, z7.s, z17.s\n"
+ "fmax z6.s, p0/M, z6.s, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x20, x26, LSL #2]\n"
+ "fmax z5.s, p0/M, z5.s, z16.s\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"st1w { z8.s }, p4, [%x[outptr], x9, LSL #2]\n"
@@ -166,44 +166,44 @@ void sve_fp32_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z8.s, #0xff800000\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1w { z4.s }, p4/Z, [x24, x9, LSL #2]\n"
- "ld1w { z3.s }, p4/Z, [x23, x9, LSL #2]\n"
- "ld1w { z2.s }, p4/Z, [x22, x9, LSL #2]\n"
- "ld1w { z1.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z4.s }, p4/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z3.s }, p4/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z2.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z1.s }, p4/Z, [x20, x9, LSL #2]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z19, z4\n fmax z19.s, p0/M, z19.s, z3.s\n"
- "movprfx z23, z2\n fmax z23.s, p0/M, z23.s, z1.s\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "fmax z19.s, p0/M, z19.s, z23.s\n"
+ "movprfx z16, z4\n fmax z16.s, p0/M, z16.s, z3.s\n"
+ "movprfx z17, z2\n fmax z17.s, p0/M, z17.s, z1.s\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "fmax z16.s, p0/M, z16.s, z17.s\n"
"subs x25, x25, #0x1\n"
- "ld1w { z4.s }, p4/Z, [x24, x9, LSL #2]\n"
- "ld1w { z3.s }, p4/Z, [x23, x9, LSL #2]\n"
- "fmax z8.s, p0/M, z8.s, z19.s\n"
- "add x20, x20, #0x20\n"
- "ld1w { z2.s }, p4/Z, [x22, x9, LSL #2]\n"
- "ld1w { z1.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z4.s }, p4/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z3.s }, p4/Z, [x22, x9, LSL #2]\n"
+ "fmax z8.s, p0/M, z8.s, z16.s\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z2.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z1.s }, p4/Z, [x20, x9, LSL #2]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z19, z4\n fmax z19.s, p0/M, z19.s, z3.s\n"
- "movprfx z23, z2\n fmax z23.s, p0/M, z23.s, z1.s\n"
- "fmax z19.s, p0/M, z19.s, z23.s\n"
- "fmax z8.s, p0/M, z8.s, z19.s\n"
+ "movprfx z16, z4\n fmax z16.s, p0/M, z16.s, z3.s\n"
+ "movprfx z17, z2\n fmax z17.s, p0/M, z17.s, z1.s\n"
+ "fmax z16.s, p0/M, z16.s, z17.s\n"
+ "fmax z8.s, p0/M, z8.s, z16.s\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1w { z4.s }, p4/Z, [x24, x9, LSL #2]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1w { z16.s }, p4/Z, [x20, x9, LSL #2]\n"
"subs x21, x21, #0x1\n"
- "fmax z8.s, p0/M, z8.s, z4.s\n"
+ "fmax z8.s, p0/M, z8.s, z16.s\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"st1w { z8.s }, p4, [%x[outptr], x9, LSL #2]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp
index 68bd831d63..7925905e64 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -109,7 +109,7 @@ void sve_s8_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"mov z15.s, #0x0\n"
"mov z14.s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"mov z13.s, #0x0\n"
"mov z12.s, #0x0\n"
"mov z11.s, #0x0\n"
@@ -125,42 +125,42 @@ void sve_s8_nhwc_avg_generic_depthfirst_impl(
"mov z1.s, #0x0\n"
"mov z0.s, #0x0\n"
"cbz x23, 4f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
".inst 0x455c03b5 // saddlb z21.h, z29.b, z28.b\n"
".inst 0x455c07b4 // saddlt z20.h, z29.b, z28.b\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x455a0373 // saddlb z19.h, z27.b, z26.b\n"
".inst 0x455a0772 // saddlt z18.h, z27.b, z26.b\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
".inst 0x45580331 // saddlb z17.h, z25.b, z24.b\n"
".inst 0x45580730 // saddlt z16.h, z25.b, z24.b\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
".inst 0x4595416b // saddwb z11.s, z11.s, z21.h\n"
".inst 0x4595454a // saddwt z10.s, z10.s, z21.h\n"
".inst 0x45944129 // saddwb z9.s, z9.s, z20.h\n"
@@ -203,20 +203,20 @@ void sve_s8_nhwc_avg_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508a3f7 // sshllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508a7f6 // sshllt z22.h, z31.b, #0x0\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- ".inst 0x4508a3b5 // sshllb z21.h, z29.b, #0x0\n"
- ".inst 0x4508a7b4 // sshllt z20.h, z29.b, #0x0\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- ".inst 0x4508a373 // sshllb z19.h, z27.b, #0x0\n"
- ".inst 0x4508a772 // sshllt z18.h, z27.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508a217 // sshllb z23.h, z16.b, #0x0\n"
+ ".inst 0x4508a616 // sshllt z22.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z17.b }, p2/Z, [x20, x25]\n"
+ ".inst 0x4508a215 // sshllb z21.h, z16.b, #0x0\n"
+ ".inst 0x4508a614 // sshllt z20.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x4508a233 // sshllb z19.h, z17.b, #0x0\n"
+ ".inst 0x4508a632 // sshllt z18.h, z17.b, #0x0\n"
"subs x21, x21, #0x1\n"
- ".inst 0x4508a331 // sshllb z17.h, z25.b, #0x0\n"
- ".inst 0x4508a730 // sshllt z16.h, z25.b, #0x0\n"
+ ".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
@@ -332,49 +332,49 @@ void sve_s8_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"mov z15.s, #0x0\n"
"mov z14.s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"mov z13.s, #0x0\n"
"mov z12.s, #0x0\n"
"cbz x23, 11f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- ".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ ".inst 0x455e03f1 // saddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e07f0 // saddlt z16.h, z31.b, z30.b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
- ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
- ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- ".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
- ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
- ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
- ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x455e03f1 // saddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e07f0 // saddlt z16.h, z31.b, z30.b\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508a3f7 // sshllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508a7f6 // sshllt z22.h, z31.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
"subs x21, x21, #0x1\n"
- ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
- ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
- ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 96e20c752e..5681cc1f3d 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -66,10 +66,10 @@ void sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr x15, [%x[args], %[offsetof_n_channels]]\n"
"ldr x21, [%x[args], %[offsetof_outptrs]]\n"
"mov x14, #0x0\n"
- "whilelt p2.b, x14, x15\n"
+ "whilelt p0.b, x14, x15\n"
"ldr x20, [%x[args], %[offsetof_inptrs]]\n"
"ldp x13, x12, [x21, #0x0]\n"
- "ptrue p1.b\n"
+ "ptrue p2.b\n"
"mov x11, #0x0\n"
"ldp x10, x9, [x21, #0x10]\n"
"ldp x28, x27, [x20, #0x0]\n"
@@ -77,61 +77,61 @@ void sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldp x24, x23, [x20, #0x20]\n"
"ldp x22, x21, [x20, #0x30]\n"
"ldr x20, [x20, #0x40]\n"
- "ld1b { z31.b }, p2/Z, [x27, x14]\n"
- "ld1b { z30.b }, p2/Z, [x24, x14]\n"
- "ld1b { z29.b }, p2/Z, [x21, x14]\n"
- "ld1b { z28.b }, p2/Z, [x25, x14]\n"
- "ld1b { z27.b }, p2/Z, [x28, x14]\n"
- "ld1b { z26.b }, p2/Z, [x26, x14]\n"
- "ld1b { z25.b }, p2/Z, [x23, x14]\n"
- "ld1b { z24.b }, p2/Z, [x22, x14]\n"
- "ld1b { z23.b }, p2/Z, [x20, x14]\n"
+ "ld1b { z31.b }, p0/Z, [x27, x14]\n"
+ "ld1b { z30.b }, p0/Z, [x24, x14]\n"
+ "ld1b { z29.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z28.b }, p0/Z, [x25, x14]\n"
+ "ld1b { z27.b }, p0/Z, [x28, x14]\n"
+ "ld1b { z26.b }, p0/Z, [x26, x14]\n"
+ "ld1b { z25.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z24.b }, p0/Z, [x22, x14]\n"
+ "ld1b { z23.b }, p0/Z, [x20, x14]\n"
"incw x14\n"
- "whilelt p2.b, x14, x15\n"
+ "whilelt p1.b, x14, x15\n"
"b.none 2f\n"
"1:" // Vector: Loop
- "movprfx z22, z31\n smax z22.b, p1/M, z22.b, z30.b\n"
- "movprfx z21, z30\n smax z21.b, p1/M, z21.b, z29.b\n"
- "ld1b { z31.b }, p2/Z, [x27, x14]\n"
- "ld1b { z30.b }, p2/Z, [x24, x14]\n"
- "movprfx z20, z28\n smax z20.b, p1/M, z20.b, z27.b\n"
- "movprfx z19, z26\n smax z19.b, p1/M, z19.b, z25.b\n"
- "ld1b { z29.b }, p2/Z, [x21, x14]\n"
- "ld1b { z27.b }, p2/Z, [x28, x14]\n"
- "movprfx z17, z28\n smax z17.b, p1/M, z17.b, z24.b\n"
- "movprfx z18, z25\n smax z18.b, p1/M, z18.b, z23.b\n"
- "ld1b { z28.b }, p2/Z, [x25, x14]\n"
- "ld1b { z26.b }, p2/Z, [x26, x14]\n"
- "ld1b { z25.b }, p2/Z, [x23, x14]\n"
- "ld1b { z24.b }, p2/Z, [x22, x14]\n"
+ "movprfx z22, z31\n smax z22.b, p2/M, z22.b, z30.b\n"
+ "movprfx z21, z30\n smax z21.b, p2/M, z21.b, z29.b\n"
+ "ld1b { z31.b }, p1/Z, [x27, x14]\n"
+ "ld1b { z30.b }, p1/Z, [x24, x14]\n"
+ "movprfx z20, z28\n smax z20.b, p2/M, z20.b, z27.b\n"
+ "movprfx z19, z26\n smax z19.b, p2/M, z19.b, z25.b\n"
+ "ld1b { z29.b }, p1/Z, [x21, x14]\n"
+ "ld1b { z27.b }, p1/Z, [x28, x14]\n"
+ "movprfx z17, z28\n smax z17.b, p2/M, z17.b, z24.b\n"
+ "movprfx z18, z25\n smax z18.b, p2/M, z18.b, z23.b\n"
+ "ld1b { z28.b }, p1/Z, [x25, x14]\n"
+ "ld1b { z26.b }, p1/Z, [x26, x14]\n"
+ "ld1b { z25.b }, p1/Z, [x23, x14]\n"
+ "ld1b { z24.b }, p1/Z, [x22, x14]\n"
"whilelt p0.b, x11, x15\n"
- "movprfx z16, z22\n smax z16.b, p1/M, z16.b, z20.b\n"
- "ld1b { z23.b }, p2/Z, [x20, x14]\n"
+ "movprfx z16, z22\n smax z16.b, p2/M, z16.b, z20.b\n"
+ "ld1b { z23.b }, p1/Z, [x20, x14]\n"
"incw x14\n"
- "whilelt p2.b, x14, x15\n"
+ "whilelt p1.b, x14, x15\n"
"st1b { z16.b }, p0, [x13, x11]\n"
- "movprfx z16, z19\n smax z16.b, p1/M, z16.b, z22.b\n"
- "smax z17.b, p1/M, z17.b, z21.b\n"
+ "movprfx z16, z19\n smax z16.b, p2/M, z16.b, z22.b\n"
+ "smax z17.b, p2/M, z17.b, z21.b\n"
"st1b { z16.b }, p0, [x12, x11]\n"
- "movprfx z16, z18\n smax z16.b, p1/M, z16.b, z21.b\n"
+ "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z18.b\n"
"st1b { z17.b }, p0, [x10, x11]\n"
"st1b { z16.b }, p0, [x9, x11]\n"
"incw x11\n"
"b.any 1b\n"
"2:" // Vector: Tail
- "movprfx z22, z31\n smax z22.b, p1/M, z22.b, z30.b\n"
- "movprfx z21, z30\n smax z21.b, p1/M, z21.b, z29.b\n"
- "movprfx z20, z28\n smax z20.b, p1/M, z20.b, z27.b\n"
- "movprfx z19, z26\n smax z19.b, p1/M, z19.b, z25.b\n"
- "movprfx z17, z28\n smax z17.b, p1/M, z17.b, z24.b\n"
- "movprfx z18, z25\n smax z18.b, p1/M, z18.b, z23.b\n"
+ "movprfx z22, z31\n smax z22.b, p2/M, z22.b, z30.b\n"
+ "movprfx z21, z30\n smax z21.b, p2/M, z21.b, z29.b\n"
+ "movprfx z20, z28\n smax z20.b, p2/M, z20.b, z27.b\n"
+ "movprfx z19, z26\n smax z19.b, p2/M, z19.b, z25.b\n"
+ "movprfx z17, z28\n smax z17.b, p2/M, z17.b, z24.b\n"
+ "movprfx z18, z25\n smax z18.b, p2/M, z18.b, z23.b\n"
"whilelt p0.b, x11, x15\n"
- "movprfx z16, z22\n smax z16.b, p1/M, z16.b, z20.b\n"
+ "movprfx z16, z22\n smax z16.b, p2/M, z16.b, z20.b\n"
"st1b { z16.b }, p0, [x13, x11]\n"
- "movprfx z16, z19\n smax z16.b, p1/M, z16.b, z22.b\n"
- "smax z17.b, p1/M, z17.b, z21.b\n"
+ "movprfx z16, z19\n smax z16.b, p2/M, z16.b, z22.b\n"
+ "smax z17.b, p2/M, z17.b, z21.b\n"
"st1b { z16.b }, p0, [x12, x11]\n"
- "movprfx z16, z18\n smax z16.b, p1/M, z16.b, z21.b\n"
+ "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z18.b\n"
"st1b { z17.b }, p0, [x10, x11]\n"
"st1b { z16.b }, p0, [x9, x11]\n"
:
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp
index 7d14edddeb..da9e1408f9 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp
@@ -54,68 +54,68 @@ void sve_s8_nhwc_max_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z8.b, #0x80\n"
"mov z7.b, #0x80\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z6.b, #0x80\n"
"mov z5.b, #0x80\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
- "ld1b { z0.b }, p3/Z, [x24, x28]\n"
- "ld1b { z31.b }, p3/Z, [x23, x28]\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z30.b }, p3/Z, [x21, x28]\n"
- "ld1b { z29.b }, p2/Z, [x24, x27]\n"
- "ld1b { z28.b }, p2/Z, [x23, x27]\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z27.b }, p2/Z, [x21, x27]\n"
- "ld1b { z26.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
"movprfx z23, z2\n smax z23.b, p0/M, z23.b, z1.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"movprfx z18, z0\n smax z18.b, p0/M, z18.b, z31.b\n"
"smax z22.b, p0/M, z22.b, z30.b\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
"movprfx z17, z29\n smax z17.b, p0/M, z17.b, z28.b\n"
"smax z21.b, p0/M, z21.b, z27.b\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"movprfx z16, z26\n smax z16.b, p0/M, z16.b, z25.b\n"
"smax z20.b, p0/M, z20.b, z24.b\n"
- "ld1b { z0.b }, p3/Z, [x24, x28]\n"
- "ld1b { z31.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z31.b }, p3/Z, [x22, x28]\n"
"smax z19.b, p0/M, z19.b, z23.b\n"
"smax z18.b, p0/M, z18.b, z22.b\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z30.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x28]\n"
"smax z17.b, p0/M, z17.b, z21.b\n"
"smax z16.b, p0/M, z16.b, z20.b\n"
- "ld1b { z29.b }, p2/Z, [x24, x27]\n"
- "ld1b { z28.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z28.b }, p2/Z, [x22, x27]\n"
"subs x25, x25, #0x1\n"
"smax z8.b, p0/M, z8.b, z19.b\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z27.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x20, x27]\n"
"smax z7.b, p0/M, z7.b, z18.b\n"
"smax z6.b, p0/M, z6.b, z17.b\n"
- "ld1b { z26.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
"smax z5.b, p0/M, z5.b, z16.b\n"
- "add x20, x20, #0x20\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
@@ -138,16 +138,16 @@ void sve_s8_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "smax z8.b, p0/M, z8.b, z4.b\n"
- "ld1b { z0.b }, p3/Z, [x24, x28]\n"
- "ld1b { z29.b }, p2/Z, [x24, x27]\n"
- "smax z7.b, p0/M, z7.b, z0.b\n"
- "smax z6.b, p0/M, z6.b, z29.b\n"
- "ld1b { z26.b }, p1/Z, [x24, x26]\n"
- "smax z5.b, p0/M, z5.b, z26.b\n"
+ "smax z8.b, p0/M, z8.b, z16.b\n"
+ "ld1b { z17.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+ "smax z7.b, p0/M, z7.b, z17.b\n"
+ "smax z6.b, p0/M, z6.b, z16.b\n"
+ "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+ "smax z5.b, p0/M, z5.b, z16.b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"st1b { z8.b }, p4, [%x[outptr], x9]\n"
@@ -166,44 +166,44 @@ void sve_s8_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z8.b, #0x80\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
- "movprfx z23, z2\n smax z23.b, p0/M, z23.b, z1.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "smax z19.b, p0/M, z19.b, z23.b\n"
+ "movprfx z16, z4\n smax z16.b, p0/M, z16.b, z3.b\n"
+ "movprfx z17, z2\n smax z17.b, p0/M, z17.b, z1.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "smax z16.b, p0/M, z16.b, z17.b\n"
"subs x25, x25, #0x1\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
- "smax z8.b, p0/M, z8.b, z19.b\n"
- "add x20, x20, #0x20\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "smax z8.b, p0/M, z8.b, z16.b\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
- "movprfx z23, z2\n smax z23.b, p0/M, z23.b, z1.b\n"
- "smax z19.b, p0/M, z19.b, z23.b\n"
- "smax z8.b, p0/M, z8.b, z19.b\n"
+ "movprfx z16, z4\n smax z16.b, p0/M, z16.b, z3.b\n"
+ "movprfx z17, z2\n smax z17.b, p0/M, z17.b, z1.b\n"
+ "smax z16.b, p0/M, z16.b, z17.b\n"
+ "smax z8.b, p0/M, z8.b, z16.b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "smax z8.b, p0/M, z8.b, z4.b\n"
+ "smax z8.b, p0/M, z8.b, z16.b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"st1b { z8.b }, p4, [%x[outptr], x9]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst/generic.cpp
index 7161c4f389..19a3b112ad 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -128,7 +128,7 @@ void sve_s8q_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"mov z15.s, #0x0\n"
"mov z14.s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"mov z13.s, #0x0\n"
"mov z12.s, #0x0\n"
"mov z11.s, #0x0\n"
@@ -144,42 +144,42 @@ void sve_s8q_nhwc_avg_generic_depthfirst_impl(
"mov z1.s, #0x0\n"
"mov z0.s, #0x0\n"
"cbz x23, 4f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
".inst 0x455c03b5 // saddlb z21.h, z29.b, z28.b\n"
".inst 0x455c07b4 // saddlt z20.h, z29.b, z28.b\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x455a0373 // saddlb z19.h, z27.b, z26.b\n"
".inst 0x455a0772 // saddlt z18.h, z27.b, z26.b\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
".inst 0x45580331 // saddlb z17.h, z25.b, z24.b\n"
".inst 0x45580730 // saddlt z16.h, z25.b, z24.b\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
".inst 0x4595416b // saddwb z11.s, z11.s, z21.h\n"
".inst 0x4595454a // saddwt z10.s, z10.s, z21.h\n"
".inst 0x45944129 // saddwb z9.s, z9.s, z20.h\n"
@@ -222,20 +222,20 @@ void sve_s8q_nhwc_avg_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508a3f7 // sshllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508a7f6 // sshllt z22.h, z31.b, #0x0\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- ".inst 0x4508a3b5 // sshllb z21.h, z29.b, #0x0\n"
- ".inst 0x4508a7b4 // sshllt z20.h, z29.b, #0x0\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- ".inst 0x4508a373 // sshllb z19.h, z27.b, #0x0\n"
- ".inst 0x4508a772 // sshllt z18.h, z27.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508a217 // sshllb z23.h, z16.b, #0x0\n"
+ ".inst 0x4508a616 // sshllt z22.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z17.b }, p2/Z, [x20, x25]\n"
+ ".inst 0x4508a215 // sshllb z21.h, z16.b, #0x0\n"
+ ".inst 0x4508a614 // sshllt z20.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x4508a233 // sshllb z19.h, z17.b, #0x0\n"
+ ".inst 0x4508a632 // sshllt z18.h, z17.b, #0x0\n"
"subs x21, x21, #0x1\n"
- ".inst 0x4508a331 // sshllb z17.h, z25.b, #0x0\n"
- ".inst 0x4508a730 // sshllt z16.h, z25.b, #0x0\n"
+ ".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
@@ -368,57 +368,57 @@ void sve_s8q_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"mov z15.s, #0x0\n"
"mov z14.s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"mov z13.s, #0x0\n"
"mov z12.s, #0x0\n"
"cbz x23, 11f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- ".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ ".inst 0x455e03f1 // saddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e07f0 // saddlt z16.h, z31.b, z30.b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
- ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
- ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- ".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
- ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
- ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
- ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x455e03f1 // saddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e07f0 // saddlt z16.h, z31.b, z30.b\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508a3f7 // sshllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508a7f6 // sshllt z22.h, z31.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
"subs x21, x21, #0x1\n"
- ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
- ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
- ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "ld1rw { z18.s }, p0/Z, [%x[left_shift]]\n"
+ "ld1rw { z16.s }, p0/Z, [%x[left_shift]]\n"
"ld1rw { z17.s }, p0/Z, [%x[combined_rescale_value]]\n"
- ".inst 0x4482824f // srshl z15.s, p0/M, z15.s, z18.s\n"
- ".inst 0x4482824e // srshl z14.s, p0/M, z14.s, z18.s\n"
- ".inst 0x4482824d // srshl z13.s, p0/M, z13.s, z18.s\n"
- ".inst 0x4482824c // srshl z12.s, p0/M, z12.s, z18.s\n"
+ ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
+ ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
+ ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
+ ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
"ld1rw { z16.s }, p0/Z, [%x[right_shift]]\n"
".inst 0x04b175ef // sqrdmulh z15.s, z15.s, z17.s\n"
".inst 0x04b175ce // sqrdmulh z14.s, z14.s, z17.s\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst/generic.cpp
index 19209811d8..4fc1532d5a 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -56,68 +56,68 @@ void sve_s8q_nhwc_max_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z8.b, #0x80\n"
"mov z7.b, #0x80\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z6.b, #0x80\n"
"mov z5.b, #0x80\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
- "ld1b { z0.b }, p3/Z, [x24, x28]\n"
- "ld1b { z31.b }, p3/Z, [x23, x28]\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z30.b }, p3/Z, [x21, x28]\n"
- "ld1b { z29.b }, p2/Z, [x24, x27]\n"
- "ld1b { z28.b }, p2/Z, [x23, x27]\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z27.b }, p2/Z, [x21, x27]\n"
- "ld1b { z26.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
"movprfx z23, z2\n smax z23.b, p0/M, z23.b, z1.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"movprfx z18, z0\n smax z18.b, p0/M, z18.b, z31.b\n"
"smax z22.b, p0/M, z22.b, z30.b\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
"movprfx z17, z29\n smax z17.b, p0/M, z17.b, z28.b\n"
"smax z21.b, p0/M, z21.b, z27.b\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"movprfx z16, z26\n smax z16.b, p0/M, z16.b, z25.b\n"
"smax z20.b, p0/M, z20.b, z24.b\n"
- "ld1b { z0.b }, p3/Z, [x24, x28]\n"
- "ld1b { z31.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z31.b }, p3/Z, [x22, x28]\n"
"smax z19.b, p0/M, z19.b, z23.b\n"
"smax z18.b, p0/M, z18.b, z22.b\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z30.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x28]\n"
"smax z17.b, p0/M, z17.b, z21.b\n"
"smax z16.b, p0/M, z16.b, z20.b\n"
- "ld1b { z29.b }, p2/Z, [x24, x27]\n"
- "ld1b { z28.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z28.b }, p2/Z, [x22, x27]\n"
"subs x25, x25, #0x1\n"
"smax z8.b, p0/M, z8.b, z19.b\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z27.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x20, x27]\n"
"smax z7.b, p0/M, z7.b, z18.b\n"
"smax z6.b, p0/M, z6.b, z17.b\n"
- "ld1b { z26.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
"smax z5.b, p0/M, z5.b, z16.b\n"
- "add x20, x20, #0x20\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
@@ -140,16 +140,16 @@ void sve_s8q_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "smax z8.b, p0/M, z8.b, z4.b\n"
- "ld1b { z0.b }, p3/Z, [x24, x28]\n"
- "ld1b { z29.b }, p2/Z, [x24, x27]\n"
- "smax z7.b, p0/M, z7.b, z0.b\n"
- "smax z6.b, p0/M, z6.b, z29.b\n"
- "ld1b { z26.b }, p1/Z, [x24, x26]\n"
- "smax z5.b, p0/M, z5.b, z26.b\n"
+ "smax z8.b, p0/M, z8.b, z16.b\n"
+ "ld1b { z17.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+ "smax z7.b, p0/M, z7.b, z17.b\n"
+ "smax z6.b, p0/M, z6.b, z16.b\n"
+ "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+ "smax z5.b, p0/M, z5.b, z16.b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
".inst 0x4508a111 // sshllb z17.h, z8.b, #0x0\n"
@@ -292,82 +292,82 @@ void sve_s8q_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z8.b, #0x80\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
- "movprfx z23, z2\n smax z23.b, p0/M, z23.b, z1.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "smax z19.b, p0/M, z19.b, z23.b\n"
+ "movprfx z16, z4\n smax z16.b, p0/M, z16.b, z3.b\n"
+ "movprfx z17, z2\n smax z17.b, p0/M, z17.b, z1.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "smax z16.b, p0/M, z16.b, z17.b\n"
"subs x25, x25, #0x1\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
- "smax z8.b, p0/M, z8.b, z19.b\n"
- "add x20, x20, #0x20\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "smax z8.b, p0/M, z8.b, z16.b\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
- "movprfx z23, z2\n smax z23.b, p0/M, z23.b, z1.b\n"
- "smax z19.b, p0/M, z19.b, z23.b\n"
- "smax z8.b, p0/M, z8.b, z19.b\n"
+ "movprfx z16, z4\n smax z16.b, p0/M, z16.b, z3.b\n"
+ "movprfx z17, z2\n smax z17.b, p0/M, z17.b, z1.b\n"
+ "smax z16.b, p0/M, z16.b, z17.b\n"
+ "smax z8.b, p0/M, z8.b, z16.b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "smax z8.b, p0/M, z8.b, z4.b\n"
+ "smax z8.b, p0/M, z8.b, z16.b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
".inst 0x4508a111 // sshllb z17.h, z8.b, #0x0\n"
- ".inst 0x4508a517 // sshllt z23.h, z8.b, #0x0\n"
+ ".inst 0x4508a512 // sshllt z18.h, z8.b, #0x0\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- "ld1rw { z4.s }, p0/Z, [x20]\n"
- ".inst 0x4510a221 // sshllb z1.s, z17.h, #0x0\n"
- ".inst 0x4510a631 // sshllt z17.s, z17.h, #0x0\n"
+ "ld1rw { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x4510a236 // sshllb z22.s, z17.h, #0x0\n"
+ ".inst 0x4510a635 // sshllt z21.s, z17.h, #0x0\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "ld1rw { z3.s }, p0/Z, [x20]\n"
- ".inst 0x4510a2e0 // sshllb z0.s, z23.h, #0x0\n"
- ".inst 0x4510a6ff // sshllt z31.s, z23.h, #0x0\n"
- ".inst 0x44828081 // srshl z1.s, p0/M, z1.s, z4.s\n"
- ".inst 0x44828091 // srshl z17.s, p0/M, z17.s, z4.s\n"
- ".inst 0x44828080 // srshl z0.s, p0/M, z0.s, z4.s\n"
- ".inst 0x4482809f // srshl z31.s, p0/M, z31.s, z4.s\n"
- ".inst 0x04a37421 // sqrdmulh z1.s, z1.s, z3.s\n"
- ".inst 0x04a37631 // sqrdmulh z17.s, z17.s, z3.s\n"
+ "ld1rw { z17.s }, p0/Z, [x20]\n"
+ ".inst 0x4510a254 // sshllb z20.s, z18.h, #0x0\n"
+ ".inst 0x4510a653 // sshllt z19.s, z18.h, #0x0\n"
+ ".inst 0x44828216 // srshl z22.s, p0/M, z22.s, z16.s\n"
+ ".inst 0x44828215 // srshl z21.s, p0/M, z21.s, z16.s\n"
+ ".inst 0x44828214 // srshl z20.s, p0/M, z20.s, z16.s\n"
+ ".inst 0x44828213 // srshl z19.s, p0/M, z19.s, z16.s\n"
+ ".inst 0x04b176d6 // sqrdmulh z22.s, z22.s, z17.s\n"
+ ".inst 0x04b176b5 // sqrdmulh z21.s, z21.s, z17.s\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "ld1rw { z2.s }, p0/Z, [x20]\n"
- ".inst 0x04a37400 // sqrdmulh z0.s, z0.s, z3.s\n"
- ".inst 0x04a377ff // sqrdmulh z31.s, z31.s, z3.s\n"
+ "ld1rw { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x04b17694 // sqrdmulh z20.s, z20.s, z17.s\n"
+ ".inst 0x04b17673 // sqrdmulh z19.s, z19.s, z17.s\n"
"mov z18.s, #0x7f\n"
- ".inst 0x44828041 // srshl z1.s, p0/M, z1.s, z2.s\n"
- ".inst 0x44828051 // srshl z17.s, p0/M, z17.s, z2.s\n"
- ".inst 0x44828040 // srshl z0.s, p0/M, z0.s, z2.s\n"
- ".inst 0x4482805f // srshl z31.s, p0/M, z31.s, z2.s\n"
+ ".inst 0x44828216 // srshl z22.s, p0/M, z22.s, z16.s\n"
+ ".inst 0x44828215 // srshl z21.s, p0/M, z21.s, z16.s\n"
+ ".inst 0x44828214 // srshl z20.s, p0/M, z20.s, z16.s\n"
+ ".inst 0x44828213 // srshl z19.s, p0/M, z19.s, z16.s\n"
"not z16.s, p0/M, z18.s\n"
- "smax z1.s, p0/M, z1.s, z16.s\n"
- "smax z17.s, p0/M, z17.s, z16.s\n"
- "smax z0.s, p0/M, z0.s, z16.s\n"
- "smax z31.s, p0/M, z31.s, z16.s\n"
- "smin z1.s, p0/M, z1.s, z18.s\n"
- "smin z17.s, p0/M, z17.s, z18.s\n"
- "smin z0.s, p0/M, z0.s, z18.s\n"
- "trn1 z17.h, z1.h, z17.h\n"
- "smin z31.s, p0/M, z31.s, z18.s\n"
- "trn1 z16.h, z0.h, z31.h\n"
+ "smax z22.s, p0/M, z22.s, z16.s\n"
+ "smax z21.s, p0/M, z21.s, z16.s\n"
+ "smax z20.s, p0/M, z20.s, z16.s\n"
+ "smax z19.s, p0/M, z19.s, z16.s\n"
+ "smin z22.s, p0/M, z22.s, z18.s\n"
+ "smin z21.s, p0/M, z21.s, z18.s\n"
+ "smin z20.s, p0/M, z20.s, z18.s\n"
+ "trn1 z17.h, z22.h, z21.h\n"
+ "smin z19.s, p0/M, z19.s, z18.s\n"
+ "trn1 z16.h, z20.h, z19.h\n"
"trn1 z16.b, z17.b, z16.b\n"
"st1b { z16.b }, p4, [%x[outptr], x9]\n"
"incb x9\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst/generic.cpp
index f888038a2a..f3f4950a1f 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -109,7 +109,7 @@ void sve_u8_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"mov z15.s, #0x0\n"
"mov z14.s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"mov z13.s, #0x0\n"
"mov z12.s, #0x0\n"
"mov z11.s, #0x0\n"
@@ -125,42 +125,42 @@ void sve_u8_nhwc_avg_generic_depthfirst_impl(
"mov z1.s, #0x0\n"
"mov z0.s, #0x0\n"
"cbz x23, 4f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
".inst 0x455c0bb5 // uaddlb z21.h, z29.b, z28.b\n"
".inst 0x455c0fb4 // uaddlt z20.h, z29.b, z28.b\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x455a0b73 // uaddlb z19.h, z27.b, z26.b\n"
".inst 0x455a0f72 // uaddlt z18.h, z27.b, z26.b\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
".inst 0x45580b31 // uaddlb z17.h, z25.b, z24.b\n"
".inst 0x45580f30 // uaddlt z16.h, z25.b, z24.b\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
".inst 0x4595496b // uaddwb z11.s, z11.s, z21.h\n"
".inst 0x45954d4a // uaddwt z10.s, z10.s, z21.h\n"
".inst 0x45944929 // uaddwb z9.s, z9.s, z20.h\n"
@@ -203,20 +203,20 @@ void sve_u8_nhwc_avg_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508abf7 // ushllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508aff6 // ushllt z22.h, z31.b, #0x0\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- ".inst 0x4508abb5 // ushllb z21.h, z29.b, #0x0\n"
- ".inst 0x4508afb4 // ushllt z20.h, z29.b, #0x0\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- ".inst 0x4508ab73 // ushllb z19.h, z27.b, #0x0\n"
- ".inst 0x4508af72 // ushllt z18.h, z27.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508aa17 // ushllb z23.h, z16.b, #0x0\n"
+ ".inst 0x4508ae16 // ushllt z22.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z17.b }, p2/Z, [x20, x25]\n"
+ ".inst 0x4508aa15 // ushllb z21.h, z16.b, #0x0\n"
+ ".inst 0x4508ae14 // ushllt z20.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x4508aa33 // ushllb z19.h, z17.b, #0x0\n"
+ ".inst 0x4508ae32 // ushllt z18.h, z17.b, #0x0\n"
"subs x21, x21, #0x1\n"
- ".inst 0x4508ab31 // ushllb z17.h, z25.b, #0x0\n"
- ".inst 0x4508af30 // ushllt z16.h, z25.b, #0x0\n"
+ ".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
@@ -332,49 +332,49 @@ void sve_u8_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"mov z15.s, #0x0\n"
"mov z14.s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"mov z13.s, #0x0\n"
"mov z12.s, #0x0\n"
"cbz x23, 11f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- ".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ ".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e0ff0 // uaddlt z16.h, z31.b, z30.b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
- ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
- ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- ".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
- ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
- ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
- ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e0ff0 // uaddlt z16.h, z31.b, z30.b\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508abf7 // ushllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508aff6 // ushllt z22.h, z31.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
"subs x21, x21, #0x1\n"
- ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
- ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
- ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
@@ -387,17 +387,17 @@ void sve_u8_nhwc_avg_generic_depthfirst_impl(
".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
- "mov z16.s, #0x0\n"
- "mov z18.s, #0xff\n"
- "smax z15.s, p0/M, z15.s, z16.s\n"
- "smax z14.s, p0/M, z14.s, z16.s\n"
- "smax z13.s, p0/M, z13.s, z16.s\n"
- "smax z12.s, p0/M, z12.s, z16.s\n"
- "smin z15.s, p0/M, z15.s, z18.s\n"
- "smin z14.s, p0/M, z14.s, z18.s\n"
+ "mov z17.s, #0x0\n"
+ "mov z16.s, #0xff\n"
+ "smax z15.s, p0/M, z15.s, z17.s\n"
+ "smax z14.s, p0/M, z14.s, z17.s\n"
+ "smax z13.s, p0/M, z13.s, z17.s\n"
+ "smax z12.s, p0/M, z12.s, z17.s\n"
+ "smin z15.s, p0/M, z15.s, z16.s\n"
+ "smin z14.s, p0/M, z14.s, z16.s\n"
"trn1 z17.h, z15.h, z14.h\n"
- "smin z13.s, p0/M, z13.s, z18.s\n"
- "smin z12.s, p0/M, z12.s, z18.s\n"
+ "smin z13.s, p0/M, z13.s, z16.s\n"
+ "smin z12.s, p0/M, z12.s, z16.s\n"
"trn1 z16.h, z13.h, z12.h\n"
"trn1 z16.b, z17.b, z16.b\n"
"st1b { z16.b }, p4, [%x[outptr], x27]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 70d308a585..8612555bfb 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -66,10 +66,10 @@ void sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr x15, [%x[args], %[offsetof_n_channels]]\n"
"ldr x21, [%x[args], %[offsetof_outptrs]]\n"
"mov x14, #0x0\n"
- "whilelt p2.b, x14, x15\n"
+ "whilelt p0.b, x14, x15\n"
"ldr x20, [%x[args], %[offsetof_inptrs]]\n"
"ldp x13, x12, [x21, #0x0]\n"
- "ptrue p1.b\n"
+ "ptrue p2.b\n"
"mov x11, #0x0\n"
"ldp x10, x9, [x21, #0x10]\n"
"ldp x28, x27, [x20, #0x0]\n"
@@ -77,61 +77,61 @@ void sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldp x24, x23, [x20, #0x20]\n"
"ldp x22, x21, [x20, #0x30]\n"
"ldr x20, [x20, #0x40]\n"
- "ld1b { z31.b }, p2/Z, [x27, x14]\n"
- "ld1b { z30.b }, p2/Z, [x24, x14]\n"
- "ld1b { z29.b }, p2/Z, [x21, x14]\n"
- "ld1b { z28.b }, p2/Z, [x25, x14]\n"
- "ld1b { z27.b }, p2/Z, [x28, x14]\n"
- "ld1b { z26.b }, p2/Z, [x26, x14]\n"
- "ld1b { z25.b }, p2/Z, [x23, x14]\n"
- "ld1b { z24.b }, p2/Z, [x22, x14]\n"
- "ld1b { z23.b }, p2/Z, [x20, x14]\n"
+ "ld1b { z31.b }, p0/Z, [x27, x14]\n"
+ "ld1b { z30.b }, p0/Z, [x24, x14]\n"
+ "ld1b { z29.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z28.b }, p0/Z, [x25, x14]\n"
+ "ld1b { z27.b }, p0/Z, [x28, x14]\n"
+ "ld1b { z26.b }, p0/Z, [x26, x14]\n"
+ "ld1b { z25.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z24.b }, p0/Z, [x22, x14]\n"
+ "ld1b { z23.b }, p0/Z, [x20, x14]\n"
"incw x14\n"
- "whilelt p2.b, x14, x15\n"
+ "whilelt p1.b, x14, x15\n"
"b.none 2f\n"
"1:" // Vector: Loop
- "movprfx z22, z31\n umax z22.b, p1/M, z22.b, z30.b\n"
- "movprfx z21, z30\n umax z21.b, p1/M, z21.b, z29.b\n"
- "ld1b { z31.b }, p2/Z, [x27, x14]\n"
- "ld1b { z30.b }, p2/Z, [x24, x14]\n"
- "movprfx z20, z28\n umax z20.b, p1/M, z20.b, z27.b\n"
- "movprfx z19, z26\n umax z19.b, p1/M, z19.b, z25.b\n"
- "ld1b { z29.b }, p2/Z, [x21, x14]\n"
- "ld1b { z27.b }, p2/Z, [x28, x14]\n"
- "movprfx z17, z28\n umax z17.b, p1/M, z17.b, z24.b\n"
- "movprfx z18, z25\n umax z18.b, p1/M, z18.b, z23.b\n"
- "ld1b { z28.b }, p2/Z, [x25, x14]\n"
- "ld1b { z26.b }, p2/Z, [x26, x14]\n"
- "ld1b { z25.b }, p2/Z, [x23, x14]\n"
- "ld1b { z24.b }, p2/Z, [x22, x14]\n"
+ "movprfx z22, z31\n umax z22.b, p2/M, z22.b, z30.b\n"
+ "movprfx z21, z30\n umax z21.b, p2/M, z21.b, z29.b\n"
+ "ld1b { z31.b }, p1/Z, [x27, x14]\n"
+ "ld1b { z30.b }, p1/Z, [x24, x14]\n"
+ "movprfx z20, z28\n umax z20.b, p2/M, z20.b, z27.b\n"
+ "movprfx z19, z26\n umax z19.b, p2/M, z19.b, z25.b\n"
+ "ld1b { z29.b }, p1/Z, [x21, x14]\n"
+ "ld1b { z27.b }, p1/Z, [x28, x14]\n"
+ "movprfx z17, z28\n umax z17.b, p2/M, z17.b, z24.b\n"
+ "movprfx z18, z25\n umax z18.b, p2/M, z18.b, z23.b\n"
+ "ld1b { z28.b }, p1/Z, [x25, x14]\n"
+ "ld1b { z26.b }, p1/Z, [x26, x14]\n"
+ "ld1b { z25.b }, p1/Z, [x23, x14]\n"
+ "ld1b { z24.b }, p1/Z, [x22, x14]\n"
"whilelt p0.b, x11, x15\n"
- "movprfx z16, z22\n umax z16.b, p1/M, z16.b, z20.b\n"
- "ld1b { z23.b }, p2/Z, [x20, x14]\n"
+ "movprfx z16, z22\n umax z16.b, p2/M, z16.b, z20.b\n"
+ "ld1b { z23.b }, p1/Z, [x20, x14]\n"
"incw x14\n"
- "whilelt p2.b, x14, x15\n"
+ "whilelt p1.b, x14, x15\n"
"st1b { z16.b }, p0, [x13, x11]\n"
- "movprfx z16, z19\n umax z16.b, p1/M, z16.b, z22.b\n"
- "umax z17.b, p1/M, z17.b, z21.b\n"
+ "movprfx z16, z19\n umax z16.b, p2/M, z16.b, z22.b\n"
+ "umax z17.b, p2/M, z17.b, z21.b\n"
"st1b { z16.b }, p0, [x12, x11]\n"
- "movprfx z16, z18\n umax z16.b, p1/M, z16.b, z21.b\n"
+ "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z18.b\n"
"st1b { z17.b }, p0, [x10, x11]\n"
"st1b { z16.b }, p0, [x9, x11]\n"
"incw x11\n"
"b.any 1b\n"
"2:" // Vector: Tail
- "movprfx z22, z31\n umax z22.b, p1/M, z22.b, z30.b\n"
- "movprfx z21, z30\n umax z21.b, p1/M, z21.b, z29.b\n"
- "movprfx z20, z28\n umax z20.b, p1/M, z20.b, z27.b\n"
- "movprfx z19, z26\n umax z19.b, p1/M, z19.b, z25.b\n"
- "movprfx z17, z28\n umax z17.b, p1/M, z17.b, z24.b\n"
- "movprfx z18, z25\n umax z18.b, p1/M, z18.b, z23.b\n"
+ "movprfx z22, z31\n umax z22.b, p2/M, z22.b, z30.b\n"
+ "movprfx z21, z30\n umax z21.b, p2/M, z21.b, z29.b\n"
+ "movprfx z20, z28\n umax z20.b, p2/M, z20.b, z27.b\n"
+ "movprfx z19, z26\n umax z19.b, p2/M, z19.b, z25.b\n"
+ "movprfx z17, z28\n umax z17.b, p2/M, z17.b, z24.b\n"
+ "movprfx z18, z25\n umax z18.b, p2/M, z18.b, z23.b\n"
"whilelt p0.b, x11, x15\n"
- "movprfx z16, z22\n umax z16.b, p1/M, z16.b, z20.b\n"
+ "movprfx z16, z22\n umax z16.b, p2/M, z16.b, z20.b\n"
"st1b { z16.b }, p0, [x13, x11]\n"
- "movprfx z16, z19\n umax z16.b, p1/M, z16.b, z22.b\n"
- "umax z17.b, p1/M, z17.b, z21.b\n"
+ "movprfx z16, z19\n umax z16.b, p2/M, z16.b, z22.b\n"
+ "umax z17.b, p2/M, z17.b, z21.b\n"
"st1b { z16.b }, p0, [x12, x11]\n"
- "movprfx z16, z18\n umax z16.b, p1/M, z16.b, z21.b\n"
+ "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z18.b\n"
"st1b { z17.b }, p0, [x10, x11]\n"
"st1b { z16.b }, p0, [x9, x11]\n"
:
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp
index 34aa5a3dd6..be0eb398ae 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp
@@ -54,68 +54,68 @@ void sve_u8_nhwc_max_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z8.b, #0x0\n"
"mov z7.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z6.b, #0x0\n"
"mov z5.b, #0x0\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
- "ld1b { z0.b }, p3/Z, [x24, x28]\n"
- "ld1b { z31.b }, p3/Z, [x23, x28]\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z30.b }, p3/Z, [x21, x28]\n"
- "ld1b { z29.b }, p2/Z, [x24, x27]\n"
- "ld1b { z28.b }, p2/Z, [x23, x27]\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z27.b }, p2/Z, [x21, x27]\n"
- "ld1b { z26.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
"movprfx z23, z2\n umax z23.b, p0/M, z23.b, z1.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"movprfx z18, z0\n umax z18.b, p0/M, z18.b, z31.b\n"
"umax z22.b, p0/M, z22.b, z30.b\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
"movprfx z17, z29\n umax z17.b, p0/M, z17.b, z28.b\n"
"umax z21.b, p0/M, z21.b, z27.b\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"movprfx z16, z26\n umax z16.b, p0/M, z16.b, z25.b\n"
"umax z20.b, p0/M, z20.b, z24.b\n"
- "ld1b { z0.b }, p3/Z, [x24, x28]\n"
- "ld1b { z31.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z31.b }, p3/Z, [x22, x28]\n"
"umax z19.b, p0/M, z19.b, z23.b\n"
"umax z18.b, p0/M, z18.b, z22.b\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z30.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x28]\n"
"umax z17.b, p0/M, z17.b, z21.b\n"
"umax z16.b, p0/M, z16.b, z20.b\n"
- "ld1b { z29.b }, p2/Z, [x24, x27]\n"
- "ld1b { z28.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z28.b }, p2/Z, [x22, x27]\n"
"subs x25, x25, #0x1\n"
"umax z8.b, p0/M, z8.b, z19.b\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z27.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x20, x27]\n"
"umax z7.b, p0/M, z7.b, z18.b\n"
"umax z6.b, p0/M, z6.b, z17.b\n"
- "ld1b { z26.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
"umax z5.b, p0/M, z5.b, z16.b\n"
- "add x20, x20, #0x20\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
@@ -138,16 +138,16 @@ void sve_u8_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "umax z8.b, p0/M, z8.b, z4.b\n"
- "ld1b { z0.b }, p3/Z, [x24, x28]\n"
- "ld1b { z29.b }, p2/Z, [x24, x27]\n"
- "umax z7.b, p0/M, z7.b, z0.b\n"
- "umax z6.b, p0/M, z6.b, z29.b\n"
- "ld1b { z26.b }, p1/Z, [x24, x26]\n"
- "umax z5.b, p0/M, z5.b, z26.b\n"
+ "umax z8.b, p0/M, z8.b, z16.b\n"
+ "ld1b { z17.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+ "umax z7.b, p0/M, z7.b, z17.b\n"
+ "umax z6.b, p0/M, z6.b, z16.b\n"
+ "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+ "umax z5.b, p0/M, z5.b, z16.b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"st1b { z8.b }, p4, [%x[outptr], x9]\n"
@@ -166,44 +166,44 @@ void sve_u8_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z8.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
- "movprfx z23, z2\n umax z23.b, p0/M, z23.b, z1.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "umax z19.b, p0/M, z19.b, z23.b\n"
+ "movprfx z16, z4\n umax z16.b, p0/M, z16.b, z3.b\n"
+ "movprfx z17, z2\n umax z17.b, p0/M, z17.b, z1.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "umax z16.b, p0/M, z16.b, z17.b\n"
"subs x25, x25, #0x1\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
- "umax z8.b, p0/M, z8.b, z19.b\n"
- "add x20, x20, #0x20\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "umax z8.b, p0/M, z8.b, z16.b\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
- "movprfx z23, z2\n umax z23.b, p0/M, z23.b, z1.b\n"
- "umax z19.b, p0/M, z19.b, z23.b\n"
- "umax z8.b, p0/M, z8.b, z19.b\n"
+ "movprfx z16, z4\n umax z16.b, p0/M, z16.b, z3.b\n"
+ "movprfx z17, z2\n umax z17.b, p0/M, z17.b, z1.b\n"
+ "umax z16.b, p0/M, z16.b, z17.b\n"
+ "umax z8.b, p0/M, z8.b, z16.b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "umax z8.b, p0/M, z8.b, z4.b\n"
+ "umax z8.b, p0/M, z8.b, z16.b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"st1b { z8.b }, p4, [%x[outptr], x9]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp
index 36ac381004..e8339a2cd9 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -136,7 +136,7 @@ void sve_u8q_nhwc_avg_generic_depthfirst_impl(
"mov z13.d, z15.d\n"
"mov z12.d, z15.d\n"
"mov z11.d, z15.d\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"mov z10.d, z15.d\n"
"mov z9.d, z15.d\n"
"mov z8.d, z15.d\n"
@@ -149,42 +149,42 @@ void sve_u8q_nhwc_avg_generic_depthfirst_impl(
"mov z1.d, z15.d\n"
"mov z0.d, z15.d\n"
"cbz x23, 4f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
".inst 0x455c0bb5 // uaddlb z21.h, z29.b, z28.b\n"
".inst 0x455c0fb4 // uaddlt z20.h, z29.b, z28.b\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x455a0b73 // uaddlb z19.h, z27.b, z26.b\n"
".inst 0x455a0f72 // uaddlt z18.h, z27.b, z26.b\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
".inst 0x45580b31 // uaddlb z17.h, z25.b, z24.b\n"
".inst 0x45580f30 // uaddlt z16.h, z25.b, z24.b\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
".inst 0x4595496b // uaddwb z11.s, z11.s, z21.h\n"
".inst 0x45954d4a // uaddwt z10.s, z10.s, z21.h\n"
".inst 0x45944929 // uaddwb z9.s, z9.s, z20.h\n"
@@ -227,20 +227,20 @@ void sve_u8q_nhwc_avg_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508abf7 // ushllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508aff6 // ushllt z22.h, z31.b, #0x0\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- ".inst 0x4508abb5 // ushllb z21.h, z29.b, #0x0\n"
- ".inst 0x4508afb4 // ushllt z20.h, z29.b, #0x0\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- ".inst 0x4508ab73 // ushllb z19.h, z27.b, #0x0\n"
- ".inst 0x4508af72 // ushllt z18.h, z27.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508aa17 // ushllb z23.h, z16.b, #0x0\n"
+ ".inst 0x4508ae16 // ushllt z22.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z17.b }, p2/Z, [x20, x25]\n"
+ ".inst 0x4508aa15 // ushllb z21.h, z16.b, #0x0\n"
+ ".inst 0x4508ae14 // ushllt z20.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x4508aa33 // ushllb z19.h, z17.b, #0x0\n"
+ ".inst 0x4508ae32 // ushllt z18.h, z17.b, #0x0\n"
"subs x21, x21, #0x1\n"
- ".inst 0x4508ab31 // ushllb z17.h, z25.b, #0x0\n"
- ".inst 0x4508af30 // ushllt z16.h, z25.b, #0x0\n"
+ ".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
@@ -393,55 +393,55 @@ void sve_u8q_nhwc_avg_generic_depthfirst_impl(
"mov z14.d, z15.d\n"
"mov z13.d, z15.d\n"
"mov z12.d, z15.d\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x23, 11f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- ".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ ".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e0ff0 // uaddlt z16.h, z31.b, z30.b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
- ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
- ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- ".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
- ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
- ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
- ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e0ff0 // uaddlt z16.h, z31.b, z30.b\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508abf7 // ushllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508aff6 // ushllt z22.h, z31.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
"subs x21, x21, #0x1\n"
- ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
- ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
- ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "ld1rw { z18.s }, p0/Z, [%x[left_shift]]\n"
+ "ld1rw { z17.s }, p0/Z, [%x[left_shift]]\n"
"ld1rw { z16.s }, p0/Z, [%x[combined_rescale_value]]\n"
- ".inst 0x4482824f // srshl z15.s, p0/M, z15.s, z18.s\n"
- ".inst 0x4482824e // srshl z14.s, p0/M, z14.s, z18.s\n"
- ".inst 0x4482824d // srshl z13.s, p0/M, z13.s, z18.s\n"
- ".inst 0x4482824c // srshl z12.s, p0/M, z12.s, z18.s\n"
+ ".inst 0x4482822f // srshl z15.s, p0/M, z15.s, z17.s\n"
+ ".inst 0x4482822e // srshl z14.s, p0/M, z14.s, z17.s\n"
+ ".inst 0x4482822d // srshl z13.s, p0/M, z13.s, z17.s\n"
+ ".inst 0x4482822c // srshl z12.s, p0/M, z12.s, z17.s\n"
"ld1rw { z17.s }, p0/Z, [%x[right_shift]]\n"
".inst 0x04b075ef // sqrdmulh z15.s, z15.s, z16.s\n"
".inst 0x04b075ce // sqrdmulh z14.s, z14.s, z16.s\n"
@@ -457,17 +457,17 @@ void sve_u8q_nhwc_avg_generic_depthfirst_impl(
"add z14.s, z14.s, z16.s\n"
"add z13.s, z13.s, z16.s\n"
"add z12.s, z12.s, z16.s\n"
- "mov z16.s, #0x0\n"
- "smax z15.s, p0/M, z15.s, z16.s\n"
- "smax z14.s, p0/M, z14.s, z16.s\n"
- "mov z18.s, #0xff\n"
- "smax z13.s, p0/M, z13.s, z16.s\n"
- "smax z12.s, p0/M, z12.s, z16.s\n"
- "smin z15.s, p0/M, z15.s, z18.s\n"
- "smin z14.s, p0/M, z14.s, z18.s\n"
+ "mov z17.s, #0x0\n"
+ "smax z15.s, p0/M, z15.s, z17.s\n"
+ "smax z14.s, p0/M, z14.s, z17.s\n"
+ "mov z16.s, #0xff\n"
+ "smax z13.s, p0/M, z13.s, z17.s\n"
+ "smax z12.s, p0/M, z12.s, z17.s\n"
+ "smin z15.s, p0/M, z15.s, z16.s\n"
+ "smin z14.s, p0/M, z14.s, z16.s\n"
"trn1 z17.h, z15.h, z14.h\n"
- "smin z13.s, p0/M, z13.s, z18.s\n"
- "smin z12.s, p0/M, z12.s, z18.s\n"
+ "smin z13.s, p0/M, z13.s, z16.s\n"
+ "smin z12.s, p0/M, z12.s, z16.s\n"
"trn1 z16.h, z13.h, z12.h\n"
"trn1 z16.b, z17.b, z16.b\n"
"st1b { z16.b }, p4, [%x[outptr], x27]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp
index a00cbc59d8..94522cdaaa 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -56,68 +56,68 @@ void sve_u8q_nhwc_max_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z8.b, #0x0\n"
"mov z7.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z6.b, #0x0\n"
"mov z5.b, #0x0\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
- "ld1b { z0.b }, p3/Z, [x24, x28]\n"
- "ld1b { z31.b }, p3/Z, [x23, x28]\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z30.b }, p3/Z, [x21, x28]\n"
- "ld1b { z29.b }, p2/Z, [x24, x27]\n"
- "ld1b { z28.b }, p2/Z, [x23, x27]\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z27.b }, p2/Z, [x21, x27]\n"
- "ld1b { z26.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
"movprfx z23, z2\n umax z23.b, p0/M, z23.b, z1.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"movprfx z18, z0\n umax z18.b, p0/M, z18.b, z31.b\n"
"umax z22.b, p0/M, z22.b, z30.b\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
"movprfx z17, z29\n umax z17.b, p0/M, z17.b, z28.b\n"
"umax z21.b, p0/M, z21.b, z27.b\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"movprfx z16, z26\n umax z16.b, p0/M, z16.b, z25.b\n"
"umax z20.b, p0/M, z20.b, z24.b\n"
- "ld1b { z0.b }, p3/Z, [x24, x28]\n"
- "ld1b { z31.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z31.b }, p3/Z, [x22, x28]\n"
"umax z19.b, p0/M, z19.b, z23.b\n"
"umax z18.b, p0/M, z18.b, z22.b\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z30.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x28]\n"
"umax z17.b, p0/M, z17.b, z21.b\n"
"umax z16.b, p0/M, z16.b, z20.b\n"
- "ld1b { z29.b }, p2/Z, [x24, x27]\n"
- "ld1b { z28.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z28.b }, p2/Z, [x22, x27]\n"
"subs x25, x25, #0x1\n"
"umax z8.b, p0/M, z8.b, z19.b\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z27.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x20, x27]\n"
"umax z7.b, p0/M, z7.b, z18.b\n"
"umax z6.b, p0/M, z6.b, z17.b\n"
- "ld1b { z26.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
"umax z5.b, p0/M, z5.b, z16.b\n"
- "add x20, x20, #0x20\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
@@ -140,103 +140,103 @@ void sve_u8q_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "umax z8.b, p0/M, z8.b, z4.b\n"
- "ld1b { z0.b }, p3/Z, [x24, x28]\n"
- "ld1b { z29.b }, p2/Z, [x24, x27]\n"
- "umax z7.b, p0/M, z7.b, z0.b\n"
- "umax z6.b, p0/M, z6.b, z29.b\n"
- "ld1b { z26.b }, p1/Z, [x24, x26]\n"
- "umax z5.b, p0/M, z5.b, z26.b\n"
+ "umax z8.b, p0/M, z8.b, z16.b\n"
+ "ld1b { z17.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+ "umax z7.b, p0/M, z7.b, z17.b\n"
+ "umax z6.b, p0/M, z6.b, z16.b\n"
+ "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+ "umax z5.b, p0/M, z5.b, z16.b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
- "ld1rw { z4.s }, p0/Z, [x20]\n"
- ".inst 0x4508a918 // ushllb z24.h, z8.b, #0x0\n"
- ".inst 0x4508ad17 // ushllt z23.h, z8.b, #0x0\n"
- ".inst 0x4508a8f6 // ushllb z22.h, z7.b, #0x0\n"
- ".inst 0x4508acf5 // ushllt z21.h, z7.b, #0x0\n"
- "neg z4.s, p0/M, z4.s\n"
- "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- ".inst 0x4508a8d4 // ushllb z20.h, z6.b, #0x0\n"
- ".inst 0x4508acd3 // ushllt z19.h, z6.b, #0x0\n"
"ld1rw { z3.s }, p0/Z, [x20]\n"
+ ".inst 0x4508a911 // ushllb z17.h, z8.b, #0x0\n"
+ ".inst 0x4508ad18 // ushllt z24.h, z8.b, #0x0\n"
+ ".inst 0x4508a8f7 // ushllb z23.h, z7.b, #0x0\n"
+ ".inst 0x4508acf6 // ushllt z22.h, z7.b, #0x0\n"
+ "neg z3.s, p0/M, z3.s\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+ ".inst 0x4508a8d5 // ushllb z21.h, z6.b, #0x0\n"
+ ".inst 0x4508acd4 // ushllt z20.h, z6.b, #0x0\n"
+ "ld1rw { z2.s }, p0/Z, [x20]\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- ".inst 0x4508a8b2 // ushllb z18.h, z5.b, #0x0\n"
- ".inst 0x4508acb1 // ushllt z17.h, z5.b, #0x0\n"
- "ld1rw { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x4508a8b3 // ushllb z19.h, z5.b, #0x0\n"
+ ".inst 0x4508acb0 // ushllt z16.h, z5.b, #0x0\n"
+ "ld1rw { z18.s }, p0/Z, [x20]\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- ".inst 0x45984082 // saddwb z2.s, z4.s, z24.h\n"
- ".inst 0x45984481 // saddwt z1.s, z4.s, z24.h\n"
- ".inst 0x44828062 // srshl z2.s, p0/M, z2.s, z3.s\n"
- ".inst 0x44828061 // srshl z1.s, p0/M, z1.s, z3.s\n"
- ".inst 0x45974080 // saddwb z0.s, z4.s, z23.h\n"
- ".inst 0x4597449f // saddwt z31.s, z4.s, z23.h\n"
- ".inst 0x44828060 // srshl z0.s, p0/M, z0.s, z3.s\n"
- ".inst 0x4482807f // srshl z31.s, p0/M, z31.s, z3.s\n"
- ".inst 0x4596409e // saddwb z30.s, z4.s, z22.h\n"
- ".inst 0x4596449d // saddwt z29.s, z4.s, z22.h\n"
- ".inst 0x4482807e // srshl z30.s, p0/M, z30.s, z3.s\n"
- ".inst 0x4482807d // srshl z29.s, p0/M, z29.s, z3.s\n"
- ".inst 0x4595409c // saddwb z28.s, z4.s, z21.h\n"
- ".inst 0x4595449b // saddwt z27.s, z4.s, z21.h\n"
- ".inst 0x4482807c // srshl z28.s, p0/M, z28.s, z3.s\n"
- ".inst 0x4482807b // srshl z27.s, p0/M, z27.s, z3.s\n"
- ".inst 0x4594409a // saddwb z26.s, z4.s, z20.h\n"
- ".inst 0x45944499 // saddwt z25.s, z4.s, z20.h\n"
- ".inst 0x4482807a // srshl z26.s, p0/M, z26.s, z3.s\n"
- ".inst 0x44828079 // srshl z25.s, p0/M, z25.s, z3.s\n"
- ".inst 0x45934098 // saddwb z24.s, z4.s, z19.h\n"
- ".inst 0x45934497 // saddwt z23.s, z4.s, z19.h\n"
- ".inst 0x44828078 // srshl z24.s, p0/M, z24.s, z3.s\n"
- ".inst 0x44828077 // srshl z23.s, p0/M, z23.s, z3.s\n"
- ".inst 0x45924096 // saddwb z22.s, z4.s, z18.h\n"
- ".inst 0x45924495 // saddwt z21.s, z4.s, z18.h\n"
- ".inst 0x44828076 // srshl z22.s, p0/M, z22.s, z3.s\n"
- ".inst 0x44828075 // srshl z21.s, p0/M, z21.s, z3.s\n"
- ".inst 0x45914094 // saddwb z20.s, z4.s, z17.h\n"
- ".inst 0x45914493 // saddwt z19.s, z4.s, z17.h\n"
- ".inst 0x44828074 // srshl z20.s, p0/M, z20.s, z3.s\n"
- ".inst 0x44828073 // srshl z19.s, p0/M, z19.s, z3.s\n"
- "ld1rw { z17.s }, p0/Z, [x20]\n"
- ".inst 0x04b07442 // sqrdmulh z2.s, z2.s, z16.s\n"
- ".inst 0x04b07421 // sqrdmulh z1.s, z1.s, z16.s\n"
+ ".inst 0x45914061 // saddwb z1.s, z3.s, z17.h\n"
+ ".inst 0x45914471 // saddwt z17.s, z3.s, z17.h\n"
+ ".inst 0x44828041 // srshl z1.s, p0/M, z1.s, z2.s\n"
+ ".inst 0x44828051 // srshl z17.s, p0/M, z17.s, z2.s\n"
+ ".inst 0x45984060 // saddwb z0.s, z3.s, z24.h\n"
+ ".inst 0x4598447f // saddwt z31.s, z3.s, z24.h\n"
+ ".inst 0x44828040 // srshl z0.s, p0/M, z0.s, z2.s\n"
+ ".inst 0x4482805f // srshl z31.s, p0/M, z31.s, z2.s\n"
+ ".inst 0x4597407e // saddwb z30.s, z3.s, z23.h\n"
+ ".inst 0x4597447d // saddwt z29.s, z3.s, z23.h\n"
+ ".inst 0x4482805e // srshl z30.s, p0/M, z30.s, z2.s\n"
+ ".inst 0x4482805d // srshl z29.s, p0/M, z29.s, z2.s\n"
+ ".inst 0x4596407c // saddwb z28.s, z3.s, z22.h\n"
+ ".inst 0x4596447b // saddwt z27.s, z3.s, z22.h\n"
+ ".inst 0x4482805c // srshl z28.s, p0/M, z28.s, z2.s\n"
+ ".inst 0x4482805b // srshl z27.s, p0/M, z27.s, z2.s\n"
+ ".inst 0x4595407a // saddwb z26.s, z3.s, z21.h\n"
+ ".inst 0x45954479 // saddwt z25.s, z3.s, z21.h\n"
+ ".inst 0x4482805a // srshl z26.s, p0/M, z26.s, z2.s\n"
+ ".inst 0x44828059 // srshl z25.s, p0/M, z25.s, z2.s\n"
+ ".inst 0x45944078 // saddwb z24.s, z3.s, z20.h\n"
+ ".inst 0x45944477 // saddwt z23.s, z3.s, z20.h\n"
+ ".inst 0x44828058 // srshl z24.s, p0/M, z24.s, z2.s\n"
+ ".inst 0x44828057 // srshl z23.s, p0/M, z23.s, z2.s\n"
+ ".inst 0x45934076 // saddwb z22.s, z3.s, z19.h\n"
+ ".inst 0x45934475 // saddwt z21.s, z3.s, z19.h\n"
+ ".inst 0x44828056 // srshl z22.s, p0/M, z22.s, z2.s\n"
+ ".inst 0x44828055 // srshl z21.s, p0/M, z21.s, z2.s\n"
+ ".inst 0x45904074 // saddwb z20.s, z3.s, z16.h\n"
+ ".inst 0x45904473 // saddwt z19.s, z3.s, z16.h\n"
+ ".inst 0x44828054 // srshl z20.s, p0/M, z20.s, z2.s\n"
+ ".inst 0x44828053 // srshl z19.s, p0/M, z19.s, z2.s\n"
+ "ld1rw { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x04b27421 // sqrdmulh z1.s, z1.s, z18.s\n"
+ ".inst 0x04b27631 // sqrdmulh z17.s, z17.s, z18.s\n"
"add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
- ".inst 0x04b07400 // sqrdmulh z0.s, z0.s, z16.s\n"
- ".inst 0x04b077ff // sqrdmulh z31.s, z31.s, z16.s\n"
- ".inst 0x44828222 // srshl z2.s, p0/M, z2.s, z17.s\n"
- ".inst 0x44828221 // srshl z1.s, p0/M, z1.s, z17.s\n"
- ".inst 0x04b077de // sqrdmulh z30.s, z30.s, z16.s\n"
- ".inst 0x04b077bd // sqrdmulh z29.s, z29.s, z16.s\n"
- ".inst 0x44828220 // srshl z0.s, p0/M, z0.s, z17.s\n"
- ".inst 0x4482823f // srshl z31.s, p0/M, z31.s, z17.s\n"
- ".inst 0x04b0779c // sqrdmulh z28.s, z28.s, z16.s\n"
- ".inst 0x04b0777b // sqrdmulh z27.s, z27.s, z16.s\n"
- ".inst 0x4482823e // srshl z30.s, p0/M, z30.s, z17.s\n"
- ".inst 0x4482823d // srshl z29.s, p0/M, z29.s, z17.s\n"
- ".inst 0x04b0775a // sqrdmulh z26.s, z26.s, z16.s\n"
- ".inst 0x04b07739 // sqrdmulh z25.s, z25.s, z16.s\n"
- ".inst 0x4482823c // srshl z28.s, p0/M, z28.s, z17.s\n"
- ".inst 0x4482823b // srshl z27.s, p0/M, z27.s, z17.s\n"
- ".inst 0x04b07718 // sqrdmulh z24.s, z24.s, z16.s\n"
- ".inst 0x04b076f7 // sqrdmulh z23.s, z23.s, z16.s\n"
- ".inst 0x4482823a // srshl z26.s, p0/M, z26.s, z17.s\n"
- ".inst 0x44828239 // srshl z25.s, p0/M, z25.s, z17.s\n"
- ".inst 0x04b076d6 // sqrdmulh z22.s, z22.s, z16.s\n"
- ".inst 0x04b076b5 // sqrdmulh z21.s, z21.s, z16.s\n"
- ".inst 0x44828238 // srshl z24.s, p0/M, z24.s, z17.s\n"
- ".inst 0x44828237 // srshl z23.s, p0/M, z23.s, z17.s\n"
- ".inst 0x04b07694 // sqrdmulh z20.s, z20.s, z16.s\n"
- ".inst 0x04b07673 // sqrdmulh z19.s, z19.s, z16.s\n"
- ".inst 0x44828236 // srshl z22.s, p0/M, z22.s, z17.s\n"
- ".inst 0x44828235 // srshl z21.s, p0/M, z21.s, z17.s\n"
- ".inst 0x44828234 // srshl z20.s, p0/M, z20.s, z17.s\n"
- ".inst 0x44828233 // srshl z19.s, p0/M, z19.s, z17.s\n"
+ ".inst 0x04b27400 // sqrdmulh z0.s, z0.s, z18.s\n"
+ ".inst 0x04b277ff // sqrdmulh z31.s, z31.s, z18.s\n"
+ ".inst 0x44828201 // srshl z1.s, p0/M, z1.s, z16.s\n"
+ ".inst 0x44828211 // srshl z17.s, p0/M, z17.s, z16.s\n"
+ ".inst 0x04b277de // sqrdmulh z30.s, z30.s, z18.s\n"
+ ".inst 0x04b277bd // sqrdmulh z29.s, z29.s, z18.s\n"
+ ".inst 0x44828200 // srshl z0.s, p0/M, z0.s, z16.s\n"
+ ".inst 0x4482821f // srshl z31.s, p0/M, z31.s, z16.s\n"
+ ".inst 0x04b2779c // sqrdmulh z28.s, z28.s, z18.s\n"
+ ".inst 0x04b2777b // sqrdmulh z27.s, z27.s, z18.s\n"
+ ".inst 0x4482821e // srshl z30.s, p0/M, z30.s, z16.s\n"
+ ".inst 0x4482821d // srshl z29.s, p0/M, z29.s, z16.s\n"
+ ".inst 0x04b2775a // sqrdmulh z26.s, z26.s, z18.s\n"
+ ".inst 0x04b27739 // sqrdmulh z25.s, z25.s, z18.s\n"
+ ".inst 0x4482821c // srshl z28.s, p0/M, z28.s, z16.s\n"
+ ".inst 0x4482821b // srshl z27.s, p0/M, z27.s, z16.s\n"
+ ".inst 0x04b27718 // sqrdmulh z24.s, z24.s, z18.s\n"
+ ".inst 0x04b276f7 // sqrdmulh z23.s, z23.s, z18.s\n"
+ ".inst 0x4482821a // srshl z26.s, p0/M, z26.s, z16.s\n"
+ ".inst 0x44828219 // srshl z25.s, p0/M, z25.s, z16.s\n"
+ ".inst 0x04b276d6 // sqrdmulh z22.s, z22.s, z18.s\n"
+ ".inst 0x04b276b5 // sqrdmulh z21.s, z21.s, z18.s\n"
+ ".inst 0x44828218 // srshl z24.s, p0/M, z24.s, z16.s\n"
+ ".inst 0x44828217 // srshl z23.s, p0/M, z23.s, z16.s\n"
+ ".inst 0x04b27694 // sqrdmulh z20.s, z20.s, z18.s\n"
+ ".inst 0x04b27673 // sqrdmulh z19.s, z19.s, z18.s\n"
+ ".inst 0x44828216 // srshl z22.s, p0/M, z22.s, z16.s\n"
+ ".inst 0x44828215 // srshl z21.s, p0/M, z21.s, z16.s\n"
+ ".inst 0x44828214 // srshl z20.s, p0/M, z20.s, z16.s\n"
+ ".inst 0x44828213 // srshl z19.s, p0/M, z19.s, z16.s\n"
"ld1rw { z16.s }, p0/Z, [x20]\n"
- "add z2.s, z2.s, z16.s\n"
"add z1.s, z1.s, z16.s\n"
+ "add z17.s, z17.s, z16.s\n"
"add z0.s, z0.s, z16.s\n"
"add z31.s, z31.s, z16.s\n"
"add z30.s, z30.s, z16.s\n"
@@ -252,8 +252,8 @@ void sve_u8q_nhwc_max_generic_depthfirst_impl(
"add z20.s, z20.s, z16.s\n"
"add z19.s, z19.s, z16.s\n"
"mov z16.s, #0x0\n"
- "smax z2.s, p0/M, z2.s, z16.s\n"
"smax z1.s, p0/M, z1.s, z16.s\n"
+ "smax z17.s, p0/M, z17.s, z16.s\n"
"smax z0.s, p0/M, z0.s, z16.s\n"
"smax z31.s, p0/M, z31.s, z16.s\n"
"mov z18.s, #0xff\n"
@@ -269,9 +269,9 @@ void sve_u8q_nhwc_max_generic_depthfirst_impl(
"smax z21.s, p0/M, z21.s, z16.s\n"
"smax z20.s, p0/M, z20.s, z16.s\n"
"smax z19.s, p0/M, z19.s, z16.s\n"
- "smin z2.s, p0/M, z2.s, z18.s\n"
"smin z1.s, p0/M, z1.s, z18.s\n"
- "trn1 z17.h, z2.h, z1.h\n"
+ "smin z17.s, p0/M, z17.s, z18.s\n"
+ "trn1 z17.h, z1.h, z17.h\n"
"smin z0.s, p0/M, z0.s, z18.s\n"
"smin z31.s, p0/M, z31.s, z18.s\n"
"trn1 z16.h, z0.h, z31.h\n"
@@ -313,91 +313,91 @@ void sve_u8q_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z8.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
- "movprfx z23, z2\n umax z23.b, p0/M, z23.b, z1.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "umax z19.b, p0/M, z19.b, z23.b\n"
+ "movprfx z16, z4\n umax z16.b, p0/M, z16.b, z3.b\n"
+ "movprfx z17, z2\n umax z17.b, p0/M, z17.b, z1.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "umax z16.b, p0/M, z16.b, z17.b\n"
"subs x25, x25, #0x1\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
- "umax z8.b, p0/M, z8.b, z19.b\n"
- "add x20, x20, #0x20\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "umax z8.b, p0/M, z8.b, z16.b\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
- "movprfx z23, z2\n umax z23.b, p0/M, z23.b, z1.b\n"
- "umax z19.b, p0/M, z19.b, z23.b\n"
- "umax z8.b, p0/M, z8.b, z19.b\n"
+ "movprfx z16, z4\n umax z16.b, p0/M, z16.b, z3.b\n"
+ "movprfx z17, z2\n umax z17.b, p0/M, z17.b, z1.b\n"
+ "umax z16.b, p0/M, z16.b, z17.b\n"
+ "umax z8.b, p0/M, z8.b, z16.b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "umax z8.b, p0/M, z8.b, z4.b\n"
+ "umax z8.b, p0/M, z8.b, z16.b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
- "ld1rw { z4.s }, p0/Z, [x20]\n"
- ".inst 0x4508a918 // ushllb z24.h, z8.b, #0x0\n"
- ".inst 0x4508ad17 // ushllt z23.h, z8.b, #0x0\n"
- "neg z4.s, p0/M, z4.s\n"
+ "ld1rw { z18.s }, p0/Z, [x20]\n"
+ ".inst 0x4508a911 // ushllb z17.h, z8.b, #0x0\n"
+ ".inst 0x4508ad10 // ushllt z16.h, z8.b, #0x0\n"
+ "neg z18.s, p0/M, z18.s\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- ".inst 0x45984082 // saddwb z2.s, z4.s, z24.h\n"
- ".inst 0x45984481 // saddwt z1.s, z4.s, z24.h\n"
- ".inst 0x45974080 // saddwb z0.s, z4.s, z23.h\n"
- ".inst 0x4597449f // saddwt z31.s, z4.s, z23.h\n"
- "ld1rw { z3.s }, p0/Z, [x20]\n"
+ ".inst 0x45914255 // saddwb z21.s, z18.s, z17.h\n"
+ ".inst 0x45914654 // saddwt z20.s, z18.s, z17.h\n"
+ ".inst 0x45904253 // saddwb z19.s, z18.s, z16.h\n"
+ ".inst 0x45904652 // saddwt z18.s, z18.s, z16.h\n"
+ "ld1rw { z17.s }, p0/Z, [x20]\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
"ld1rw { z16.s }, p0/Z, [x20]\n"
- ".inst 0x44828062 // srshl z2.s, p0/M, z2.s, z3.s\n"
- ".inst 0x44828061 // srshl z1.s, p0/M, z1.s, z3.s\n"
- ".inst 0x04b07442 // sqrdmulh z2.s, z2.s, z16.s\n"
- ".inst 0x44828060 // srshl z0.s, p0/M, z0.s, z3.s\n"
- ".inst 0x4482807f // srshl z31.s, p0/M, z31.s, z3.s\n"
- ".inst 0x04b07421 // sqrdmulh z1.s, z1.s, z16.s\n"
- ".inst 0x04b07400 // sqrdmulh z0.s, z0.s, z16.s\n"
+ ".inst 0x44828235 // srshl z21.s, p0/M, z21.s, z17.s\n"
+ ".inst 0x44828234 // srshl z20.s, p0/M, z20.s, z17.s\n"
+ ".inst 0x04b076b5 // sqrdmulh z21.s, z21.s, z16.s\n"
+ ".inst 0x44828233 // srshl z19.s, p0/M, z19.s, z17.s\n"
+ ".inst 0x44828232 // srshl z18.s, p0/M, z18.s, z17.s\n"
+ ".inst 0x04b07694 // sqrdmulh z20.s, z20.s, z16.s\n"
+ ".inst 0x04b07673 // sqrdmulh z19.s, z19.s, z16.s\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
"ld1rw { z17.s }, p0/Z, [x20]\n"
- ".inst 0x04b077ff // sqrdmulh z31.s, z31.s, z16.s\n"
+ ".inst 0x04b07652 // sqrdmulh z18.s, z18.s, z16.s\n"
"add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
- ".inst 0x44828222 // srshl z2.s, p0/M, z2.s, z17.s\n"
- ".inst 0x44828221 // srshl z1.s, p0/M, z1.s, z17.s\n"
+ ".inst 0x44828235 // srshl z21.s, p0/M, z21.s, z17.s\n"
+ ".inst 0x44828234 // srshl z20.s, p0/M, z20.s, z17.s\n"
"ld1rw { z16.s }, p0/Z, [x20]\n"
- "add z2.s, z2.s, z16.s\n"
- ".inst 0x44828220 // srshl z0.s, p0/M, z0.s, z17.s\n"
- ".inst 0x4482823f // srshl z31.s, p0/M, z31.s, z17.s\n"
- "add z1.s, z1.s, z16.s\n"
- "add z0.s, z0.s, z16.s\n"
- "add z31.s, z31.s, z16.s\n"
+ "add z21.s, z21.s, z16.s\n"
+ ".inst 0x44828233 // srshl z19.s, p0/M, z19.s, z17.s\n"
+ ".inst 0x44828232 // srshl z18.s, p0/M, z18.s, z17.s\n"
+ "add z20.s, z20.s, z16.s\n"
+ "add z19.s, z19.s, z16.s\n"
+ "add z18.s, z18.s, z16.s\n"
"mov z16.s, #0x0\n"
- "smax z2.s, p0/M, z2.s, z16.s\n"
- "smax z1.s, p0/M, z1.s, z16.s\n"
- "smax z0.s, p0/M, z0.s, z16.s\n"
- "smax z31.s, p0/M, z31.s, z16.s\n"
- "mov z18.s, #0xff\n"
- "smin z2.s, p0/M, z2.s, z18.s\n"
- "smin z1.s, p0/M, z1.s, z18.s\n"
- "trn1 z17.h, z2.h, z1.h\n"
- "smin z0.s, p0/M, z0.s, z18.s\n"
- "smin z31.s, p0/M, z31.s, z18.s\n"
- "trn1 z16.h, z0.h, z31.h\n"
+ "smax z21.s, p0/M, z21.s, z16.s\n"
+ "smax z20.s, p0/M, z20.s, z16.s\n"
+ "smax z19.s, p0/M, z19.s, z16.s\n"
+ "smax z18.s, p0/M, z18.s, z16.s\n"
+ "mov z16.s, #0xff\n"
+ "smin z21.s, p0/M, z21.s, z16.s\n"
+ "smin z20.s, p0/M, z20.s, z16.s\n"
+ "trn1 z17.h, z21.h, z20.h\n"
+ "smin z19.s, p0/M, z19.s, z16.s\n"
+ "smin z18.s, p0/M, z18.s, z16.s\n"
+ "trn1 z16.h, z19.h, z18.h\n"
"trn1 z16.b, z17.b, z16.b\n"
"st1b { z16.b }, p4, [%x[outptr], x9]\n"
"incb x9\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp
index 63333c8fb4..8a6e63d993 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -101,7 +101,7 @@ class PoolingDepthfirst : public DepthfirstDriver<TInput, TOutput>
{
auto ws = reinterpret_cast<WorkingSpace *>(raw_ws);
ws->input_buffer = ws + 1;
- ws->output_buffer = reinterpret_cast<TInput *>(ws + 1) + n_channels;
+ ws->output_buffer = reinterpret_cast<char *>(ws + 1) + sizeof(TInput) * n_channels;
// Fill the input buffer with an appropriate value
TInput fill_val = 0;
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_cache_oblivious.hpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_cache_oblivious.hpp
deleted file mode 100644
index 4aabd957cd..0000000000
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_cache_oblivious.hpp
+++ /dev/null
@@ -1,312 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include "pool_common.hpp"
-
-#include <stack>
-#include <vector>
-
-namespace arm_conv {
-namespace pooling {
-
-template <class strategy>
-class PoolingDepthfirstCacheOblivious : public PoolingCommon<typename strategy::operand_type, typename strategy::return_type>
-{
- using TInput = typename strategy::operand_type;
- using TOutput = typename strategy::return_type;
-
- const PoolingArgs m_args; // Copy of arguments
-
- constexpr static unsigned int input_rows(void)
- {
- return (strategy::out_rows() - 1)*strategy::stride_rows() + strategy::pool_rows();
- }
-
- constexpr static unsigned int input_cols(void)
- {
- return (strategy::out_cols() - 1)*strategy::stride_cols() + strategy::pool_cols();
- }
-
- size_t sizeof_input_buffer(void) const
- {
- return sizeof(TInput) * m_args.n_channels;
- }
-
- size_t sizeof_output_buffer(void) const
- {
- return sizeof(TOutput) * m_args.n_channels;
- }
-
- public:
- PoolingDepthfirstCacheOblivious(const PoolingArgs &args) : m_args(args)
- {
- }
-
- PoolingDepthfirstCacheOblivious(PoolingDepthfirstCacheOblivious &) = delete;
- PoolingDepthfirstCacheOblivious &operator=(PoolingDepthfirstCacheOblivious &) = delete;
-
- size_t get_working_size(void) const override
- {
- // We require an array of pointers for the inputs and outputs, a
- // channel-length vector in which to dump surplus output, and a
- // channel-length vector of padding values.
- return sizeof_input_buffer() + sizeof_output_buffer();
- }
-
- void execute(
- const void *const input,
- void *const output,
- void *const working_space
- ) const override
- {
- const size_t ld_input_col = m_args.n_channels;
- const size_t ld_input_row = ld_input_col * m_args.input_cols;
- const size_t ld_input_batch = ld_input_row * m_args.input_rows;
- const size_t ld_output_col = ld_input_col;
- const size_t ld_output_row = ld_output_col * m_args.output_cols;
- const size_t ld_output_batch = ld_output_row * m_args.output_rows;
-
- execute(
- input, ld_input_col, ld_input_row, ld_input_batch,
- output, ld_output_col, ld_output_row, ld_output_batch,
- working_space
- );
- }
-
- void execute(
- const void *const input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- void *const output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *const working_space
- ) const override
- {
- execute(
- m_args.n_batches, m_args.input_rows, m_args.input_cols,
- m_args.n_channels,
- input, ld_input_col, ld_input_row, ld_input_batch,
- m_args.padding,
- m_args.output_rows, m_args.output_cols,
- output, ld_output_col, ld_output_row, ld_output_batch,
- working_space
- );
- }
-
- void execute(
- unsigned int batches,
- unsigned int input_height,
- unsigned int input_width,
- unsigned int channels,
- const void *const _input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const PaddingValues &padding,
- unsigned int output_height,
- unsigned int output_width,
- void *const _output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *const _working_space
- ) const override
- {
- strategy strat(m_args.cpu_info);
-#ifdef CYCLE_PROFILING
- arm_gemm::profiler prof;
-#endif // CYCLE_PROFILING
-
- // Cast input and output pointers into the right types
- const TInput *const inptr = static_cast<const TInput *>(_input);
- TOutput *const outptr = static_cast<TOutput *>(_output);
-
- // Allocate portions of the working space
- uint8_t *const working_space = static_cast<uint8_t *>(_working_space);
- TOutput *const output_buffer = reinterpret_cast<TOutput *>(working_space);
- TInput *const input_buffer = reinterpret_cast<TInput *>(working_space + sizeof_output_buffer());
-
- // Fill the input buffer
- const TInput pad_value = (m_args.pool_type == PoolingType::AVERAGE)
- ? static_cast<TInput>(0)
- : (std::numeric_limits<TInput>::has_infinity
- ? -std::numeric_limits<TInput>::infinity()
- : std::numeric_limits<TInput>::lowest());
- for (unsigned int i = 0; i < channels; i++)
- {
- input_buffer[i] = pad_value;
- }
-
- // Keep subdividing the output plane across the longest dimension until we
- // reach the size of the tile. Queue items for later processing. Note - we
- // can determine the largest size of the queue a priori from the input
- // tensor size, this would allow us to allocate memory within the working
- // space and improve performance.
- struct WorkItem
- {
- unsigned int output_i, output_j;
- unsigned int output_height, output_width;
-
- WorkItem(unsigned int i, unsigned int j, unsigned int height, unsigned int width)
- : output_i(i), output_j(j), output_height(height), output_width(width) {}
- };
-
- auto execute = [&] (const WorkItem &item) {
- // Create an array for the output pointers
- TOutput * _outptr_array[strategy::out_rows() * strategy::out_cols()];
- TOutput **const outptr_array = _outptr_array;
-
- // Construct the output pointer array
- {
- const auto output_pad_right = strategy::out_rows() - item.output_width;
- auto outptr_element = outptr_array;
- auto outptr_row = outptr + item.output_i * ld_output_row + item.output_j * ld_output_col;
-
- // Fill the array with pointers to the output buffer
- for (unsigned int i = 0; i < strategy::out_rows() * strategy::out_cols(); i++)
- {
- outptr_array[i] = output_buffer;
- }
-
- // Fill in the valid portion of the array
- for (unsigned int i = 0; i < item.output_height; i++)
- {
- auto outptr_col = outptr_row;
- for (unsigned int j = 0; j < item.output_width; j++)
- {
- *(outptr_element++) = outptr_col;
- outptr_col += ld_output_col;
- }
- outptr_element += output_pad_right;
- outptr_row += ld_output_row;
- }
- }
-
- const int start_i = item.output_i * strategy::stride_rows() - padding.top;
- const int end_i = start_i + input_rows();
- const unsigned int pad_top = std::max(0, 0 - start_i);
- const unsigned int pad_bottom = std::max(0, end_i - static_cast<int>(input_height));
-
- const int start_j = item.output_j * strategy::stride_cols() - padding.left;
- const int end_j = start_j + input_cols();
- const unsigned int pad_left = std::max(0, 0 - start_j);
- const unsigned int pad_right = std::max(0, end_j - static_cast<int>(input_width));
-
- // Create an array for the input pointers
- const TInput * _inptr_array[input_rows() * input_cols()];
- const TInput **const inptr_array = _inptr_array;
- {
- const unsigned int row_padding = pad_top + pad_bottom;
- const unsigned int valid_rows = input_rows() - row_padding;
-
- const unsigned int col_padding = pad_left + pad_right;
- const unsigned int valid_cols = input_cols() - col_padding;
-
- // Fill the array with pointers to the input buffer
- for (unsigned int i = 0; i < input_rows() * input_cols(); i++)
- {
- inptr_array[i] = input_buffer;
- }
-
- // Compute valid initial pointer
- auto inptr_row = inptr + std::max(start_i, 0) * ld_input_row + std::max(start_j, 0) * ld_input_col;
-
- // Fill in the valid portion of the input array
- auto inptr_element = inptr_array + pad_top * input_cols() + pad_left;
- for (unsigned int i = 0; i < valid_rows; i++)
- {
- auto inptr_col = inptr_row;
- for (unsigned int j = 0; j < valid_cols; j++)
- {
- *(inptr_element++) = inptr_col;
- inptr_col += ld_input_col;
- }
-
- inptr_row += ld_input_row;
- inptr_element += col_padding; // Skip the padding elements
- }
- }
-
- // Call the kernel
-#ifdef CYCLE_PROFILING
- // TODO Work number
- auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(item.output_height * item.output_width * strategy::pool_rows() * strategy::pool_cols()));
-#endif // CYCLE_PROFILING
- strat.kernel(channels, inptr_array, outptr_array,
- pad_left, pad_top, pad_right, pad_bottom);
- };
-
- // Add the initial work item to the stack of work.
- std::stack<WorkItem, std::vector<WorkItem>> stack;
- stack.push(WorkItem(0, 0, output_height, output_width));
- while (!stack.empty())
- {
- // Pop an item from the stack, bisect the largest dimension and either
- // execute the resulting tiles or add them to the stack if they are too
- // large.
- const WorkItem item(stack.top());
- stack.pop();
-
- if (item.output_height <= strategy::out_rows() &&
- item.output_width <= strategy::out_cols())
- {
- execute(item);
- }
- else
- {
- // Split the largest dimension, such that we get an exact number of
- // tiles in the first partition.
- if (item.output_height >= item.output_width)
- {
- const unsigned int height_in_tiles = (item.output_height + strategy::out_rows() - 1) / strategy::out_rows();
- const unsigned int tiles_first = height_in_tiles - height_in_tiles / 2;
-
- const unsigned int height_first = tiles_first * strategy::out_rows();
- const unsigned int height_second = item.output_height - height_first;
-
- stack.push(WorkItem(item.output_i + height_first, item.output_j, height_second, item.output_width));
- stack.push(WorkItem(item.output_i, item.output_j, height_first, item.output_width));
- }
- else
- {
- const unsigned int width_in_tiles = item.output_width / strategy::out_cols();
- const unsigned int tiles_first = width_in_tiles - width_in_tiles / 2;
-
- const unsigned int width_first = tiles_first * strategy::out_cols();
- const unsigned int width_second = item.output_width - width_first;
-
- stack.push(WorkItem(item.output_i, item.output_j + width_first, item.output_height, width_second));
- stack.push(WorkItem(item.output_i, item.output_j, item.output_height, width_first));
- }
- }
- }
- }
-};
-
-} // namespace pooling
-} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic.hpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic.hpp
index 65d9a91977..07c582059f 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,6 @@
#pragma once
-#include "arm_compute/core/Error.h"
#include "depthfirst_driver.hpp"
#include "utils.hpp"
#if !defined(_WIN64) && !defined(__OpenBSD__)
@@ -208,10 +207,9 @@ class PoolingDepthfirstGeneric : public DepthfirstDriver<TInput, TOutput>
const unsigned int channel_start, const unsigned int channel_end,
const TensorSpec<const TInput *> &input,
const TensorSpec<TOutput *> &output,
- void *working_space
+ void *
) const override
{
- ARM_COMPUTE_UNUSED(working_space);
// Determine start position and padding
const int start_i = static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top;
const auto input_i = static_cast<unsigned int>(start_i < 0 ? 0 : start_i);
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic_quantized.hpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic_quantized.hpp
deleted file mode 100644
index f3cb9a1d1f..0000000000
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic_quantized.hpp
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include "pool_common.hpp"
-#include "utils.hpp"
-
-namespace arm_conv {
-namespace pooling {
-
-template <class strategy>
-class PoolingDepthfirstGenericQuantized : public PoolingCommon<typename strategy::operand_type, typename strategy::return_type, Requantize32>
-{
- using TInput = typename strategy::operand_type;
- using TOutput = typename strategy::return_type;
-
- const PoolingArgs m_args; // Copy of arguments
- const Requantize32 m_requant; // Quantization parameters
-
- unsigned int input_rows(void) const
- {
- return m_args.pool_window.rows;
- }
-
- unsigned int input_cols(void) const
- {
- return m_args.pool_window.cols;
- }
-
- public:
- PoolingDepthfirstGenericQuantized(const PoolingArgs &args, const Requantize32 &rq) : m_args(args), m_requant(rq)
- {
- }
-
- PoolingDepthfirstGenericQuantized(PoolingDepthfirstGenericQuantized &) = delete;
- PoolingDepthfirstGenericQuantized &operator=(PoolingDepthfirstGenericQuantized &) = delete;
-
- size_t sizeof_input_pointer_array(void) const
- {
- return sizeof(TInput *) * input_rows() * input_cols();
- }
-
- size_t get_working_size(unsigned int num_threads) const override
- {
- return num_threads * sizeof_input_pointer_array();
- }
-
- void execute(
- const void *const input,
- void *const output,
- void *const working_space,
- unsigned int thread_id,
- unsigned int num_threads
- ) const override
- {
- const size_t ld_input_col = m_args.n_channels;
- const size_t ld_input_row = ld_input_col * m_args.input_cols;
- const size_t ld_input_batch = ld_input_row * m_args.input_rows;
- const size_t ld_output_col = ld_input_col;
- const size_t ld_output_row = ld_output_col * m_args.output_cols;
- const size_t ld_output_batch = ld_output_row * m_args.output_rows;
-
- execute(
- input, ld_input_col, ld_input_row, ld_input_batch,
- output, ld_output_col, ld_output_row, ld_output_batch,
- working_space,
- thread_id, num_threads
- );
- }
-
- void execute(
- const void *const input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- void *const output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *const working_space,
- unsigned int thread_id,
- unsigned int num_threads
- ) const override
- {
- execute(
- m_args.n_batches, m_args.input_rows, m_args.input_cols,
- m_args.n_channels,
- input, ld_input_col, ld_input_row, ld_input_batch,
- m_args.padding,
- m_args.output_rows, m_args.output_cols,
- output, ld_output_col, ld_output_row, ld_output_batch,
- working_space,
- thread_id, num_threads
- );
- }
-
- void execute(
- unsigned int batches,
- unsigned int height,
- unsigned int width,
- unsigned int channels,
- const void *const _input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const PaddingValues &padding,
- unsigned int output_height,
- unsigned int output_width,
- void *const _output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *const _working_space,
- unsigned int thread_id,
- unsigned int num_threads
- ) const override
- {
- strategy strat(m_args.cpu_info);
-#ifdef CYCLE_PROFILING
- arm_gemm::profiler prof;
-#endif // CYCLE_PROFILING
-
- const unsigned int roundup_output_rows = roundup(output_height, num_threads);
- const unsigned int rows_per_thread = roundup_output_rows / num_threads;
- int start_out_height = static_cast<int>(thread_id * rows_per_thread);
- int end_out_height = std::min<int>(output_height, static_cast<int>((thread_id + 1) * rows_per_thread));
-
- unsigned int start_channel = 0;
- unsigned int end_channel = channels;
- if(output_height == 1)
- {
- const unsigned int channels_per_thread = roundup(channels, num_threads) / num_threads;
- start_channel = thread_id * channels_per_thread;
- end_channel = std::min(start_channel + channels_per_thread, channels);
-
- // Reset start and end rows
- start_out_height = 0;
- end_out_height = output_height;
- }
-
- if(start_channel >= end_channel)
- {
- // Early exit in case of multiple threads parallelising on channels
- return;
- }
-
- // Cast input and output pointers into the right types
- const TInput *const inptr = static_cast<const TInput *>(_input) + start_channel;
- TOutput *const outptr = static_cast<TOutput *>(_output) + start_channel;
-
- // Grab the input pointer array
- uint8_t *const working_space = static_cast<uint8_t *>(_working_space);
- const TInput **const inptr_array = reinterpret_cast<const TInput **>(working_space + thread_id * sizeof_input_pointer_array());
-
- // For each output tile, construct the requisite set of pointers and call
- // into the kernel.
- for (unsigned int batch = 0; batch < batches; batch++)
- {
- // Get batch pointers
- const auto inptr_batch = inptr + batch * ld_input_batch;
- const auto outptr_batch = outptr + batch * ld_output_batch;
-
- for (int out_i = start_out_height; out_i < end_out_height; out_i++)
- {
- const int start_in_i = out_i * m_args.pool_stride.rows - padding.top;
- const int end_in_i = start_in_i + m_args.pool_window.rows;
-
- // Compute top/bottom padding
- const auto pad_top = static_cast<unsigned int>(-std::min(start_in_i, 0));
- const auto pad_bottom = static_cast<unsigned int>(-std::min(static_cast<int>(height) - end_in_i, 0));
-
- // Compute the number of pooling window rows which are contained in
- // either the valid region of the input tensor, or the padding.
- const auto padded_bottom = std::min<unsigned int>(
- start_in_i + m_args.pool_window.rows, height + padding.bottom
- );
- const auto n_total_rows = padded_bottom - start_in_i;
-
- for (int out_j = 0, start_in_j = -padding.left;
- out_j < static_cast<int>(output_width);
- out_j++, start_in_j += m_args.pool_stride.cols)
- {
- const int end_in_j = start_in_j + m_args.pool_window.cols;
-
- // Compute left/right padding
- const auto pad_left = static_cast<unsigned int>(-std::min(start_in_j, 0));
- const auto pad_right = static_cast<unsigned int>(-std::min(static_cast<int>(width) - end_in_j, 0));
-
- // Compute the number of pooling window columns which are contained
- // in either the valid region of the input tensor, or the padding.
- const auto padded_right = std::min<unsigned int>(
- start_in_j + m_args.pool_window.cols, width + padding.right
- );
- const auto n_total_cols = padded_right - start_in_j;
-
- // Construct the input pointer array - fill in all valid points
- // contiguously.
- const TInput **ptrs = inptr_array;
- for (auto i = pad_top; i < input_rows() - pad_bottom; i++)
- {
- // Can skip over the left padding because we will have either the
- // same or less than the previous tile.
- unsigned int j = pad_left;
- const TInput *colptr = inptr_batch + (start_in_i + i) * ld_input_row + (start_in_j + j) * ld_input_col;
- for (; j < input_cols() - pad_right; j++)
- {
- *(ptrs++) = colptr;
- colptr += ld_input_col;
- }
- }
-
- // Compute the number of valid cells
- const auto valid_rows = input_rows() - pad_top - pad_bottom;
- const auto valid_cols = input_cols() - pad_left - pad_right;
- const auto valid_cells = valid_rows * valid_cols;
- const auto cells_in_range = n_total_rows * n_total_cols;
- const auto window_cells = m_args.exclude_padding ? valid_cells : cells_in_range;
-
- // Get the output pointer for this call
- TOutput *outptr = outptr_batch + out_i * ld_output_row + out_j * ld_output_col;
-
-#ifdef CYCLE_PROFILING
- // TODO Work number
- auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long) 0);
-#endif
- strat.kernel(window_cells, valid_cells, end_channel - start_channel, inptr_array, outptr, m_requant);
- }
- }
- }
- }
-};
-
-} // namespace pooling
-} // namespace arm_conv